Browse Source

new merge (sched_policies not yet done)

Andra Hugo 14 years ago
parent
commit
64ee8754cc
100 changed files with 2572 additions and 1138 deletions
  1. 1 0
      .gitignore
  2. 2 0
      AUTHORS
  3. 7 1
      Makefile.am
  4. 42 23
      configure.ac
  5. 6 1
      doc/Makefile.am
  6. 160 0
      doc/c-extensions.texi
  7. 48 11
      doc/starpu.texi
  8. 2 0
      examples/Makefile.am
  9. 2 2
      examples/basic_examples/hello_world.c
  10. 3 3
      examples/basic_examples/mult.c
  11. 384 0
      examples/basic_examples/mult_impl.c
  12. 2 2
      examples/basic_examples/vector_scal_cpu.c
  13. 10 9
      examples/cholesky/cholesky_models.c
  14. 1 1
      examples/filters/fblock_cpu.c
  15. 1 1
      examples/filters/fmatrix.c
  16. 1 1
      examples/filters/fvector.c
  17. 13 12
      examples/heat/lu_kernels_model.c
  18. 1 1
      examples/mandelbrot/mandelbrot.c
  19. 6 6
      examples/ppm_downscaler/ppm_downscaler.c
  20. 8 8
      examples/ppm_downscaler/yuv_downscaler.c
  21. 1 1
      examples/profiling/profiling.c
  22. 4 4
      examples/reductions/dot_product.c
  23. 2 2
      examples/reductions/minmax_reduction.c
  24. 1 1
      examples/socl/mandelbrot/mandelbrot.c
  25. 2 2
      examples/spmv/spmv.c
  26. 1 0
      examples/starpufft/.gitignore
  27. 1 0
      examples/stencil/.gitignore
  28. 2 2
      examples/stencil/stencil-blocks.c
  29. 33 25
      examples/stencil/stencil-kernels.c
  30. 3 3
      examples/stencil/stencil.c
  31. 1 1
      examples/tag_example/tag_example.c
  32. 1 1
      examples/tag_example/tag_example2.c
  33. 1 1
      examples/tag_example/tag_example3.c
  34. 2 2
      examples/tag_example/tag_restartable.c
  35. 3 3
      examples/top/hello_world_top.c
  36. 2 0
      gcc-plugin/README
  37. 5 0
      gcc-plugin/examples/Makefile.am
  38. 8 36
      gcc-plugin/examples/matrix-mult.c
  39. 5 2
      gcc-plugin/src/Makefile.am
  40. 234 0
      gcc-plugin/src/c-expr.y
  41. 8 0
      gcc-plugin/src/starpu-gcc-config.h.in
  42. 148 96
      gcc-plugin/src/starpu.c
  43. 1 0
      gcc-plugin/tests/Makefile.am
  44. 2 2
      gcc-plugin/tests/acquire-errors.c
  45. 4 1
      gcc-plugin/tests/lib-user.c
  46. 8 2
      gcc-plugin/tests/register-errors.c
  47. 39 1
      gcc-plugin/tests/register.c
  48. 11 5
      gcc-plugin/tests/run-test.in
  49. 2 2
      gcc-plugin/tests/unregister-errors.c
  50. 27 0
      gcc-plugin/tests/wait-errors.c
  51. 1 0
      include/starpu_config.h.in
  52. 2 0
      include/starpu_data_filters.h
  53. 5 4
      include/starpu_perfmodel.h
  54. 6 4
      include/starpu_scheduler.h
  55. 19 1
      include/starpu_task.h
  56. 3 2
      include/starpu_task_bundle.h
  57. 2 0
      m4/.gitignore
  58. 9 1
      m4/gcc.m4
  59. 1 0
      mpi/.gitignore
  60. 1 0
      mpi/tests/.gitignore
  61. 8 6
      socl/src/Makefile.am
  62. 3 4
      socl/src/cl_createcommandqueue.c
  63. 11 11
      socl/src/cl_createkernel.c
  64. 2 2
      socl/src/cl_createprogramwithsource.c
  65. 4 3
      socl/src/cl_enqueuebarrier.c
  66. 40 28
      socl/src/cl_enqueuecopybuffer.c
  67. 25 48
      socl/src/cl_enqueuemapbuffer.c
  68. 15 5
      socl/src/cl_enqueuemarker.c
  69. 124 250
      socl/src/cl_enqueuendrangekernel.c
  70. 36 24
      socl/src/cl_enqueuereadbuffer.c
  71. 4 16
      socl/src/cl_enqueuetask.c
  72. 18 14
      socl/src/cl_enqueueunmapmemobject.c
  73. 3 4
      socl/src/cl_enqueuewaitforevents.c
  74. 42 33
      socl/src/cl_enqueuewritebuffer.c
  75. 9 6
      socl/src/cl_finish.c
  76. 2 1
      socl/src/cl_geteventinfo.c
  77. 1 1
      socl/src/cl_getkernelinfo.c
  78. 5 3
      socl/src/cl_setkernelarg.c
  79. 6 1
      socl/src/cl_waitforevents.c
  80. 235 0
      socl/src/command.c
  81. 198 0
      socl/src/command.h
  82. 40 0
      socl/src/command_list.c
  83. 28 0
      socl/src/command_list.h
  84. 124 53
      socl/src/command_queue.c
  85. 8 4
      socl/src/command_queue.h
  86. 76 0
      socl/src/debug.c
  87. 4 57
      socl/src/debug.h
  88. 13 15
      socl/src/event.c
  89. 5 0
      socl/src/event.h
  90. 6 2
      socl/src/gc.c
  91. 1 1
      socl/src/gc.h
  92. 0 123
      socl/src/graph.c
  93. 0 73
      socl/src/graph.h
  94. 0 4
      socl/src/init.c
  95. 39 18
      socl/src/socl.h
  96. 61 35
      socl/src/task.c
  97. 17 4
      socl/src/task.h
  98. 24 0
      socl/src/util.c
  99. 24 0
      socl/src/util.h
  100. 0 0
      src/common/htable32.c

+ 1 - 0
.gitignore

@@ -185,3 +185,4 @@ starpu.log
 /gcc-plugin/tests/unregister
 /gcc-plugin/tests/unregister
 /gcc-plugin/tests/lib-user
 /gcc-plugin/tests/lib-user
 /gcc-plugin/examples/matrix-mult
 /gcc-plugin/examples/matrix-mult
+/gcc-plugin/src/c-expr.c

+ 2 - 0
AUTHORS

@@ -10,3 +10,5 @@ William Braik <wbraik@gmail.com>
 Yann Courtois <yann.courtois33@gmail.com>
 Yann Courtois <yann.courtois33@gmail.com>
 Jean-Marie Couteyen <jm.couteyen@gmail.com>
 Jean-Marie Couteyen <jm.couteyen@gmail.com>
 Anthony Roy <theanthony33@gmail.com>
 Anthony Roy <theanthony33@gmail.com>
+David Gómez <david_gomez1380@yahoo.com.mx>
+Nguyen Quôc Dinh <nguyen.quocdinh@gmail.com>

+ 7 - 1
Makefile.am

@@ -18,6 +18,8 @@ ACLOCAL_AMFLAGS=-I m4
 CLEANFILES = *.gcno *.gcda *.linkinfo
 CLEANFILES = *.gcno *.gcda *.linkinfo
 
 
 SUBDIRS = src
 SUBDIRS = src
+SUBDIRS += tools tests doc
+
 if USE_MPI
 if USE_MPI
 SUBDIRS += mpi
 SUBDIRS += mpi
 endif
 endif
@@ -26,7 +28,7 @@ if BUILD_SOCL
 SUBDIRS += socl
 SUBDIRS += socl
 endif
 endif
 
 
-SUBDIRS += tools examples tests doc
+SUBDIRS += examples
 
 
 if COND_OPT
 if COND_OPT
 SUBDIRS += tests/opt examples/opt
 SUBDIRS += tests/opt examples/opt
@@ -68,6 +70,8 @@ clean-local:
 install-exec-local:
 install-exec-local:
 	$(MKDIR_P) $(DESTDIR)$(bindir)
 	$(MKDIR_P) $(DESTDIR)$(bindir)
 	$(INSTALL_STRIP_PROGRAM) starpu-top/StarPU-Top $(DESTDIR)$(bindir)
 	$(INSTALL_STRIP_PROGRAM) starpu-top/StarPU-Top $(DESTDIR)$(bindir)
+uninstall-local:
+	$(RM) $(DESTDIR)$(bindir)/StarPU-Top
 endif
 endif
 
 
 if STARPU_HAVE_WINDOWS
 if STARPU_HAVE_WINDOWS
@@ -77,3 +81,5 @@ txtdir = ${docdir}
 endif
 endif
 txt_DATA = AUTHORS COPYING.LGPL README
 txt_DATA = AUTHORS COPYING.LGPL README
 EXTRA_DIST = AUTHORS COPYING.LGPL README
 EXTRA_DIST = AUTHORS COPYING.LGPL README
+
+include starpu-top/extradist

+ 42 - 23
configure.ac

@@ -2,6 +2,7 @@
 #
 #
 # Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
 # Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+# Copyright (C) 2011  Télécom-SudParis
 #
 #
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
 # it under the terms of the GNU Lesser General Public License as published by
@@ -33,8 +34,8 @@ AC_PROG_SED
 AC_PROG_LN_S
 AC_PROG_LN_S
 AC_PROG_F77
 AC_PROG_F77
 
 
-AC_LIBTOOL_WIN32_DLL
-AC_PROG_LIBTOOL
+LT_PREREQ([2.2])
+LT_INIT([win32-dll])
 
 
 AC_PROG_INSTALL
 AC_PROG_INSTALL
 AC_PROG_MKDIR_P
 AC_PROG_MKDIR_P
@@ -158,18 +159,6 @@ if test x$enable_libnuma = xyes; then
 fi
 fi
 
 
 ###############################################################################
 ###############################################################################
-#									      #
-#				SCED_CTX settings			      #
-#									      #
-###############################################################################
-AC_MSG_CHECKING(maximum number of sched_ctxs)
-AC_ARG_ENABLE(max_sched_ctxs, [AS_HELP_STRING([--enable-max-sched-ctxs=<number>],
-			[maximum number of sched_ctxs])],
-			max_sched_ctxs=$enableval, max_sched_ctxs=10)
-AC_MSG_RESULT($max_sched_ctxs)
-AC_DEFINE_UNQUOTED(STARPU_NMAX_SCHED_CTXS, [$max_sched_ctxs], [Maximum number of sched_ctxs supported])
-
-###############################################################################
 #                                                                             #
 #                                                                             #
 #                                 CPUs settings                               #
 #                                 CPUs settings                               #
 #                                                                             #
 #                                                                             #
@@ -282,6 +271,7 @@ AC_DEFUN([STARPU_CHECK_CUDA],
 
 
     if test "$have_valid_cuda" = "no" ; then
     if test "$have_valid_cuda" = "no" ; then
         LDFLAGS="${SAVED_LDFLAGS}"
         LDFLAGS="${SAVED_LDFLAGS}"
+	unset STARPU_CUDA_LDFLAGS
     fi
     fi
 ])
 ])
 
 
@@ -460,8 +450,10 @@ if test x$have_curand = xyes; then
 fi
 fi
 
 
 # Peer transfers are only supported since CUDA 4.0
 # Peer transfers are only supported since CUDA 4.0
+# Disable them if user explicity wants to disable them
+AC_ARG_ENABLE(cuda_memcpy_peer, [AS_HELP_STRING([--disable-cuda-memcpy-peer], [do not allow peer transfers when using CUDA 4.0])],, [enable_cuda_memcpy_peer=yes])
 have_cuda_memcpy_peer=no
 have_cuda_memcpy_peer=no
-if test x$enable_cuda = xyes; then
+if test x$enable_cuda_memcpy_peer = xyes -a x$enable_cuda = xyes ; then
     SAVED_LDFLAGS="${LDFLAGS}"
     SAVED_LDFLAGS="${LDFLAGS}"
     LDFLAGS="${LDFLAGS} ${STARPU_CUDA_LDFLAGS}"
     LDFLAGS="${LDFLAGS} ${STARPU_CUDA_LDFLAGS}"
     AC_CHECK_FUNC([cudaMemcpyPeer], have_cuda_memcpy_peer=yes, have_cuda_memcpy_peer=no)
     AC_CHECK_FUNC([cudaMemcpyPeer], have_cuda_memcpy_peer=yes, have_cuda_memcpy_peer=no)
@@ -898,6 +890,15 @@ AC_MSG_CHECKING(Maximum number of workers)
 AC_MSG_RESULT($nmaxworkers)
 AC_MSG_RESULT($nmaxworkers)
 AC_DEFINE_UNQUOTED(STARPU_NMAXWORKERS, [$nmaxworkers], [Maximum number of workers])
 AC_DEFINE_UNQUOTED(STARPU_NMAXWORKERS, [$nmaxworkers], [Maximum number of workers])
 
 
+# Computes the maximum number of implementations per arch
+AC_MSG_CHECKING(maximum number of implementations)
+AC_ARG_ENABLE(maximplementations, [AS_HELP_STRING([--enable-maximplementations=<number>],
+		[maximum number of implementations])],
+		maximplementations=$enableval, maximplementations=1)
+AC_MSG_RESULT($maximplementations)
+AC_DEFINE_UNQUOTED(STARPU_MAXIMPLEMENTATIONS, [$maximplementations],
+		[maximum number of implementations])
+
 ###############################################################################
 ###############################################################################
 #                                                                             #
 #                                                                             #
 #                                    MPI                                      #
 #                                    MPI                                      #
@@ -1116,6 +1117,10 @@ else
    run_gcc_plugin_test_suite="no"
    run_gcc_plugin_test_suite="no"
 fi
 fi
 
 
+# Bison is used to generate the C expression parser.  The generated
+# parser is part of the distribution, though.
+AC_PROG_YACC
+
 AM_CONDITIONAL([BUILD_GCC_PLUGIN], [test "x$build_gcc_plugin" = "xyes"])
 AM_CONDITIONAL([BUILD_GCC_PLUGIN], [test "x$build_gcc_plugin" = "xyes"])
 AM_CONDITIONAL([HAVE_GUILE], [test "x$GUILE" != "x"])
 AM_CONDITIONAL([HAVE_GUILE], [test "x$GUILE" != "x"])
 
 
@@ -1285,6 +1290,10 @@ AC_MSG_CHECKING(which BLAS lib should be used)
 AC_MSG_RESULT($blas_lib)
 AC_MSG_RESULT($blas_lib)
 AC_SUBST(BLAS_LIB,$blas_lib)
 AC_SUBST(BLAS_LIB,$blas_lib)
 
 
+##########################################
+# FFT                                    #
+##########################################
+
 have_fftw=no
 have_fftw=no
 have_fftwf=no
 have_fftwf=no
 have_fftwl=no
 have_fftwl=no
@@ -1295,7 +1304,7 @@ PKG_CHECK_MODULES([FFTW],  [fftw3],  [
   have_fftw=yes
   have_fftw=yes
 ], [:])
 ], [:])
 AM_CONDITIONAL(STARPU_HAVE_FFTW, [test x$have_fftw = xyes])
 AM_CONDITIONAL(STARPU_HAVE_FFTW, [test x$have_fftw = xyes])
-
+ 
 PKG_CHECK_MODULES([FFTWF], [fftw3f], [
 PKG_CHECK_MODULES([FFTWF], [fftw3f], [
   AC_DEFINE([STARPU_HAVE_FFTWF], [1], [Define to 1 if you have the libfftw3f library.])
   AC_DEFINE([STARPU_HAVE_FFTWF], [1], [Define to 1 if you have the libfftw3f library.])
   AC_SUBST([STARPU_HAVE_FFTWF], [1])
   AC_SUBST([STARPU_HAVE_FFTWF], [1])
@@ -1310,6 +1319,10 @@ PKG_CHECK_MODULES([FFTWL], [fftw3l], [
 ], [:])
 ], [:])
 AM_CONDITIONAL(STARPU_HAVE_FFTWL, [test x$have_fftwl = xyes])
 AM_CONDITIONAL(STARPU_HAVE_FFTWL, [test x$have_fftwl = xyes])
 
 
+##########################################
+# hwloc                                  #
+##########################################
+
 AC_ARG_WITH([hwloc], [AS_HELP_STRING([--without-hwloc], [Disable hwloc (enabled by default)])])
 AC_ARG_WITH([hwloc], [AS_HELP_STRING([--without-hwloc], [Disable hwloc (enabled by default)])])
 SAVED_LDFLAGS="${LDFLAGS}"
 SAVED_LDFLAGS="${LDFLAGS}"
 SAVED_CPPFLAGS="${CPPFLAGS}"
 SAVED_CPPFLAGS="${CPPFLAGS}"
@@ -1364,23 +1377,29 @@ AM_CONDITIONAL([COND_OPT], [test "$want_optional_tests" = yes])
 # File configuration
 # File configuration
 AC_CONFIG_COMMANDS([executable-scripts], [
 AC_CONFIG_COMMANDS([executable-scripts], [
   chmod +x tests/regression/regression.sh
   chmod +x tests/regression/regression.sh
-  chmod +x gcc-plugin/tests/run-test
 ])
 ])
 
 
 AC_CONFIG_FILES(tests/regression/regression.sh tests/regression/profiles tests/regression/profiles.build.only)
 AC_CONFIG_FILES(tests/regression/regression.sh tests/regression/profiles tests/regression/profiles.build.only)
 AC_CONFIG_HEADER(src/common/config.h include/starpu_config.h)
 AC_CONFIG_HEADER(src/common/config.h include/starpu_config.h)
 
 
-AC_CONFIG_HEADERS([gcc-plugin/src/starpu-gcc-config.h])
+if test $build_gcc_plugin == "yes" ; then
+    AC_CONFIG_HEADERS([gcc-plugin/src/starpu-gcc-config.h])
+    AC_OUTPUT([
+	    gcc-plugin/Makefile
+	    gcc-plugin/src/Makefile
+	    gcc-plugin/tests/Makefile
+	    gcc-plugin/tests/run-test
+	    gcc-plugin/examples/Makefile
+            ])
+    AC_CONFIG_COMMANDS([executable-plugin-scripts], [
+            chmod +x gcc-plugin/tests/run-test
+            ])
+fi 
 
 
 AC_OUTPUT([
 AC_OUTPUT([
 	Makefile
 	Makefile
 	src/Makefile
 	src/Makefile
 	tools/Makefile
 	tools/Makefile
-	gcc-plugin/Makefile
-	gcc-plugin/src/Makefile
-	gcc-plugin/tests/Makefile
-	gcc-plugin/tests/run-test
-	gcc-plugin/examples/Makefile
 	socl/Makefile
 	socl/Makefile
 	socl/src/Makefile
 	socl/src/Makefile
 	libstarpu.pc
 	libstarpu.pc

+ 6 - 1
doc/Makefile.am

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 #
 # Copyright (C) 2009  Université de Bordeaux 1
 # Copyright (C) 2009  Université de Bordeaux 1
-# Copyright (C) 2010  Centre National de la Recherche Scientifique
+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 #
 #
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
 # it under the terms of the GNU Lesser General Public License as published by
@@ -16,6 +16,8 @@
 
 
 info_TEXINFOS = starpu.texi
 info_TEXINFOS = starpu.texi
 
 
+starpu_TEXINFOS = c-extensions.texi
+
 MAINTAINERCLEANFILES = starpu.pdf
 MAINTAINERCLEANFILES = starpu.pdf
 
 
 EXTRA_DIST = starpu.pdf \
 EXTRA_DIST = starpu.pdf \
@@ -28,6 +30,9 @@ EXTRA_DIST = starpu.pdf \
 
 
 AM_MAKEINFOHTMLFLAGS = --css-include=$(top_srcdir)/doc/starpu.css --no-headers --no-split
 AM_MAKEINFOHTMLFLAGS = --css-include=$(top_srcdir)/doc/starpu.css --no-headers --no-split
 
 
+uninstall-local:
+	$(RM) $(DESTDIR)$(infodir)/dir
+
 #$(top_srcdir)/doc/starpu.texi: vector_scal_c.texi vector_scal_cuda.texi vector_scal_opencl.texi vector_scal_opencl_codelet.texi
 #$(top_srcdir)/doc/starpu.texi: vector_scal_c.texi vector_scal_cuda.texi vector_scal_opencl.texi vector_scal_opencl_codelet.texi
 #vector_scal_c.texi: $(top_srcdir)/examples/basic_examples/vector_scal.c
 #vector_scal_c.texi: $(top_srcdir)/examples/basic_examples/vector_scal.c
 #	cat $< | sed 's/{/@{/g' | sed 's/}/@}/g' | sed 's/\t/    /g' > $@
 #	cat $< | sed 's/{/@{/g' | sed 's/}/@}/g' | sed 's/\t/    /g' > $@

+ 160 - 0
doc/c-extensions.texi

@@ -0,0 +1,160 @@
+@c This is part of the StarPU Handbook.
+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
+
+@node C Extensions
+@chapter C Extensions
+
+@cindex C extensions
+@cindex GCC plug-in
+
+When configured with @code{--enable-gcc-extensions}, StarPU builds a
+plug-in for the GNU Compiler Collection (GCC), which defines extensions
+to the C language that make it easier to write StarPU code@footnote{This
+feature is only available for GCC 4.5 and later.}.  Those extensions
+include syntactic sugar for defining tasks and their implementations,
+invoking a task, and manipulating data buffers.
+
+This section does not require detailed knowledge of the StarPU library.
+
+Note: as of StarPU @value{VERSION}, this is still an area under
+development and subject to change.
+
+@menu
+* Defining Tasks::              Defining StarPU tasks
+* Registered Data Buffers::     Manipulating data buffers
+@end menu
+
+@node Defining Tasks
+@section Defining Tasks
+
+@cindex task
+@cindex task implementation
+
+The StarPU GCC plug-in views @dfn{tasks} as ``extended'' C functions:
+
+@enumerate
+@item
+tasks may have several implementations---e.g., one for CPUs, one written
+in OpenCL, one written in CUDA;
+@item
+when a task is invoked, it may run in parallel, and StarPU is free to
+choose any of its implementations.
+@end enumerate
+
+Tasks and their implementations must be @emph{declared}.  These
+declarations are annotated with @dfn{attributes} (@pxref{Attribute
+Syntax, attributes in GNU C,, gcc, Using the GNU Compiler Collection
+(GCC)}): the declaration of a task is a regular C function declaration
+with an additional @code{task} attribute, and task implementations are
+declared with a @code{task_implementation} attribute.
+
+The following function attributes are provided:
+
+@table @code
+
+@item task
+@cindex @code{task} attribute
+Declare the given function as a StarPU task.  Its return type must be
+@code{void}, and it must not be defined---instead, a definition will
+automatically be provided by the compiler.
+
+Under the hood, declaring a task leads to the declaration of the
+corresponding @code{codelet} (@pxref{Codelet and Tasks}).  If one or
+more task implementations are declared in the same compilation unit,
+then the codelet and the function itself are also defined; they inherit
+the scope of the task.
+
+Scalar arguments to the task are passed by value and copied to the
+target device if need be---technically, they are passed as the
+@code{cl_arg} buffer (@pxref{Codelets and Tasks, @code{cl_arg}}).
+
+Pointer arguments are assumed to be registered data buffers---the
+@code{buffers} argument of a task (@pxref{Codelets and Tasks,
+@code{buffers}}); @code{const}-qualified pointer arguments are viewed as
+read-only buffers (@code{STARPU_R}), and non-@code{const}-qualified
+buffers are assumed to be used read-write (@code{STARPU_RW}).
+
+@item task_implementation (@var{target}, @var{task})
+@cindex @code{task_implementation} attribute
+Declare the given function as an implementation of @var{task} to run on
+@var{target}.  @var{target} must be a string, currently one of
+@code{"cpu"} or @code{"cuda"}.
+@c FIXME: Update when OpenCL support is ready.
+
+@end table
+
+Here is an example:
+
+@example
+static void matmul (const float *A, const float *B, float *C,
+		    size_t nx, size_t ny, size_t nz)
+  __attribute__ ((task));
+
+static void matmul_cpu (const float *A, const float *B, float *C,
+			size_t nx, size_t ny, size_t nz)
+  __attribute__ ((task_implementation ("cpu", matmul)));
+
+
+static void
+matmul_cpu (const float *A, const float *B, float *C,
+	    size_t nx, size_t ny, size_t nz)
+@{
+  size_t i, j, k;
+
+  for (j = 0; j < ny; j++)
+    for (i = 0; i < nx; i++)
+      @{
+	for (k = 0; k < nz; k++)
+	  C[j * nx + i] += A[j * nz + k] * B[k * nx + i];
+      @}
+@}
+@end example
+
+@noindent
+A @code{matmult} task is defined; it has only one implementation,
+@code{matmult_cpu}, which runs on the CPU.  Variables @var{A} and
+@var{B} are input buffers, whereas @var{C} is considered an input/output
+buffer.  The task can be invoked like a regular C function:
+
+@example
+matmul (&A[i * zdim * bydim + k * bzdim * bydim],
+        &B[k * xdim * bzdim + j * bxdim * bzdim],
+        &C[i * xdim * bydim + j * bxdim * bydim],
+        bxdim, bydim, bzdim);
+@end example
+
+@noindent
+This leads to an @dfn{asynchronous invocation}, whereby @code{matmult}'s
+implementation may run in parallel with the continuation of the caller.
+
+The next section describes how memory buffers must be handled in
+StarPU-GCC code.
+
+
+@node Registered Data Buffers
+@section Registered Data Buffers
+
+Data buffers such as matrices and vectors that are to be passed to tasks
+must be @dfn{registered}.  Registration allows StarPU to handle data
+transfers among devices---e.g., transferring an input buffer from the
+CPU's main memory to a task scheduled to run a GPU (@pxref{StarPU Data
+Management Library}).
+
+The following pragmas are provided:
+
+@table @code
+
+@item #pragma starpu register @var{ptr} [@var{size}]
+Register @var{ptr} as a @var{size}-element buffer.
+
+@item #pragma starpu unregister @var{ptr}
+@item #pragma starpu acquire @var{ptr}
+
+@end table
+
+FIXME: finish
+
+@c Local Variables:
+@c TeX-master: "guile.texi"
+@c ispell-local-dictionary: "american"
+@c End:

+ 48 - 11
doc/starpu.texi

@@ -49,8 +49,9 @@ was last updated on @value{UPDATED}.
 * Configuring StarPU::          How to configure StarPU
 * Configuring StarPU::          How to configure StarPU
 * StarPU API::                  The API to use StarPU
 * StarPU API::                  The API to use StarPU
 * Advanced Topics::             Advanced use of StarPU
 * Advanced Topics::             Advanced use of StarPU
-* Full source code for the 'Scaling a Vector' example::  
+* C Extensions::                Easier StarPU programming with GCC
 
 
+* Full source code for the 'Scaling a Vector' example::
 * Function Index::              Index of C functions.
 * Function Index::              Index of C functions.
 @end menu
 @end menu
 
 
@@ -1227,16 +1228,16 @@ Partitioning can be applied several times, see
 @section Performance model example
 @section Performance model example
 
 
 To achieve good scheduling, StarPU scheduling policies need to be able to
 To achieve good scheduling, StarPU scheduling policies need to be able to
-estimate in advance the duration of a task. This is done by giving to codelets a
-performance model. There are several kinds of performance models.
+estimate in advance the duration of a task. This is done by giving to codelets
+a performance model, by defining a @code{starpu_perfmodel_t} structure and
+providing its address in the @code{model} field of the @code{starpu_codelet}
+structure. The @code{symbol} and @code{type} fields of @code{starpu_perfmodel_t}
+are mandatory, to give a name to the model, and the type of the model, since
+there are several kinds of performance models.
 
 
 @itemize
 @itemize
 @item
 @item
-Providing an estimation from the application itself (@code{STARPU_COMMON} model type and @code{cost_model} field),
-see for instance
-@code{examples/common/blas_model.h} and @code{examples/common/blas_model.c}. It can also be provided for each architecture (@code{STARPU_PER_ARCH} model type and @code{per_arch} field)
-@item
-Measured at runtime (STARPU_HISTORY_BASED model type). This assumes that for a
+Measured at runtime (@code{STARPU_HISTORY_BASED} model type). This assumes that for a
 given set of data input/output sizes, the performance will always be about the
 given set of data input/output sizes, the performance will always be about the
 same. This is very true for regular kernels on GPUs for instance (<0.1% error),
 same. This is very true for regular kernels on GPUs for instance (<0.1% error),
 and just a bit less true on CPUs (~=1% error). This also assumes that there are
 and just a bit less true on CPUs (~=1% error). This also assumes that there are
@@ -1276,7 +1277,7 @@ starpu_codelet cl = @{
 @end cartouche
 @end cartouche
 
 
 @item
 @item
-Measured at runtime and refined by regression (STARPU_REGRESSION_*_BASED
+Measured at runtime and refined by regression (@code{STARPU_REGRESSION_*_BASED}
 model type). This still assumes performance regularity, but can work
 model type). This still assumes performance regularity, but can work
 with various data input sizes, by applying regression over observed
 with various data input sizes, by applying regression over observed
 execution times. STARPU_REGRESSION_BASED uses an a*n^b regression
 execution times. STARPU_REGRESSION_BASED uses an a*n^b regression
@@ -1286,7 +1287,12 @@ STARPU_REGRESSION_BASED, but costs a lot more to compute). For instance,
 model for the @code{memset} operation.
 model for the @code{memset} operation.
 
 
 @item
 @item
-Provided explicitly by the application (STARPU_PER_ARCH model type): the
+Provided as an estimation from the application itself (@code{STARPU_COMMON} model type and @code{cost_model} field),
+see for instance
+@code{examples/common/blas_model.h} and @code{examples/common/blas_model.c}.
+
+@item
+Provided explicitly by the application (@code{STARPU_PER_ARCH} model type): the
 @code{.per_arch[i].cost_model} fields have to be filled with pointers to
 @code{.per_arch[i].cost_model} fields have to be filled with pointers to
 functions which return the expected duration of the task in micro-seconds, one
 functions which return the expected duration of the task in micro-seconds, one
 per architecture.
 per architecture.
@@ -1877,6 +1883,22 @@ TODO
 
 
 @c what kind of information do we get ?
 @c what kind of information do we get ?
 
 
+The bus speed measured by StarPU can be displayed by using the
+@code{starpu_machine_display} tool, for instance:
+
+@example
+StarPU has found :
+        3 CUDA devices
+                CUDA 0 (Tesla C2050 02:00.0)
+                CUDA 1 (Tesla C2050 03:00.0)
+                CUDA 2 (Tesla C2050 84:00.0)
+from    to RAM          to CUDA 0       to CUDA 1       to CUDA 2
+RAM     0.000000        5176.530428     5176.492994     5191.710722
+CUDA 0  4523.732446     0.000000        2414.074751     2417.379201
+CUDA 1  4523.718152     2414.078822     0.000000        2417.375119
+CUDA 2  4534.229519     2417.069025     2417.060863     0.000000
+@end example
+
 @node StarPU-Top
 @node StarPU-Top
 @subsection StarPU-Top interface
 @subsection StarPU-Top interface
 
 
@@ -2057,7 +2079,7 @@ This will create an @code{activity.data} file in the current
 directory. A profile of the application showing the activity of StarPU
 directory. A profile of the application showing the activity of StarPU
 during the execution of the program can be generated:
 during the execution of the program can be generated:
 @example
 @example
-$ starpu_top.sh activity.data
+$ starpu_top activity.data
 @end example
 @end example
 
 
 This will create a file named @code{activity.eps} in the current directory.
 This will create a file named @code{activity.eps} in the current directory.
@@ -2558,6 +2580,7 @@ Enable flags for the @code{gcov} coverage tool.
 * --with-cuda-dir::             
 * --with-cuda-dir::             
 * --with-cuda-include-dir::             
 * --with-cuda-include-dir::             
 * --with-cuda-lib-dir::             
 * --with-cuda-lib-dir::             
+* --disable-cuda-memcpy-peer::
 * --enable-maxopencldev::       
 * --enable-maxopencldev::       
 * --disable-opencl::            
 * --disable-opencl::            
 * --with-opencl-dir::           
 * --with-opencl-dir::           
@@ -2624,6 +2647,13 @@ notably contain the CUDA shared libraries (e.g. libcuda.so). This defaults to
 
 
 @end table
 @end table
 
 
+@node --disable-cuda-memcpy-peer
+@subsubsection @code{--disable-cuda-memcpy-peer}
+@table @asis
+@item @emph{Description}
+Explicitely disables peer transfers when using CUDA 4.0
+@end table
+
 @node --enable-maxopencldev
 @node --enable-maxopencldev
 @subsubsection @code{--enable-maxopencldev=<number>}
 @subsubsection @code{--enable-maxopencldev=<number>}
 @table @asis
 @table @asis
@@ -4893,6 +4923,13 @@ static struct starpu_sched_policy_s dummy_sched_policy = @{
 
 
 
 
 @c ---------------------------------------------------------------------
 @c ---------------------------------------------------------------------
+@c C Extensions
+@c ---------------------------------------------------------------------
+
+@include c-extensions.texi
+
+
+@c ---------------------------------------------------------------------
 @c Appendices
 @c Appendices
 @c ---------------------------------------------------------------------
 @c ---------------------------------------------------------------------
 
 

+ 2 - 0
examples/Makefile.am

@@ -2,6 +2,7 @@
 #
 #
 # Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
 # Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+# Copyright (C) 2011  Télécom-SudParis
 #
 #
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
 # it under the terms of the GNU Lesser General Public License as published by
@@ -152,6 +153,7 @@ examplebin_PROGRAMS +=				\
 	basic_examples/mult			\
 	basic_examples/mult			\
 	basic_examples/block			\
 	basic_examples/block			\
 	basic_examples/variable			\
 	basic_examples/variable			\
+	basic_examples/mult_impl                \
 	filters/fvector				\
 	filters/fvector				\
 	filters/fblock				\
 	filters/fblock				\
 	filters/fmatrix				\
 	filters/fmatrix				\

+ 2 - 2
examples/basic_examples/hello_world.c

@@ -52,12 +52,12 @@ struct params {
 };
 };
 void cpu_func(void *buffers[], void *cl_arg)
 void cpu_func(void *buffers[], void *cl_arg)
 {
 {
-	struct params *params = cl_arg;
+	struct params *params = (struct params *) cl_arg;
 
 
 	FPRINTF(stdout, "Hello world (params = {%i, %f} )\n", params->i, params->f);
 	FPRINTF(stdout, "Hello world (params = {%i, %f} )\n", params->i, params->f);
 }
 }
 
 
-starpu_codelet cl;
+starpu_codelet cl = {};
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {

+ 3 - 3
examples/basic_examples/mult.c

@@ -127,9 +127,9 @@ static void init_problem_data(void)
 
 
 	/* we initialize matrices A, B and C in the usual way */
 	/* we initialize matrices A, B and C in the usual way */
 
 
-	A = malloc(zdim*ydim*sizeof(float));
-	B = malloc(xdim*zdim*sizeof(float));
-	C = malloc(xdim*ydim*sizeof(float));
+	A = (float *) malloc(zdim*ydim*sizeof(float));
+	B = (float *) malloc(xdim*zdim*sizeof(float));
+	C = (float *) malloc(xdim*ydim*sizeof(float));
 
 
 	/* fill the A and B matrices */
 	/* fill the A and B matrices */
 	srand(2009);
 	srand(2009);

+ 384 - 0
examples/basic_examples/mult_impl.c

@@ -0,0 +1,384 @@
+/*/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Télécom-SudParis
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#include <string.h>
+#include <math.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <pthread.h>
+#include <signal.h>
+
+#include <starpu.h>
+
+static float *A, *B, *C;
+static starpu_data_handle A_handle, B_handle, C_handle;
+
+static unsigned nslicesx = 4;
+static unsigned nslicesy = 4;
+static unsigned xdim = 1024;
+static unsigned ydim = 1024;
+static unsigned zdim = 512;
+
+
+double mult_gemm_cost(starpu_buffer_descr *descr)
+{
+	/* C = A * B */
+	uint32_t nxC, nyC, nxA;
+
+
+	nxC = starpu_matrix_get_nx(descr[2].handle);
+	nyC = starpu_matrix_get_ny(descr[2].handle);
+	nxA = starpu_matrix_get_nx(descr[0].handle);
+
+	//printf("nxC %d nxC %d nxA %d\n", nxC, nyC, nxA);
+
+	double cost = ((double)nxC)*((double)nyC)*((double)nxA/1000.0f/4.11f);
+
+	printf("cost %e \n", cost);
+
+	return cost;
+}
+
+static void cpu_mult(void *descr[], __attribute__((unused))  void *arg)
+{
+	float *subA, *subB, *subC;
+	uint32_t nxC, nyC, nyA;
+	uint32_t ldA, ldB, ldC;
+	printf("On application: Hello, this is kernel cpu_mult\n\n");
+	/* .blas.ptr gives a pointer to the first element of the local copy */
+	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
+	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
+	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
+
+	/* .blas.nx is the number of rows (consecutive elements) and .blas.ny
+	 * is the number of lines that are separated by .blas.ld elements (ld
+	 * stands for leading dimension).
+	 * NB: in case some filters were used, the leading dimension is not
+	 * guaranteed to be the same in main memory (on the original matrix)
+	 * and on the accelerator! */
+	nxC = STARPU_MATRIX_GET_NX(descr[2]);
+	nyC = STARPU_MATRIX_GET_NY(descr[2]);
+	nyA = STARPU_MATRIX_GET_NY(descr[0]);
+
+	ldA = STARPU_MATRIX_GET_LD(descr[0]);
+	ldB = STARPU_MATRIX_GET_LD(descr[1]);
+	ldC = STARPU_MATRIX_GET_LD(descr[2]);
+
+	/* we assume a FORTRAN-ordering! */
+	unsigned i,j,k;
+	for (i = 0; i < nyC; i++)
+	{
+		for (j = 0; j < nxC; j++)
+		{
+			float sum = 0.0;
+
+			for (k = 0; k < nyA; k++)
+			{
+				sum += subA[j+k*ldA]*subB[k+i*ldB];
+			}
+
+			subC[j + i*ldC] = sum;
+		}
+	}
+}
+
+static void cpu_mult_2(void *descr[], __attribute__((unused))  void *arg)
+{
+	float *subA, *subB, *subC;
+	uint32_t nxC, nyC, nyA;
+	uint32_t ldA, ldB, ldC;
+	printf("On application: this is kernel cpu_mult_2\n\n");
+	/* .blas.ptr gives a pointer to the first element of the local copy */
+	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
+	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
+	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
+
+	nxC = STARPU_MATRIX_GET_NX(descr[2]);
+	nyC = STARPU_MATRIX_GET_NY(descr[2]);
+	nyA = STARPU_MATRIX_GET_NY(descr[0]);
+
+	ldA = STARPU_MATRIX_GET_LD(descr[0]);
+	ldB = STARPU_MATRIX_GET_LD(descr[1]);
+	ldC = STARPU_MATRIX_GET_LD(descr[2]);
+
+	/* we assume a FORTRAN-ordering! */
+	unsigned i,j,k;
+	for (j = 0; j < nxC; j++)
+	{
+		for (i = 0; i < nyC; i++)
+		{
+			float sum = 0.0;
+
+			for (k = 0; k < nyA; k++)
+			{
+				sum += subA[j+k*ldA]*subB[k+i*ldB];
+			}
+
+			subC[j + i*ldC] = sum;
+		}
+	}
+}
+
+
+
+static void init_problem_data(void)
+{
+	unsigned i,j;
+
+	/* we initialize matrices A, B and C in the usual way */
+
+	A = malloc(zdim*ydim*sizeof(float));
+	B = malloc(xdim*zdim*sizeof(float));
+	C = malloc(xdim*ydim*sizeof(float));
+
+	/* fill the A and B matrices */
+	srand(2009);
+	for (j=0; j < ydim; j++) {
+		for (i=0; i < zdim; i++) {
+			A[j+i*ydim] = (float)(starpu_drand48());
+		}
+	}
+
+	for (j=0; j < zdim; j++) {
+		for (i=0; i < xdim; i++) {
+			B[j+i*zdim] = (float)(starpu_drand48());
+		}
+	}
+
+	for (j=0; j < ydim; j++) {
+		for (i=0; i < xdim; i++) {
+			C[j+i*ydim] = (float)(0);
+		}
+	}
+}
+
+static void partition_mult_data(void)
+{
+	/* note that we assume a FORTRAN ordering here! */
+
+	starpu_matrix_data_register(&A_handle, 0, (uintptr_t)A,
+		ydim, ydim, zdim, sizeof(float));
+	starpu_matrix_data_register(&B_handle, 0, (uintptr_t)B,
+		zdim, zdim, xdim, sizeof(float));
+	starpu_matrix_data_register(&C_handle, 0, (uintptr_t)C,
+		ydim, ydim, xdim, sizeof(float));
+
+	/* A filter is a method to partition a data into disjoint chunks, it is
+	 * described by the means of the "struct starpu_data_filter" structure that
+	 * contains a function that is applied on a data handle to partition it
+	 * into smaller chunks, and an argument that is passed to the function
+	 * (eg. the number of blocks to create here).
+	 */
+
+	struct starpu_data_filter vert = {
+		.filter_func = starpu_vertical_block_filter_func,
+		.nchildren = nslicesx,
+		.get_nchildren = NULL,
+		.get_child_ops = NULL
+	};
+
+	struct starpu_data_filter horiz = {
+		.filter_func = starpu_block_filter_func,
+		.nchildren = nslicesy,
+		.get_nchildren = NULL,
+		.get_child_ops = NULL
+	};
+
+/*
+ *	Illustration with nslicex = 4 and nslicey = 2, it is possible to access
+ *	sub-data by using the "starpu_data_get_sub_data" method, which takes a data handle,
+ *	the number of filters to apply, and the indexes for each filters, for
+ *	instance:
+ *
+ *		A' handle is starpu_data_get_sub_data(A_handle, 1, 1);
+ *		B' handle is starpu_data_get_sub_data(B_handle, 1, 2);
+ *		C' handle is starpu_data_get_sub_data(C_handle, 2, 2, 1);
+ *
+ *	Note that here we applied 2 filters recursively onto C.
+ *
+ *	"starpu_data_get_sub_data(C_handle, 1, 3)" would return a handle to the 4th column
+ *	of blocked matrix C for example.
+ *
+ *		              |---|---|---|---|
+ *		              |   |   | B'|   | B
+ *		              |---|---|---|---|
+ *		                0   1   2   3
+ *		     |----|   |---|---|---|---|
+ *		     |    |   |   |   |   |   |
+ *		     |    | 0 |   |   |   |   |
+ *		     |----|   |---|---|---|---|
+ *		     | A' |   |   |   | C'|   |
+ *		     |    |   |   |   |   |   |
+ *		     |----|   |---|---|---|---|
+ *		       A              C
+ *
+ *	IMPORTANT: applying filters is equivalent to partitionning a piece of
+ *	data in a hierarchical manner, so that memory consistency is enforced
+ *	for each of the elements independantly. The tasks should therefore NOT
+ *	access inner nodes (eg. one column of C or the whole C) but only the
+ *	leafs of the tree (ie. blocks here). Manipulating inner nodes is only
+ *	possible by disapplying the filters (using starpu_data_unpartition), to
+ *	enforce memory consistency.
+ */
+
+	starpu_data_partition(B_handle, &vert);
+	starpu_data_partition(A_handle, &horiz);
+
+	/* starpu_data_map_filters is a variable-arity function, the first argument
+	 * is the handle of the data to partition, the second argument is the
+	 * number of filters to apply recursively. Filters are applied in the
+	 * same order as the arguments.
+	 * This would be equivalent to starpu_data_partition(C_handle, &vert) and
+	 * then applying horiz on each sub-data (ie. each column of C)
+	 */
+	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
+}
+
+static struct starpu_perfmodel_t starpu_dgemm_model_common = {
+	.cost_model = mult_gemm_cost,
+	.type = STARPU_HISTORY_BASED,//STARPU_COMMON, //STARPU_PER_ARCH,
+	.symbol = "mult_perf_model"
+};
+
+/*
+static struct starpu_perfmodel_t mult_perf_model = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "mult_perf_model"
+};
+*/
+
+struct starpu_conf conf = {
+		.sched_policy_name = "heft",
+		.calibrate = 1,
+		.ncpus = 4
+};
+
+
+static starpu_codelet cl = {
+        /* we can only execute that kernel on a CPU yet */
+        .where = STARPU_CPU,
+        //.starpu_impl_multiple = 1,
+        /* CPU implementation of the codelet */
+        .cpu_func = STARPU_MULTIPLE_CPU_IMPLEMENTATIONS,
+        .cpu_funcs = {cpu_mult,cpu_mult_2},
+        /* the codelet manipulates 3 buffers that are managed by the
+         * DSM */
+        .nbuffers = 3,
+        /* in case the scheduling policy may use performance models */
+        .model = &starpu_dgemm_model_common
+};
+
+static void launch_tasks(void)
+{
+	/* partition the work into slices */
+	unsigned taskx, tasky;
+
+	for (taskx = 0; taskx < nslicesx; taskx++)
+	{
+		for (tasky = 0; tasky < nslicesy; tasky++)
+		{
+			/* C[taskx, tasky] = A[tasky] B[taskx] */
+
+			/* by default, starpu_task_create() returns an
+ 			 * asynchronous task (ie. task->synchronous = 0) */
+			struct starpu_task *task = starpu_task_create();
+
+			/* this task implements codelet "cl" */
+			task->cl = &cl;
+
+			/*
+			 *              |---|---|---|---|
+			 *              |   | * |   |   | B
+			 *              |---|---|---|---|
+			 *                    X
+			 *     |----|   |---|---|---|---|
+			 *     |****| Y |   |***|   |   |
+			 *     |****|   |   |***|   |   |
+			 *     |----|   |---|---|---|---|
+			 *     |    |   |   |   |   |   |
+			 *     |    |   |   |   |   |   |
+			 *     |----|   |---|---|---|---|
+			 *       A              C
+			 */
+
+			/* there was a single filter applied to matrices A
+			 * (respectively B) so we grab the handle to the chunk
+			 * identified by "tasky" (respectively "taskx). The "1"
+			 * tells StarPU that there is a single argument to the
+			 * variable-arity function starpu_data_get_sub_data */
+			task->buffers[0].handle = starpu_data_get_sub_data(A_handle, 1, tasky);
+			task->buffers[0].mode = STARPU_R;
+			task->buffers[1].handle = starpu_data_get_sub_data(B_handle, 1, taskx);
+			task->buffers[1].mode = STARPU_R;
+
+			/* 2 filters were applied on matrix C, so we give
+			 * starpu_data_get_sub_data 2 arguments. The order of the arguments
+			 * must match the order in which the filters were
+			 * applied.
+			 * NB: starpu_data_get_sub_data(C_handle, 1, k) would have returned
+			 * a handle to the column number k of matrix C.
+			 * NB2: starpu_data_get_sub_data(C_handle, 2, taskx, tasky) is
+			 * equivalent to
+			 * starpu_data_get_sub_data(starpu_data_get_sub_data(C_handle, 1, taskx), 1, tasky)*/
+			task->buffers[2].handle = starpu_data_get_sub_data(C_handle, 2, taskx, tasky);
+			task->buffers[2].mode = STARPU_W;
+
+			/* this is not a blocking call since task->synchronous = 0 */
+			int summit_task;
+			summit_task = starpu_task_submit(task);
+			printf("task is submmited or not %d\n",summit_task);
+
+		}
+	}
+}
+
+int main(void)
+{
+	/* start the runtime */
+	starpu_init(&conf);
+
+	/* initialize matrices A, B and C and register them to StarPU */
+	init_problem_data();
+
+	/* partition matrices into blocks that can be manipulated by the
+ 	 * codelets */
+	partition_mult_data();
+
+	/* submit all tasks in an asynchronous fashion */
+	launch_tasks();
+
+	/* wait for termination */
+	starpu_task_wait_for_all();
+
+	/* remove the filters applied by the means of starpu_data_map_filters; now
+ 	 * it's not possible to manipulate a subset of C using starpu_data_get_sub_data until
+	 * starpu_data_map_filters is called again on C_handle.
+	 * The second argument is the memory node where the different subsets
+	 * should be reassembled, 0 = main memory (RAM) */
+	starpu_data_unpartition(C_handle, 0);
+
+	/* stop monitoring matrix C : after this, it is not possible to pass C
+	 * (or any subset of C) as a codelet input/output. This also implements
+	 * a barrier so that the piece of data is put back into main memory in
+	 * case it was only available on a GPU for instance. */
+	starpu_data_unregister(C_handle);
+
+	starpu_shutdown();
+
+	return 0;
+}

+ 2 - 2
examples/basic_examples/vector_scal_cpu.c

@@ -24,7 +24,7 @@
 void scal_cpu_func(void *buffers[], void *cl_arg)
 void scal_cpu_func(void *buffers[], void *cl_arg)
 {
 {
 	unsigned i;
 	unsigned i;
-	float *factor = cl_arg;
+	float *factor = (float *) cl_arg;
 
 
 	/*
 	/*
 	 * The "buffers" array matches the task->buffers array: for instance
 	 * The "buffers" array matches the task->buffers array: for instance
@@ -37,7 +37,7 @@ void scal_cpu_func(void *buffers[], void *cl_arg)
 	 * migrated/replicated), and elemsize gives the size of each elements.
 	 * migrated/replicated), and elemsize gives the size of each elements.
 	 */
 	 */
 
 
-	starpu_vector_interface_t *vector = buffers[0];
+	starpu_vector_interface_t *vector = (starpu_vector_interface_t *) buffers[0];
 
 
 	/* length of the vector */
 	/* length of the vector */
 	unsigned n = STARPU_VECTOR_GET_NX(vector);
 	unsigned n = STARPU_VECTOR_GET_NX(vector);

+ 10 - 9
examples/cholesky/cholesky_models.c

@@ -2,6 +2,7 @@
  *
  *
  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -125,27 +126,27 @@ static double cuda_chol_task_22_cost(starpu_buffer_descr *descr)
 }
 }
 
 
 struct starpu_perfmodel_t chol_model_11 = {
 struct starpu_perfmodel_t chol_model_11 = {
-	.per_arch = { 
-		[STARPU_CPU_DEFAULT] = { .cost_model = cpu_chol_task_11_cost },
-		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_chol_task_11_cost }
+	.per_arch = {
+		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_11_cost },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_11_cost }
 	},
 	},
 	.type = STARPU_HISTORY_BASED,
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "chol_model_11"
 	.symbol = "chol_model_11"
 };
 };
 
 
 struct starpu_perfmodel_t chol_model_21 = {
 struct starpu_perfmodel_t chol_model_21 = {
-	.per_arch = { 
-		[STARPU_CPU_DEFAULT] = { .cost_model = cpu_chol_task_21_cost },
-		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_chol_task_21_cost }
+	.per_arch = {
+		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_21_cost },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_21_cost }
 	},
 	},
 	.type = STARPU_HISTORY_BASED,
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "chol_model_21"
 	.symbol = "chol_model_21"
 };
 };
 
 
 struct starpu_perfmodel_t chol_model_22 = {
 struct starpu_perfmodel_t chol_model_22 = {
-	.per_arch = { 
-		[STARPU_CPU_DEFAULT] = { .cost_model = cpu_chol_task_22_cost },
-		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_chol_task_22_cost }
+	.per_arch = {
+		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_22_cost },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_22_cost }
 	},
 	},
 	.type = STARPU_HISTORY_BASED,
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "chol_model_22"
 	.symbol = "chol_model_22"

+ 1 - 1
examples/filters/fblock_cpu.c

@@ -19,7 +19,7 @@
 void cpu_func(void *buffers[], void *cl_arg)
 void cpu_func(void *buffers[], void *cl_arg)
 {
 {
         unsigned i, j, k;
         unsigned i, j, k;
-        int *factor = cl_arg;
+        int *factor = (int *) cl_arg;
 	int *block = (int *)STARPU_BLOCK_GET_PTR(buffers[0]);
 	int *block = (int *)STARPU_BLOCK_GET_PTR(buffers[0]);
 	int nx = (int)STARPU_BLOCK_GET_NX(buffers[0]);
 	int nx = (int)STARPU_BLOCK_GET_NX(buffers[0]);
 	int ny = (int)STARPU_BLOCK_GET_NY(buffers[0]);
 	int ny = (int)STARPU_BLOCK_GET_NY(buffers[0]);

+ 1 - 1
examples/filters/fmatrix.c

@@ -25,7 +25,7 @@
 void cpu_func(void *buffers[], void *cl_arg)
 void cpu_func(void *buffers[], void *cl_arg)
 {
 {
         unsigned i, j;
         unsigned i, j;
-        int *factor = cl_arg;
+        int *factor = (int *) cl_arg;
 
 
         /* length of the matrix */
         /* length of the matrix */
         unsigned nx = STARPU_MATRIX_GET_NX(buffers[0]);
         unsigned nx = STARPU_MATRIX_GET_NX(buffers[0]);

+ 1 - 1
examples/filters/fvector.c

@@ -24,7 +24,7 @@
 void cpu_func(void *buffers[], void *cl_arg)
 void cpu_func(void *buffers[], void *cl_arg)
 {
 {
         unsigned i;
         unsigned i;
-        int *factor = cl_arg;
+        int *factor = (int *) cl_arg;
 
 
         /* length of the vector */
         /* length of the vector */
         unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
         unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);

+ 13 - 12
examples/heat/lu_kernels_model.c

@@ -2,6 +2,7 @@
  *
  *
  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -215,9 +216,9 @@ double task_22_cost_cpu(starpu_buffer_descr *descr)
 
 
 struct starpu_perfmodel_t model_11 = {
 struct starpu_perfmodel_t model_11 = {
 	.cost_model = task_11_cost,
 	.cost_model = task_11_cost,
-	.per_arch = { 
-		[STARPU_CPU_DEFAULT] = { .cost_model = task_11_cost_cpu },
-		[STARPU_CUDA_DEFAULT] = { .cost_model = task_11_cost_cuda }
+	.per_arch = {
+		[STARPU_CPU_DEFAULT][0] = { .cost_model = task_11_cost_cpu },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = task_11_cost_cuda }
 	},
 	},
 	.type = STARPU_HISTORY_BASED,
 	.type = STARPU_HISTORY_BASED,
 #ifdef STARPU_ATLAS
 #ifdef STARPU_ATLAS
@@ -231,9 +232,9 @@ struct starpu_perfmodel_t model_11 = {
 
 
 struct starpu_perfmodel_t model_12 = {
 struct starpu_perfmodel_t model_12 = {
 	.cost_model = task_12_cost,
 	.cost_model = task_12_cost,
-	.per_arch = { 
-		[STARPU_CPU_DEFAULT] = { .cost_model = task_12_cost_cpu },
-		[STARPU_CUDA_DEFAULT] = { .cost_model = task_12_cost_cuda }
+	.per_arch = {
+		[STARPU_CPU_DEFAULT][0] = { .cost_model = task_12_cost_cpu },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = task_12_cost_cuda }
 	},
 	},
 	.type = STARPU_HISTORY_BASED,
 	.type = STARPU_HISTORY_BASED,
 #ifdef STARPU_ATLAS
 #ifdef STARPU_ATLAS
@@ -247,9 +248,9 @@ struct starpu_perfmodel_t model_12 = {
 
 
 struct starpu_perfmodel_t model_21 = {
 struct starpu_perfmodel_t model_21 = {
 	.cost_model = task_21_cost,
 	.cost_model = task_21_cost,
-	.per_arch = { 
-		[STARPU_CPU_DEFAULT] = { .cost_model = task_21_cost_cpu },
-		[STARPU_CUDA_DEFAULT] = { .cost_model = task_21_cost_cuda }
+	.per_arch = {
+		[STARPU_CPU_DEFAULT][0] = { .cost_model = task_21_cost_cpu },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = task_21_cost_cuda }
 	},
 	},
 	.type = STARPU_HISTORY_BASED,
 	.type = STARPU_HISTORY_BASED,
 #ifdef STARPU_ATLAS
 #ifdef STARPU_ATLAS
@@ -263,9 +264,9 @@ struct starpu_perfmodel_t model_21 = {
 
 
 struct starpu_perfmodel_t model_22 = {
 struct starpu_perfmodel_t model_22 = {
 	.cost_model = task_22_cost,
 	.cost_model = task_22_cost,
-	.per_arch = { 
-		[STARPU_CPU_DEFAULT] = { .cost_model = task_22_cost_cpu },
-		[STARPU_CUDA_DEFAULT] = { .cost_model = task_22_cost_cuda }
+	.per_arch = {
+		[STARPU_CPU_DEFAULT][0] = { .cost_model = task_22_cost_cpu },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = task_22_cost_cuda }
 	},
 	},
 	.type = STARPU_HISTORY_BASED,
 	.type = STARPU_HISTORY_BASED,
 #ifdef STARPU_ATLAS
 #ifdef STARPU_ATLAS

+ 1 - 1
examples/mandelbrot/mandelbrot.c

@@ -333,7 +333,7 @@ static void compute_block_spmd(void *descr[], void *cl_arg)
 
 
 	while (1)
 	while (1)
 	{
 	{
-		local_iy = STARPU_ATOMIC_ADD(pcnt, 1) - 1;
+		local_iy = STARPU_ATOMIC_ADD((unsigned int *)pcnt, 1) - 1;
 		if (local_iy >= block_size)
 		if (local_iy >= block_size)
 			break;
 			break;
 
 

+ 6 - 6
examples/ppm_downscaler/ppm_downscaler.c

@@ -28,7 +28,7 @@
 
 
 struct ppm_image *allocate_new_ppm(int ncols, int nlines, int coldepth)
 struct ppm_image *allocate_new_ppm(int ncols, int nlines, int coldepth)
 {
 {
-	struct ppm_image *ppm = malloc(sizeof(struct ppm_image));
+	struct ppm_image *ppm = (struct ppm_image *) malloc(sizeof(struct ppm_image));
 	assert(ppm);
 	assert(ppm);
 
 
 	ppm->ncols = ncols;
 	ppm->ncols = ncols;
@@ -36,9 +36,9 @@ struct ppm_image *allocate_new_ppm(int ncols, int nlines, int coldepth)
 	ppm->coldepth = coldepth;
 	ppm->coldepth = coldepth;
 
 
 #ifdef STARPU_HAVE_MEMALIGN
 #ifdef STARPU_HAVE_MEMALIGN
-	ppm->data = memalign(16384, ncols*nlines*sizeof(struct ppm_color));
+	ppm->data = (struct ppm_color *) memalign(16384, ncols*nlines*sizeof(struct ppm_color));
 #else
 #else
-	ppm->data = malloc(ncols*nlines*sizeof(struct ppm_color));
+	ppm->data = (struct ppm_color *) malloc(ncols*nlines*sizeof(struct ppm_color));
 #endif
 #endif
 	assert(ppm->data);
 	assert(ppm->data);
 
 
@@ -49,7 +49,7 @@ struct ppm_image *file_to_ppm(char *filename)
 {
 {
 	int ret;
 	int ret;
 
 
-	struct ppm_image *ppm = malloc(sizeof(struct ppm_image));
+	struct ppm_image *ppm = (struct ppm_image *) malloc(sizeof(struct ppm_image));
 	assert(ppm);
 	assert(ppm);
 	
 	
 	FILE *file = fopen(filename, "r");
 	FILE *file = fopen(filename, "r");
@@ -64,9 +64,9 @@ struct ppm_image *file_to_ppm(char *filename)
 	
 	
 	/* allocate a buffer for the image */
 	/* allocate a buffer for the image */
 #ifdef STARPU_HAVE_MEMALIGN
 #ifdef STARPU_HAVE_MEMALIGN
-	ppm->data = memalign(16384, ppm->ncols*ppm->nlines*sizeof(struct ppm_color));
+	ppm->data = (struct ppm_color *) memalign(16384, ppm->ncols*ppm->nlines*sizeof(struct ppm_color));
 #else
 #else
-	ppm->data = malloc(ppm->ncols*ppm->nlines*sizeof(struct ppm_color));
+	ppm->data = (struct ppm_color *) malloc(ppm->ncols*ppm->nlines*sizeof(struct ppm_color));
 #endif
 #endif
 	assert(ppm->data);
 	assert(ppm->data);
 
 

+ 8 - 8
examples/ppm_downscaler/yuv_downscaler.c

@@ -123,7 +123,7 @@ int main(int argc, char **argv)
 	FILE *f_in = fopen(filename_in, "r");
 	FILE *f_in = fopen(filename_in, "r");
 	assert(f_in);
 	assert(f_in);
 
 
-	struct yuv_frame *yuv_in_buffer = malloc(nframes*FRAMESIZE);
+	struct yuv_frame *yuv_in_buffer = (struct yuv_frame *) malloc(nframes*FRAMESIZE);
 	fread(yuv_in_buffer, FRAMESIZE, nframes, f_in);
 	fread(yuv_in_buffer, FRAMESIZE, nframes, f_in);
 
 
 	/* allocate room for an output buffer */
 	/* allocate room for an output buffer */
@@ -131,16 +131,16 @@ int main(int argc, char **argv)
 	assert(f_out);
 	assert(f_out);
 
 
 /*	fprintf(stderr, "Alloc output file ...\n"); */
 /*	fprintf(stderr, "Alloc output file ...\n"); */
-	struct yuv_new_frame *yuv_out_buffer = calloc(nframes, NEW_FRAMESIZE);
+	struct yuv_new_frame *yuv_out_buffer = (struct yuv_new_frame *) calloc(nframes, NEW_FRAMESIZE);
 	assert(yuv_out_buffer);
 	assert(yuv_out_buffer);
 
 
-	starpu_data_handle *frame_y_handle = calloc(nframes, sizeof(starpu_data_handle));
-	starpu_data_handle *frame_u_handle = calloc(nframes, sizeof(starpu_data_handle));
-	starpu_data_handle *frame_v_handle = calloc(nframes, sizeof(starpu_data_handle));
+	starpu_data_handle *frame_y_handle = (starpu_data_handle *)  calloc(nframes, sizeof(starpu_data_handle));
+	starpu_data_handle *frame_u_handle = (starpu_data_handle *)  calloc(nframes, sizeof(starpu_data_handle));
+	starpu_data_handle *frame_v_handle = (starpu_data_handle *)  calloc(nframes, sizeof(starpu_data_handle));
 
 
-	starpu_data_handle *new_frame_y_handle = calloc(nframes, sizeof(starpu_data_handle));
-	starpu_data_handle *new_frame_u_handle = calloc(nframes, sizeof(starpu_data_handle));
-	starpu_data_handle *new_frame_v_handle = calloc(nframes, sizeof(starpu_data_handle));
+	starpu_data_handle *new_frame_y_handle = (starpu_data_handle *)  calloc(nframes, sizeof(starpu_data_handle));
+	starpu_data_handle *new_frame_u_handle = (starpu_data_handle *)  calloc(nframes, sizeof(starpu_data_handle));
+	starpu_data_handle *new_frame_v_handle = (starpu_data_handle *)  calloc(nframes, sizeof(starpu_data_handle));
 
 
 	starpu_init(NULL);
 	starpu_init(NULL);
 
 

+ 1 - 1
examples/profiling/profiling.c

@@ -53,7 +53,7 @@ int main(int argc, char **argv)
 		.nbuffers = 0
 		.nbuffers = 0
 	};
 	};
 
 
-	struct starpu_task **tasks = malloc(niter*sizeof(struct starpu_task *));
+	struct starpu_task **tasks = (struct starpu_task **) malloc(niter*sizeof(struct starpu_task *));
 	assert(tasks);
 	assert(tasks);
 
 
 	unsigned i;
 	unsigned i;

+ 4 - 4
examples/reductions/dot_product.c

@@ -164,11 +164,11 @@ int main(int argc, char **argv)
 	unsigned long nelems = nblocks*entries_per_block;
 	unsigned long nelems = nblocks*entries_per_block;
 	size_t size = nelems*sizeof(float);
 	size_t size = nelems*sizeof(float);
 
 
-	x = malloc(size);
-	y = malloc(size);
+	x = (float *) malloc(size);
+	y = (float *) malloc(size);
 
 
-	x_handles = calloc(nblocks, sizeof(starpu_data_handle));
-	y_handles = calloc(nblocks, sizeof(starpu_data_handle));
+	x_handles = (starpu_data_handle *) calloc(nblocks, sizeof(starpu_data_handle));
+	y_handles = (starpu_data_handle *) calloc(nblocks, sizeof(starpu_data_handle));
 
 
 	assert(x && y);
 	assert(x && y);
 
 

+ 2 - 2
examples/reductions/minmax_reduction.c

@@ -129,8 +129,8 @@ int main(int argc, char **argv)
 	unsigned long nelems = nblocks*entries_per_bock;
 	unsigned long nelems = nblocks*entries_per_bock;
 	size_t size = nelems*sizeof(TYPE);
 	size_t size = nelems*sizeof(TYPE);
 
 
-	x = malloc(size);
-	x_handles = calloc(nblocks, sizeof(starpu_data_handle));
+	x = (TYPE *) malloc(size);
+	x_handles = (starpu_data_handle *) calloc(nblocks, sizeof(starpu_data_handle));
 	
 	
 	assert(x && x_handles);
 	assert(x && x_handles);
 
 

+ 1 - 1
examples/socl/mandelbrot/mandelbrot.c

@@ -21,7 +21,7 @@
 #include <unistd.h>
 #include <unistd.h>
 
 
 /* Uncomment this to activate X11 display */
 /* Uncomment this to activate X11 display */
-#define USE_X11
+//#define USE_X11
 
 
 #define SHORT_LOG 1
 #define SHORT_LOG 1
 #define ROUND_ROBIN
 #define ROUND_ROBIN

+ 2 - 2
examples/spmv/spmv.c

@@ -44,8 +44,8 @@ static void parse_args(int argc, char **argv)
  * same number of non-zero entries. */
  * same number of non-zero entries. */
 static void csr_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
 static void csr_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
 {
 {
-	starpu_csr_interface_t *csr_father = father_interface;
-	starpu_csr_interface_t *csr_child = child_interface;
+	starpu_csr_interface_t *csr_father = (starpu_csr_interface_t *) father_interface;
+	starpu_csr_interface_t *csr_child = (starpu_csr_interface_t *) child_interface;
 
 
 	uint32_t nrow = csr_father->nrow;
 	uint32_t nrow = csr_father->nrow;
 	size_t elemsize = csr_father->elemsize;
 	size_t elemsize = csr_father->elemsize;

+ 1 - 0
examples/starpufft/.gitignore

@@ -0,0 +1 @@
+/.deps

+ 1 - 0
examples/stencil/.gitignore

@@ -0,0 +1 @@
+/.deps

+ 2 - 2
examples/stencil/stencil-blocks.c

@@ -90,7 +90,7 @@ int MPI_TAG1(int z, int iter, int dir)
 /* Compute the size of the different blocks */
 /* Compute the size of the different blocks */
 static void compute_block_sizes(void)
 static void compute_block_sizes(void)
 {
 {
-	block_sizes_z = malloc(nbz*sizeof(unsigned));
+	block_sizes_z = (unsigned *) malloc(nbz*sizeof(unsigned));
 	STARPU_ASSERT(block_sizes_z);
 	STARPU_ASSERT(block_sizes_z);
 
 
 	/* Perhaps the last chunk is smaller */
 	/* Perhaps the last chunk is smaller */
@@ -136,7 +136,7 @@ void create_blocks_array(unsigned _sizex, unsigned _sizey, unsigned _sizez, unsi
 	sizez = _sizez;
 	sizez = _sizez;
 
 
 	/* Create a grid of block descriptors */
 	/* Create a grid of block descriptors */
-	blocks = calloc(nbz, sizeof(struct block_description));
+	blocks = (struct block_description *) calloc(nbz, sizeof(struct block_description));
 	STARPU_ASSERT(blocks);
 	STARPU_ASSERT(blocks);
 
 
 	/* What is the size of the different blocks ? */
 	/* What is the size of the different blocks ? */

+ 33 - 25
examples/stencil/stencil-kernels.c

@@ -165,10 +165,12 @@ static void check_load(starpu_block_interface_t *block, starpu_block_interface_t
 /*
 /*
  * Load a neighbour's boundary into block, CPU version
  * Load a neighbour's boundary into block, CPU version
  */
  */
-static void load_subblock_from_buffer_cpu(starpu_block_interface_t *block,
-					starpu_block_interface_t *boundary,
+static void load_subblock_from_buffer_cpu(void *_block,
+					void *_boundary,
 					unsigned firstz)
 					unsigned firstz)
 {
 {
+	starpu_block_interface_t *block = (starpu_block_interface_t *)_block;
+	starpu_block_interface_t *boundary = (starpu_block_interface_t *)_boundary;
 	check_load(block, boundary);
 	check_load(block, boundary);
 
 
 	/* We do a contiguous memory transfer */
 	/* We do a contiguous memory transfer */
@@ -184,10 +186,12 @@ static void load_subblock_from_buffer_cpu(starpu_block_interface_t *block,
  * Load a neighbour's boundary into block, CUDA version
  * Load a neighbour's boundary into block, CUDA version
  */
  */
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-static void load_subblock_from_buffer_cuda(starpu_block_interface_t *block,
-					starpu_block_interface_t *boundary,
+static void load_subblock_from_buffer_cuda(void *_block,
+					void *_boundary,
 					unsigned firstz)
 					unsigned firstz)
 {
 {
+	starpu_block_interface_t *block = (starpu_block_interface_t *)_block;
+	starpu_block_interface_t *boundary = (starpu_block_interface_t *)_boundary;
 	check_load(block, boundary);
 	check_load(block, boundary);
 
 
 	/* We do a contiguous memory transfer */
 	/* We do a contiguous memory transfer */
@@ -245,16 +249,16 @@ fprintf(stderr,"!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid)
 	for (i=1; i<=K; i++)
 	for (i=1; i<=K; i++)
 	{
 	{
 		starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
 		starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
-		TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;
+		TYPE *old = (void*) oldb->ptr, *newer = (void*) newb->ptr;
 
 
 		/* Shadow data */
 		/* Shadow data */
 		cuda_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
 		cuda_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
 
 
 		/* And perform actual computation */
 		/* And perform actual computation */
 #ifdef LIFE
 #ifdef LIFE
-		cuda_life_update_host(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
+		cuda_life_update_host(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
 #else
 #else
-		cudaMemcpyAsync(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
+		cudaMemcpyAsync(newer, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
 #endif /* LIFE */
 #endif /* LIFE */
 	}
 	}
 
 
@@ -338,16 +342,16 @@ fprintf(stderr,"!!! DO update_func_opencl z %d OPENCL%d !!!\n", block->bz, worke
 	for (i=1; i<=K; i++)
 	for (i=1; i<=K; i++)
 	{
 	{
 		starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
 		starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
-		TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;
+		TYPE *old = (void*) oldb->ptr, *newer = (void*) newb->ptr;
 
 
 		/* Shadow data */
 		/* Shadow data */
 		opencl_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
 		opencl_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
 
 
 		/* And perform actual computation */
 		/* And perform actual computation */
 #ifdef LIFE
 #ifdef LIFE
-		opencl_life_update_host(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
+		opencl_life_update_host(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
 #else
 #else
-                clEnqueueCopyBuffer(cq, old, new, 0, 0, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), 0, NULL, NULL);
+                clEnqueueCopyBuffer(cq, old, newer, 0, 0, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer), 0, NULL, NULL);
 #endif /* LIFE */
 #endif /* LIFE */
 	}
 	}
 
 
@@ -365,7 +369,7 @@ fprintf(stderr,"!!! DO update_func_opencl z %d OPENCL%d !!!\n", block->bz, worke
  */
  */
 static void update_func_cpu(void *descr[], void *arg)
 static void update_func_cpu(void *descr[], void *arg)
 {
 {
-	struct block_description *block = arg;
+	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	int workerid = starpu_worker_get_id();
 	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
 	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
 	if (block->bz == 0)
 	if (block->bz == 0)
@@ -405,8 +409,8 @@ fprintf(stderr,"!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
 
 
 	for (i=1; i<=K; i++)
 	for (i=1; i<=K; i++)
 	{
 	{
-		starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
-		TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;
+		starpu_block_interface_t *oldb = (starpu_block_interface_t *) descr[i%2], *newb = (starpu_block_interface_t *) descr[(i+1)%2];
+		TYPE *old = (TYPE*) oldb->ptr, *newer = (TYPE*) newb->ptr;
 
 
 		/* Shadow data */
 		/* Shadow data */
 		unsigned ldy = oldb->ldy, ldz = oldb->ldz;
 		unsigned ldy = oldb->ldy, ldz = oldb->ldz;
@@ -424,9 +428,9 @@ fprintf(stderr,"!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
 
 
 		/* And perform actual computation */
 		/* And perform actual computation */
 #ifdef LIFE
 #ifdef LIFE
-		life_update(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
+		life_update(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
 #else
 #else
-		memcpy(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new));
+		memcpy(newer, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer));
 #endif /* LIFE */
 #endif /* LIFE */
 	}
 	}
 
 
@@ -465,10 +469,12 @@ starpu_codelet cl_update = {
  */
  */
 
 
 /* CPU version */
 /* CPU version */
-static void load_subblock_into_buffer_cpu(starpu_block_interface_t *block,
-					starpu_block_interface_t *boundary,
+static void load_subblock_into_buffer_cpu(void *_block,
+					void *_boundary,
 					unsigned firstz)
 					unsigned firstz)
 {
 {
+	starpu_block_interface_t *block = (starpu_block_interface_t *)_block;
+	starpu_block_interface_t *boundary = (starpu_block_interface_t *)_boundary;
 	check_load(block, boundary);
 	check_load(block, boundary);
 
 
 	/* We do a contiguous memory transfer */
 	/* We do a contiguous memory transfer */
@@ -482,10 +488,12 @@ static void load_subblock_into_buffer_cpu(starpu_block_interface_t *block,
 
 
 /* CUDA version */
 /* CUDA version */
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-static void load_subblock_into_buffer_cuda(starpu_block_interface_t *block,
-					starpu_block_interface_t *boundary,
+static void load_subblock_into_buffer_cuda(void *_block,
+					void *_boundary,
 					unsigned firstz)
 					unsigned firstz)
 {
 {
+	starpu_block_interface_t *block = (starpu_block_interface_t *)_block;
+	starpu_block_interface_t *boundary = (starpu_block_interface_t *)_boundary;
 	check_load(block, boundary);
 	check_load(block, boundary);
 
 
 	/* We do a contiguous memory transfer */
 	/* We do a contiguous memory transfer */
@@ -527,7 +535,7 @@ unsigned bottom_per_worker[STARPU_NMAXWORKERS];
 /* top save, CPU version */
 /* top save, CPU version */
 static void dummy_func_top_cpu(void *descr[] __attribute__((unused)), void *arg)
 static void dummy_func_top_cpu(void *descr[] __attribute__((unused)), void *arg)
 {
 {
-	struct block_description *block = arg;
+	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	int workerid = starpu_worker_get_id();
 	top_per_worker[workerid]++;
 	top_per_worker[workerid]++;
 
 
@@ -543,7 +551,7 @@ static void dummy_func_top_cpu(void *descr[] __attribute__((unused)), void *arg)
 /* bottom save, CPU version */
 /* bottom save, CPU version */
 static void dummy_func_bottom_cpu(void *descr[] __attribute__((unused)), void *arg)
 static void dummy_func_bottom_cpu(void *descr[] __attribute__((unused)), void *arg)
 {
 {
-	struct block_description *block = arg;
+	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	int workerid = starpu_worker_get_id();
 	bottom_per_worker[workerid]++;
 	bottom_per_worker[workerid]++;
 
 
@@ -557,7 +565,7 @@ static void dummy_func_bottom_cpu(void *descr[] __attribute__((unused)), void *a
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 static void dummy_func_top_cuda(void *descr[] __attribute__((unused)), void *arg)
 static void dummy_func_top_cuda(void *descr[] __attribute__((unused)), void *arg)
 {
 {
-	struct block_description *block = arg;
+	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	int workerid = starpu_worker_get_id();
 	top_per_worker[workerid]++;
 	top_per_worker[workerid]++;
 
 
@@ -574,7 +582,7 @@ static void dummy_func_top_cuda(void *descr[] __attribute__((unused)), void *arg
 /* bottom save, CUDA version */
 /* bottom save, CUDA version */
 static void dummy_func_bottom_cuda(void *descr[] __attribute__((unused)), void *arg)
 static void dummy_func_bottom_cuda(void *descr[] __attribute__((unused)), void *arg)
 {
 {
-	struct block_description *block = arg;
+	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	int workerid = starpu_worker_get_id();
 	bottom_per_worker[workerid]++;
 	bottom_per_worker[workerid]++;
 
 
@@ -590,7 +598,7 @@ static void dummy_func_bottom_cuda(void *descr[] __attribute__((unused)), void *
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
 static void dummy_func_top_opencl(void *descr[] __attribute__((unused)), void *arg)
 static void dummy_func_top_opencl(void *descr[] __attribute__((unused)), void *arg)
 {
 {
-	struct block_description *block = arg;
+	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	int workerid = starpu_worker_get_id();
 	top_per_worker[workerid]++;
 	top_per_worker[workerid]++;
 
 
@@ -610,7 +618,7 @@ static void dummy_func_top_opencl(void *descr[] __attribute__((unused)), void *a
 /* bottom save, OPENCL version */
 /* bottom save, OPENCL version */
 static void dummy_func_bottom_opencl(void *descr[] __attribute__((unused)), void *arg)
 static void dummy_func_bottom_opencl(void *descr[] __attribute__((unused)), void *arg)
 {
 {
-	struct block_description *block = arg;
+	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	int workerid = starpu_worker_get_id();
 	bottom_per_worker[workerid]++;
 	bottom_per_worker[workerid]++;
 
 

+ 3 - 3
examples/stencil/stencil.c

@@ -132,9 +132,9 @@ static void init_problem(int argc, char **argv, int rank, int world_size)
 	display_memory_consumption(rank);
 	display_memory_consumption(rank);
 
 
 	who_runs_what_len = 2*niter;
 	who_runs_what_len = 2*niter;
-	who_runs_what = calloc(nbz * who_runs_what_len, sizeof(*who_runs_what));
-	who_runs_what_index = calloc(nbz, sizeof(*who_runs_what_index));
-	last_tick = calloc(nbz, sizeof(*last_tick));
+	who_runs_what = (int *) calloc(nbz * who_runs_what_len, sizeof(*who_runs_what));
+	who_runs_what_index = (int *) calloc(nbz, sizeof(*who_runs_what_index));
+	last_tick = (struct timeval *) calloc(nbz, sizeof(*last_tick));
 }
 }
 
 
 /*
 /*

+ 1 - 1
examples/tag_example/tag_example.c

@@ -30,7 +30,7 @@
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 #define TAG(i, j, iter)	((starpu_tag_t) ( ((uint64_t)(iter)<<48) |  ((uint64_t)(j)<<24) | (i)) )
 #define TAG(i, j, iter)	((starpu_tag_t) ( ((uint64_t)(iter)<<48) |  ((uint64_t)(j)<<24) | (i)) )
 
 
-starpu_codelet cl;
+starpu_codelet cl = {};
 
 
 #define Ni	64
 #define Ni	64
 #define Nj	32
 #define Nj	32

+ 1 - 1
examples/tag_example/tag_example2.c

@@ -30,7 +30,7 @@
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 #define TAG(i, iter)	((starpu_tag_t)  (((uint64_t)iter)<<32 | (i)) )
 #define TAG(i, iter)	((starpu_tag_t)  (((uint64_t)iter)<<32 | (i)) )
 
 
-starpu_codelet cl;
+starpu_codelet cl = {};
 
 
 #define Ni	64
 #define Ni	64
 #define Nk	256
 #define Nk	256

+ 1 - 1
examples/tag_example/tag_example3.c

@@ -30,7 +30,7 @@
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 #define TAG(i, iter)	((starpu_tag_t)  (((uint64_t)iter)<<32 | (i)) )
 #define TAG(i, iter)	((starpu_tag_t)  (((uint64_t)iter)<<32 | (i)) )
 
 
-starpu_codelet cl;
+starpu_codelet cl = {};
 
 
 #define Ni	64
 #define Ni	64
 #define Nk	256
 #define Nk	256

+ 2 - 2
examples/tag_example/tag_restartable.c

@@ -35,7 +35,7 @@
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 #define TAG(i, iter)	((starpu_tag_t)  (((uint64_t)((iter)%Nrolls))<<32 | (i)) )
 #define TAG(i, iter)	((starpu_tag_t)  (((uint64_t)((iter)%Nrolls))<<32 | (i)) )
 
 
-starpu_codelet cl;
+starpu_codelet cl = {};
 
 
 #define Ni	64
 #define Ni	64
 #define Nk	256
 #define Nk	256
@@ -134,7 +134,7 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 	FPRINTF(stderr, "ITER : %u\n", nk);
 	FPRINTF(stderr, "ITER : %u\n", nk);
 
 
 	for (i = 0; i < Nrolls; i++) {
 	for (i = 0; i < Nrolls; i++) {
-		tasks[i] = malloc(ni * sizeof(*tasks[i]));
+		tasks[i] = (struct starpu_task **) malloc(ni * sizeof(*tasks[i]));
 
 
 		create_task_grid(i);
 		create_task_grid(i);
 	}
 	}

+ 3 - 3
examples/top/hello_world_top.c

@@ -62,7 +62,7 @@ struct params {
 };
 };
 void cpu_func(void *buffers[], void *cl_arg)
 void cpu_func(void *buffers[], void *cl_arg)
 {
 {
-	struct params *params = cl_arg;
+	struct params *params = (struct params *) cl_arg;
 
 
 	//loosing time for top example...
 	//loosing time for top example...
 	int sum = 0;
 	int sum = 0;
@@ -82,14 +82,14 @@ void cpu_func(void *buffers[], void *cl_arg)
 
 
 void callback_name_changed(starputop_param* param)
 void callback_name_changed(starputop_param* param)
 {
 {
-	char* message = malloc(256);
+	char* message = (char *) malloc(256);
 	sprintf(message, "Name have been changed to %s", names[name_selected]);
 	sprintf(message, "Name have been changed to %s", names[name_selected]);
 	starputop_debug_log(message);
 	starputop_debug_log(message);
 }
 }
 
 
 void callback_number_addition_changed(starputop_param* param)
 void callback_number_addition_changed(starputop_param* param)
 {
 {
-	char* message = malloc(256);
+	char* message = (char *) malloc(256);
 	sprintf(message, "Number of addition is now %d", number_of_addition);
 	sprintf(message, "Number of addition is now %d", number_of_addition);
 
 
 	starputop_debug_log(message);
 	starputop_debug_log(message);

+ 2 - 0
gcc-plugin/README

@@ -8,3 +8,5 @@ tasks.
 Plug-ins are supported starting from GCC 4.5.
 Plug-ins are supported starting from GCC 4.5.
 
 
 To run the test suite, GNU Guile 1.8.x or 2.0.x is needed.
 To run the test suite, GNU Guile 1.8.x or 2.0.x is needed.
+
+When building from SVN, GNU Bison 2.5+ is required.

+ 5 - 0
gcc-plugin/examples/Makefile.am

@@ -17,4 +17,9 @@ noinst_PROGRAMS =				\
   matrix-mult
   matrix-mult
 
 
 AM_LDFLAGS = $(top_builddir)/src/libstarpu.la
 AM_LDFLAGS = $(top_builddir)/src/libstarpu.la
+
+AM_CPPFLAGS =						\
+  -I$(top_srcdir)/include				\
+  $(STARPU_OPENCL_CPPFLAGS) $(STARPU_CUDA_CPPFLAGS)
+
 AM_CFLAGS = -fplugin="$(builddir)/../src/.libs/starpu.so" -Wall
 AM_CFLAGS = -fplugin="$(builddir)/../src/.libs/starpu.so" -Wall

+ 8 - 36
gcc-plugin/examples/matrix-mult.c

@@ -187,34 +187,15 @@ main (int argc, char **argv)
   gettimeofday (&start_register, NULL);
   gettimeofday (&start_register, NULL);
   for (i = 0; i < nslicesy; i++)
   for (i = 0; i < nslicesy; i++)
     for (j = 0; j < nslicesz; j++)
     for (j = 0; j < nslicesz; j++)
-      {
-	/* TODO: Get rid of the `ptr' and `size' variables when the pragma
-	   parser supports arbitrary C expressions.  */
-
-	typeof (A) ptr = &A[i*zdim*bydim + j*bzdim*bydim];
-	size_t size = (bzdim * bydim);
-#pragma starpu register ptr size
-      }
+#pragma starpu register &A[i*zdim*bydim + j*bzdim*bydim] (bzdim * bydim)
 
 
   for (i = 0; i < nslicesz; i++)
   for (i = 0; i < nslicesz; i++)
-    {
-      for (j = 0; j < nslicesx; j++)
-	{
-	  typeof (B) ptr = &B[i*xdim*bzdim + j*bxdim*bzdim];
-	  size_t size = (bxdim * bzdim);
-#pragma starpu register ptr size
-	}
-    }
+    for (j = 0; j < nslicesx; j++)
+#pragma starpu register &B[i*xdim*bzdim + j*bxdim*bzdim] (bxdim * bzdim)
 
 
   for (i = 0; i < nslicesy; i++)
   for (i = 0; i < nslicesy; i++)
-    {
-      for (j = 0; j < nslicesx; j++)
-	{
-	  typeof (C) ptr = &C[i*xdim*bydim + j*bxdim*bydim];
-	  size_t size = (bxdim * bydim);
-#pragma starpu register ptr size
-	}
-    }
+    for (j = 0; j < nslicesx; j++)
+#pragma starpu register &C[i*xdim*bydim + j*bxdim*bydim] (bxdim * bydim)
 
 
 
 
   gettimeofday (&end_register, NULL);
   gettimeofday (&end_register, NULL);
@@ -256,25 +237,16 @@ main (int argc, char **argv)
   gettimeofday (&start_unregister, NULL);
   gettimeofday (&start_unregister, NULL);
   for (i = 0; i < nslicesy; i++)
   for (i = 0; i < nslicesy; i++)
     for (j = 0; j < nslicesz; j++)
     for (j = 0; j < nslicesz; j++)
-      {
-	typeof (A) ptr =  &A[i*zdim*bydim + j*bzdim*bydim];
-#pragma starpu unregister ptr
-      }
+#pragma starpu unregister &A[i*zdim*bydim + j*bzdim*bydim]
 
 
 
 
   for (i = 0; i < nslicesz; i++)
   for (i = 0; i < nslicesz; i++)
     for (j = 0; j < nslicesx; j++)
     for (j = 0; j < nslicesx; j++)
-      {
-	typeof (B) ptr = &B[i*xdim*bzdim + j*bxdim*bzdim];
-#pragma starpu unregister ptr
-      }
+#pragma starpu unregister &B[i*xdim*bzdim + j*bxdim*bzdim]
 
 
   for (i = 0; i < nslicesy; i++)
   for (i = 0; i < nslicesy; i++)
     for (j = 0; j < nslicesx; j++)
     for (j = 0; j < nslicesx; j++)
-      {
-	typeof (C) ptr = &C[i*xdim*bydim + j*bxdim*bydim];
-#pragma starpu unregister ptr
-      }
+#pragma starpu unregister &C[i*xdim*bydim + j*bxdim*bydim]
 
 
   gettimeofday (&end_unregister, NULL);
   gettimeofday (&end_unregister, NULL);
   gettimeofday (&end_all, NULL);
   gettimeofday (&end_all, NULL);

+ 5 - 2
gcc-plugin/src/Makefile.am

@@ -17,7 +17,10 @@
 # requires a name prefixed by `lib'.
 # requires a name prefixed by `lib'.
 pkglib_LTLIBRARIES = starpu.la
 pkglib_LTLIBRARIES = starpu.la
 
 
-starpu_la_SOURCES = starpu.c
+starpu_la_SOURCES = starpu.c c-expr.y
+
+AM_CPPFLAGS =						\
+  -I$(top_srcdir)/include				\
+  -I$(GCC_PLUGIN_INCLUDE_DIR) -Wall -DYYERROR_VERBOSE=1
 
 
-AM_CPPFLAGS = -I$(GCC_PLUGIN_INCLUDE_DIR) -Wall
 AM_LDFLAGS = -module
 AM_LDFLAGS = -module

+ 234 - 0
gcc-plugin/src/c-expr.y

@@ -0,0 +1,234 @@
+/* GCC-StarPU
+   Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
+
+   GCC-StarPU is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   GCC-StarPU is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC-StarPU.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/* Parser for simple C expressions in pragmas.  */
+
+%define api.pure
+%parse-param { location_t loc }
+%parse-param { const char *pragma }
+%parse-param { tree *seq }
+%debug
+
+%{
+  #include <starpu-gcc-config.h>
+
+  #include <gcc-plugin.h>
+  #include <plugin.h>
+  #include <tree.h>
+  #include <cpplib.h>
+
+  #ifdef HAVE_C_FAMILY_C_COMMON_H
+  # include <c-family/c-common.h>
+  #elif HAVE_C_COMMON_H
+  # include <c-common.h>
+  #endif
+
+  #ifdef HAVE_C_FAMILY_C_PRAGMA_H
+  # include <c-family/c-pragma.h>
+  #elif HAVE_C_PRAGMA_H
+  # include <c-pragma.h>
+  #endif
+
+  #if !HAVE_DECL_BUILD_ARRAY_REF
+  /* This declaration is missing in GCC 4.6.1.  */
+  extern tree build_array_ref (location_t loc, tree array, tree index);
+  #endif
+
+
+  #define YYSTYPE tree
+  #define YYLTYPE location_t
+
+  static void
+  yyerror (location_t loc, const char *pragma, tree *seq,
+	   char const *message)
+  {
+    error_at (loc, "parse error in pragma %qs: %s", pragma, message);
+  }
+
+  /* Return SOMETHING if it's a VAR_DECL, an identifier bound to a VAR_DECL,
+     or another object; raise an error otherwise.  */
+
+  static tree
+  ensure_bound (location_t loc, tree something)
+  {
+    gcc_assert (something != NULL_TREE);
+
+    if (DECL_P (something))
+      return something;
+    else if (TREE_CODE (something) == IDENTIFIER_NODE)
+      {
+	tree var = lookup_name (something);
+	if (var == NULL_TREE)
+	  {
+	    error_at (loc, "unbound variable %qE", something);
+	    return error_mark_node;
+	  }
+	else
+	  return var;
+      }
+
+    return something;
+  }
+
+  static tree
+  build_component_ref (location_t loc, tree what, tree field)
+  {
+    sorry ("struct field access not implemented yet"); /* XXX */
+    return error_mark_node;
+  }
+%}
+
+%code {
+  /* Mapping of libcpp token names to Bison-generated token names.  This is
+     not ideal but Bison cannot be told to use the `enum cpp_ttype'
+     values.  */
+  static const int cpplib_bison_token_map[] =
+    {
+      [CPP_NAME] = YCPP_NAME,
+      [CPP_NUMBER] = YCPP_NUM,
+      [CPP_AND] = YCPP_AND,
+      [CPP_OPEN_SQUARE] = YCPP_OPEN_SQUARE,
+      [CPP_CLOSE_SQUARE] = YCPP_CLOSE_SQUARE,
+      [CPP_OPEN_PAREN] = YCPP_OPEN_PAREN,
+      [CPP_CLOSE_PAREN] = YCPP_CLOSE_PAREN,
+      [CPP_PLUS] = YCPP_PLUS,
+      [CPP_MINUS] = YCPP_MINUS,
+      [CPP_MULT] = YCPP_MULT,
+      [CPP_DIV] = YCPP_DIV,
+      [CPP_DOT] = YCPP_DOT,
+      [CPP_DEREF] = YCPP_DEREF
+    };
+
+  static int
+  yylex (YYSTYPE *lvalp)
+  {
+    int ret;
+
+    ret = pragma_lex (lvalp);
+    if (ret < sizeof cpplib_bison_token_map / sizeof cpplib_bison_token_map[0])
+      ret = cpplib_bison_token_map[ret];
+    else
+      ret = -1;
+
+    return ret;
+  }
+}
+
+%token YCPP_NAME "identifier"
+%token YCPP_NUM "integer"
+%token YCPP_AND "&"
+%token YCPP_OPEN_SQUARE "["
+%token YCPP_CLOSE_SQUARE "]"
+%token YCPP_OPEN_PAREN "("
+%token YCPP_CLOSE_PAREN ")"
+%token YCPP_PLUS "+"
+%token YCPP_MINUS "-"
+%token YCPP_MULT "*"
+%token YCPP_DIV "/"
+%token YCPP_DOT "."
+%token YCPP_DEREF "->"
+
+%% /* Grammar rules.  */
+
+ /* Always return a TREE_LIST rather than a raw chain, because the elements
+    of that list may be already chained for other purposes---e.g., PARM_DECLs
+    of a function are chained together.  */
+
+sequence: expression {
+          gcc_assert (*seq == NULL_TREE);
+	  *seq = tree_cons (NULL_TREE, $1, NULL_TREE);
+	  $$ = *seq;
+      }
+      | expression sequence {
+	  gcc_assert ($2 == *seq);
+	  *seq = tree_cons (NULL_TREE, $1, $2);
+	  $$ = *seq;
+      }
+;
+
+expression: identifier | binary_expression | unary_expression;
+
+/* XXX: `ensure_bound' below leads to errors raised even for non-significant
+   arguments---e.g., junk after pragma.  */
+identifier: YCPP_NAME  { $$ = ensure_bound (loc, $1); }
+;
+
+binary_expression: multiplicative_expression
+     | additive_expression
+;
+
+multiplicative_expression: multiplicative_expression YCPP_MULT cast_expression {
+       $$ = build_binary_op (UNKNOWN_LOCATION, MULT_EXPR, $1, $3, 0);
+     }
+     | multiplicative_expression YCPP_DIV cast_expression {
+       $$ = build_binary_op (UNKNOWN_LOCATION, TRUNC_DIV_EXPR, $1, $3, 0);
+     }
+     | cast_expression
+;
+
+additive_expression: multiplicative_expression
+     | additive_expression YCPP_PLUS multiplicative_expression {
+       $$ = build_binary_op (UNKNOWN_LOCATION, PLUS_EXPR, $1, $3, 0);
+     }
+     | additive_expression YCPP_MINUS multiplicative_expression {
+       $$ = build_binary_op (UNKNOWN_LOCATION, MINUS_EXPR, $1, $3, 0);
+     }
+;
+
+cast_expression: unary_expression
+		 /* XXX: No support for '(' TYPE-NAME ')' UNARY-EXPRESSION.  */
+;
+
+unary_expression:
+       primary_expression
+     | postfix_expression
+     | YCPP_AND cast_expression {
+       $$ = build_addr (ensure_bound (loc, $2), current_function_decl);
+     }
+;
+
+postfix_expression:
+       primary_expression
+     | postfix_expression YCPP_OPEN_SQUARE expression YCPP_CLOSE_SQUARE {
+#if 1
+	 /* Build the array ref with proper error checking.  */
+	 $$ = build_array_ref (loc, ensure_bound (loc, $1),
+			       ensure_bound (loc, $3));
+#else /* TIMTOWTDI */
+	 $$ = build_indirect_ref (loc,
+	       build_binary_op (loc, PLUS_EXPR, ensure_bound (loc, $1), ensure_bound (loc, $3), 0),
+		RO_ARRAY_INDEXING);
+#endif
+     }
+     | postfix_expression YCPP_DOT identifier {
+        $$ = build_component_ref (loc, ensure_bound (loc, $1), $2);
+     }
+     | postfix_expression YCPP_DEREF identifier {
+        $$ = build_component_ref (loc,
+               build_indirect_ref (loc, ensure_bound (loc, $1), RO_ARRAY_INDEXING),
+               $2);
+     }
+;
+
+primary_expression: identifier
+     | constant
+     | YCPP_OPEN_PAREN expression YCPP_CLOSE_PAREN { $$ = $2; }
+;
+
+constant: YCPP_NUM { $$ = $1; }
+;
+
+%%

+ 8 - 0
gcc-plugin/src/starpu-gcc-config.h.in

@@ -21,3 +21,11 @@
 #undef HAVE_DECL_BUILD_CALL_EXPR_LOC_ARRAY
 #undef HAVE_DECL_BUILD_CALL_EXPR_LOC_ARRAY
 
 
 #undef HAVE_DECL_BUILD_CALL_EXPR_LOC_VEC
 #undef HAVE_DECL_BUILD_CALL_EXPR_LOC_VEC
+
+#undef HAVE_DECL_BUILD_ARRAY_REF
+
+#undef HAVE_C_FAMILY_C_COMMON_H
+#undef HAVE_C_COMMON_H
+
+#undef HAVE_C_FAMILY_C_PRAGMA_H
+#undef HAVE_C_PRAGMA_H

+ 148 - 96
gcc-plugin/src/starpu.c

@@ -30,8 +30,19 @@ int plugin_is_GPL_compatible;
 #include <cpplib.h>
 #include <cpplib.h>
 #include <tree.h>
 #include <tree.h>
 #include <tree-iterator.h>
 #include <tree-iterator.h>
-#include <c-common.h>
-#include <c-pragma.h>
+
+#ifdef HAVE_C_FAMILY_C_COMMON_H
+# include <c-family/c-common.h>
+#elif HAVE_C_COMMON_H
+# include <c-common.h>
+#endif
+
+#ifdef HAVE_C_FAMILY_C_PRAGMA_H
+# include <c-family/c-pragma.h>
+#elif HAVE_C_PRAGMA_H
+# include <c-pragma.h>
+#endif
+
 #include <tm.h>
 #include <tm.h>
 #include <gimple.h>
 #include <gimple.h>
 #include <tree-pass.h>
 #include <tree-pass.h>
@@ -75,6 +86,9 @@ static tree build_codelet_declaration (tree task_decl);
 static tree build_task_body (const_tree task_decl);
 static tree build_task_body (const_tree task_decl);
 static tree build_pointer_lookup (tree pointer);
 static tree build_pointer_lookup (tree pointer);
 
 
+static bool task_p (const_tree decl);
+static bool task_implementation_p (const_tree decl);
+
 
 
 /* Lookup the StarPU function NAME in the global scope and store the result
 /* Lookup the StarPU function NAME in the global scope and store the result
    in VAR (this can't be done from `lower_starpu'.)  */
    in VAR (this can't be done from `lower_starpu'.)  */
@@ -119,7 +133,9 @@ build_call_expr_loc_vec (location_t loc, tree fndecl, VEC(tree,gc) *vec)
 
 
 
 
 /* Build a reference to the INDEXth element of ARRAY.  `build_array_ref' is
 /* Build a reference to the INDEXth element of ARRAY.  `build_array_ref' is
-   not exported, so we roll our own.  */
+   not exported, so we roll our own.
+   FIXME: This version may not work for array types and doesn't do as much
+   type-checking as `build_array_ref'.  */
 
 
 static tree
 static tree
 array_ref (tree array, size_t index)
 array_ref (tree array, size_t index)
@@ -347,48 +363,45 @@ handle_pragma_shutdown (struct cpp_reader *reader)
 static void
 static void
 handle_pragma_wait (struct cpp_reader *reader)
 handle_pragma_wait (struct cpp_reader *reader)
 {
 {
-  tree fndecl;
+  if (task_implementation_p (current_function_decl))
+    {
+      location_t loc;
 
 
-  fndecl = lookup_name (get_identifier ("starpu_task_wait_for_all"));
-  gcc_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
+      loc = cpp_peek_token (reader, 0)->src_loc;
+
+      /* TODO: In the future we could generate a task for the continuation
+	 and have it depend on what's before here.  */
+      error_at (loc, "task implementation is not allowed to wait");
+    }
+  else
+    {
+      tree fndecl;
+
+      fndecl = lookup_name (get_identifier ("starpu_task_wait_for_all"));
+      gcc_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
 
 
-  add_stmt (build_call_expr (fndecl, 0));
+      add_stmt (build_call_expr (fndecl, 0));
+    }
 }
 }
 
 
-/* Parse a pointer variable for PRAGMA, raising the appropriate error if
-   needed.  Return the pointer variable on success, NULL_TREE otherwise.  */
+/* The minimal C expression parser.  */
+
+extern int yyparse (location_t, const char *, tree *);
+extern int yydebug;
+
+/* Parse expressions from the CPP reader for PRAGMA, which is located at LOC.
+   Return a TREE_LIST of C expressions.  */
 
 
 static tree
 static tree
-read_pragma_pointer_variable (const char *pragma, location_t loc)
+read_pragma_expressions (const char *pragma, location_t loc)
 {
 {
-  tree token, var = NULL_TREE;
-  enum cpp_ttype type;
-
-  type = pragma_lex (&token);
-  if (type == CPP_EOF)
-    error_at (loc, "unterminated %<starpu %s%> pragma", pragma);
-  else if (type != CPP_NAME)
-    error_at (loc, "identifier expected");
-  else
-    {
-      /* Get the variable name.  */
-      tree var_name = token;
-      tree decl = lookup_name (var_name);
-
-      if (decl == NULL_TREE || !DECL_P (decl))
-	error_at (loc, "unbound variable %qE", var_name);
-      else if (!POINTER_TYPE_P (TREE_TYPE (decl))
-	       && TREE_CODE (TREE_TYPE (decl)) != ARRAY_TYPE)
-	error_at (loc, "%qE is neither a pointer nor an array", var_name);
-      else
-	{
-	  var = decl;
-	  TREE_USED (var) = true;
-	  DECL_READ_P (var) = true;
-	}
-    }
+  tree expr = NULL_TREE;
+
+  if (yyparse (loc, pragma, &expr))
+    /* Parse error or memory exhaustion.  */
+    expr = NULL_TREE;
 
 
-  return var;
+  return expr;
 }
 }
 
 
 /* Process `#pragma starpu register VAR [COUNT]' and emit the corresponding
 /* Process `#pragma starpu register VAR [COUNT]' and emit the corresponding
@@ -397,19 +410,37 @@ read_pragma_pointer_variable (const char *pragma, location_t loc)
 static void
 static void
 handle_pragma_register (struct cpp_reader *reader)
 handle_pragma_register (struct cpp_reader *reader)
 {
 {
-  tree token, var;
+  tree args, ptr, count_arg;
   location_t loc;
   location_t loc;
-  enum cpp_ttype type;
 
 
   loc = cpp_peek_token (reader, 0)->src_loc;
   loc = cpp_peek_token (reader, 0)->src_loc;
 
 
-  var = read_pragma_pointer_variable ("register", loc);
-  if (var == NULL_TREE)
+  args = read_pragma_expressions ("register", loc);
+  if (args == NULL_TREE)
+    /* Parse error, presumably already handled by the parser.  */
     return;
     return;
 
 
-  if (TREE_CODE (TREE_TYPE (var)) == ARRAY_TYPE
-      && !DECL_EXTERNAL (var)
-      && !TREE_STATIC (var)
+  /* First argument should be a pointer expression.  */
+  ptr = TREE_VALUE (args);
+  args = TREE_CHAIN (args);
+
+  if (ptr == error_mark_node)
+    return;
+
+  if (!POINTER_TYPE_P (TREE_TYPE (ptr))
+      && TREE_CODE (TREE_TYPE (ptr)) != ARRAY_TYPE)
+    {
+      error_at (loc, "%qE is neither a pointer nor an array", ptr);
+      return;
+    }
+
+  TREE_USED (ptr) = true;
+  if (DECL_P (ptr))
+    DECL_READ_P (ptr) = true;
+
+  if (TREE_CODE (TREE_TYPE (ptr)) == ARRAY_TYPE
+      && !DECL_EXTERNAL (ptr)
+      && !TREE_STATIC (ptr)
       && !MAIN_NAME_P (DECL_NAME (current_function_decl)))
       && !MAIN_NAME_P (DECL_NAME (current_function_decl)))
     warning_at (loc, 0, "using an on-stack array as a task input "
     warning_at (loc, 0, "using an on-stack array as a task input "
 		"considered unsafe");
 		"considered unsafe");
@@ -417,9 +448,9 @@ handle_pragma_register (struct cpp_reader *reader)
   /* Determine the number of elements in the vector.  */
   /* Determine the number of elements in the vector.  */
   tree count = NULL_TREE;
   tree count = NULL_TREE;
 
 
-  if (TREE_CODE (TREE_TYPE (var)) == ARRAY_TYPE)
+  if (TREE_CODE (TREE_TYPE (ptr)) == ARRAY_TYPE)
     {
     {
-      tree domain = TYPE_DOMAIN (TREE_TYPE (var));
+      tree domain = TYPE_DOMAIN (TREE_TYPE (ptr));
 
 
       if (domain != NULL_TREE)
       if (domain != NULL_TREE)
 	{
 	{
@@ -435,52 +466,44 @@ handle_pragma_register (struct cpp_reader *reader)
 	}
 	}
     }
     }
 
 
+  /* Second argument is optional but should be an integer.  */
+  count_arg = (args == NULL_TREE) ? NULL_TREE : TREE_VALUE (args);
+  if (args != NULL_TREE)
+    {
+      args = TREE_CHAIN (args);
+      TREE_CHAIN (count_arg) = NULL_TREE;
+    }
 
 
-  type = pragma_lex (&token);
-  if (type == CPP_EOF)
+  if (count_arg == NULL_TREE)
     {
     {
-      /* End of line reached: don't consume TOKEN and check whether the array
-	 size was determined.  */
+      /* End of line reached: check whether the array size was
+	 determined.  */
       if (count == NULL_TREE)
       if (count == NULL_TREE)
 	{
 	{
-	  error_at (loc, "cannot determine size of array %qE", DECL_NAME (var));
+	  error_at (loc, "cannot determine size of array %qE", ptr);
 	  return;
 	  return;
 	}
 	}
     }
     }
+  else if (count_arg == error_mark_node)
+    /* COUNT_ARG could not be parsed and an error was already reported.  */
+    return;
+  else if (!INTEGRAL_TYPE_P (TREE_TYPE (count_arg)))
+    {
+      error_at (loc, "%qE is not an integer", count_arg);
+      return;
+    }
   else
   else
     {
     {
-      /* TOKEN may be a number or a integer variable.  */
-
-      tree count_arg;
-
-      if (TREE_CODE (token) == IDENTIFIER_NODE)
-	{
-	  count_arg = lookup_name (token);
-	  if (count_arg == NULL_TREE)
-	    {
-	      error_at (loc, "unbound variable %qE", token);
-	      return;
-	    }
-	  else if (!INTEGRAL_TYPE_P (TREE_TYPE (count_arg)))
-	    {
-	      error_at (loc, "integer expected");
-	      return;
-	    }
-
-	  TREE_USED (count_arg) = true;
-	  DECL_READ_P (count_arg) = true;
-	}
-      else if (TREE_CODE (token) != INTEGER_CST)
-	error_at (loc, "integer expected");
-      else
-	count_arg = token;
+      TREE_USED (count_arg) = true;
+      if (DECL_P (count_arg))
+	DECL_READ_P (count_arg) = true;
 
 
       if (count != NULL_TREE)
       if (count != NULL_TREE)
 	{
 	{
 	  /* The number of elements of this array was already determined.  */
 	  /* The number of elements of this array was already determined.  */
 	  inform (loc,
 	  inform (loc,
 		  "element count can be omitted for bounded array %qE",
 		  "element count can be omitted for bounded array %qE",
-		  DECL_NAME (var));
+		  ptr);
 
 
 	  if (count_arg != NULL_TREE)
 	  if (count_arg != NULL_TREE)
 	    {
 	    {
@@ -489,7 +512,7 @@ handle_pragma_register (struct cpp_reader *reader)
 		  if (!tree_int_cst_equal (count, count_arg))
 		  if (!tree_int_cst_equal (count, count_arg))
 		    error_at (loc, "specified element count differs "
 		    error_at (loc, "specified element count differs "
 			      "from actual size of array %qE",
 			      "from actual size of array %qE",
-			      DECL_NAME (var));
+			      ptr);
 		}
 		}
 	      else
 	      else
 		/* Using a variable to determine the array size whereas the
 		/* Using a variable to determine the array size whereas the
@@ -501,19 +524,24 @@ handle_pragma_register (struct cpp_reader *reader)
 	}
 	}
       else
       else
 	count = count_arg;
 	count = count_arg;
-
-      if (pragma_lex (&token) != CPP_EOF)
-	error_at (loc, "junk after %<starpu register%> pragma");
     }
     }
 
 
-  /* If VAR is an array, take its address.  */
+  /* Any remaining args?  */
+  if (args != NULL_TREE)
+    error_at (loc, "junk after %<starpu register%> pragma");
+
+  /* If PTR is an array, take its address.  */
   tree pointer =
   tree pointer =
-    POINTER_TYPE_P (TREE_TYPE (var))
-    ? var
-    : build_addr (var, current_function_decl);
+    POINTER_TYPE_P (TREE_TYPE (ptr))
+    ? ptr
+    : build_addr (ptr, current_function_decl);
 
 
   /* Introduce a local variable to hold the handle.  */
   /* Introduce a local variable to hold the handle.  */
-  tree handle_var = create_tmp_var (ptr_type_node, ".handle");
+  tree handle_var = build_decl (loc, VAR_DECL, create_tmp_var_name (".handle"),
+				ptr_type_node);
+  DECL_CONTEXT (handle_var) = current_function_decl;
+  DECL_ARTIFICIAL (handle_var) = true;
+  DECL_INITIAL (handle_var) = NULL_TREE;
 
 
   tree register_fn =
   tree register_fn =
     lookup_name (get_identifier ("starpu_vector_data_register"));
     lookup_name (get_identifier ("starpu_vector_data_register"));
@@ -525,9 +553,13 @@ handle_pragma_register (struct cpp_reader *reader)
 		     build_addr (handle_var, current_function_decl),
 		     build_addr (handle_var, current_function_decl),
 		     build_zero_cst (uintptr_type_node), /* home node */
 		     build_zero_cst (uintptr_type_node), /* home node */
 		     pointer, count,
 		     pointer, count,
-		     size_in_bytes (TREE_TYPE (TREE_TYPE (var))));
+		     size_in_bytes (TREE_TYPE (TREE_TYPE (ptr))));
 
 
-  add_stmt (call);
+  tree bind;
+  bind = build3 (BIND_EXPR, void_type_node, handle_var, call,
+		 NULL_TREE);
+
+  add_stmt (bind);
 }
 }
 
 
 /* Process `#pragma starpu acquire VAR' and emit the corresponding
 /* Process `#pragma starpu acquire VAR' and emit the corresponding
@@ -539,16 +571,26 @@ handle_pragma_acquire (struct cpp_reader *reader)
   static tree acquire_fn;
   static tree acquire_fn;
   LOOKUP_STARPU_FUNCTION (acquire_fn, "starpu_data_acquire");
   LOOKUP_STARPU_FUNCTION (acquire_fn, "starpu_data_acquire");
 
 
-  tree token, var;
+  tree args, var;
   location_t loc;
   location_t loc;
 
 
   loc = cpp_peek_token (reader, 0)->src_loc;
   loc = cpp_peek_token (reader, 0)->src_loc;
 
 
-  var = read_pragma_pointer_variable ("acquire", loc);
-  if (var == NULL_TREE)
+  args = read_pragma_expressions ("acquire", loc);
+  if (args == NULL_TREE)
     return;
     return;
 
 
-  if (pragma_lex (&token) != CPP_EOF)
+  var = TREE_VALUE (args);
+
+  if (var == error_mark_node)
+    return;
+  else if (TREE_CODE (TREE_TYPE (var)) != POINTER_TYPE
+	   && TREE_CODE (TREE_TYPE (var)) != ARRAY_TYPE)
+    {
+      error_at (loc, "%qE is neither a pointer nor an array", var);
+      return;
+    }
+  else if (TREE_CHAIN (var) != NULL_TREE)
     error_at (loc, "junk after %<starpu acquire%> pragma");
     error_at (loc, "junk after %<starpu acquire%> pragma");
 
 
   /* If VAR is an array, take its address.  */
   /* If VAR is an array, take its address.  */
@@ -573,16 +615,26 @@ handle_pragma_unregister (struct cpp_reader *reader)
   static tree unregister_fn;
   static tree unregister_fn;
   LOOKUP_STARPU_FUNCTION (unregister_fn, "starpu_data_unregister");
   LOOKUP_STARPU_FUNCTION (unregister_fn, "starpu_data_unregister");
 
 
-  tree token, var;
+  tree args, var;
   location_t loc;
   location_t loc;
 
 
   loc = cpp_peek_token (reader, 0)->src_loc;
   loc = cpp_peek_token (reader, 0)->src_loc;
 
 
-  var = read_pragma_pointer_variable ("unregister", loc);
-  if (var == NULL_TREE)
+  args = read_pragma_expressions ("unregister", loc);
+  if (args == NULL_TREE)
     return;
     return;
 
 
-  if (pragma_lex (&token) != CPP_EOF)
+  var = TREE_VALUE (args);
+
+  if (var == error_mark_node)
+    return;
+  else if (TREE_CODE (TREE_TYPE (var)) != POINTER_TYPE
+	   && TREE_CODE (TREE_TYPE (var)) != ARRAY_TYPE)
+    {
+      error_at (loc, "%qE is neither a pointer nor an array", var);
+      return;
+    }
+  else if (TREE_CHAIN (args) != NULL_TREE)
     error_at (loc, "junk after %<starpu unregister%> pragma");
     error_at (loc, "junk after %<starpu unregister%> pragma");
 
 
   /* If VAR is an array, take its address.  */
   /* If VAR is an array, take its address.  */

+ 1 - 0
gcc-plugin/tests/Makefile.am

@@ -28,6 +28,7 @@ gcc_tests =					\
   pointer-tasks.c				\
   pointer-tasks.c				\
   no-initialize.c				\
   no-initialize.c				\
   lib-user.c					\
   lib-user.c					\
+  wait-errors.c					\
   shutdown-errors.c
   shutdown-errors.c
 
 
 dist_noinst_HEADERS = mocks.h
 dist_noinst_HEADERS = mocks.h

+ 2 - 2
gcc-plugin/tests/acquire-errors.c

@@ -28,8 +28,8 @@ main (int argc, char *argv[])
 
 
 #pragma starpu register x
 #pragma starpu register x
 
 
-#pragma starpu acquire /* (error "unterminated") */
-#pragma starpu acquire 123 /* (error "identifier expected") */
+#pragma starpu acquire /* (error "parse error") */
+#pragma starpu acquire 123 /* (error "neither a pointer nor an array") */
 #pragma starpu acquire does_not_exit /* (error "unbound variable") */
 #pragma starpu acquire does_not_exit /* (error "unbound variable") */
 
 
 #pragma starpu acquire argc /* (error "neither a pointer nor an array") */
 #pragma starpu acquire argc /* (error "neither a pointer nor an array") */

+ 4 - 1
gcc-plugin/tests/lib-user.c

@@ -28,7 +28,10 @@ main (int argc, char *argv[])
 {
 {
 #pragma starpu initialize
 #pragma starpu initialize
 
 
-  static const char x[] = { 0, 1, 2, 3, 4, 5 };
+  /* Align X so that the assumptions behind `dummy_pointer_to_handle'
+     hold.  */
+  static const char x[] __attribute__ ((aligned (8))) = { 0, 1, 2, 3, 4, 5 };
+
   float y[sizeof x];
   float y[sizeof x];
 
 
   static const char forty_two = 42;
   static const char forty_two = 42;

+ 8 - 2
gcc-plugin/tests/register-errors.c

@@ -23,9 +23,9 @@ main (int argc, char *argv[])
 {
 {
 #pragma starpu initialize
 #pragma starpu initialize
 
 
-#pragma starpu register /* (error "unterminated") */
+#pragma starpu register /* (error "parse error") */
 
 
-#pragma starpu register argv 234 junk right here /* (error "junk after") */
+#pragma starpu register argv 234 junk here /* (error "junk after") *//* (error "unbound") *//* (error "unbound") */
 
 
   static int x[123] __attribute__ ((unused));
   static int x[123] __attribute__ ((unused));
 #pragma starpu register x 234 /* (note "can be omitted") *//* (error "differs from actual size") */
 #pragma starpu register x 234 /* (note "can be omitted") *//* (error "differs from actual size") */
@@ -37,9 +37,15 @@ main (int argc, char *argv[])
 #pragma starpu register argv does_not_exit /* (error "unbound variable") */
 #pragma starpu register argv does_not_exit /* (error "unbound variable") */
 
 
 #pragma starpu register argv /* (error "cannot determine size") */
 #pragma starpu register argv /* (error "cannot determine size") */
+#pragma starpu register &argv[2] /* (error "cannot determine size") */
+#pragma starpu register &x[2] /* (error "cannot determine size") */
 
 
 #pragma starpu register argc /* (error "neither a pointer nor an array") */
 #pragma starpu register argc /* (error "neither a pointer nor an array") */
 
 
+#pragma starpu register argv[2][3] 3 /* (error "neither a pointer nor an array") */
+
+#pragma starpu register argv[does_not_exist] 3 /* (error "unbound variable") */
+
   char **p = argv;
   char **p = argv;
   size_t ps = argc;
   size_t ps = argc;
 #pragma starpu register p ps  /* No unused variable warning, please! */
 #pragma starpu register p ps  /* No unused variable warning, please! */

+ 39 - 1
gcc-plugin/tests/register.c

@@ -31,6 +31,15 @@ foo (void)
 #pragma starpu register x /* (warning "considered unsafe") */
 #pragma starpu register x /* (warning "considered unsafe") */
 }
 }
 
 
+static void
+bar (float *p, int s)
+{
+  expected_register_arguments.pointer = p;
+  expected_register_arguments.elements = s;
+  expected_register_arguments.element_size = sizeof *p;
+#pragma starpu register p s
+}
+
 int
 int
 main (int argc, char *argv[])
 main (int argc, char *argv[])
 {
 {
@@ -39,6 +48,7 @@ main (int argc, char *argv[])
   int x[123];
   int x[123];
   double *y;
   double *y;
   static char z[345];
   static char z[345];
+  static float m[7][42];
   short w[] = { 1, 2, 3 };
   short w[] = { 1, 2, 3 };
   size_t y_size = 234;
   size_t y_size = 234;
 
 
@@ -84,8 +94,36 @@ main (int argc, char *argv[])
 #undef N
 #undef N
 
 
   foo ();
   foo ();
+  bar ((float *) argv, argc);
+
+  expected_register_arguments.pointer = argv;
+  expected_register_arguments.elements = argc;
+  expected_register_arguments.element_size = sizeof argv[0];
+
+  int chbouib = argc;
+#pragma starpu register argv chbouib
+
+  expected_register_arguments.pointer = &argv[2];
+  expected_register_arguments.elements = 3;
+  expected_register_arguments.element_size = sizeof argv[0];
+#pragma starpu register &argv[2] 3
+
+  expected_register_arguments.pointer = &argv[argc + 3 / 2];
+  expected_register_arguments.elements = argc * 4;
+  expected_register_arguments.element_size = sizeof argv[0];
+#pragma starpu register &argv[argc + 3 / 2] (argc * 4)
+
+  expected_register_arguments.pointer = &y[y_size / 2];
+  expected_register_arguments.elements = (y_size / 2 - 7);
+  expected_register_arguments.element_size = sizeof y[0];
+#pragma starpu register &y[y_size / 2] (y_size / 2 - 7)
+
+  expected_register_arguments.pointer = m[6];
+  expected_register_arguments.elements = 42;
+  expected_register_arguments.element_size = sizeof m[0][0];
+#pragma starpu register m[6]
 
 
-  assert (data_register_calls == 8);
+  assert (data_register_calls == 14);
 
 
   free (y);
   free (y);
 
 

+ 11 - 5
gcc-plugin/tests/run-test.in

@@ -85,11 +85,11 @@ exec "${GUILE-@GUILE@}" -l "$0"    \
     ;; the real file name.
     ;; the real file name.
     ,(string-append "-fplugin=" %builddir "/../src/.libs/starpu.so")
     ,(string-append "-fplugin=" %builddir "/../src/.libs/starpu.so")
 
 
-    "-g" "-O2"
+    "-g"
     "-fdump-tree-gimple" "-Wall"))
     "-fdump-tree-gimple" "-Wall"))
 
 
 (define %default-ldflags
 (define %default-ldflags
-  `(,(string-append "-L" %srcdir "/../../src")))
+  `(,(string-append "-L" %builddir "/../../src")))
 
 
 (define %libtool
 (define %libtool
   (string-append %builddir "/../../libtool"))
   (string-append %builddir "/../../libtool"))
@@ -105,7 +105,7 @@ compiler status and the list of lines printed on stdout/stderr."
          (mode     (if compile?
          (mode     (if compile?
                        "compile"
                        "compile"
                        "link"))
                        "link"))
-         (command  (format #f "LANG=C ~a --mode=~a ~a ~{~a ~} \"~a\" ~{~a ~} 2>&1"
+         (command  (format #f "LC_ALL=C ~a --mode=~a ~a ~{~a ~} \"~a\" ~{~a ~} 2>&1"
                            %libtool mode cc cflags file ldflags))
                            %libtool mode cc cflags file ldflags))
          (pipe     (begin
          (pipe     (begin
                      (log "running `~a'" command)
                      (log "running `~a'" command)
@@ -311,7 +311,7 @@ otherwise."
                 file (length dependencies) dependencies))
                 file (length dependencies) dependencies))
 
 
     (and (every (cut compile/match <> cc cflags ldflags)
     (and (every (cut compile/match <> cc cflags ldflags)
-                dependencies)
+                (map (cut string-append %srcdir "/" <>) dependencies))
          (let*-values (((goal)
          (let*-values (((goal)
                         (if error-expected?
                         (if error-expected?
                             'compile
                             'compile
@@ -421,6 +421,12 @@ otherwise."
 ;;;
 ;;;
 
 
 (define (build/run . file)
 (define (build/run . file)
-  (exit (every (cut compile/match <> %gcc %default-cflags %default-ldflags) file)))
+  (exit (every (lambda (file)
+                 ;; For each file, check that everything works both with and
+                 ;; without optimizations.
+                 (every (cut compile/match file %gcc <> %default-ldflags)
+                        `((,"-O0" ,@%default-cflags)
+                          (,"-O2" ,@%default-cflags))))
+               file)))
 
 
 ;;; run-test.in ends here
 ;;; run-test.in ends here

+ 2 - 2
gcc-plugin/tests/unregister-errors.c

@@ -28,8 +28,8 @@ main (int argc, char *argv[])
 
 
 #pragma starpu register x
 #pragma starpu register x
 
 
-#pragma starpu unregister /* (error "unterminated") */
-#pragma starpu unregister 123 /* (error "identifier expected") */
+#pragma starpu unregister /* (error "parse error") */
+#pragma starpu unregister 123 /* (error "neither a pointer nor an array") */
 #pragma starpu unregister does_not_exit /* (error "unbound variable") */
 #pragma starpu unregister does_not_exit /* (error "unbound variable") */
 
 
 #pragma starpu unregister argc /* (error "neither a pointer nor an array") */
 #pragma starpu unregister argc /* (error "neither a pointer nor an array") */

+ 27 - 0
gcc-plugin/tests/wait-errors.c

@@ -0,0 +1,27 @@
+/* GCC-StarPU
+   Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
+
+   GCC-StarPU is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   GCC-StarPU is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC-StarPU.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/* The task under test.  */
+
+void task (int x, char y, int z) __attribute__ ((task));
+static void task_cpu (int x, char y, int z)
+  __attribute__ ((task_implementation ("cpu", task)));
+
+static void
+task_cpu (int x, char y, int z)
+{
+#pragma starpu wait /* (error "not allowed") */
+}

+ 1 - 0
include/starpu_config.h.in

@@ -46,6 +46,7 @@
 #undef STARPU_MAXCUDADEVS
 #undef STARPU_MAXCUDADEVS
 #undef STARPU_MAXOPENCLDEVS
 #undef STARPU_MAXOPENCLDEVS
 #undef STARPU_NMAXWORKERS
 #undef STARPU_NMAXWORKERS
+#undef STARPU_MAXIMPLEMENTATIONS
 
 
 #undef STARPU_HAVE_LIBNUMA
 #undef STARPU_HAVE_LIBNUMA
 
 

+ 2 - 0
include/starpu_data_filters.h

@@ -19,6 +19,8 @@
 #ifndef __STARPU_DATA_FILTERS_H__
 #ifndef __STARPU_DATA_FILTERS_H__
 #define __STARPU_DATA_FILTERS_H__
 #define __STARPU_DATA_FILTERS_H__
 
 
+#include <stdarg.h>
+
 #include <starpu.h>
 #include <starpu.h>
 #include <starpu_config.h>
 #include <starpu_config.h>
 
 

+ 5 - 4
include/starpu_perfmodel.h

@@ -2,6 +2,7 @@
  *
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -104,7 +105,7 @@ struct starpu_perfmodel_t {
 	double (*cost_model)(struct starpu_buffer_descr_t *);
 	double (*cost_model)(struct starpu_buffer_descr_t *);
 
 
 	/* per-architecture model */
 	/* per-architecture model */
-	struct starpu_per_arch_perfmodel_t per_arch[STARPU_NARCH_VARIATIONS];
+	struct starpu_per_arch_perfmodel_t per_arch[STARPU_NARCH_VARIATIONS][STARPU_MAXIMPLEMENTATIONS];
 
 
 	/* Name of the performance model, this is used as a file name when saving history-based performance models */
 	/* Name of the performance model, this is used as a file name when saving history-based performance models */
 	const char *symbol;
 	const char *symbol;
@@ -126,12 +127,12 @@ enum starpu_perf_archtype starpu_worker_get_perf_archtype(int workerid);
  * performance model files */
  * performance model files */
 int starpu_load_history_debug(const char *symbol, struct starpu_perfmodel_t *model);
 int starpu_load_history_debug(const char *symbol, struct starpu_perfmodel_t *model);
 void starpu_perfmodel_debugfilepath(struct starpu_perfmodel_t *model,
 void starpu_perfmodel_debugfilepath(struct starpu_perfmodel_t *model,
-		enum starpu_perf_archtype arch, char *path, size_t maxlen);
-void starpu_perfmodel_get_arch_name(enum starpu_perf_archtype arch,
-		char *archname, size_t maxlen);
+		enum starpu_perf_archtype arch, char *path, size_t maxlen, unsigned nimpl);
+void starpu_perfmodel_get_arch_name(enum starpu_perf_archtype arch,	char *archname, size_t maxlen, unsigned nimpl);
 int starpu_list_models(void);
 int starpu_list_models(void);
 
 
 void starpu_force_bus_sampling(void);
 void starpu_force_bus_sampling(void);
+void starpu_print_bus_bandwidth(FILE *f);
 
 
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }

+ 6 - 4
include/starpu_scheduler.h

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2011  Télécom-SudParis
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -127,7 +128,7 @@ void starpu_worker_set_sched_condition(int workerid, pthread_cond_t *sched_cond,
 #endif
 #endif
 
 
 /* Check if the worker specified by workerid can execute the codelet. */
 /* Check if the worker specified by workerid can execute the codelet. */
-int starpu_worker_may_execute_task(unsigned workerid, struct starpu_task *task);
+int starpu_worker_may_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
 
 
 /* The scheduling policy may put tasks directly into a worker's local queue so
 /* The scheduling policy may put tasks directly into a worker's local queue so
  * that it is not always necessary to create its own queue when the local queue
  * that it is not always necessary to create its own queue when the local queue
@@ -147,6 +148,7 @@ int starpu_push_local_task(int workerid, struct starpu_task *task, int back);
 /* By convention, the default priority level should be 0 so that we can
 /* By convention, the default priority level should be 0 so that we can
  * statically allocate tasks with a default priority. */
  * statically allocate tasks with a default priority. */
 #define STARPU_DEFAULT_PRIO	0
 #define STARPU_DEFAULT_PRIO	0
+
 int starpu_sched_get_min_priority(void);
 int starpu_sched_get_min_priority(void);
 int starpu_sched_get_max_priority(void);
 int starpu_sched_get_max_priority(void);
 
 
@@ -164,7 +166,7 @@ void _starpu_sched_find_worker_combinations(struct starpu_machine_topology_s *to
 /* Get the description of a combined worker */
 /* Get the description of a combined worker */
 int starpu_combined_worker_get_description(int workerid, int *worker_size, int **combined_workerid);
 int starpu_combined_worker_get_description(int workerid, int *worker_size, int **combined_workerid);
 /* Variant of starpu_worker_may_execute_task compatible with combined workers */
 /* Variant of starpu_worker_may_execute_task compatible with combined workers */
-int starpu_combined_worker_may_execute_task(unsigned workerid, struct starpu_task *task);
+int starpu_combined_worker_may_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
 
 
 /*
 /*
  *	Data prefetching
  *	Data prefetching
@@ -182,7 +184,7 @@ int starpu_prefetch_task_input_on_node(struct starpu_task *task, uint32_t node);
 /* Return the current date */
 /* Return the current date */
 double starpu_timing_now(void);
 double starpu_timing_now(void);
 /* Returns expected task duration in µs */
 /* Returns expected task duration in µs */
-double starpu_task_expected_length(struct starpu_task *task, enum starpu_perf_archtype arch);
+double starpu_task_expected_length(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
 /* Returns an estimated speedup factor relative to CPU speed */
 /* Returns an estimated speedup factor relative to CPU speed */
 double starpu_worker_get_relative_speedup(enum starpu_perf_archtype perf_archtype);
 double starpu_worker_get_relative_speedup(enum starpu_perf_archtype perf_archtype);
 /* Returns expected data transfer time in µs */
 /* Returns expected data transfer time in µs */
@@ -190,7 +192,7 @@ double starpu_task_expected_data_transfer_time(uint32_t memory_node, struct star
 /* Predict the transfer time (in µs) to move a handle to a memory node */
 /* Predict the transfer time (in µs) to move a handle to a memory node */
 double starpu_data_expected_transfer_time(starpu_data_handle handle, unsigned memory_node, starpu_access_mode mode);
 double starpu_data_expected_transfer_time(starpu_data_handle handle, unsigned memory_node, starpu_access_mode mode);
 /* Returns expected power consumption in J */
 /* Returns expected power consumption in J */
-double starpu_task_expected_power(struct starpu_task *task, enum starpu_perf_archtype arch);
+double starpu_task_expected_power(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
 
 
 /* Waits until all the tasks of a worker, already submitted, have been executed */
 /* Waits until all the tasks of a worker, already submitted, have been executed */
 int starpu_wait_for_all_tasks_of_worker(int workerid);
 int starpu_wait_for_all_tasks_of_worker(int workerid);

+ 19 - 1
include/starpu_task.h

@@ -2,6 +2,7 @@
  *
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -23,7 +24,7 @@
 #include <starpu_config.h>
 #include <starpu_config.h>
 
 
 #if defined STARPU_USE_CUDA && !defined STARPU_DONT_INCLUDE_CUDA_HEADERS
 #if defined STARPU_USE_CUDA && !defined STARPU_DONT_INCLUDE_CUDA_HEADERS
-#include <cuda.h>
+# include <cuda.h>
 #endif
 #endif
 
 
 #include <starpu_data.h>
 #include <starpu_data.h>
@@ -56,6 +57,18 @@ extern "C" {
 
 
 typedef uint64_t starpu_tag_t;
 typedef uint64_t starpu_tag_t;
 
 
+
+typedef void (*starpu_cpu_func_t)(void **, void*);    /* CPU core */
+typedef void (*starpu_cuda_func_t)(void **, void*);   /* NVIDIA CUDA device */
+typedef void (*starpu_opencl_func_t)(void **, void*); /* OpenCL CUDA device */
+typedef uint8_t starpu_gordon_func_t; /* Cell SPU */
+
+#define STARPU_MULTIPLE_CPU_IMPLEMENTATIONS    (starpu_cpu_func_t) -1
+#define STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS   (starpu_cuda_func_t) -1
+#define STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS (starpu_opencl_func_t) -1
+#define STARPU_MULTIPLE_GORDON_IMPLEMENTATIONS 255
+
+
 /*
 /*
  * A codelet describes the various function 
  * A codelet describes the various function 
  * that may be called from a worker
  * that may be called from a worker
@@ -72,6 +85,11 @@ typedef struct starpu_codelet_t {
 	void (*opencl_func)(void **, void *);
 	void (*opencl_func)(void **, void *);
 	uint8_t gordon_func;
 	uint8_t gordon_func;
 
 
+	starpu_cpu_func_t cpu_funcs[STARPU_MAXIMPLEMENTATIONS];
+	starpu_cuda_func_t cuda_funcs[STARPU_MAXIMPLEMENTATIONS];
+	starpu_opencl_func_t opencl_funcs[STARPU_MAXIMPLEMENTATIONS];
+	starpu_gordon_func_t gordon_funcs[STARPU_MAXIMPLEMENTATIONS];
+
 	/* how many buffers do the codelet takes as argument ? */
 	/* how many buffers do the codelet takes as argument ? */
 	unsigned nbuffers;
 	unsigned nbuffers;
 
 

+ 3 - 2
include/starpu_task_bundle.h

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2011  Télécom-SudParis
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -71,10 +72,10 @@ int starpu_task_bundle_remove(struct starpu_task_bundle *bundle, struct starpu_t
 void starpu_task_bundle_close(struct starpu_task_bundle *bundle);
 void starpu_task_bundle_close(struct starpu_task_bundle *bundle);
 
 
 /* Return the expected duration of the entire task bundle in µs. */
 /* Return the expected duration of the entire task bundle in µs. */
-double starpu_task_bundle_expected_length(struct starpu_task_bundle *bundle, enum starpu_perf_archtype arch);
+double starpu_task_bundle_expected_length(struct starpu_task_bundle *bundle, enum starpu_perf_archtype arch, unsigned nimpl);
 /* Return the time (in µs) expected to transfer all data used within the bundle */
 /* Return the time (in µs) expected to transfer all data used within the bundle */
 double starpu_task_bundle_expected_data_transfer_time(struct starpu_task_bundle *bundle, unsigned memory_node);
 double starpu_task_bundle_expected_data_transfer_time(struct starpu_task_bundle *bundle, unsigned memory_node);
 /* Return the expected power consumption of the entire task bundle in J. */
 /* Return the expected power consumption of the entire task bundle in J. */
-double starpu_task_bundle_expected_power(struct starpu_task_bundle *bundle,  enum starpu_perf_archtype arch);
+double starpu_task_bundle_expected_power(struct starpu_task_bundle *bundle,  enum starpu_perf_archtype arch, unsigned nimpl);
 
 
 #endif // __STARPU_TASK_BUNDLE_H__
 #endif // __STARPU_TASK_BUNDLE_H__

+ 2 - 0
m4/.gitignore

@@ -0,0 +1,2 @@
+/libtool.m4
+/lt*.m4

+ 9 - 1
m4/gcc.m4

@@ -51,8 +51,16 @@ AC_DEFUN([STARPU_GCC_PLUGIN_SUPPORT], [
     dnl Reason:
     dnl Reason:
     dnl   build_call_expr_loc_array -- not in GCC 4.5.x; appears in 4.6
     dnl   build_call_expr_loc_array -- not in GCC 4.5.x; appears in 4.6
     dnl   build_call_expr_loc_vec   -- likewise
     dnl   build_call_expr_loc_vec   -- likewise
+    dnl   build_array_ref           -- present but undeclared in 4.6.1
     _STARPU_WITH_GCC_PLUGIN_API([
     _STARPU_WITH_GCC_PLUGIN_API([
-      AC_CHECK_DECLS([build_call_expr_loc_array, build_call_expr_loc_vec],
+      AC_CHECK_DECLS([build_call_expr_loc_array, build_call_expr_loc_vec,
+                      build_array_ref],
+        [], [], [#include <gcc-plugin.h>
+	         #include <tree.h>])
+
+      dnl Work around header naming issues introduced upstream and in Debian
+      dnl (see <http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=631082>).
+      AC_CHECK_HEADERS([c-common.h c-pragma.h c-family/c-common.h c-family/c-pragma.h],
         [], [], [#include <gcc-plugin.h>
         [], [], [#include <gcc-plugin.h>
 	         #include <tree.h>])
 	         #include <tree.h>])
     ])
     ])

+ 1 - 0
mpi/.gitignore

@@ -0,0 +1 @@
+/.deps

+ 1 - 0
mpi/tests/.gitignore

@@ -0,0 +1 @@
+/.deps

+ 8 - 6
socl/src/Makefile.am

@@ -24,16 +24,18 @@ SUBDIRS =
 lib_LTLIBRARIES = libsocl.la
 lib_LTLIBRARIES = libsocl.la
 
 
 libsocl_la_SOURCES = 						\
 libsocl_la_SOURCES = 						\
-  socl.c \
-  gc.c \
-  graph.c \
+  command.c \
+  command_list.c \
+  command_queue.c \
+  debug.c \
+  devices.c \
   event.c \
   event.c \
+  gc.c \
   init.c \
   init.c \
-  task.c \
-  command_queue.c \
   mem_objects.c \
   mem_objects.c \
+  socl.c \
+  task.c \
   util.c \
   util.c \
-  devices.c \
   cl_getplatformids.c \
   cl_getplatformids.c \
   cl_getplatforminfo.c \
   cl_getplatforminfo.c \
   cl_getdeviceids.c \
   cl_getdeviceids.c \

+ 3 - 4
socl/src/cl_createcommandqueue.c

@@ -30,8 +30,7 @@ static void release_callback_command_queue(void * e) {
   gc_entity_unstore(&cq->context);
   gc_entity_unstore(&cq->context);
 
 
   /* Destruct object */
   /* Destruct object */
-  pthread_spin_destroy(&cq->spin);
-  free(cq->events);
+  pthread_mutex_destroy(&cq->mutex);
 }
 }
 
 
 
 
@@ -68,9 +67,9 @@ soclCreateCommandQueue(cl_context                   context,
       profiling_queue_count += 1;
       profiling_queue_count += 1;
    }
    }
 
 
-   cq->events = NULL;
+   cq->commands = NULL;
    cq->barrier = NULL;
    cq->barrier = NULL;
-   pthread_spin_init(&cq->spin, 0);
+   pthread_mutex_init(&cq->mutex, NULL);
 
 
    if (errcode_ret != NULL)
    if (errcode_ret != NULL)
       *errcode_ret = CL_SUCCESS;
       *errcode_ret = CL_SUCCESS;

+ 11 - 11
socl/src/cl_createkernel.c

@@ -32,23 +32,23 @@ static void soclCreateKernel_task(void *data) {
    }
    }
 
 
    /* One worker creates argument structures */
    /* One worker creates argument structures */
-   if (__sync_bool_compare_and_swap(&k->arg_count, 0, 666)) {
+   if (__sync_bool_compare_and_swap(&k->num_args, 0, 666)) {
       unsigned int i;
       unsigned int i;
-      cl_uint arg_count;
+      cl_uint num_args;
 
 
-      err = clGetKernelInfo(k->cl_kernels[range], CL_KERNEL_NUM_ARGS, sizeof(arg_count), &arg_count, NULL);
+      err = clGetKernelInfo(k->cl_kernels[range], CL_KERNEL_NUM_ARGS, sizeof(num_args), &num_args, NULL);
       if (err != CL_SUCCESS) {
       if (err != CL_SUCCESS) {
          DEBUG_CL("clGetKernelInfo", err);
          DEBUG_CL("clGetKernelInfo", err);
          ERROR_STOP("Unable to get kernel argument count. Aborting.\n");
          ERROR_STOP("Unable to get kernel argument count. Aborting.\n");
       }
       }
-      k->arg_count = arg_count;
-      DEBUG_MSG("Kernel has %d arguments\n", arg_count);
+      k->num_args = num_args;
+      DEBUG_MSG("Kernel has %d arguments\n", num_args);
 
 
-      k->arg_size = (size_t*)malloc(sizeof(size_t) * arg_count);
-      k->arg_value = (void**)malloc(sizeof(void*) * arg_count);
-      k->arg_type = (enum kernel_arg_type*)malloc(sizeof(enum kernel_arg_type) * arg_count);
+      k->arg_size = (size_t*)malloc(sizeof(size_t) * num_args);
+      k->arg_value = (void**)malloc(sizeof(void*) * num_args);
+      k->arg_type = (enum kernel_arg_type*)malloc(sizeof(enum kernel_arg_type) * num_args);
       /* Settings default type to NULL */
       /* Settings default type to NULL */
-      for (i=0; i<arg_count; i++) {
+      for (i=0; i<num_args; i++) {
          k->arg_value[i] = NULL;
          k->arg_value[i] = NULL;
          k->arg_type[i] = Null;
          k->arg_type[i] = Null;
       }
       }
@@ -70,7 +70,7 @@ static void release_callback_kernel(void * e) {
 
 
   //Free args
   //Free args
   unsigned int j;
   unsigned int j;
-  for (j=0; j<kernel->arg_count; j++) {
+  for (j=0; j<kernel->num_args; j++) {
     switch (kernel->arg_type[j]) {
     switch (kernel->arg_type[j]) {
       case Null:
       case Null:
         break;
         break;
@@ -125,7 +125,7 @@ soclCreateKernel(cl_program    program,
    
    
    gc_entity_store(&k->program, program);
    gc_entity_store(&k->program, program);
    k->kernel_name = strdup(kernel_name);
    k->kernel_name = strdup(kernel_name);
-   k->arg_count = 0;
+   k->num_args = 0;
    k->arg_value = NULL;
    k->arg_value = NULL;
    k->arg_size = NULL;
    k->arg_size = NULL;
 
 

+ 2 - 2
socl/src/cl_createprogramwithsource.c

@@ -40,11 +40,11 @@ static void soclCreateProgramWithSource_task(void *data) {
 }
 }
 
 
 static void rp_task(void *data) {
 static void rp_task(void *data) {
-   struct _cl_program *d = (struct _cl_program*)data;
+   cl_program program = (cl_program)data;
 
 
    int range = starpu_worker_get_range();
    int range = starpu_worker_get_range();
 
 
-   cl_int err = clReleaseProgram(d->cl_programs[range]);
+   cl_int err = clReleaseProgram(program->cl_programs[range]);
    if (err != CL_SUCCESS)
    if (err != CL_SUCCESS)
       DEBUG_CL("clReleaseProgram", err);
       DEBUG_CL("clReleaseProgram", err);
 }
 }

+ 4 - 3
socl/src/cl_enqueuebarrier.c

@@ -19,8 +19,9 @@
 CL_API_ENTRY cl_int CL_API_CALL
 CL_API_ENTRY cl_int CL_API_CALL
 soclEnqueueBarrier(cl_command_queue cq) CL_API_SUFFIX__VERSION_1_0
 soclEnqueueBarrier(cl_command_queue cq) CL_API_SUFFIX__VERSION_1_0
 {
 {
-   cl_event ev = enqueueBarrier(cq);   
-   gc_entity_release(ev);
+	command_marker cmd = command_barrier_create();
 
 
-   return CL_SUCCESS;
+	command_queue_enqueue(cq, cmd, 0, NULL);
+
+	return CL_SUCCESS;
 }
 }

+ 40 - 28
socl/src/cl_enqueuecopybuffer.c

@@ -66,6 +66,42 @@ static starpu_codelet codelet_copybuffer = {
    .nbuffers = 2
    .nbuffers = 2
 };
 };
 
 
+cl_int command_copy_buffer_submit(command_copy_buffer cmd) {
+	/* Aliases */
+	cl_mem src_buffer = cmd->src_buffer;
+	cl_mem dst_buffer = cmd->dst_buffer;
+	size_t src_offset = cmd->src_offset;
+	size_t dst_offset = cmd->dst_offset;
+	size_t cb = cmd->cb;
+
+	struct starpu_task *task;
+	struct arg_copybuffer *arg;
+
+	task = task_create(CL_COMMAND_COPY_BUFFER);
+
+	task->buffers[0].handle = src_buffer->handle;
+	task->buffers[0].mode = STARPU_R;
+	task->buffers[1].handle = dst_buffer->handle;
+	task->buffers[1].mode = STARPU_RW;
+	task->cl = &codelet_copybuffer;
+
+	arg = (struct arg_copybuffer*)malloc(sizeof(struct arg_copybuffer));
+	arg->src_offset = src_offset;
+	arg->dst_offset = dst_offset;
+	arg->cb = cb;
+	gc_entity_store(&arg->src_buffer, src_buffer);
+	gc_entity_store(&arg->dst_buffer, dst_buffer);
+	task->cl_arg = arg;
+	task->cl_arg_size = sizeof(struct arg_copybuffer);
+
+	dst_buffer->scratch = 0;
+
+	task_submit(task, cmd);
+
+	return CL_SUCCESS;
+}
+
+
 CL_API_ENTRY cl_int CL_API_CALL
 CL_API_ENTRY cl_int CL_API_CALL
 soclEnqueueCopyBuffer(cl_command_queue  cq, 
 soclEnqueueCopyBuffer(cl_command_queue  cq, 
                     cl_mem              src_buffer,
                     cl_mem              src_buffer,
@@ -77,35 +113,11 @@ soclEnqueueCopyBuffer(cl_command_queue  cq,
                     const cl_event *    events,
                     const cl_event *    events,
                     cl_event *          event) CL_API_SUFFIX__VERSION_1_0
                     cl_event *          event) CL_API_SUFFIX__VERSION_1_0
 {
 {
-   struct starpu_task *task;
-   struct arg_copybuffer *arg;
-   cl_event ev;
-
-   task = task_create(CL_COMMAND_COPY_BUFFER);
-   ev = task_event(task);
-
-   task->buffers[0].handle = src_buffer->handle;
-   task->buffers[0].mode = STARPU_R;
-   task->buffers[1].handle = dst_buffer->handle;
-   task->buffers[1].mode = STARPU_RW;
-   task->cl = &codelet_copybuffer;
-
-   arg = (struct arg_copybuffer*)malloc(sizeof(struct arg_copybuffer));
-   arg->src_offset = src_offset;
-   arg->dst_offset = dst_offset;
-   arg->cb = cb;
-   gc_entity_store(&arg->src_buffer, src_buffer);
-   gc_entity_store(&arg->dst_buffer, dst_buffer);
-   task->cl_arg = arg;
-   task->cl_arg_size = sizeof(struct arg_copybuffer);
-
-   dst_buffer->scratch = 0;
-
-   DEBUG_MSG("Submitting CopyBuffer task (event %d)\n", ev->id);
+	command_copy_buffer cmd = command_copy_buffer_create(src_buffer, dst_buffer, src_offset, dst_offset, cb);
 
 
-   cl_int ret = command_queue_enqueue(cq, task, 0, num_events, events);
+	command_queue_enqueue(cq, cmd, num_events, events);
 
 
-   RETURN_EVENT(ev, event);
+	RETURN_EVENT(cmd, event);
 
 
-   return ret;
+	return CL_SUCCESS;
 }
 }

+ 25 - 48
socl/src/cl_enqueuemapbuffer.c

@@ -16,76 +16,53 @@
 
 
 #include "socl.h"
 #include "socl.h"
 
 
-struct mb_data {
-  cl_event ev;
-  cl_mem buffer;
-  cl_map_flags map_flags;
-};
-
 static void mapbuffer_callback(void *args) {
 static void mapbuffer_callback(void *args) {
-  struct mb_data *arg = (struct mb_data*)args;
-
-  starpu_tag_notify_from_apps(arg->ev->id);
-  arg->ev->status = CL_COMPLETE;
+	command_map_buffer cmd = (command_map_buffer)args;
 
 
-  gc_entity_unstore(&arg->ev);
-  gc_entity_unstore(&arg->buffer);
-  free(args);
+	starpu_tag_notify_from_apps(cmd->event->id);
+	cmd->event->status = CL_COMPLETE;
 }
 }
 
 
 static void mapbuffer_task(void *args) {
 static void mapbuffer_task(void *args) {
-  struct mb_data *arg = (struct mb_data*)args;
+	command_map_buffer cmd = (command_map_buffer)args;
+
+	starpu_access_mode mode = (cmd->map_flags == CL_MAP_READ ? STARPU_R : STARPU_RW);
+
+	starpu_data_acquire_cb(cmd->buffer->handle, mode, mapbuffer_callback, cmd);
+}
 
 
-  starpu_access_mode mode = (arg->map_flags == CL_MAP_READ ? STARPU_R : STARPU_RW);
+cl_int command_map_buffer_submit(command_map_buffer cmd) {
+	starpu_task task = task_create_cpu(mapbuffer_task, cmd, 0);
 
 
-  starpu_data_acquire_cb(arg->buffer->handle, mode, mapbuffer_callback, arg);
+	task_submit(task, cmd);
+
+	return CL_SUCCESS;
 }
 }
 
 
 CL_API_ENTRY void * CL_API_CALL
 CL_API_ENTRY void * CL_API_CALL
 soclEnqueueMapBuffer(cl_command_queue cq,
 soclEnqueueMapBuffer(cl_command_queue cq,
                    cl_mem           buffer,
                    cl_mem           buffer,
-                   cl_bool          blocking_map, 
+                   cl_bool          blocking, 
                    cl_map_flags     map_flags,
                    cl_map_flags     map_flags,
                    size_t           offset, 
                    size_t           offset, 
-                   size_t           UNUSED(cb),
+                   size_t           cb,
                    cl_uint          num_events,
                    cl_uint          num_events,
                    const cl_event * events,
                    const cl_event * events,
                    cl_event *       event,
                    cl_event *       event,
                    cl_int *         errcode_ret) CL_API_SUFFIX__VERSION_1_0
                    cl_int *         errcode_ret) CL_API_SUFFIX__VERSION_1_0
 {
 {
-   struct starpu_task *task;
-   struct mb_data *arg;
-   cl_event ev;
-   cl_int err;
-
-   /* Create custom event that will be triggered when map is complete */
-   ev = event_create();
-
-   /* Store arguments */
-   arg = (struct mb_data*)malloc(sizeof(struct mb_data));
-   arg->map_flags = map_flags;
-   gc_entity_store(&arg->ev, ev);
-   gc_entity_store(&arg->buffer, buffer);
-
-   /* Create StarPU task */
-   task = task_create_cpu(CL_COMMAND_MAP_BUFFER, mapbuffer_task, arg, 0);
-   cl_event map_event = task_event(task);
-
-   /* Enqueue task */
-   DEBUG_MSG("Submitting MapBuffer task (event %d)\n", ev->id);
-   err = command_queue_enqueue_fakeevent(cq, task, 0, num_events, events, ev);
-   gc_entity_release(map_event);
+	cl_event ev = event_create();
 
 
-   if (errcode_ret != NULL)
-      *errcode_ret = err;
+	command_map_buffer cmd = command_map_buffer_create(buffer, map_flags, offset, cb, ev);
 
 
-   if (err != CL_SUCCESS)
-      return NULL;
+	command_queue_enqueue(cq, cmd, num_events, events);
 
 
-   if (blocking_map == CL_TRUE)
-      soclWaitForEvents(1, &ev);
+	if (errcode_ret != NULL)
+		*errcode_ret = CL_SUCCESS;
 
 
-   RETURN_EVENT(ev, event);
+	RETURN_CUSTOM_EVENT(ev,event);
 
 
-   return (void*)(starpu_variable_get_local_ptr(buffer->handle) + offset);
+	MAY_BLOCK_CUSTOM(blocking,ev);
+	
+	return (void*)(starpu_variable_get_local_ptr(buffer->handle) + offset);
 }
 }

+ 15 - 5
socl/src/cl_enqueuemarker.c

@@ -20,11 +20,21 @@ CL_API_ENTRY cl_int CL_API_CALL
 soclEnqueueMarker(cl_command_queue  cq,
 soclEnqueueMarker(cl_command_queue  cq,
                 cl_event *          event) CL_API_SUFFIX__VERSION_1_0
                 cl_event *          event) CL_API_SUFFIX__VERSION_1_0
 {
 {
-   if (event == NULL)
-      return CL_INVALID_VALUE;
+	if (event == NULL)
+		return CL_INVALID_VALUE;
+	
+	command_marker cmd = command_marker_create();
 
 
-   starpu_task * task = task_create(CL_COMMAND_MARKER);
-   *event = task_event(task);
+	command_queue_enqueue(cq, cmd, 0, NULL);
 
 
-   return command_queue_enqueue(cq, task, 0, 0, NULL);
+	RETURN_EVENT(cmd, event);
+
+	return CL_SUCCESS;
+}
+
+cl_int command_marker_submit(command_marker cmd) {
+	struct starpu_task *task;
+	task = task_create(CL_COMMAND_MARKER);
+
+	task_submit(task, cmd);
 }
 }

+ 124 - 250
socl/src/cl_enqueuendrangekernel.c

@@ -16,81 +16,64 @@
 
 
 #include "socl.h"
 #include "socl.h"
 
 
-typedef struct running_kernel * running_kernel;
-
-struct running_kernel {
-  cl_kernel kernel;
-  cl_mem *buffers;
-  unsigned int buffer_count;
-  starpu_codelet *codelet;
-  cl_uint work_dim;
-  size_t * global_work_offset;
-  size_t * global_work_size;
-  size_t * local_work_size;
-  /* Arguments */
-  unsigned int arg_count;
-  size_t *arg_size;
-  enum kernel_arg_type  *arg_type;
-  void  **arg_value;
-};
 
 
-static void soclEnqueueNDRangeKernel_task(void *descr[], void *args) {
-   running_kernel d;
+void soclEnqueueNDRangeKernel_task(void *descr[], void *args) {
+	command_ndrange_kernel cmd = (command_ndrange_kernel)args;
+
    cl_command_queue cq;
    cl_command_queue cq;
    int wid;
    int wid;
    cl_int err;
    cl_int err;
 
 
-   d = (running_kernel)args;
    wid = starpu_worker_get_id();
    wid = starpu_worker_get_id();
    starpu_opencl_get_queue(wid, &cq);
    starpu_opencl_get_queue(wid, &cq);
 
 
-   DEBUG_MSG("[worker %d] [kernel %d] Executing kernel...\n", wid, d->kernel->id);
+   DEBUG_MSG("[worker %d] [kernel %d] Executing kernel...\n", wid, cmd->kernel->id);
 
 
    int range = starpu_worker_get_range();
    int range = starpu_worker_get_range();
 
 
    /* Set arguments */
    /* Set arguments */
    {
    {
-      unsigned int i;
-      int buf = 0;
-      for (i=0; i<d->arg_count; i++) {
-         switch (d->arg_type[i]) {
-            case Null:
-               err = clSetKernelArg(d->kernel->cl_kernels[range], i, d->arg_size[i], NULL);
-               break;
-            case Buffer: {
-                  cl_mem mem;  
-                  mem = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[buf]);
-                  err = clSetKernelArg(d->kernel->cl_kernels[range], i, d->arg_size[i], &mem);
-                  buf++;
-               }
-               break;
-            case Immediate:
-               err = clSetKernelArg(d->kernel->cl_kernels[range], i, d->arg_size[i], d->arg_value[i]);
-               break;
-         }
-         if (err != CL_SUCCESS) {
-            DEBUG_CL("clSetKernelArg", err);
-            DEBUG_ERROR("Aborting\n");
-         }
-      }
+	   unsigned int i;
+	   int buf = 0;
+	   for (i=0; i<cmd->num_args; i++) {
+		   switch (cmd->arg_types[i]) {
+			   case Null:
+				   err = clSetKernelArg(cmd->kernel->cl_kernels[range], i, cmd->arg_sizes[i], NULL);
+				   break;
+			   case Buffer: {
+						cl_mem mem;  
+						mem = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[buf]);
+						err = clSetKernelArg(cmd->kernel->cl_kernels[range], i, cmd->arg_sizes[i], &mem);
+						buf++;
+					}
+					break;
+			   case Immediate:
+					err = clSetKernelArg(cmd->kernel->cl_kernels[range], i, cmd->arg_sizes[i], cmd->args[i]);
+					break;
+		   }
+		   if (err != CL_SUCCESS) {
+			   DEBUG_CL("clSetKernelArg", err);
+			   DEBUG_ERROR("Aborting\n");
+		   }
+	   }
    }
    }
 
 
    /* Calling Kernel */
    /* Calling Kernel */
    cl_event event;
    cl_event event;
-   err = clEnqueueNDRangeKernel(cq, d->kernel->cl_kernels[range], d->work_dim, d->global_work_offset, d->global_work_size, d->local_work_size, 0, NULL, &event);
+   err = clEnqueueNDRangeKernel(cq, cmd->kernel->cl_kernels[range], cmd->work_dim, cmd->global_work_offset, cmd->global_work_size, cmd->local_work_size, 0, NULL, &event);
 
 
    if (err != CL_SUCCESS) {
    if (err != CL_SUCCESS) {
-      ERROR_MSG("Worker[%d] Unable to Enqueue kernel (error %d)\n", wid, err);
-      DEBUG_CL("clEnqueueNDRangeKernel", err);
-      DEBUG_MSG("Workdim %d, global_work_offset %p, global_work_size %p, local_work_size %p\n",
-                d->work_dim, d->global_work_offset, d->global_work_size, d->local_work_size);
-      DEBUG_MSG("Global work size: %ld %ld %ld\n", d->global_work_size[0],
-            (d->work_dim > 1 ? d->global_work_size[1] : 1), (d->work_dim > 2 ? d->global_work_size[2] : 1)); 
-      if (d->local_work_size != NULL)
-         DEBUG_MSG("Local work size: %ld %ld %ld\n", d->local_work_size[0],
-               (d->work_dim > 1 ? d->local_work_size[1] : 1), (d->work_dim > 2 ? d->local_work_size[2] : 1)); 
-      ERROR_MSG("Aborting.\n");
-      exit(1);
+	   ERROR_MSG("Worker[%d] Unable to Enqueue kernel (error %d)\n", wid, err);
+	   DEBUG_CL("clEnqueueNDRangeKernel", err);
+	   DEBUG_MSG("Workdim %d, global_work_offset %p, global_work_size %p, local_work_size %p\n",
+			   cmd->work_dim, cmd->global_work_offset, cmd->global_work_size, cmd->local_work_size);
+	   DEBUG_MSG("Global work size: %ld %ld %ld\n", cmd->global_work_size[0],
+			   (cmd->work_dim > 1 ? cmd->global_work_size[1] : 1), (cmd->work_dim > 2 ? cmd->global_work_size[2] : 1)); 
+	   if (cmd->local_work_size != NULL)
+		   DEBUG_MSG("Local work size: %ld %ld %ld\n", cmd->local_work_size[0],
+				   (cmd->work_dim > 1 ? cmd->local_work_size[1] : 1), (cmd->work_dim > 2 ? cmd->local_work_size[2] : 1)); 
+	   ERROR_MSG("Aborting.\n");
+	   exit(1);
    }
    }
 
 
    /* Waiting for kernel to terminate */
    /* Waiting for kernel to terminate */
@@ -99,219 +82,110 @@ static void soclEnqueueNDRangeKernel_task(void *descr[], void *args) {
 }
 }
 
 
 static void cleaning_task_callback(void *args) {
 static void cleaning_task_callback(void *args) {
-   running_kernel arg = (running_kernel)args;
+	command_ndrange_kernel cmd = (command_ndrange_kernel)args;
 
 
-   free(arg->arg_size);
-   free(arg->arg_type);
-
-   unsigned int i;
-   for (i=0; i<arg->arg_count; i++) {
-      free(arg->arg_value[i]);
-   }
-   free(arg->arg_value);
+	free(cmd->arg_sizes);
+	free(cmd->arg_types);
 
 
-   for (i=0; i<arg->buffer_count; i++)
-      gc_entity_unstore(&arg->buffers[i]);
+	unsigned int i;
+	for (i=0; i<cmd->num_args; i++) {
+		free(cmd->args[i]);
+	}
+	free(cmd->args);
 
 
-   gc_entity_unstore(&arg->kernel);
+	for (i=0; i<cmd->num_buffers; i++)
+		gc_entity_unstore(&cmd->buffers[i]);
 
 
-   free(arg->buffers);
-   free(arg->global_work_offset);
-   free(arg->global_work_size);
-   free(arg->local_work_size);
-   void * co = arg->codelet;
-   arg->codelet = NULL;
-   free(co);
+	free(cmd->buffers);
+	void * co = cmd->codelet;
+	cmd->codelet = NULL;
+	free(co);
 }
 }
 
 
 static struct starpu_perfmodel_t perf_model = {
 static struct starpu_perfmodel_t perf_model = {
-  .type = STARPU_HISTORY_BASED,
-  .symbol = "perf_model"
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "perf_model"
 };
 };
 
 
 /**
 /**
  * Real kernel enqueuing command
  * Real kernel enqueuing command
  */
  */
-cl_int node_play_enqueue_kernel(node_enqueue_kernel n) {
-
-   struct starpu_task *task;
-   running_kernel arg;
-   starpu_codelet *codelet;
-   cl_event ev;
-   
-   /* Alias struc fields */
-   cl_command_queue cq = n->cq;
-   cl_kernel        kernel = n->kernel;
-   cl_uint          work_dim = n->work_dim;
-   const size_t *   global_work_offset = n->global_work_offset;
-   const size_t *   global_work_size = n->global_work_size;
-   const size_t *   local_work_size = n->local_work_size;
-   cl_uint          num_events = n->num_events;
-   const cl_event * events = n->events;
-   cl_event *       event = n->event;
-   char 	    is_task = n->is_task;
-
-
-   /* Allocate structures */
-
-   /* Codelet */
-   codelet = (starpu_codelet*)malloc(sizeof(starpu_codelet));
-   if (codelet == NULL)
-      return CL_OUT_OF_HOST_MEMORY;
-
-   /* Codelet arguments */
-   arg = (running_kernel)malloc(sizeof(struct running_kernel));
-   if (arg == NULL) {
-      free(codelet);
-      return CL_OUT_OF_HOST_MEMORY;
-   }
-
-   /* StarPU task */
-   task = task_create(is_task ? CL_COMMAND_TASK : CL_COMMAND_NDRANGE_KERNEL);
-   ev = task_event(task);
-
-   /*******************
-    * Initializations *
-    *******************/
-
-   /* ------- *
-    * Codelet *
-    * ------- */
-   codelet->where = STARPU_OPENCL;
-   codelet->power_model = NULL;
-   codelet->opencl_func = &soclEnqueueNDRangeKernel_task;
-   //codelet->model = NULL;
-   codelet->model = &perf_model;
-
-   /* ---------------- *
-    * Codelet argument *
-    * ---------------- */
-   gc_entity_store(&arg->kernel, kernel);
-   arg->work_dim = work_dim;
-   arg->codelet = codelet;
-
-   /* Global work offset */
-   if (global_work_offset != NULL) {
-      arg->global_work_offset = (size_t*)malloc(sizeof(size_t)*work_dim);
-      memcpy(arg->global_work_offset, global_work_offset, work_dim*sizeof(size_t));
-   }
-   else arg->global_work_offset = NULL;
-
-   /* Global work size */
-   arg->global_work_size = (size_t*)malloc(sizeof(size_t)*work_dim);
-   memcpy(arg->global_work_size, global_work_size, work_dim*sizeof(size_t));
-
-   /* Local work size */
-   if (local_work_size != NULL) {
-      arg->local_work_size = (size_t*)malloc(sizeof(size_t)*work_dim);
-      memcpy(arg->local_work_size, local_work_size, work_dim*sizeof(size_t));
-   }
-   else arg->local_work_size = NULL;
-
-   /* ----------- *
-    * StarPU task *
-    * ----------- */
-   task->cl = codelet;
-   task->cl_arg = arg;
-   task->cl_arg_size = sizeof(struct running_kernel);
-
-   /* Convert OpenCL's memory objects to StarPU buffers */
-   codelet->nbuffers = 0;
-   {
-      arg->buffers = malloc(sizeof(cl_mem) * kernel->arg_count);
-      arg->buffer_count = 0;
-
-      unsigned int i;
-      for (i=0; i<kernel->arg_count; i++) {
-         if (kernel->arg_type[i] == Buffer) {
-
-            cl_mem buf = (cl_mem)kernel->arg_value[i];
-
-            /* We save cl_mem references in order to properly release them after kernel termination */
-            gc_entity_store(&arg->buffers[arg->buffer_count], buf);
-            arg->buffer_count += 1;
-
-            codelet->nbuffers++;
-            task->buffers[codelet->nbuffers-1].handle = buf->handle;
-
-            /* Determine best StarPU buffer access mode */
-            int mode;
-            if (buf->mode == CL_MEM_READ_ONLY)
-               mode = STARPU_R;
-            else if (buf->mode == CL_MEM_WRITE_ONLY) {
-               mode = STARPU_W;
-               buf->scratch = 0;
-            }
-            else if (buf->scratch) { //RW but never accessed in RW or W mode
-               mode = STARPU_W;
-               buf->scratch = 0;
-            }
-            else {
-               mode = STARPU_RW;
-               buf->scratch = 0;
-            }
-            task->buffers[codelet->nbuffers-1].mode = mode; 
-         }
-      }
-   }
-
-   /* Copy arguments as kernel args can be modified by the time we launch the kernel */
-   {
-      arg->arg_count = kernel->arg_count;
-      arg->arg_size = malloc(sizeof(size_t) * kernel->arg_count);
-      memcpy(arg->arg_size, kernel->arg_size, sizeof(size_t) * kernel->arg_count);
-      arg->arg_type = malloc(sizeof(enum kernel_arg_type) * kernel->arg_count);
-      memcpy(arg->arg_type, kernel->arg_type, sizeof(enum kernel_arg_type) * kernel->arg_count);
-      arg->arg_value = malloc(sizeof(void*) * kernel->arg_count);
-      unsigned int i;
-      for (i=0; i<kernel->arg_count; i++) {
-         if (kernel->arg_value[i] != NULL) {
-           arg->arg_value[i] = malloc(arg->arg_size[i]);
-           memcpy(arg->arg_value[i], kernel->arg_value[i], arg->arg_size[i]);
-         }
-         else arg->arg_value[i] = NULL;
-      }
-   }
-
-   DEBUG_MSG("Submitting NDRange task (event %d)\n", ev->id);
-
-   cl_int ret = command_queue_enqueue(cq, task, 0, num_events, events);
-
-   /* Enqueue a cleaning task */
-   starpu_task * cleaning_task = task_create_cpu(0, cleaning_task_callback, arg,1);
-   cl_event cleaning_event = task_event(cleaning_task);
-   command_queue_enqueue(cq, cleaning_task, 0, 1, &ev);
+cl_int command_ndrange_kernel_submit(command_ndrange_kernel cmd) {
+
+	starpu_task task = task_create();
+	task->cl = cmd->codelet;
+	task->cl_arg = cmd;
+	task->cl_arg_size = sizeof(cmd);
+
+	starpu_codelet * codelet = cmd->codelet;
+
+	/* We need to detect which parameters are OpenCL's memory objects and
+	 * we retrieve their corresponding StarPU buffers */
+	cmd->num_buffers = 0;
+	cmd->buffers = malloc(sizeof(cl_mem) * cmd->num_args);
+
+	unsigned int i;
+	for (i=0; i<cmd->num_args; i++) {
+		if (cmd->arg_types[i] == Buffer) {
+
+			cl_mem buf = *(cl_mem*)cmd->args[i];
+
+			gc_entity_store(&cmd->buffers[cmd->num_buffers], buf);
+			task->buffers[cmd->num_buffers].handle = buf->handle;
+
+			/* Determine best StarPU buffer access mode */
+			int mode;
+			if (buf->mode == CL_MEM_READ_ONLY)
+				mode = STARPU_R;
+			else if (buf->mode == CL_MEM_WRITE_ONLY) {
+				mode = STARPU_W;
+				buf->scratch = 0;
+			}
+			else if (buf->scratch) { //RW but never accessed in RW or W mode
+				mode = STARPU_W;
+				buf->scratch = 0;
+			}
+			else {
+				mode = STARPU_RW;
+				buf->scratch = 0;
+			}
+			task->buffers[cmd->num_buffers].mode = mode; 
+
+			cmd->num_buffers += 1;
+		}
+	}
+	codelet->nbuffers = cmd->num_buffers;
+
+	task_submit(task, cmd);
+
+	/* Enqueue a cleaning task */
+	//FIXME: execute this in the callback?
+	starpu_task cleaning_task = task_create_cpu(cleaning_task_callback, cmd,1);
+	cl_event ev = command_event_get(cmd);
+	task_depends_on(cleaning_task, 1, &ev);
+	task_submit(cleaning_task, cmd);
 
 
-   gc_entity_release(cleaning_event);
-  
-   RETURN_EVENT(ev, event);
-
-   return ret;
+	return CL_SUCCESS;
 }
 }
 
 
-/**
- * Virtual kernel enqueueing command
- */
+
 CL_API_ENTRY cl_int CL_API_CALL
 CL_API_ENTRY cl_int CL_API_CALL
 soclEnqueueNDRangeKernel(cl_command_queue cq,
 soclEnqueueNDRangeKernel(cl_command_queue cq,
-                       cl_kernel        kernel,
-                       cl_uint          work_dim,
-                       const size_t *   global_work_offset,
-                       const size_t *   global_work_size,
-                       const size_t *   local_work_size,
-                       cl_uint          num_events,
-                       const cl_event * events,
-                       cl_event *       event) CL_API_SUFFIX__VERSION_1_1
+		cl_kernel        kernel,
+		cl_uint          work_dim,
+		const size_t *   global_work_offset,
+		const size_t *   global_work_size,
+		const size_t *   local_work_size,
+		cl_uint          num_events,
+		const cl_event * events,
+		cl_event *       event) CL_API_SUFFIX__VERSION_1_1
 {
 {
-	node_enqueue_kernel n;
+	command_ndrange_kernel cmd = command_ndrange_kernel_create(kernel, work_dim,
+			global_work_offset, global_work_size, local_work_size);
+
+	command_queue_enqueue(cq, cmd, num_events, events);
 
 
-	n = graph_create_enqueue_kernel(0, cq, kernel, work_dim, global_work_offset, global_work_size,
-		local_work_size, num_events, events, event, kernel->arg_count, kernel->arg_size,
-		kernel->arg_type, kernel->arg_value);
-	
-	//FIXME: temporarily, we execute the node directly. In the future, we will postpone this.
-	node_play_enqueue_kernel(n);
+	RETURN_EVENT(cmd, event);
 
 
-	//graph_store(n);
 	return CL_SUCCESS;
 	return CL_SUCCESS;
 }
 }

+ 36 - 24
socl/src/cl_enqueuereadbuffer.c

@@ -66,6 +66,37 @@ static starpu_codelet codelet_readbuffer = {
    .nbuffers = 1
    .nbuffers = 1
 };
 };
 
 
+cl_int command_read_buffer_submit(command_read_buffer cmd) {
+	/* Aliases */
+	cl_mem buffer = cmd->buffer;
+	size_t offset = cmd->offset;
+	size_t cb = cmd->cb;
+	void * ptr = cmd->ptr;
+
+	struct starpu_task *task;
+	struct arg_readbuffer *arg;
+
+	task = task_create(CL_COMMAND_READ_BUFFER);
+
+	task->buffers[0].handle = buffer->handle;
+	task->buffers[0].mode = STARPU_R;
+	task->cl = &codelet_readbuffer;
+
+	arg = (struct arg_readbuffer*)malloc(sizeof(struct arg_readbuffer));
+	arg->offset = offset;
+	arg->cb = cb;
+	arg->ptr = ptr;
+	task->cl_arg = arg;
+	task->cl_arg_size = sizeof(struct arg_readbuffer);
+
+	gc_entity_store(&arg->buffer, buffer);
+
+	task_submit(task, cmd);
+
+	return CL_SUCCESS;
+}
+
+
 CL_API_ENTRY cl_int CL_API_CALL
 CL_API_ENTRY cl_int CL_API_CALL
 soclEnqueueReadBuffer(cl_command_queue  cq,
 soclEnqueueReadBuffer(cl_command_queue  cq,
                     cl_mem              buffer,
                     cl_mem              buffer,
@@ -77,33 +108,14 @@ soclEnqueueReadBuffer(cl_command_queue  cq,
                     const cl_event *    events,
                     const cl_event *    events,
                     cl_event *          event) CL_API_SUFFIX__VERSION_1_0
                     cl_event *          event) CL_API_SUFFIX__VERSION_1_0
 { 
 { 
-   struct starpu_task *task;
-   struct arg_readbuffer *arg;
-   cl_event ev;
-
-   task = task_create(CL_COMMAND_READ_BUFFER);
-   ev = task_event(task);
-
-   task->buffers[0].handle = buffer->handle;
-   task->buffers[0].mode = STARPU_R;
-   task->cl = &codelet_readbuffer;
-
-   arg = (struct arg_readbuffer*)malloc(sizeof(struct arg_readbuffer));
-   arg->offset = offset;
-   arg->cb = cb;
-   arg->ptr = ptr;
-   task->cl_arg = arg;
-   task->cl_arg_size = sizeof(struct arg_readbuffer);
-
-   gc_entity_store(&arg->buffer, buffer);
 
 
-   task->synchronous = (blocking == CL_TRUE);
+	command_read_buffer cmd = command_read_buffer_create(buffer, offset, cb, ptr);
 
 
-   DEBUG_MSG("Submitting EnqueueRWBuffer task (event %d)\n", ev->id);
+	command_queue_enqueue(cq, cmd, num_events, events);
 
 
-   cl_int ret = command_queue_enqueue(cq, task, 0, num_events, events);
+	RETURN_EVENT(cmd, event);
 
 
-   RETURN_EVENT(ev, event);
+	MAY_BLOCK(blocking);
 
 
-   return ret;
+	return CL_SUCCESS;
 }
 }

+ 4 - 16
socl/src/cl_enqueuetask.c

@@ -16,14 +16,6 @@
 
 
 #include "socl.h"
 #include "socl.h"
 
 
-static cl_uint work_dim = 3;
-static const size_t global_work_offset[3] = {0,0,0};
-static const size_t global_work_size[3] = {1,1,1};
-static const size_t * local_work_size = NULL;
-
-CL_API_ENTRY cl_int CL_API_CALL
-soclEnqueueNDRangeKernel(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *) CL_API_SUFFIX__VERSION_1_0;
-
 CL_API_ENTRY cl_int CL_API_CALL
 CL_API_ENTRY cl_int CL_API_CALL
 soclEnqueueTask(cl_command_queue cq,
 soclEnqueueTask(cl_command_queue cq,
               cl_kernel         kernel,
               cl_kernel         kernel,
@@ -31,15 +23,11 @@ soclEnqueueTask(cl_command_queue cq,
               const cl_event *  events,
               const cl_event *  events,
               cl_event *        event) CL_API_SUFFIX__VERSION_1_0
               cl_event *        event) CL_API_SUFFIX__VERSION_1_0
 {
 {
-	node_enqueue_kernel n;
-
-	n = graph_create_enqueue_kernel(1, cq, kernel, work_dim, global_work_offset, global_work_size,
-		local_work_size, num_events, events, event, kernel->arg_count, kernel->arg_size,
-		kernel->arg_type, kernel->arg_value);
+	command_ndrange_kernel cmd = command_task_create(kernel);
 	
 	
-	//FIXME: temporarily, we execute the node directly. In the future, we will postpone this.
-	node_play_enqueue_kernel(n);
+	command_queue_enqueue(cq, cmd, num_events, events);
+
+	RETURN_EVENT(cmd, event);
 
 
-	//graph_store(n);
 	return CL_SUCCESS;
 	return CL_SUCCESS;
 }
 }

+ 18 - 14
socl/src/cl_enqueueunmapmemobject.c

@@ -16,27 +16,31 @@
 
 
 #include "socl.h"
 #include "socl.h"
 
 
+cl_int command_unmap_mem_object_submit(command_unmap_mem_object cmd) {
+	/* Aliases */
+	cl_mem buffer = cmd->buffer;
+
+	//FIXME: use a callback
+	starpu_task task = task_create_cpu((void(*)(void*))starpu_data_release, buffer->handle, 0);
+
+	task_submit(task, cmd);
+
+	return CL_SUCCESS;
+}
+
 CL_API_ENTRY cl_int CL_API_CALL
 CL_API_ENTRY cl_int CL_API_CALL
 soclEnqueueUnmapMemObject(cl_command_queue cq,
 soclEnqueueUnmapMemObject(cl_command_queue cq,
-                        cl_mem            memobj,
-                        void *            UNUSED(mapped_ptr),
+                        cl_mem            buffer,
+                        void *            ptr,
                         cl_uint           num_events,
                         cl_uint           num_events,
                         const cl_event *  events,
                         const cl_event *  events,
                         cl_event *        event) CL_API_SUFFIX__VERSION_1_0
                         cl_event *        event) CL_API_SUFFIX__VERSION_1_0
 {
 {
-   struct starpu_task *task;
-   cl_int err;
-   cl_event ev;
-
-   /* Create StarPU task */
-   task = task_create_cpu(CL_COMMAND_UNMAP_MEM_OBJECT, (void(*)(void*))starpu_data_release, memobj->handle, 0);
-   ev = task_event(task);
-
-   DEBUG_MSG("Submitting UnmapBuffer task (event %d)\n", task->tag_id);
+	command_unmap_mem_object cmd = command_unmap_mem_object_create(buffer, ptr);
 
 
-   err = command_queue_enqueue(cq, task, 0, num_events, events);
+	command_queue_enqueue(cq, cmd, num_events, events);
 
 
-   RETURN_EVENT(ev, event);
+	RETURN_EVENT(cmd, event);
 
 
-   return err;
+	return CL_SUCCESS;
 }
 }

+ 3 - 4
socl/src/cl_enqueuewaitforevents.c

@@ -22,10 +22,9 @@ soclEnqueueWaitForEvents(cl_command_queue cq,
                        const cl_event * events) CL_API_SUFFIX__VERSION_1_0
                        const cl_event * events) CL_API_SUFFIX__VERSION_1_0
 {
 {
 
 
-   //CL_COMMAND_MARKER has been chosen as CL_COMMAND_WAIT_FOR_EVENTS doesn't exist
-   starpu_task * task = task_create(CL_COMMAND_MARKER);
+	command_marker cmd = command_marker_create();
 
 
-   command_queue_enqueue(cq, task, 0, num_events, events);
+	command_queue_enqueue(cq, cmd, num_events, events);
 
 
-   return CL_SUCCESS;
+	return CL_SUCCESS;
 }
 }

+ 42 - 33
socl/src/cl_enqueuewritebuffer.c

@@ -68,6 +68,43 @@ static starpu_codelet codelet_writebuffer = {
    .nbuffers = 1
    .nbuffers = 1
 };
 };
 
 
+cl_int command_write_buffer_submit(command_write_buffer cmd) {
+	/* Aliases */
+	cl_mem buffer = cmd->buffer;
+	size_t offset = cmd->offset;
+	size_t cb = cmd->cb;
+	const void * ptr = cmd->ptr;
+
+	struct starpu_task *task;
+	struct arg_writebuffer *arg;
+
+	task = task_create(CL_COMMAND_WRITE_BUFFER);
+
+	task->buffers[0].handle = buffer->handle;
+	//If only a subpart of the buffer is written, RW access mode is required
+	if (cb != buffer->size)
+		task->buffers[0].mode = STARPU_RW;
+	else 
+		task->buffers[0].mode = STARPU_W;
+	task->cl = &codelet_writebuffer;
+
+	arg = (struct arg_writebuffer*)malloc(sizeof(struct arg_writebuffer));
+	arg->offset = offset;
+	arg->cb = cb;
+	arg->ptr = ptr;
+	task->cl_arg = arg;
+	task->cl_arg_size = sizeof(struct arg_writebuffer);
+
+	gc_entity_store(&arg->buffer, buffer);
+
+	//The buffer now contains meaningful data
+	arg->buffer->scratch = 0;
+
+	task_submit(task, cmd);
+
+	return CL_SUCCESS;
+}
+
 CL_API_ENTRY cl_int CL_API_CALL
 CL_API_ENTRY cl_int CL_API_CALL
 soclEnqueueWriteBuffer(cl_command_queue cq, 
 soclEnqueueWriteBuffer(cl_command_queue cq, 
                      cl_mem             buffer, 
                      cl_mem             buffer, 
@@ -79,41 +116,13 @@ soclEnqueueWriteBuffer(cl_command_queue cq,
                      const cl_event *   events, 
                      const cl_event *   events, 
                      cl_event *         event) CL_API_SUFFIX__VERSION_1_0
                      cl_event *         event) CL_API_SUFFIX__VERSION_1_0
 { 
 { 
-   struct starpu_task *task;
-   struct arg_writebuffer *arg;
-   cl_event ev;
-
-   task = task_create(CL_COMMAND_WRITE_BUFFER);
-   ev = task_event(task);
-
-   task->buffers[0].handle = buffer->handle;
-   //If only a subpart of the buffer is written, RW access mode is required
-   if (cb != buffer->size)
-      task->buffers[0].mode = STARPU_RW;
-   else 
-      task->buffers[0].mode = STARPU_W;
-   task->cl = &codelet_writebuffer;
-
-   arg = (struct arg_writebuffer*)malloc(sizeof(struct arg_writebuffer));
-   arg->offset = offset;
-   arg->cb = cb;
-   arg->ptr = ptr;
-   task->cl_arg = arg;
-   task->cl_arg_size = sizeof(struct arg_writebuffer);
-
-   gc_entity_store(&arg->buffer, buffer);
-
-   //The buffer now contains meaningful data
-   arg->buffer->scratch = 0;
-
-   task->synchronous = (blocking == CL_TRUE);
+	command_write_buffer cmd = command_write_buffer_create(buffer, offset, cb, ptr);
 
 
-   DEBUG_MSG("Submitting EnqueueRWBuffer task (event %d)\n", ev->id);
+	command_queue_enqueue(cq, cmd, num_events, events);
 
 
-   cl_int ret = command_queue_enqueue(cq, task, 0, num_events,events);
+	RETURN_EVENT(cmd, event);
 
 
-   /* Return retained event if required by user */
-   RETURN_EVENT(ev,event);
+	MAY_BLOCK(blocking);
 
 
-   return ret;
+	return CL_SUCCESS;
 }
 }

+ 9 - 6
socl/src/cl_finish.c

@@ -17,11 +17,14 @@
 #include "socl.h"
 #include "socl.h"
 
 
 CL_API_ENTRY cl_int CL_API_CALL
 CL_API_ENTRY cl_int CL_API_CALL
-soclFinish(cl_command_queue cq) CL_API_SUFFIX__VERSION_1_0
-{
-   cl_event ev = enqueueBarrier(cq);
-   soclWaitForEvents(1, &ev);
-   gc_entity_release(ev);
+soclFinish(cl_command_queue cq) CL_API_SUFFIX__VERSION_1_0 {
 
 
-   return CL_SUCCESS;
+	command_marker cmd = command_barrier_create();
+
+	command_queue_enqueue(cq, cmd, 0, NULL);
+		cl_event ev = command_event_get(cmd);
+
+	MAY_BLOCK(CL_TRUE)
+
+	return CL_SUCCESS;
 }
 }

+ 2 - 1
socl/src/cl_geteventinfo.c

@@ -17,6 +17,7 @@
 #include "socl.h"
 #include "socl.h"
 #include "getinfo.h"
 #include "getinfo.h"
 
 
+
 CL_API_ENTRY cl_int CL_API_CALL
 CL_API_ENTRY cl_int CL_API_CALL
 soclGetEventInfo(cl_event       event,
 soclGetEventInfo(cl_event       event,
                cl_event_info    param_name,
                cl_event_info    param_name,
@@ -33,7 +34,7 @@ soclGetEventInfo(cl_event       event,
 
 
    switch (param_name) {
    switch (param_name) {
       INFO_CASE(CL_EVENT_COMMAND_QUEUE, event->cq);
       INFO_CASE(CL_EVENT_COMMAND_QUEUE, event->cq);
-      INFO_CASE(CL_EVENT_COMMAND_TYPE, event->type);
+      INFO_CASE(CL_EVENT_COMMAND_TYPE, event->command->typ);
       INFO_CASE(CL_EVENT_COMMAND_EXECUTION_STATUS, event->status);
       INFO_CASE(CL_EVENT_COMMAND_EXECUTION_STATUS, event->status);
       INFO_CASE(CL_EVENT_REFERENCE_COUNT, event->_entity.refs);
       INFO_CASE(CL_EVENT_REFERENCE_COUNT, event->_entity.refs);
       default:
       default:

+ 1 - 1
socl/src/cl_getkernelinfo.c

@@ -29,7 +29,7 @@ soclGetKernelInfo(cl_kernel       kernel,
 
 
    switch (param_name) {
    switch (param_name) {
       INFO_CASE_EX(CL_KERNEL_FUNCTION_NAME, kernel->kernel_name, strlen(kernel->kernel_name)+1)
       INFO_CASE_EX(CL_KERNEL_FUNCTION_NAME, kernel->kernel_name, strlen(kernel->kernel_name)+1)
-      INFO_CASE(CL_KERNEL_NUM_ARGS, kernel->arg_count)
+      INFO_CASE(CL_KERNEL_NUM_ARGS, kernel->num_args)
       INFO_CASE(CL_KERNEL_REFERENCE_COUNT, kernel->_entity.refs)
       INFO_CASE(CL_KERNEL_REFERENCE_COUNT, kernel->_entity.refs)
       INFO_CASE(CL_KERNEL_PROGRAM, kernel->program)
       INFO_CASE(CL_KERNEL_PROGRAM, kernel->program)
       INFO_CASE(CL_KERNEL_CONTEXT, kernel->program->context)
       INFO_CASE(CL_KERNEL_CONTEXT, kernel->program->context)

+ 5 - 3
socl/src/cl_setkernelarg.c

@@ -25,7 +25,7 @@ soclSetKernelArg(cl_kernel  kernel,
    if (kernel == NULL)
    if (kernel == NULL)
       return CL_INVALID_KERNEL;
       return CL_INVALID_KERNEL;
 
 
-   if (arg_index >= kernel->arg_count)
+   if (arg_index >= kernel->num_args)
       return CL_INVALID_ARG_INDEX;
       return CL_INVALID_ARG_INDEX;
 
 
    //FIXME: we don't return CL_INVALID_ARG_VALUE if "arg_value is NULL for an argument that is not declared with __local qualifier or vice-versa"
    //FIXME: we don't return CL_INVALID_ARG_VALUE if "arg_value is NULL for an argument that is not declared with __local qualifier or vice-versa"
@@ -38,7 +38,8 @@ soclSetKernelArg(cl_kernel  kernel,
          break;
          break;
       case Buffer:
       case Buffer:
          kernel->arg_type[arg_index] = Null;
          kernel->arg_type[arg_index] = Null;
-         gc_entity_unstore((cl_mem*)&kernel->arg_value[arg_index]);
+         gc_entity_unstore((cl_mem*)kernel->arg_value[arg_index]);
+	 free(kernel->arg_value[arg_index]);
          kernel->arg_value[arg_index] = NULL;
          kernel->arg_value[arg_index] = NULL;
          break;
          break;
       case Immediate:
       case Immediate:
@@ -60,7 +61,8 @@ soclSetKernelArg(cl_kernel  kernel,
       if ((arg_size == sizeof(cl_mem)) && ((buf = mem_object_fetch(arg_value)) != NULL)) {
       if ((arg_size == sizeof(cl_mem)) && ((buf = mem_object_fetch(arg_value)) != NULL)) {
          DEBUG_MSG("Found buffer %d \n", buf->id);
          DEBUG_MSG("Found buffer %d \n", buf->id);
          kernel->arg_type[arg_index] = Buffer;
          kernel->arg_type[arg_index] = Buffer;
-         gc_entity_store(&kernel->arg_value[arg_index], buf);
+         kernel->arg_value[arg_index] = malloc(sizeof(void*));
+	 gc_entity_store((cl_mem*)kernel->arg_value[arg_index], buf);
       }
       }
       else {
       else {
          /* Argument must be an immediate buffer  */
          /* Argument must be an immediate buffer  */

+ 6 - 1
socl/src/cl_waitforevents.c

@@ -23,7 +23,12 @@ soclWaitForEvents(cl_uint           num_events,
    unsigned int i;
    unsigned int i;
    DEBUG_MSG("Waiting for events: ");
    DEBUG_MSG("Waiting for events: ");
    for (i=0; i<num_events; i++) {
    for (i=0; i<num_events; i++) {
-      DEBUG_MSG_NOHEAD("%d ", event_list[i]->id);
+   	command_graph_dump(event_list[i]->command);
+
+   	/* We need to submit commands if it's not already done */
+	command_submit_deep(event_list[i]->command);
+
+      	DEBUG_MSG_NOHEAD("%d ", event_list[i]->id);
    }
    }
    DEBUG_MSG_NOHEAD("\n");
    DEBUG_MSG_NOHEAD("\n");
 
 

+ 235 - 0
socl/src/command.c

@@ -0,0 +1,235 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010,2011 University of Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "socl.h"
+
+void command_init_ex(cl_command cmd, cl_command_type typ) {
+	cmd->typ = typ;
+	cmd->num_events = 0;
+	cmd->events = NULL;
+	cmd->event = event_create();
+	cmd->event->command = cmd;
+	cmd->cq = NULL;
+	cmd->task = NULL;
+	cmd->submitted = 0;
+}
+
+
+void command_submit_ex(cl_command cmd) {
+#define SUBMIT(typ,name) case typ:\
+	name##_submit((name)cmd);\
+	break;
+
+	assert(cmd->submitted == 0);
+
+	switch(cmd->typ) {
+		SUBMIT(CL_COMMAND_NDRANGE_KERNEL, command_ndrange_kernel)
+		SUBMIT(CL_COMMAND_TASK, command_ndrange_kernel)
+		SUBMIT(CL_COMMAND_READ_BUFFER, command_read_buffer)
+		SUBMIT(CL_COMMAND_WRITE_BUFFER, command_write_buffer)
+		SUBMIT(CL_COMMAND_COPY_BUFFER, command_copy_buffer)
+		SUBMIT(CL_COMMAND_MAP_BUFFER, command_map_buffer)
+		SUBMIT(CL_COMMAND_UNMAP_MEM_OBJECT, command_unmap_mem_object)
+		SUBMIT(CL_COMMAND_MARKER, command_marker)
+		default:
+			ERROR_STOP("Trying to submit unknown command (type %x)", cmd->typ);
+	}
+
+	cmd->submitted = 1;
+#undef SUBMIT
+}
+
+cl_int command_submit_deep_ex(cl_command cmd) {
+	if (cmd->submitted == 1)
+		return CL_SUCCESS;
+	
+	/* We set this in order to avoid cyclic dependencies */
+	cmd->submitted = 1;
+
+	unsigned int i;
+	for (i=0; i<cmd->num_events; i++)
+		command_submit_deep(cmd->events[i]->command);
+	
+	cmd->submitted = 0;
+
+	command_submit_ex(cmd);
+
+	return CL_SUCCESS;
+}
+
+void command_graph_dump_ex(cl_command cmd) {
+
+	unsigned int i;
+	for (i=0; i<cmd->num_events; i++)
+		command_graph_dump_ex(cmd->events[i]->command);
+
+	printf("CMD %lx TYPE %d DEPS", cmd, cmd->typ);
+	for (i=0; i<cmd->num_events; i++)
+		printf(" %lx", cmd->events[i]->command);
+	printf("\n");
+
+}
+
+#define nullOrDup(name,size) cmd->name = memdup_safe(name,size)
+#define dup(name) cmd->name = name
+#define dupEntity(name) do { cmd->name = name; gc_entity_retain(name); } while (0);
+
+void soclEnqueueNDRangeKernel_task(void *descr[], void *args);
+
+command_ndrange_kernel command_ndrange_kernel_create (
+		cl_kernel        kernel,
+		cl_uint          work_dim,
+		const size_t *   global_work_offset,
+		const size_t *   global_work_size,
+		const size_t *   local_work_size)
+{
+	command_ndrange_kernel cmd = malloc(sizeof(struct command_ndrange_kernel_t));
+	command_init(cmd, CL_COMMAND_NDRANGE_KERNEL);
+
+	dupEntity(kernel);
+	dup(work_dim);
+	nullOrDup(global_work_offset, work_dim*sizeof(size_t));
+	nullOrDup(global_work_size, work_dim*sizeof(size_t));
+	nullOrDup(local_work_size, work_dim*sizeof(size_t));
+
+   	/* Codelet */
+   	cmd->codelet = (starpu_codelet*)malloc(sizeof(starpu_codelet));
+	starpu_codelet * codelet = cmd->codelet;
+	codelet->where = STARPU_OPENCL;
+	codelet->power_model = NULL;
+	codelet->opencl_func = &soclEnqueueNDRangeKernel_task;
+	codelet->model = NULL;
+
+   	/* Kernel is mutable, so we duplicate its parameters... */
+	cmd->num_args = kernel->num_args;
+	cmd->arg_sizes = memdup(kernel->arg_size, sizeof(size_t) * kernel->num_args);
+	cmd->arg_types = memdup(kernel->arg_type, sizeof(enum kernel_arg_type) * kernel->num_args);
+	cmd->args = memdup_deep_varsize_safe(kernel->arg_value, kernel->num_args, kernel->arg_size);
+
+	return cmd;
+}
+
+command_ndrange_kernel command_task_create (cl_kernel kernel) {
+
+	static cl_uint task_work_dim = 3;
+	static const size_t task_global_work_offset[3] = {0,0,0};
+	static const size_t task_global_work_size[3] = {1,1,1};
+	static const size_t * task_local_work_size = NULL;
+
+	command_ndrange_kernel cmd = command_ndrange_kernel_create(
+			kernel, task_work_dim, task_global_work_offset,
+			task_global_work_size, task_local_work_size);
+
+	/* This is the only difference with command_ndrange_kernel_create */
+	cmd->_command.typ = CL_COMMAND_TASK;
+
+	return cmd;
+}
+
+command_marker command_barrier_create () {
+
+	command_marker cmd = malloc(sizeof(struct command_marker_t));
+	command_init(cmd, CL_COMMAND_BARRIER);
+
+	return cmd;
+}
+
+command_marker command_marker_create () {
+
+	command_marker cmd = malloc(sizeof(struct command_marker_t));
+	command_init(cmd, CL_COMMAND_MARKER);
+
+	return cmd;
+}
+
+command_map_buffer command_map_buffer_create(
+		cl_mem buffer,
+		cl_map_flags map_flags,
+		size_t offset,
+		size_t cb,
+		cl_event event
+		) {
+
+	command_map_buffer cmd = malloc(sizeof(struct command_map_buffer_t));
+	command_init(cmd, CL_COMMAND_MAP_BUFFER);
+
+	dupEntity(buffer);
+	dup(map_flags);
+	dup(offset);
+	dup(cb);
+	dupEntity(event);
+
+	return cmd;
+}
+
+command_unmap_mem_object command_unmap_mem_object_create(cl_mem buffer, void * ptr) {
+	command_unmap_mem_object cmd = malloc(sizeof(struct command_unmap_mem_object_t));
+	command_init(cmd, CL_COMMAND_UNMAP_MEM_OBJECT);
+
+	dupEntity(buffer);
+	dup(ptr);
+
+	return cmd;
+}
+
+command_read_buffer command_read_buffer_create(cl_mem buffer, size_t offset, size_t cb, void * ptr) {
+
+	command_read_buffer cmd = malloc(sizeof(struct command_read_buffer_t));
+	command_init(cmd, CL_COMMAND_READ_BUFFER);
+
+	dupEntity(buffer);
+	dup(offset);
+	dup(cb);
+	dup(ptr);
+
+	return cmd;
+}
+
+command_write_buffer command_write_buffer_create(cl_mem buffer, size_t offset, size_t cb, const void * ptr) {
+
+	command_write_buffer cmd = malloc(sizeof(struct command_write_buffer_t));
+	command_init(cmd, CL_COMMAND_WRITE_BUFFER);
+
+	dupEntity(buffer);
+	dup(offset);
+	dup(cb);
+	dup(ptr);
+
+	return cmd;
+}
+
+command_copy_buffer command_copy_buffer_create( cl_mem src_buffer, cl_mem dst_buffer,
+		size_t src_offset, size_t dst_offset, size_t cb)
+{
+	command_copy_buffer cmd = malloc(sizeof(struct command_copy_buffer_t));
+	command_init(cmd, CL_COMMAND_COPY_BUFFER);
+
+	dupEntity(src_buffer);
+	dupEntity(dst_buffer);
+	dup(src_offset);
+	dup(dst_offset);
+	dup(cb);
+
+	return cmd;
+}
+
+#undef nullOrDup
+#undef nodeNullOrDup
+#undef dup
+#undef dupEntity
+#undef nodeDup
+#undef memdup
+

+ 198 - 0
socl/src/command.h

@@ -0,0 +1,198 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010,2011 University of Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef SOCL_COMMANDS_H
+#define SOCL_COMMANDS_H
+
+typedef struct cl_command_t * cl_command;
+
+/**
+ * Initialize a command structure
+ *
+ * Command constructors for each kind of command use this method
+ * Implicit and explicit dependencies must be passed as parameters
+ */
+void command_init_ex(cl_command cmd, cl_command_type typ);
+#define command_init(cmd,typ) \
+	command_init_ex((cl_command)cmd,typ)
+
+/** Submit a command for execution */
+void command_submit_ex(cl_command cmd);
+#define command_submit(cmd) \
+	command_submit_ex(&(cmd)->_command)
+
+/** Submit a command and its dependencies */
+cl_int command_submit_deep_ex(cl_command cmd);
+#define command_submit_deep(cmd) (command_submit_deep_ex((cl_command)cmd))
+
+void command_graph_dump_ex(cl_command cmd);
+#define command_graph_dump(cmd) (command_graph_dump_ex((cl_command)cmd))
+
+/**************************
+ * OpenCL Commands
+ **************************/
+struct cl_command_t {
+	cl_command_type	typ;	 	/* Command type */
+	cl_uint 	num_events;	/* Number of dependencies */
+	cl_event * 	events;		/* Dependencies */
+	cl_event  	event;		/* Event for this command */
+	cl_command_queue cq;		/* Command queue the command is enqueued in */
+	starpu_task	task;		/* Associated StarPU task, if any */
+	char		submitted;	/* True if the command has been submitted to StarPU */
+};
+
+#define command_type_get(cmd) (((cl_command)cmd)->typ)
+#define command_event_get(cmd) (((cl_command)cmd)->event)
+#define command_num_events_get(cmd) (((cl_command)cmd)->num_events)
+#define command_events_get(cmd) (((cl_command)cmd)->events)
+#define command_task_get(cmd) (((cl_command)cmd)->task)
+#define command_cq_get(cmd) (((cl_command)cmd)->cq)
+
+#define CL_COMMAND struct cl_command_t _command;
+
+typedef struct command_ndrange_kernel_t {
+	CL_COMMAND
+
+	cl_kernel        kernel;
+	cl_uint          work_dim;
+	const size_t *   global_work_offset;
+	const size_t *   global_work_size;
+	const size_t *   local_work_size;
+	cl_uint 	 num_args;
+	size_t *	 arg_sizes;
+	enum kernel_arg_type * arg_types;
+	void **		 args;
+	starpu_codelet * codelet;
+	cl_uint		 num_buffers;
+	cl_mem *	 buffers;
+} * command_ndrange_kernel;
+
+
+typedef struct command_read_buffer_t {
+	CL_COMMAND
+	
+	cl_mem buffer;
+	size_t offset;
+	size_t cb;
+	void * ptr;
+} * command_read_buffer;
+
+
+typedef struct command_write_buffer_t {
+	CL_COMMAND
+
+	cl_mem buffer;
+	size_t offset;
+	size_t cb;
+	const void * ptr;
+} * command_write_buffer;
+
+
+typedef struct command_copy_buffer_t {
+	CL_COMMAND
+	
+	cl_mem src_buffer;
+	cl_mem dst_buffer;
+	size_t src_offset;
+	size_t dst_offset;
+	size_t cb;
+} * command_copy_buffer;
+
+
+typedef struct command_map_buffer_t {
+	CL_COMMAND
+
+	cl_mem buffer;
+	cl_map_flags map_flags;
+	size_t offset;
+	size_t cb;
+	cl_event event;
+} * command_map_buffer;
+
+
+typedef struct command_unmap_mem_object_t {
+	CL_COMMAND
+
+	cl_mem buffer;
+	void * ptr;
+} * command_unmap_mem_object;
+
+
+typedef struct command_marker_t {
+	CL_COMMAND
+} * command_marker;
+
+/*************************
+ * Constructor functions
+ *************************/
+
+command_ndrange_kernel command_ndrange_kernel_create (
+		cl_kernel        kernel,
+		cl_uint          work_dim,
+		const size_t *   global_work_offset,
+		const size_t *   global_work_size,
+		const size_t *   local_work_size);
+
+command_ndrange_kernel command_task_create (cl_kernel kernel);
+
+command_marker command_barrier_create ();
+
+command_marker command_marker_create ();
+
+command_map_buffer command_map_buffer_create(
+		cl_mem buffer,
+		cl_map_flags map_flags,
+		size_t offset,
+		size_t cb,
+		cl_event event);
+
+command_unmap_mem_object command_unmap_mem_object_create(
+		cl_mem buffer,
+		void * ptr);
+
+command_read_buffer command_read_buffer_create(
+		cl_mem buffer,
+		size_t offset,
+		size_t cb,
+		void * ptr);
+
+command_write_buffer command_write_buffer_create(
+		cl_mem buffer,
+		size_t offset,
+		size_t cb,
+		const void * ptr);
+
+command_copy_buffer command_copy_buffer_create(
+		cl_mem src_buffer,
+		cl_mem dst_buffer,
+		size_t src_offset,
+		size_t dst_offset,
+		size_t cb);
+
+/*************************
+ * Submit functions
+ *************************/
+cl_int command_ndrange_kernel_submit(command_ndrange_kernel cmd);
+cl_int command_read_buffer_submit(command_read_buffer cmd);
+cl_int command_write_buffer_submit(command_write_buffer cmd);
+cl_int command_copy_buffer_submit(command_copy_buffer cmd);
+cl_int command_map_buffer_submit(command_map_buffer cmd);
+cl_int command_unmap_mem_object_submit(command_unmap_mem_object cmd);
+cl_int command_marker_submit(command_marker cmd);
+
+
+#endif /* SOCL_COMMANDS_H */
+

+ 40 - 0
socl/src/command_list.c

@@ -0,0 +1,40 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010,2011 University of Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "socl.h"
+
+command_list command_list_cons(cl_command cmd, command_list ls) {
+	command_list e = malloc(sizeof(struct command_list_t));
+	e->cmd = cmd;
+	e->next = ls;
+	if (ls != NULL)
+		ls->prev = e;
+	return e;
+}
+
+command_list command_list_remove(command_list l, cl_command cmd) {
+	command_list e = l;
+	while (e != NULL) {
+		if (e->cmd == cmd) {
+			if (e->prev != NULL) e->prev->next = e->next;
+			if (e->next != NULL) e->next->prev = e->prev;
+			command_list next = e->next;
+			free(e);
+			if (e == l) return next;
+		}
+	}
+	return l;
+}

+ 28 - 0
socl/src/command_list.h

@@ -0,0 +1,28 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010,2011 University of Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "socl.h"
+
+typedef struct command_list_t * command_list;
+
+struct command_list_t {
+	cl_command cmd;
+	command_list next;
+	command_list prev;
+};
+
+command_list command_list_cons(cl_command cmd, command_list ls);
+command_list command_list_remove(command_list l, cl_command cmd);

+ 124 - 53
socl/src/command_queue.c

@@ -24,74 +24,145 @@
  * its command queue.
  * its command queue.
  */
  */
 
 
+
 /**
 /**
- * Enqueue the given task but put fake_event into the command queue.
- * This is used when a tag notified by application is used (cf clEnqueueMapBuffer, etc.)
+ * Returned implicit dependencies for a task
+ * Command queue must be locked!
  */
  */
-cl_int command_queue_enqueue_fakeevent(cl_command_queue cq, starpu_task *task, cl_int barrier, cl_int num_events, const cl_event * events, cl_event fake_event) {
-
-  int in_order = !(cq->properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
-
-  /* Set explicit task dependencies */
-  task_dependency_add(task, num_events, events);
-
-  /* Lock command queue */
-  pthread_spin_lock(&cq->spin);
+void command_queue_dependencies_implicit(
+	cl_command_queue cq, 	/* Command queue */
+	char is_barrier,	/* Is the task a barrier */
+	cl_int * ret_num_events,	/* Returned number of dependencies */
+	cl_event ** ret_events	/* Returned dependencies */
+) {
+
+	/*********************
+	 * Count dependencies
+	 *********************/
+	int ndeps = 0;
+
+	/* Add dependency to last barrier if applicable */
+	if (cq->barrier != NULL)
+		ndeps++;
+
+	/* Add dependencies to out-of-order events (if any) */
+	if (is_barrier) {
+		command_list cl = cq->commands;
+		while (cl != NULL) {
+			ndeps++;
+			cl = cl->next;
+		}
+	}
+
+	/*********************
+	 * Return dependencies
+	 *********************/
+
+	cl_event * evs = malloc(ndeps * sizeof(cl_event));
+	int n = 0;
+
+	/* Add dependency to last barrier if applicable */
+	if (cq->barrier != NULL)
+		evs[n++] = cq->barrier->event;
+
+	/* Add dependencies to out-of-order events (if any) */
+	if (is_barrier) {
+		command_list cl = cq->commands;
+		while (cl != NULL) {
+			evs[n++] = cl->cmd->event;
+			cl = cl->next;
+		}
+	}
+
+	*ret_num_events = ndeps;
+	*ret_events = evs;
+}
+	
+/**
+ * Insert a command in the command queue
+ * The command queue must be locked!
+ */
+void command_queue_insert(
+	cl_command_queue 	cq, 	/* Command queue */
+	cl_command 		cmd,	/* Command */
+	int 			is_barrier		/* Is the task a barrier */
+) {
 
 
-  /* Add dependency to last barrier if applicable */
-  if (cq->barrier != NULL)
-    task_dependency_add(task, 1, &cq->barrier);
+	int in_order = !(cq->properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
 
 
-  /* Add dependencies to out-of-order events (if any) */
-  if (barrier) {
-    while (cq->events != NULL) {
-      task_dependency_add(task, 1, &cq->events);
-      cq->events = cq->events->next;
-    }
-  }
+	if (is_barrier)
+		cq->commands = NULL;
 
 
-  cl_event ev = (fake_event == NULL ? task_event(task) : fake_event);
+	/* Add command to the list of out-of-order commands */
+	if (!in_order)
+		cq->commands = command_list_cons(cmd, cq->commands);
 
 
-  /* Add event to the list of out-of-order events */
-  if (!in_order) {
-    ev->next = cq->events;
-    ev->prev = NULL;
-    if (cq->events != NULL)
-      cq->events->prev = ev;
-    cq->events = ev;
-  }
+	/* Register this event as last barrier */
+	if (is_barrier || in_order)
+		cq->barrier = cmd;
 
 
-  /* Register this event as last barrier */
-  if (barrier || in_order)
-    cq->barrier = ev;
+	/* Add reference to the command queue */
+	gc_entity_store(&cmd->event->cq, cq);
+}
 
 
-   /* Unlock command queue */
-   pthread_spin_unlock(&cq->spin);
+/**
+ * Return implicit and explicit dependencies for a task
+ * The command queue must be locked!
+ */
+void command_queue_dependencies(
+	cl_command_queue 	cq,		/* Command queue */
+	int 			is_barrier,	/* Is the task a barrier */
+	cl_int 			num_events,	/* Number of explicit dependencies */
+	const cl_event *	events,		/* Explicit dependencies */
+	cl_int * 		ret_num_events,	/* Returned number of dependencies */
+	cl_event ** 		ret_events	/* Returned dependencies */
+) {
+	cl_int implicit_num_events;
+	cl_event * implicit_events;
+
+	/* Implicit dependencies */
+	command_queue_dependencies_implicit(cq, is_barrier, &implicit_num_events, &implicit_events);
+
+	/* Explicit dependencies */
+	cl_int ndeps = implicit_num_events + num_events;
+	cl_event * evs = malloc(sizeof(cl_event) * ndeps);
+	memcpy(evs, implicit_events, sizeof(cl_event) * implicit_num_events);
+	memcpy(&evs[implicit_num_events], events, sizeof(cl_event) * num_events);
+
+	*ret_num_events = ndeps;
+	*ret_events = evs;
+}
 
 
-   /* Add reference to the command queue */
-   gc_entity_store(&ev->cq, cq);
+void command_queue_enqueue_ex(cl_command_queue cq, cl_command cmd, cl_uint num_events, const cl_event * events) {
 
 
-   /* Submit task */
-   gc_entity_retain(task_event(task));
-   int ret = starpu_task_submit(task);
-   if (ret != 0)
-      DEBUG_ERROR("Unable to submit a task. Error %d\n", ret);
+	/* Check if the command is a barrier */
+	int is_barrier = 0;
+	if (cmd->typ == CL_COMMAND_BARRIER) {
+		is_barrier = 1;
+		/* OpenCL has no CL_COMMAND_BARRIER type, so we fall back on CL_COMMAND_MARKER */
+		cmd->typ = CL_COMMAND_MARKER;
+	}
 
 
-   return CL_SUCCESS;
-}
+	/* Set command queue field */
+	cmd->cq = cq;
 
 
-cl_int command_queue_enqueue(cl_command_queue cq, starpu_task *task, cl_int barrier, cl_int num_events, const cl_event * events) {
-  return command_queue_enqueue_fakeevent(cq, task, barrier, num_events, events, NULL);
-}
+	/* Lock command queue */
+	pthread_mutex_lock(&cq->mutex);
 
 
+	//FIXME: crappy separation (command_queue_dependencies + command_queue_insert)
 
 
-cl_event enqueueBarrier(cl_command_queue cq) {
+	/* Get all (explicit + implicit) dependencies */
+	cl_int all_num_events;
+	cl_event * all_events;
+	command_queue_dependencies(cq, is_barrier, num_events, events, &all_num_events, &all_events);
 
 
-   //CL_COMMAND_MARKER has been chosen as CL_COMMAND_BARRIER doesn't exist
-   starpu_task * task = task_create(CL_COMMAND_MARKER);
+	/* Make all dependencies explicit for the command */
+	cmd->num_events = all_num_events;
+	cmd->events = all_events;
 
 
-   DEBUG_MSG("Submitting barrier task (event %d)\n", task->tag_id);
-   command_queue_enqueue(cq, task, 1, 0, NULL);
+	/* Insert command in the queue */
+	command_queue_insert(cq, cmd, is_barrier);
 
 
-   return task_event(task);
+	/* Unlock command queue */
+	pthread_mutex_unlock(&cq->mutex);
 }
 }

+ 8 - 4
socl/src/command_queue.h

@@ -17,10 +17,14 @@
 #ifndef SOCL_COMMAND_QUEUE_H
 #ifndef SOCL_COMMAND_QUEUE_H
 #define SOCl_COMMAND_QUEUE_H
 #define SOCl_COMMAND_QUEUE_H
 
 
-cl_int command_queue_enqueue(cl_command_queue cq, starpu_task *task, cl_int barrier, cl_int num_events, const cl_event * events);
+void command_queue_enqueue_ex(
+	cl_command_queue 	cq,		/* Command queue */
+	cl_command		cmd,		/* Command to enqueue */
+	cl_uint			num_events,	/* Number of explicit dependencies */
+	const cl_event *	events		/* Explicit dependencies */
+	);
 
 
-cl_int command_queue_enqueue_fakeevent(cl_command_queue cq, starpu_task *task, cl_int barrier, cl_int num_events, const cl_event * events, cl_event fake_event);
-
-cl_event enqueueBarrier(cl_command_queue cq);
+#define command_queue_enqueue(cq, cmd, num_events, events)\
+	command_queue_enqueue_ex(cq, (cl_command)cmd, num_events, events)
 
 
 #endif /* SOCl_COMMAND_QUEUE_H */
 #endif /* SOCl_COMMAND_QUEUE_H */

+ 76 - 0
socl/src/debug.c

@@ -0,0 +1,76 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010,2011 University of Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "socl.h"
+
+#ifdef STARPU_VERBOSE
+void DEBUG_CL(char *s, cl_int err) {
+   #define ERR_CASE(a) case a: DEBUG_MSG("[OpenCL] %s CL error: %s\n", s, #a); break;
+   switch(err) {
+      case CL_SUCCESS:
+         DEBUG_MSG("[OpenCL] %s SUCCESS.\n", s);
+         break;
+      ERR_CASE(CL_DEVICE_NOT_FOUND)
+      ERR_CASE(CL_DEVICE_NOT_AVAILABLE)
+      ERR_CASE(CL_COMPILER_NOT_AVAILABLE)
+      ERR_CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE)
+      ERR_CASE(CL_OUT_OF_RESOURCES)
+      ERR_CASE(CL_OUT_OF_HOST_MEMORY)
+      ERR_CASE(CL_PROFILING_INFO_NOT_AVAILABLE)
+      ERR_CASE(CL_MEM_COPY_OVERLAP)
+      ERR_CASE(CL_IMAGE_FORMAT_MISMATCH)
+      ERR_CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED)
+      ERR_CASE(CL_BUILD_PROGRAM_FAILURE)
+      ERR_CASE(CL_MAP_FAILURE)
+      ERR_CASE(CL_INVALID_VALUE)
+      ERR_CASE(CL_INVALID_DEVICE_TYPE)
+      ERR_CASE(CL_INVALID_PLATFORM)
+      ERR_CASE(CL_INVALID_DEVICE)
+      ERR_CASE(CL_INVALID_CONTEXT)
+      ERR_CASE(CL_INVALID_QUEUE_PROPERTIES)
+      ERR_CASE(CL_INVALID_COMMAND_QUEUE)
+      ERR_CASE(CL_INVALID_HOST_PTR)
+      ERR_CASE(CL_INVALID_MEM_OBJECT)
+      ERR_CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)
+      ERR_CASE(CL_INVALID_IMAGE_SIZE)
+      ERR_CASE(CL_INVALID_SAMPLER)
+      ERR_CASE(CL_INVALID_BINARY)
+      ERR_CASE(CL_INVALID_BUILD_OPTIONS)
+      ERR_CASE(CL_INVALID_PROGRAM)
+      ERR_CASE(CL_INVALID_PROGRAM_EXECUTABLE)
+      ERR_CASE(CL_INVALID_KERNEL_NAME)
+      ERR_CASE(CL_INVALID_KERNEL_DEFINITION)
+      ERR_CASE(CL_INVALID_KERNEL)
+      ERR_CASE(CL_INVALID_ARG_INDEX)
+      ERR_CASE(CL_INVALID_ARG_VALUE)
+      ERR_CASE(CL_INVALID_ARG_SIZE)
+      ERR_CASE(CL_INVALID_KERNEL_ARGS)
+      ERR_CASE(CL_INVALID_WORK_DIMENSION)
+      ERR_CASE(CL_INVALID_WORK_GROUP_SIZE)
+      ERR_CASE(CL_INVALID_WORK_ITEM_SIZE)
+      ERR_CASE(CL_INVALID_GLOBAL_OFFSET)
+      ERR_CASE(CL_INVALID_EVENT_WAIT_LIST)
+      ERR_CASE(CL_INVALID_EVENT)
+      ERR_CASE(CL_INVALID_OPERATION)
+      ERR_CASE(CL_INVALID_GL_OBJECT)
+      ERR_CASE(CL_INVALID_BUFFER_SIZE)
+      ERR_CASE(CL_INVALID_MIP_LEVEL)
+      ERR_CASE(CL_INVALID_GLOBAL_WORK_SIZE)
+      default:
+         DEBUG_MSG("%s CL error: Error message not supported by DEBUG_CL macro (%d).\n", s, err);
+   }
+}
+#endif

+ 4 - 57
socl/src/debug.h

@@ -17,10 +17,12 @@
 #ifndef SOCL_DEBUG_H
 #ifndef SOCL_DEBUG_H
 #define SOCL_DEBUG_H
 #define SOCL_DEBUG_H
 
 
+#include <../src/common/config.h>
+
 #ifdef STARPU_VERBOSE
 #ifdef STARPU_VERBOSE
 #define DEBUG
 #define DEBUG
 #include <stdio.h>
 #include <stdio.h>
-   #define DEBUG_MSG(...) do { fprintf(stderr, "[SOCL] [%s] ", __func__); fprintf(stderr, __VA_ARGS__); } while (0);
+   #define DEBUG_MSG(...) do { fprintf(stderr, "[SOCL] [%s] ", __func__); fprintf(stderr, __VA_ARGS__);} while (0);
    #define DEBUG_MSG_NOHEAD(...) fprintf(stderr, __VA_ARGS__)
    #define DEBUG_MSG_NOHEAD(...) fprintf(stderr, __VA_ARGS__)
    #define DEBUG_ERROR(...) do { fprintf(stderr, "[SOCL] ERROR: "__VA_ARGS__); exit(1); } while (0);
    #define DEBUG_ERROR(...) do { fprintf(stderr, "[SOCL] ERROR: "__VA_ARGS__); exit(1); } while (0);
 #else
 #else
@@ -35,62 +37,7 @@
 #define ERROR_STOP(...) do { ERROR_MSG(__VA_ARGS__); exit(1); } while(0);
 #define ERROR_STOP(...) do { ERROR_MSG(__VA_ARGS__); exit(1); } while(0);
 
 
 #ifdef STARPU_VERBOSE
 #ifdef STARPU_VERBOSE
-void DEBUG_CL(char *s, cl_int err) {
-   #define ERR_CASE(a) case a: DEBUG_MSG("[OpenCL] %s CL error: %s\n", s, #a); break;
-   switch(err) {
-      case CL_SUCCESS:
-         DEBUG_MSG("[OpenCL] %s SUCCESS.\n", s);
-         break;
-      ERR_CASE(CL_DEVICE_NOT_FOUND)
-      ERR_CASE(CL_DEVICE_NOT_AVAILABLE)
-      ERR_CASE(CL_COMPILER_NOT_AVAILABLE)
-      ERR_CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE)
-      ERR_CASE(CL_OUT_OF_RESOURCES)
-      ERR_CASE(CL_OUT_OF_HOST_MEMORY)
-      ERR_CASE(CL_PROFILING_INFO_NOT_AVAILABLE)
-      ERR_CASE(CL_MEM_COPY_OVERLAP)
-      ERR_CASE(CL_IMAGE_FORMAT_MISMATCH)
-      ERR_CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED)
-      ERR_CASE(CL_BUILD_PROGRAM_FAILURE)
-      ERR_CASE(CL_MAP_FAILURE)
-      ERR_CASE(CL_INVALID_VALUE)
-      ERR_CASE(CL_INVALID_DEVICE_TYPE)
-      ERR_CASE(CL_INVALID_PLATFORM)
-      ERR_CASE(CL_INVALID_DEVICE)
-      ERR_CASE(CL_INVALID_CONTEXT)
-      ERR_CASE(CL_INVALID_QUEUE_PROPERTIES)
-      ERR_CASE(CL_INVALID_COMMAND_QUEUE)
-      ERR_CASE(CL_INVALID_HOST_PTR)
-      ERR_CASE(CL_INVALID_MEM_OBJECT)
-      ERR_CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)
-      ERR_CASE(CL_INVALID_IMAGE_SIZE)
-      ERR_CASE(CL_INVALID_SAMPLER)
-      ERR_CASE(CL_INVALID_BINARY)
-      ERR_CASE(CL_INVALID_BUILD_OPTIONS)
-      ERR_CASE(CL_INVALID_PROGRAM)
-      ERR_CASE(CL_INVALID_PROGRAM_EXECUTABLE)
-      ERR_CASE(CL_INVALID_KERNEL_NAME)
-      ERR_CASE(CL_INVALID_KERNEL_DEFINITION)
-      ERR_CASE(CL_INVALID_KERNEL)
-      ERR_CASE(CL_INVALID_ARG_INDEX)
-      ERR_CASE(CL_INVALID_ARG_VALUE)
-      ERR_CASE(CL_INVALID_ARG_SIZE)
-      ERR_CASE(CL_INVALID_KERNEL_ARGS)
-      ERR_CASE(CL_INVALID_WORK_DIMENSION)
-      ERR_CASE(CL_INVALID_WORK_GROUP_SIZE)
-      ERR_CASE(CL_INVALID_WORK_ITEM_SIZE)
-      ERR_CASE(CL_INVALID_GLOBAL_OFFSET)
-      ERR_CASE(CL_INVALID_EVENT_WAIT_LIST)
-      ERR_CASE(CL_INVALID_EVENT)
-      ERR_CASE(CL_INVALID_OPERATION)
-      ERR_CASE(CL_INVALID_GL_OBJECT)
-      ERR_CASE(CL_INVALID_BUFFER_SIZE)
-      ERR_CASE(CL_INVALID_MIP_LEVEL)
-      ERR_CASE(CL_INVALID_GLOBAL_WORK_SIZE)
-      default:
-         DEBUG_MSG("%s CL error: Error message not supported by print_cl_error (%d).\n", s, err);
-   }
-}
+void DEBUG_CL(char *s, cl_int err);
 #else
 #else
    #define DEBUG_CL(...) while(0);
    #define DEBUG_CL(...) while(0);
 #endif
 #endif

+ 13 - 15
socl/src/event.c

@@ -20,21 +20,24 @@
 
 
 static void release_callback_event(void * e);
 static void release_callback_event(void * e);
 
 
+int event_unique_id() {
+   static int id = 1;
+
+   return __sync_fetch_and_add(&id,1);
+}
+
 /**
 /**
  * Create a new event
  * Create a new event
  *
  *
  * Events have one-to-one relation with tag. Tag number is event ID
  * Events have one-to-one relation with tag. Tag number is event ID
  */
  */
 cl_event event_create(void) {
 cl_event event_create(void) {
-   static int id = 1;
    cl_event ev;
    cl_event ev;
    ev = gc_entity_alloc(sizeof(struct _cl_event), release_callback_event);
    ev = gc_entity_alloc(sizeof(struct _cl_event), release_callback_event);
 
 
-   ev->next = NULL;
-   ev->prev = NULL;
-   ev->id = __sync_fetch_and_add(&id,1);
+   ev->id = event_unique_id();
    ev->status = CL_SUBMITTED;
    ev->status = CL_SUBMITTED;
-   ev->type = 0;
+   ev->command = NULL;
    ev->profiling_info = NULL;
    ev->profiling_info = NULL;
    ev->cq = NULL;
    ev->cq = NULL;
 
 
@@ -49,22 +52,17 @@ static void release_callback_event(void * e) {
   /* Remove from command queue */
   /* Remove from command queue */
   if (cq != NULL) {
   if (cq != NULL) {
     /* Lock command queue */
     /* Lock command queue */
-    pthread_spin_lock(&cq->spin);
+    pthread_mutex_lock(&cq->mutex);
 
 
     /* Remove barrier if applicable */
     /* Remove barrier if applicable */
-    if (cq->barrier == event)
+    if (cq->barrier == event->command)
       cq->barrier = NULL;
       cq->barrier = NULL;
 
 
-    /* Remove from the list of out-of-order events */
-    if (event->prev != NULL)
-      event->prev->next = event->next;
-    if (event->next != NULL)
-      event->next->prev = event->prev;
-    if (cq->events == event)
-      cq->events = event->next;
+    /* Remove from the list of out-of-order commands */
+    cq->commands = command_list_remove(cq->commands, event->command);
 
 
     /* Unlock command queue */
     /* Unlock command queue */
-    pthread_spin_unlock(&cq->spin);
+    pthread_mutex_unlock(&cq->mutex);
 
 
     gc_entity_unstore(&cq);
     gc_entity_unstore(&cq);
   }
   }

+ 5 - 0
socl/src/event.h

@@ -26,4 +26,9 @@
  */
  */
 cl_event event_create(void);
 cl_event event_create(void);
 
 
+/**
+ * Generate a unique tag id
+ */
+int event_unique_id();
+
 #endif /* SOCL_EVENT_H */
 #endif /* SOCL_EVENT_H */

+ 6 - 2
socl/src/gc.c

@@ -104,13 +104,15 @@ void gc_stop(void) {
   pthread_join(gc_thread, NULL);
   pthread_join(gc_thread, NULL);
 }
 }
 
 
-void gc_entity_release_ex(entity e) {
+int gc_entity_release_ex(entity e) {
 
 
   /* Decrement reference count */
   /* Decrement reference count */
   int refs = __sync_sub_and_fetch(&e->refs, 1);
   int refs = __sync_sub_and_fetch(&e->refs, 1);
 
 
   if (refs != 0)
   if (refs != 0)
-    return;
+    return 0;
+
+  DEBUG_MSG("Releasing entity %lx\n", e);
 
 
   GC_LOCK;
   GC_LOCK;
 
 
@@ -127,6 +129,8 @@ void gc_entity_release_ex(entity e) {
   gc_list = e;
   gc_list = e;
 
 
   GC_UNLOCK;
   GC_UNLOCK;
+
+  return 1;
 }
 }
 
 
 
 

+ 1 - 1
socl/src/gc.h

@@ -27,7 +27,7 @@ void * gc_entity_alloc(unsigned int size, void (*release_callback)(void*));
 void gc_entity_retain(void *arg);
 void gc_entity_retain(void *arg);
 
 
 /** Decrement reference counter and release entity if applicable */
 /** Decrement reference counter and release entity if applicable */
-void gc_entity_release_ex(entity e);
+int gc_entity_release_ex(entity e);
 
 
 int gc_active_entity_count(void);
 int gc_active_entity_count(void);
 
 

+ 0 - 123
socl/src/graph.c

@@ -1,123 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010,2011 University of Bordeaux
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "socl.h"
-#include "graph.h"
-#include "event.h"
-
-static pthread_spinlock_t graph_lock;
-static graph_node graph_nodes = NULL;
-
-
-/**
- * Initialize graph structure
- */
-void graph_init(void) {
-	pthread_spin_init(&graph_lock, PTHREAD_PROCESS_PRIVATE);
-}
-
-/**
- * Release graph structure
- */
-void graph_destroy(void) {
-	pthread_spin_destroy(&graph_lock);
-}
-
-/**
- * Initialize a graph node
- */
-void graph_node_init(graph_node node) {
-	node->id = -1;
-	node->next = NULL;
-}
-
-/**
- * Store a node in the graph
- */
-void graph_store(void * node) {
-	pthread_spin_lock(&graph_lock);
-
-	graph_node n = (graph_node)node;
-	n->next = graph_nodes;
-	graph_nodes = n;
-
-	pthread_spin_unlock(&graph_lock);
-}
-
-
-
-/**
- * Duplicate a memory area into a fresh allocated buffer
- */
-static void * memdupa(const void *p, size_t size) {
-	void * s = malloc(size);
-	memcpy(s,p,size);
-	return s;
-}
-
-#define memdup(p, size) ((typeof(p))memdupa(p,size))
-#define nullOrDup(name,size) s->name = (name == NULL ? NULL : memdup(name,size))
-#define dup(name) s->name = name
-
-
-node_enqueue_kernel graph_create_enqueue_kernel(char is_task,
-		cl_command_queue cq,
-		cl_kernel        kernel,
-		cl_uint          work_dim,
-		const size_t *   global_work_offset,
-		const size_t *   global_work_size,
-		const size_t *   local_work_size,
-		cl_uint          num_events,
-		const cl_event * events,
-		cl_event *       event,
-		cl_uint 		num_args,
-		size_t *		arg_sizes,
-		enum kernel_arg_type * arg_types,
-		void **		args)
-{
-	node_enqueue_kernel s = malloc(sizeof(struct node_enqueue_kernel_t));
-	graph_node_init(&s->node);
-	s->node.id = NODE_ENQUEUE_KERNEL;
-
-	dup(is_task);
-	dup(cq);
-	dup(kernel);
-	dup(work_dim);
-	nullOrDup(global_work_offset, work_dim*sizeof(size_t));
-	nullOrDup(global_work_size, work_dim*sizeof(size_t));
-	nullOrDup(local_work_size, work_dim*sizeof(size_t));
-	dup(num_events);
-	nullOrDup(events, num_events * sizeof(cl_event));
-	dup(num_args);
-	nullOrDup(arg_sizes, num_args * sizeof(size_t));
-	nullOrDup(arg_types, num_args * sizeof(enum kernel_arg_type));
-	nullOrDup(args, num_args * sizeof(void*));
-
-	
-	if (event != NULL) {
-		*event = event_create();
-		s->event = event;
-	}
-	else {
-		s->event = NULL;
-	}
-
-	return s;
-}
-
-#undef nullOrDup
-#undef memdup
-#undef dup

+ 0 - 73
socl/src/graph.h

@@ -1,73 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010,2011 University of Bordeaux
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#ifndef SOCL_GRAPH_H
-#define SOCL_GRAPH_H
-
-#include "socl.h"
-
-typedef struct graph_node_t * graph_node;
-
-struct graph_node_t {
-	int id; /* Kind of node */
-	graph_node next; /* Linked-list of nodes... */
-};
-
-void graph_init(void);
-void graph_destroy(void);
-void graph_node_init(graph_node node);
-void graph_store(void * node);
-
-#define NODE_ENQUEUE_KERNEL 1
-
-
-typedef struct node_enqueue_kernel_t {
-	struct graph_node_t node;
-
-	char 		 is_task; /* Set if clEnqueueTask is used */
-	cl_command_queue cq;
-	cl_kernel        kernel;
-	cl_uint          work_dim;
-	const size_t *   global_work_offset;
-	const size_t *   global_work_size;
-	const size_t *   local_work_size;
-	cl_uint          num_events;
-	const cl_event * events;
-	cl_event * 	 event;
-	cl_uint 	 num_args;
-	size_t *	 arg_sizes;
-	enum kernel_arg_type * arg_types;
-	void **		 args;
-} * node_enqueue_kernel;
-
-node_enqueue_kernel graph_create_enqueue_kernel(char is_task,
-		cl_command_queue cq,
-		cl_kernel        kernel,
-		cl_uint          work_dim,
-		const size_t *   global_work_offset,
-		const size_t *   global_work_size,
-		const size_t *   local_work_size,
-		cl_uint          num_events,
-		const cl_event * events,
-		cl_event *       event,
-		cl_uint 		num_args,
-		size_t *		arg_sizes,
-		enum kernel_arg_type * arg_types,
-		void **		args);
-
-cl_int node_play_enqueue_kernel(node_enqueue_kernel n);
-
-#endif /* SOCL_GRAPH_H */

+ 0 - 4
socl/src/init.c

@@ -15,7 +15,6 @@
  */
  */
 
 
 #include "socl.h"
 #include "socl.h"
-#include "graph.h"
 #include "gc.h"
 #include "gc.h"
 #include "mem_objects.h"
 #include "mem_objects.h"
 
 
@@ -25,7 +24,6 @@
 __attribute__((constructor)) static void socl_init() {
 __attribute__((constructor)) static void socl_init() {
   
   
   mem_object_init();
   mem_object_init();
-  graph_init();
 
 
   starpu_init(NULL);
   starpu_init(NULL);
   
   
@@ -51,7 +49,5 @@ __attribute__((destructor)) static void socl_shutdown() {
   if (active_entities != 0)
   if (active_entities != 0)
     fprintf(stderr, "Unreleased entities: %d\n", active_entities);
     fprintf(stderr, "Unreleased entities: %d\n", active_entities);
 
 
-  graph_destroy();
-
   starpu_shutdown();
   starpu_shutdown();
 }
 }

+ 39 - 18
socl/src/socl.h

@@ -23,6 +23,9 @@
 #include CL_HEADERS "CL/cl.h"
 #include CL_HEADERS "CL/cl.h"
 #endif
 #endif
 
 
+/* Additional command type */
+#define CL_COMMAND_BARRIER 0x99987
+
 #include <string.h>
 #include <string.h>
 #include <stdlib.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <stdint.h>
@@ -35,7 +38,7 @@
 #include <starpu_profiling.h>
 #include <starpu_profiling.h>
 #include <starpu_task.h>
 #include <starpu_task.h>
 
 
-typedef struct starpu_task starpu_task;
+typedef struct starpu_task * starpu_task;
 
 
 #ifdef UNUSED
 #ifdef UNUSED
 #elif defined(__GNUC__)
 #elif defined(__GNUC__)
@@ -50,12 +53,13 @@ typedef struct starpu_task starpu_task;
  */
  */
 typedef struct entity * entity;
 typedef struct entity * entity;
 
 
+#include "command.h"
+#include "command_list.h"
 #include "command_queue.h"
 #include "command_queue.h"
 #include "debug.h"
 #include "debug.h"
 #include "devices.h"
 #include "devices.h"
 #include "event.h"
 #include "event.h"
 #include "gc.h"
 #include "gc.h"
-#include "graph.h"
 #include "mem_objects.h"
 #include "mem_objects.h"
 #include "task.h"
 #include "task.h"
 #include "util.h"
 #include "util.h"
@@ -79,11 +83,32 @@ struct entity {
 
 
 struct _cl_platform_id {};
 struct _cl_platform_id {};
 
 
-#define RETURN_EVENT(ev, event) \
-   if (event != NULL) \
-      *event = ev; \
-   else\
-      gc_entity_release(ev);
+#define RETURN_EVENT(cmd, event) \
+	if (event != NULL) { \
+		cl_event ev = command_event_get(cmd);\
+		gc_entity_retain(ev);\
+		*event = ev; \
+	}
+
+#define RETURN_CUSTOM_EVENT(src, tgt) \
+	if (tgt != NULL) { \
+		gc_entity_retain(src); \
+		*tgt = src; \
+	}
+
+#define MAY_BLOCK(blocking) \
+	if ((blocking) == CL_TRUE) {\
+		cl_event ev = command_event_get(cmd);\
+		soclWaitForEvents(1, &ev);\
+		gc_entity_release(ev);\
+	}
+
+#define MAY_BLOCK_CUSTOM(blocking,event) \
+	if ((blocking) == CL_TRUE) {\
+		cl_event ev = (event);\
+		soclWaitForEvents(1, &ev);\
+		gc_entity_release(ev);\
+	}
 
 
 /* Constants */
 /* Constants */
 struct _cl_platform_id socl_platform;
 struct _cl_platform_id socl_platform;
@@ -121,14 +146,14 @@ struct _cl_command_queue {
   cl_device_id device;
   cl_device_id device;
   cl_context context;
   cl_context context;
 
 
-  /* Stored command events */
-  cl_event events;
+  /* Stored commands */
+  command_list commands;
 
 
   /* Last enqueued barrier-like event */
   /* Last enqueued barrier-like event */
-  cl_event barrier;
+  cl_command barrier;
 
 
   /* Mutex */
   /* Mutex */
-  pthread_spinlock_t spin;
+  pthread_mutex_t mutex;
 
 
   /* ID  */
   /* ID  */
 #ifdef DEBUG
 #ifdef DEBUG
@@ -142,12 +167,8 @@ struct _cl_event {
   /* Command queue */
   /* Command queue */
   cl_command_queue cq;
   cl_command_queue cq;
 
 
-  /* Command type */
-  cl_command_type type;
-
-  /* Command queue list */
-  cl_event prev;
-  cl_event next;
+  /* Command */
+  cl_command command;
 
 
   /* Event status */
   /* Event status */
   cl_int status;
   cl_int status;
@@ -244,7 +265,7 @@ struct _cl_kernel {
   cl_int *errcodes;
   cl_int *errcodes;
 
 
   /* Arguments */
   /* Arguments */
-  unsigned int arg_count;
+  unsigned int num_args;
   size_t *arg_size;
   size_t *arg_size;
   enum kernel_arg_type  *arg_type;
   enum kernel_arg_type  *arg_type;
   void  **arg_value;
   void  **arg_value;

+ 61 - 35
socl/src/task.c

@@ -18,64 +18,90 @@
 #include "gc.h"
 #include "gc.h"
 #include "event.h"
 #include "event.h"
 
 
-cl_event task_event(starpu_task *task) {
-  return (cl_event)task->callback_arg;
-}
-
 static void task_release_callback(void *arg) {
 static void task_release_callback(void *arg) {
-  starpu_task *task = starpu_get_current_task();
-  cl_event ev = (cl_event)arg;
+  starpu_task task = starpu_get_current_task();
+  cl_command cmd = (cl_command)arg;
   
   
+  cl_event ev = command_event_get(cmd);
   ev->status = CL_COMPLETE;
   ev->status = CL_COMPLETE;
 
 
+  DEBUG_MSG("notifying tag %x as well as task tag %x\n", ev->id, task->tag_id);
+
+  /* Trigger the tag associated to the command event */
+  starpu_tag_notify_from_apps(ev->id);
+
   if (task->profiling_info != NULL && (intptr_t)task->profiling_info != -ENOSYS) {
   if (task->profiling_info != NULL && (intptr_t)task->profiling_info != -ENOSYS) {
     ev->profiling_info = malloc(sizeof(*task->profiling_info));
     ev->profiling_info = malloc(sizeof(*task->profiling_info));
     memcpy(ev->profiling_info, task->profiling_info, sizeof(*task->profiling_info));
     memcpy(ev->profiling_info, task->profiling_info, sizeof(*task->profiling_info));
   }
   }
 
 
   gc_entity_release(ev);
   gc_entity_release(ev);
+
+  /* Release the command */
+  //TODO
 }
 }
 
 
 
 
 /*
 /*
  * Create a StarPU task
  * Create a StarPU task
- *
- * Task's callback_arg is event
- * Task's tag is set to event ID
  */
  */
-starpu_task * task_create(cl_command_type type) {
-   cl_event event;
-   struct starpu_task * task;
+starpu_task task_create() {
+	struct starpu_task * task;
+
+	/* Create StarPU task */
+	task = starpu_task_create();
+
+	/* Set task common settings */
+	task->destroy = 1;
+	task->detach = 1;
+
+	task->use_tag = 1;
+	task->tag_id = event_unique_id();
 
 
-   /* Create event */
-   event = event_create();
-   event->type = type;
+	DEBUG_MSG("creating task with tag %x\n", task->tag_id);
 
 
-   /* Create StarPU task */
-   task = starpu_task_create();
+	return task;
+}
+
+
+void task_depends_on(starpu_task task, cl_uint num_events, cl_event *events) {
+
+	if (num_events != 0) {
+		cl_uint i;
+
+		starpu_tag_t * tags = malloc(num_events * sizeof(starpu_tag_t));	
 
 
-   /* Task tag is set to event id */
-   task->use_tag = 1;
-   task->tag_id = event->id;
+		if (num_events != 0)
+			DEBUG_MSG("Tag %d depends on %u tags:", task->tag_id, num_events);
 
 
-   /* Set task common settings */
-   task->destroy = 1;
-   task->detach = 1;
-   task->callback_func = task_release_callback;
-   task->callback_arg = event;
+		for (i=0; i<num_events; i++) {
+			tags[i] = events[i]->id;
+			DEBUG_MSG_NOHEAD(" %u", events[i]->id);
+		}
+		DEBUG_MSG_NOHEAD("\n");
 
 
-   return task;
+		starpu_tag_declare_deps_array(task->tag_id, num_events, tags);
+
+		free(tags);
+	}
 }
 }
 
 
+cl_int task_submit_ex(starpu_task task, cl_command cmd) {
+
+	/* Associated the task to the command */
+	cmd->task = task;
+
+	task_depends_on(task, command_num_events_get(cmd), command_events_get(cmd));
+
+	task->callback_func = task_release_callback;
+	task->callback_arg = cmd;
 
 
-void task_dependency_add(starpu_task * task, cl_uint num, const cl_event *events) {
-   unsigned int i;
+	/* Submit task */
+	int ret = starpu_task_submit(task);
+	if (ret != 0)
+		DEBUG_ERROR("Unable to submit a task. Error %d\n", ret);
 
 
-   for (i=0; i<num; i++) {
-      starpu_tag_t tag = events[i]->id;
-      DEBUG_MSG("Event %d depends on event %d\n", task->tag_id, events[i]->id);
-      starpu_tag_declare_deps_array(task->tag_id, 1, &tag);
-   }
+	return CL_SUCCESS;
 }
 }
 
 
 
 
@@ -106,14 +132,14 @@ static starpu_codelet cputask_codelet = {
    .cpu_func = &cputask_task
    .cpu_func = &cputask_task
 };
 };
 
 
-starpu_task * task_create_cpu(cl_command_type type, void (*callback)(void*), void *arg, int free_arg) {
+starpu_task task_create_cpu(void (*callback)(void*), void *arg, int free_arg) {
   
   
   struct cputask_arg * a = malloc(sizeof(struct cputask_arg));
   struct cputask_arg * a = malloc(sizeof(struct cputask_arg));
   a->callback = callback;
   a->callback = callback;
   a->arg = arg;
   a->arg = arg;
   a->free_arg = free_arg;
   a->free_arg = free_arg;
 
 
-  starpu_task *task = task_create(type);
+  starpu_task task = task_create();
   task->cl = &cputask_codelet;
   task->cl = &cputask_codelet;
   task->cl_arg = a;
   task->cl_arg = a;
 
 

+ 17 - 4
socl/src/task.h

@@ -19,9 +19,22 @@
 
 
 #include "socl.h"
 #include "socl.h"
 
 
-starpu_task * task_create(cl_command_type type);
-void task_dependency_add(starpu_task * task, cl_uint num, const cl_event *events);
-starpu_task * task_create_cpu(cl_command_type type, void (*callback)(void*), void *arg, int free_arg);
-cl_event task_event(starpu_task *task);
+starpu_task task_create();
+void task_dependency_add(starpu_task task, cl_uint num_events, cl_event *events);
+
+starpu_task task_create_cpu(void (*callback)(void*), void *arg, int free_arg);
+
+/**
+ * Associate a StarPU task to a command and submit it
+ *
+ * When the task terminates, the command is set as terminated too
+ */
+cl_int task_submit_ex(starpu_task task, cl_command cmd);
+#define task_submit(task,cmd) task_submit_ex(task, (cl_command)cmd)
+
+/**
+ * Add task dependencies
+ */
+void task_depends_on(starpu_task task, cl_uint num_events, cl_event *events);
 
 
 #endif /* SOCL_TASK_H */
 #endif /* SOCL_TASK_H */

+ 24 - 0
socl/src/util.c

@@ -24,3 +24,27 @@ int starpu_worker_get_range() {
 
 
    return oid;
    return oid;
 }
 }
+
+void * memdupa(const void *p, size_t size) {
+	void * s = malloc(size);
+	memcpy(s,p,size);
+	return s;
+}
+
+void ** memdup_deep_safea(const void **p, unsigned n, size_t size) {
+	void ** s = (void**)malloc(sizeof(void*) * n);
+	unsigned i;
+	for (i=0; i<n; i++) {
+		s[i] = memdup_safe((void*)p[i], size);
+	}
+	return s;
+}
+
+void ** memdup_deep_varsize_safea(const void **p, unsigned n, size_t * size) {
+	void ** s = (void**)malloc(sizeof(void*) * n);
+	unsigned i;
+	for (i=0; i<n; i++) {
+		s[i] = memdup_safe((void*)p[i], size[i]);
+	}
+	return s;
+}

+ 24 - 0
socl/src/util.h

@@ -19,4 +19,28 @@
 
 
 int starpu_worker_get_range();
 int starpu_worker_get_range();
 
 
+/**
+ * Duplicate a memory area into a fresh allocated buffer
+ * Consider using memdup or memdup_safe instead
+ */
+void * memdupa(const void *p, size_t size);
+
+#define memdup(p, size) ((typeof(p))memdupa((const void*)p,size))
+#define memdup_safe(p,size) (p == NULL ? NULL : memdup(p,size))
+
+/**
+ * Duplicate an array of pointers by performing a deep copy
+ */
+void ** memdup_deep_safea(const void **p, unsigned n, size_t size);
+
+#define memdup_deep_safe(p,n,size) ((typeof(p))memdup_deep_safea((const void **)p,n,size))
+
+/**
+ * Duplicate an array of pointers by performing a deep copy
+ * Sizes are different for each cell
+ */
+void ** memdup_deep_varsize_safea(const void **p, unsigned n, size_t * size);
+
+#define memdup_deep_varsize_safe(p,n,size) ((typeof(p))memdup_deep_varsize_safea((const void **)p,n,size))
+
 #endif /* SOCL_UTIL_H */
 #endif /* SOCL_UTIL_H */

+ 0 - 0
src/common/htable32.c


Some files were not shown because too many files changed in this diff