Browse Source

merged with trunk from 6552 to 6847

Benjamin Lorendeau 13 years ago
parent
commit
909991b0fd
100 changed files with 2012 additions and 553 deletions
  1. 9 0
      ChangeLog
  2. 1 1
      Makefile.am
  3. 15 0
      STARPU-VERSION
  4. 16 6
      configure.ac
  5. 16 2
      doc/Makefile.am
  6. 34 10
      doc/chapters/advanced-api.texi
  7. 86 8
      doc/chapters/advanced-examples.texi
  8. 107 23
      doc/chapters/basic-api.texi
  9. 4 2
      doc/chapters/basic-examples.texi
  10. 1 1
      doc/chapters/c-extensions.texi
  11. 4 4
      doc/chapters/configuration.texi
  12. 1 1
      doc/starpu.texi
  13. 2 2
      doc/tutorial/README
  14. 33 11
      examples/Makefile.am
  15. 2 1
      examples/basic_examples/vector_scal.c
  16. 119 0
      examples/binary/binary.c
  17. 1 1
      examples/gl_interop/gl_interop.c
  18. 1 1
      examples/lu/clu.c
  19. 1 1
      examples/lu/clu_implicit.c
  20. 1 1
      examples/lu/clu_implicit_pivot.c
  21. 1 1
      examples/lu/clu_kernels.c
  22. 1 1
      examples/lu/clu_pivot.c
  23. 1 1
      examples/lu/dlu.c
  24. 1 1
      examples/lu/dlu_implicit.c
  25. 1 1
      examples/lu/dlu_implicit_pivot.c
  26. 1 1
      examples/lu/dlu_kernels.c
  27. 1 1
      examples/lu/dlu_pivot.c
  28. 0 47
      examples/lu/double.h
  29. 0 49
      examples/lu/float.h
  30. 1 1
      examples/lu/lu_example_complex_double.c
  31. 1 1
      examples/lu/lu_example_complex_float.c
  32. 1 1
      examples/lu/lu_example_double.c
  33. 1 1
      examples/lu/lu_example_float.c
  34. 1 1
      examples/lu/slu.c
  35. 1 1
      examples/lu/slu_implicit.c
  36. 1 1
      examples/lu/slu_implicit_pivot.c
  37. 1 1
      examples/lu/slu_kernels.c
  38. 1 1
      examples/lu/slu_pivot.c
  39. 2 2
      examples/lu/xlu_pivot.c
  40. 1 1
      examples/lu/zlu.c
  41. 1 1
      examples/lu/zlu_implicit.c
  42. 1 1
      examples/lu/zlu_implicit_pivot.c
  43. 1 1
      examples/lu/zlu_kernels.c
  44. 1 1
      examples/lu/zlu_pivot.c
  45. 1 1
      examples/openmp/vector_scal.c
  46. 254 0
      examples/pipeline/pipeline.c
  47. 5 0
      examples/scheduler/schedulers.sh
  48. 24 2
      gcc-plugin/Makefile.am
  49. 336 42
      gcc-plugin/src/starpu.c
  50. 4 7
      gcc-plugin/tests/Makefile.am
  51. 137 2
      gcc-plugin/tests/mocks.h
  52. 9 8
      gcc-plugin/tests/opencl-errors.c
  53. 1 1
      gcc-plugin/tests/opencl-lacking.c
  54. 8 2
      gcc-plugin/tests/opencl.c
  55. 4 0
      gcc-plugin/tests/run-test.in
  56. 3 2
      include/starpu.h
  57. 5 0
      include/starpu_data.h
  58. 2 1
      include/starpu_data_filters.h
  59. 9 2
      include/starpu_opencl.h
  60. 1 0
      include/starpu_perfmodel.h
  61. 2 2
      include/starpu_util.h
  62. 2 2
      mpi/Makefile.am
  63. 0 42
      mpi/examples/mpi_lu/double.h
  64. 0 42
      mpi/examples/mpi_lu/float.h
  65. 1 1
      mpi/examples/mpi_lu/pdlu.c
  66. 1 1
      mpi/examples/mpi_lu/pdlu_kernels.c
  67. 1 1
      mpi/examples/mpi_lu/plu_example_double.c
  68. 1 1
      mpi/examples/mpi_lu/plu_example_float.c
  69. 1 1
      mpi/examples/mpi_lu/plu_solve_double.c
  70. 1 1
      mpi/examples/mpi_lu/plu_solve_float.c
  71. 1 1
      mpi/examples/mpi_lu/pslu.c
  72. 1 1
      mpi/examples/mpi_lu/pslu_kernels.c
  73. 1 1
      mpi/examples/mpi_lu/slu_kernels.c
  74. 3 3
      mpi/starpu_mpi_fxt.h
  75. 1 1
      mpi/starpu_mpi_insert_task_cache.c
  76. 5 2
      socl/examples/Makefile.am
  77. 477 0
      socl/examples/matmul/matmul.c
  78. 1 0
      socl/src/cl_createprogramwithsource.c
  79. 0 5
      socl/src/cl_enqueuendrangekernel.c
  80. 3 1
      socl/src/cl_getkernelworkgroupinfo.c
  81. 4 1
      socl/src/command_queue.c
  82. 1 1
      socl/src/gc.c
  83. 0 3
      socl/src/init.c
  84. 21 2
      src/common/fxt.c
  85. 37 36
      src/common/fxt.h
  86. 1 1
      src/common/htable32.h
  87. 1 1
      src/common/starpu_spinlock.c
  88. 41 1
      src/common/utils.c
  89. 2 0
      src/common/utils.h
  90. 12 16
      src/core/dependencies/data_concurrency.c
  91. 2 2
      src/core/dependencies/data_concurrency.h
  92. 2 2
      src/core/dependencies/htable.h
  93. 35 0
      src/core/dependencies/implicit_data_deps.c
  94. 1 0
      src/core/dependencies/implicit_data_deps.h
  95. 3 0
      src/core/jobs.c
  96. 7 86
      src/core/perfmodel/perfmodel.c
  97. 54 15
      src/core/perfmodel/perfmodel_bus.c
  98. 2 2
      src/core/perfmodel/perfmodel_history.c
  99. 3 10
      src/core/sched_policy.c
  100. 0 0
      src/core/task.c

+ 9 - 0
ChangeLog

@@ -19,6 +19,15 @@ StarPU 1.1.0 (svn revision xxxx)
 
 New features:
   * OpenGL interoperability support.
+  * Capability to store compiled OpenCL kernels on the file system
+  * Capability to load compiled OpenCL kernels
+
+Changes:
+  * The FxT code can now be used on systems other than Linux.
+
+Small changes:
+  * STARPU_NCPU should now be used instead of STARPU_NCPUS. STARPU_NCPUS is
+	still available for compatibility reasons.
 
 StarPU 1.0.0 (svn revision 6306)
 ==============================================

+ 1 - 1
Makefile.am

@@ -101,7 +101,7 @@ else
 txtdir = ${docdir}
 endif
 txt_DATA = AUTHORS COPYING.LGPL README
-EXTRA_DIST = AUTHORS COPYING.LGPL README STARPU-VERSION
+EXTRA_DIST = AUTHORS COPYING.LGPL README STARPU-VERSION build-aux/svn2cl.xsl
 
 include starpu-top/extradist
 

+ 15 - 0
STARPU-VERSION

@@ -2,6 +2,21 @@
 
 # Versioning (SONAMEs) for StarPU libraries.
 
+# http://www.gnu.org/software/libtool/manual/html_node/Updating-version-info.html#Updating-version-info
+# Here are a set of rules to help you update your library version information:
+# Start with version information of ‘0:0:0’ for each libtool library.
+# Update the version information only immediately before a public
+# release of your software. More frequent updates are unnecessary, and
+# only guarantee that the current interface number gets larger faster.
+# - If the library source code has changed at all since the last
+#   update, then increment revision (‘c:r:a’ becomes ‘c:r+1:a’).
+# - If any interfaces have been added, removed, or changed since the
+#   last update, increment current, and set revision to 0.
+# - If any interfaces have been added since the last public release,
+#   then increment age.
+# - If any interfaces have been removed or changed since the last
+#   public release, then set age to 0. change
+
 # Libtool interface versioning (info "(libtool) Versioning").
 LIBSTARPU_INTERFACE_CURRENT=1	# increment upon ABI change
 LIBSTARPU_INTERFACE_REVISION=0	# increment upon implementation change

+ 16 - 6
configure.ac

@@ -171,6 +171,9 @@ fi
 # Some systems do not define strerror_r
 AC_CHECK_FUNC([strerror_r], [AC_DEFINE([STARPU_HAVE_STRERROR_R], [1], [Define to 1 if the function strerro_r is available.])])
 
+# Some systems may not define setenv
+AC_CHECK_FUNC([setenv], [AC_DEFINE([STARPU_HAVE_SETENV], [1], [Define to 1 if the function setenv is available.])])
+
 # Some systems do not define unsetenv
 AC_CHECK_FUNC([unsetenv], [AC_DEFINE([STARPU_HAVE_UNSETENV], [1], [Define to 1 if the function unsetenv is available.])])
 
@@ -980,7 +983,7 @@ AC_DEFINE_UNQUOTED(STARPU_NMAXWORKERS, [$nmaxworkers], [Maximum number of worker
 AC_MSG_CHECKING(maximum number of implementations)
 AC_ARG_ENABLE(maximplementations, [AS_HELP_STRING([--enable-maximplementations=<number>],
 		[maximum number of implementations])],
-		maximplementations=$enableval, maximplementations=4)
+		maximplementations=$enableval, maximplementations=8)
 AC_MSG_RESULT($maximplementations)
 AC_DEFINE_UNQUOTED(STARPU_MAXIMPLEMENTATIONS, [$maximplementations],
 		[maximum number of implementations])
@@ -1259,7 +1262,8 @@ fi
 AM_MISSING_PROG([YACC], [bison])
 
 AM_CONDITIONAL([BUILD_GCC_PLUGIN], [test "x$build_gcc_plugin" = "xyes"])
-AM_CONDITIONAL([HAVE_GUILE], [test "x$GUILE" != "x"])
+AM_CONDITIONAL([RUN_GCC_PLUGIN_TESTS],
+  [test "x$run_gcc_plugin_test_suite" = "xyes"])
 
 ###############################################################################
 #                                                                             #
@@ -1496,7 +1500,10 @@ AM_CONDITIONAL(BUILD_STARPUFFT, [test x$fft_support = xyes])
 # hwloc                                  #
 ##########################################
 
-AC_ARG_WITH([hwloc], [AS_HELP_STRING([--without-hwloc], [Disable hwloc (enabled by default)])])
+AC_ARG_WITH([hwloc],
+	[AS_HELP_STRING([--without-hwloc],
+	[Disable hwloc (enabled by default)])],
+	[hwloc_dir="$withval"])
 SAVED_LDFLAGS="${LDFLAGS}"
 SAVED_CPPFLAGS="${CPPFLAGS}"
 AS_IF([test "x$with_hwloc" != "xno"], [
@@ -1506,10 +1513,9 @@ AS_IF([test "x$with_hwloc" != "xno"], [
 		have_valid_hwloc=no
 		have_pkgconfig_hwloc=no])
 	AS_IF([test "$have_valid_hwloc" = "no"], [
-		hwloc_dir="$withval"
-		CPPFLAGS="${SAVED_CPPFLAGS} -I$hwloc_dir/include"
+		if test "$hwloc_dir" != "" ; then CPPFLAGS="${SAVED_CPPFLAGS} -I$hwloc_dir/include" ; fi
 		AC_CHECK_HEADER([hwloc.h],[have_valid_hwloc=yes],[have_valid_hwloc=no])
-		LDFLAGS="${SAVED_LDFLAGS} -L$hwloc_dir/lib"
+		if test "$hwloc_dir" != "" ; then LDFLAGS="${SAVED_LDFLAGS} -L$hwloc_dir/lib" ; fi
 		AC_HAVE_LIBRARY([hwloc],[have_valid_hwloc=yes],[have_valid_hwloc=no])
 		])
     ],
@@ -1524,6 +1530,10 @@ AS_IF([test "$have_valid_hwloc" = "yes"], [
 	])
 LDFLAGS="${SAVED_LDFLAGS}"
 CPPFLAGS="${SAVED_CPPFLAGS}"
+
+if test "$have_valid_hwloc" = "no" -a "$hwloc_dir" != "no" ; then
+   AC_MSG_ERROR([hwloc was not found on your system. If the target machine is hyperthreaded the performance may be impacted a lot.  It is strongly recommended to install hwloc. However, if you really want to use StarPU without enabling hwloc, please restart configure by specifying the option '--without-hwloc'.])
+fi
 AC_MSG_CHECKING(whether hwloc should be used)
 AC_MSG_RESULT($have_valid_hwloc)
 AC_SUBST(HWLOC_REQUIRES)

+ 16 - 2
doc/Makefile.am

@@ -33,7 +33,8 @@ starpu_TEXINFOS = chapters/advanced-api.texi \
 	chapters/fft-support.texi \
 	chapters/using.texi \
 	chapters/vector_scal_opencl.texi \
-	chapters/socl.texi
+	chapters/socl.texi \
+	chapters/version.texi
 
 MAINTAINERCLEANFILES = starpu.pdf
 
@@ -45,6 +46,19 @@ AM_MAKEINFOHTMLFLAGS = --css-include=$(top_srcdir)/doc/starpu.css --no-headers -
 uninstall-local:
 	$(RM) $(DESTDIR)$(infodir)/dir
 
+#TODO: when stat is not available on the machine, insert "unknown date"
+chapters/version.texi:
+	@for f in $(starpu_TEXINFOS) ; do \
+                if test -f $(top_srcdir)/doc/$$f ; then stat --format=%Y $(top_srcdir)/doc/$$f ; fi \
+        done | sort -r | head -1 > timestamp
+	@LC_ALL=C date --date=@`cat timestamp` +"%d %B %Y" > timestamp_updated
+	@LC_ALL=C date --date=@`cat timestamp` +"%B %Y" > timestamp_updated_month
+	@echo "@set UPDATED " `cat timestamp_updated` > $(top_srcdir)/doc/chapters/version.texi
+	@echo "@set UPDATED-MONTH" `cat timestamp_updated_month` >> $(top_srcdir)/doc/chapters/version.texi
+	@echo "@set EDITION $(VERSION)" >> $(top_srcdir)/doc/chapters/version.texi
+	@echo "@set VERSION $(VERSION)" >> $(top_srcdir)/doc/chapters/version.texi
+	@$(RM) timestamp timestamp_updated timestamp_updated_month
+
 #$(top_srcdir)/doc/starpu.texi: vector_scal_c.texi vector_scal_cuda.texi vector_scal_opencl.texi vector_scal_opencl_codelet.texi
 #vector_scal_c.texi: $(top_srcdir)/examples/basic_examples/vector_scal.c
 #	cat $< | sed 's/{/@{/g' | sed 's/}/@}/g' | sed 's/\t/    /g' > $@
@@ -59,7 +73,7 @@ uninstall-local:
 #	vector_scal_c.texi vector_scal_cuda.texi vector_scal_opencl.texi vector_scal_opencl_codelet.texi
 
 # Rule to update documentation on web server. Should only be used locally.
-PUBLISHHOST	= sync
+PUBLISHHOST	?= sync
 update-web: starpu.html
 	sed -i 's/gcc\.html#Attribute-Syntax/http:\/\/gcc.gnu.org\/onlinedocs\/gcc\/Attribute-Syntax.html#Attribute-Syntax/' starpu.html
 	scp starpu.pdf starpu.html $(PUBLISHHOST):/web/runtime/html/StarPU

+ 34 - 10
doc/chapters/advanced-api.texi

@@ -160,7 +160,7 @@ struct starpu_complex_interface
 @end cartouche
 
 Registering such a data to StarPU is easily done using the function
-@code{starpu_data_register} (@pxref{Basic Data Library API}). The last
+@code{starpu_data_register} (@pxref{Basic Data Management API}). The last
 parameter of the function, @code{interface_complex_ops}, will be
 described below.
 
@@ -358,6 +358,17 @@ This function mustn't be called if @var{bundle} is already closed and/or @var{ta
 Inform the runtime that the user won't modify @var{bundle} anymore, it means no more inserting or removing task. Thus the runtime can destroy it when possible.
 @end deftypefun
 
+@deftypefun double starpu_task_bundle_expected_length (starpu_task_bundle_t @var{bundle}, {enum starpu_perf_archtype} @var{arch}, unsigned @var{nimpl})
+Return the expected duration of the entire task bundle in µs.
+@end deftypefun
+
+@deftypefun double starpu_task_bundle_expected_power (starpu_task_bundle_t @var{bundle}, enum starpu_perf_archtype @var{arch}, unsigned @var{nimpl})
+Return the expected power consumption of the entire task bundle in J.
+@end deftypefun
+
+@deftypefun double starpu_task_bundle_expected_data_transfer_time (starpu_task_bundle_t @var{bundle}, unsigned @var{memory_node})
+Return the time (in µs) expected to transfer all data used within the bundle.
+@end deftypefun
 
 @node Task Lists
 @section Task Lists
@@ -695,19 +706,32 @@ static struct starpu_sched_policy dummy_sched_policy = @{
 @node Driver API
 @subsection Driver API
 
-@deftypefun int starpu_driver_init(struct starpu_driver *@var{d})
-Initialize the given driver. Returns 0 on success, -EINVAL if d->type is not
-STARPU_CUDA_WORKER.
+@deftypefun int starpu_driver_init (struct starpu_driver *@var{d})
+Initialize the given driver. Returns 0 on success, -EINVAL if
+@code{d->type} is not a valid StarPU device type (STARPU_CPU_WORKER,
+STARPU_CUDA_WORKER or STARPU_OPENCL_WORKER).
+@end deftypefun
+
+@deftypefun int starpu_driver_run ({struct starpu_driver *}@var{d})
+Run the driver until it receives a request to terminate, then returns 0 on success, -EINVAL if
+@code{d->type} is not a valid StarPU device type (STARPU_CPU_WORKER,
+STARPU_CUDA_WORKER or STARPU_OPENCL_WORKER).
+@end deftypefun
+
+@deftypefun int starpu_driver_run_once (struct starpu_driver *@var{d})
+Run the driver once, then returns 0 on success, -EINVAL if
+@code{d->type} is not a valid StarPU device type (STARPU_CPU_WORKER,
+STARPU_CUDA_WORKER or STARPU_OPENCL_WORKER).
 @end deftypefun
 
-@deftypefun int starpu_driver_run_once(struct starpu_driver *@var{d})
-Runs the driver for a while, then returns 0 on success, -EINVAL if d->type is
-not STARPU_CUDA_WORKER.
+@deftypefun int starpu_driver_deinit (struct starpu_driver *@var{d})
+Deinitialize the given driver. Returns 0 on success, -EINVAL if
+@code{d->type} is not a valid StarPU device type (STARPU_CPU_WORKER,
+STARPU_CUDA_WORKER or STARPU_OPENCL_WORKER).
 @end deftypefun
 
-@deftypefun int starpu_driver_deinit(struct starpu_driver *@var{d})
-Deinitialize the given driver. Returns 0 on success, -EINVAL if d->type is not
-STARPU_CUDA_WORKER.
+@deftypefun void starpu_drivers_request_termination (void)
+Notify all running drivers they should terminate.
 @end deftypefun
 
 @node Example

+ 86 - 8
doc/chapters/advanced-examples.texi

@@ -15,6 +15,7 @@
 * Theoretical lower bound on execution time::  
 * Insert Task Utility::          
 * Data reduction::  
+* Temporary buffers::  
 * Parallel Tasks::
 * Debugging::
 * The multiformat interface::
@@ -235,7 +236,7 @@ starpu_data_partition(handle, &f);
 @end smallexample
 @end cartouche
 
-The task submission then uses @code{starpu_data_get_sub_data} to retrive the
+The task submission then uses @code{starpu_data_get_sub_data} to retrieve the
 sub-handles to be passed as tasks parameters.
 
 @cartouche
@@ -302,7 +303,9 @@ a performance model, by defining a @code{starpu_perfmodel} structure and
 providing its address in the @code{model} field of the @code{struct starpu_codelet}
 structure. The @code{symbol} and @code{type} fields of @code{starpu_perfmodel}
 are mandatory, to give a name to the model, and the type of the model, since
-there are several kinds of performance models.
+there are several kinds of performance models. For compatibility, make sure to
+initialize the whole structure to zero, either by using explicit memset, or by
+letting the compiler implicitly do it as examplified below.
 
 @itemize
 @item
@@ -317,7 +320,8 @@ and ouput sizes as an index.
 It will also save it in @code{~/.starpu/sampling/codelets}
 for further executions, and can be observed by using the
 @code{starpu_perfmodel_display} command, or drawn by using
-the @code{starpu_perfmodel_plot}.  The models are indexed by machine name. To
+the @code{starpu_perfmodel_plot} (@pxref{Performance model calibration}).  The
+models are indexed by machine name. To
 share the models between machines (e.g. for a homogeneous cluster), use
 @code{export STARPU_HOSTNAME=some_global_name}. Measurements are only done when using a task scheduler which makes use of it, such as @code{heft} or @code{dmda}.
 
@@ -353,15 +357,31 @@ model type). This still assumes performance regularity, but works
 with various data input sizes, by applying regression over observed
 execution times. STARPU_REGRESSION_BASED uses an a*n^b regression
 form, STARPU_NL_REGRESSION_BASED uses an a*n^b+c (more precise than
-STARPU_REGRESSION_BASED, but costs a lot more to compute). For instance,
+STARPU_REGRESSION_BASED, but costs a lot more to compute).
+
+For instance,
 @code{tests/perfmodels/regression_based.c} uses a regression-based performance
-model for the @code{memset} operation. Of course, the application has to issue
+model for the @code{memset} operation.
+
+Of course, the application has to issue
 tasks with varying size so that the regression can be computed. StarPU will not
 trust the regression unless there is at least 10% difference between the minimum
-and maximum observed input size. For non-linear regression, since computing it
+and maximum observed input size. It can be useful to set the
+@code{STARPU_CALIBRATE} environment variable to @code{1} and run the application
+on varying input sizes, so as to feed the performance model for a variety of
+inputs. The @code{starpu_perfmodel_display} and @code{starpu_perfmodel_plot}
+tools can be used to observe how much the performance model is calibrated (@pxref{Performance model calibration}); when
+their output look good, @code{STARPU_CALIBRATE} can be reset to @code{0} to let
+StarPU use the resulting performance model without recording new measures. If
+the data input sizes vary a lot, it is really important to set
+@code{STARPU_CALIBRATE} to @code{0}, otherwise StarPU will continue adding the
+measures, and result with a very big performance model, which will take time a
+lot of time to load and save.
+
+For non-linear regression, since computing it
 is quite expensive, it is only done at termination of the application. This
-means that the first execution uses history-based performance model to perform
-scheduling.
+means that the first execution of the application will use only history-based
+performance model to perform scheduling, without using regression.
 
 @item
 Provided as an estimation from the application itself (@code{STARPU_COMMON} model type and @code{cost_function} field),
@@ -668,6 +688,64 @@ int dots(starpu_data_handle_t v1, starpu_data_handle_t v2,
 The @code{cg} example also uses reduction for the blocked gemv kernel, leading
 to yet more relaxed dependencies and more parallelism.
 
+@node Temporary buffers
+@section Temporary buffers
+
+There are two kinds of temporary buffers: temporary data which just pass results
+from a task to another, and scratch data which are needed only internally by
+tasks.
+
+@subsection Temporary data
+
+Data can sometimes be entirely produced by a task, and entirely consumed by
+another task, without the need for other parts of the application to access
+it. In such case, registration can be done without prior allocation, by using
+the special -1 memory node number, and passing a zero pointer. StarPU will
+actually allocate memory only when the task creating the content gets scheduled,
+and destroy it on unregistration.
+
+In addition to that, it can be tedious for the application to have to unregister
+the data, since it will not use its content anyway. The unregistration can be
+done lazily by using the @code{starpu_data_unregister_lazy(handle)} function,
+which will record that no more tasks accessing the handle will be submitted, so
+that it can be freed as soon as the last task accessing it is over.
+
+The following code examplifies both points: it registers the temporary
+data, submits three tasks accessing it, and records the data for automatic
+unregistration.
+
+@smallexample
+starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
+starpu_insert_task(&produce_data, STARPU_W, handle, 0);
+starpu_insert_task(&compute_data, STARPU_RW, handle, 0);
+starpu_insert_task(&summarize_data, STARPU_R, handle, STARPU_W, result_handle, 0);
+starpu_data_unregister_lazy(handle);
+@end smallexample
+
+@subsection Scratch data
+
+Some kernels sometimes need temporary data to achieve the computations, i.e. a
+workspace. The application could allocate it at the start of the codelet
+function, and free it at the end, but that would be costly. It could also
+allocate one buffer per worker (similarly to @ref{Per-worker library
+initialization }), but that would make them systematic and permanent. A more
+optimized way is to use the SCRATCH data access mode, as examplified below,
+which provides per-worker buffers without content consistency.
+
+@smallexample
+starpu_vector_data_register(&workspace, -1, 0, sizeof(float));
+for (i = 0; i < N; i++)
+    starpu_insert_task(&compute, STARPU_R, input[i], STARPU_SCRATCH, workspace, STARPU_W, output[i], 0);
+@end smallexample
+
+StarPU will make sure that the buffer is allocated before executing the task,
+and make this allocation per-worker: for CPU workers, notably, each worker has
+its own buffer. This means that each task submitted above will actually have its
+own workspace, which will actually be the same for all tasks running one after
+the other on the same worker. Also, if for instance GPU memory becomes scarce,
+StarPU will notice that it can free such buffers easily, since the content does
+not matter.
+
 @node Parallel Tasks
 @section Parallel Tasks
 

+ 107 - 23
doc/chapters/basic-api.texi

@@ -9,7 +9,7 @@
 @menu
 * Initialization and Termination::  Initialization and Termination methods
 * Workers' Properties::         Methods to enumerate workers' properties
-* Data Library::                Methods to manipulate data
+* Data Management::                Methods to manipulate data
 * Data Interfaces::
 * Data Partition::
 * Codelets and Tasks::          Methods to construct tasks
@@ -39,10 +39,12 @@ indicates that no worker was available (so that StarPU was not initialized).
 @deftp {Data Type} {struct starpu_driver}
 @table @asis
 @item @code{enum starpu_archtype type}
-The type of the driver. Only STARPU_CUDA_DRIVER and STARPU_OPENCL_DRIVER are
-currently supported.
+The type of the driver. Only STARPU_CPU_DRIVER, STARPU_CUDA_DRIVER and
+STARPU_OPENCL_DRIVER are currently supported.
 @item @code{union id} Anonymous union
 @table @asis
+@item @code{unsigned cpu_id}
+Should only be used if type is STARPU_CPU_WORKER.
 @item @code{unsigned cuda_id}
 Should only be used if type is STARPU_CUDA_WORKER.
 @item @code{cl_device_id opencl_id}
@@ -70,7 +72,7 @@ if @code{sched_policy_name} is set.
 
 @item @code{int ncpus} (default = -1)
 This is the number of CPU cores that StarPU can use. This can also be
-specified with the @code{STARPU_NCPUS} environment variable.
+specified with the @code{STARPU_NCPU} environment variable.
 
 @item @code{int ncuda} (default = -1)
 This is the number of CUDA devices that StarPU can use. This can also
@@ -125,7 +127,7 @@ executing tasks. If this value is equal to -1, the default value is used. This
 can also be specified with the @code{STARPU_CALIBRATE} environment variable.
 
 @item @code{int single_combined_worker} (default = 0)
-By default, StarPU parallel tasks concurrently.
+By default, StarPU executes parallel tasks concurrently.
 Some parallel libraries (e.g. most OpenMP implementations) however do
 not support concurrent calls to parallel code. In such case, setting this flag
 makes StarPU only start one parallel task at a time.
@@ -296,12 +298,12 @@ this function should be used in the allocation function to determine
 on which device the memory needs to be allocated.
 @end deftypefun
 
-@node Data Library
-@section Data Library
+@node Data Management
+@section Data Management
 
 @menu
-* Introduction to Data Library::
-* Basic Data Library API::
+* Introduction to Data Management::
+* Basic Data Management API::
 * Access registered data from the application::
 @end menu
 
@@ -310,7 +312,7 @@ This section describes the data management facilities provided by StarPU.
 We show how to use existing data interfaces in @ref{Data Interfaces}, but developers can
 design their own data interfaces if required.
 
-@node Introduction to Data Library
+@node Introduction to Data Management
 @subsection Introduction
 Data management is done at a high-level in StarPU: rather than accessing a mere
 list of contiguous buffers, the tasks may manipulate data that are described by
@@ -338,8 +340,8 @@ to StarPU, the specified memory node indicates where the piece of data
 initially resides (we also call this memory node the home node of a piece of
 data).
 
-@node Basic Data Library API
-@subsection Basic Data Library API
+@node Basic Data Management API
+@subsection Basic Data Management API
 
 @deftypefun int starpu_malloc (void **@var{A}, size_t @var{dim})
 This function allocates data of the given size in main memory. It will also try to pin it in
@@ -610,12 +612,12 @@ starpu_block_data_register(&block_handle, 0, (uintptr_t)block,
 @deftypefun void starpu_bcsr_data_register (starpu_data_handle_t *@var{handle}, uint32_t @var{home_node}, uint32_t @var{nnz}, uint32_t @var{nrow}, uintptr_t @var{nzval}, uint32_t *@var{colind}, uint32_t *@var{rowptr}, uint32_t @var{firstentry}, uint32_t @var{r}, uint32_t @var{c}, size_t @var{elemsize})
 This variant of @code{starpu_data_register} uses the BCSR (Blocked
 Compressed Sparse Row Representation) sparse matrix interface.
-Register the sparse matrix made of @var{nnz} non-zero values of size
+Register the sparse matrix made of @var{nnz} non-zero blocks of elements of size
 @var{elemsize} stored in @var{nzval} and initializes @var{handle} to represent
 it. Blocks have size @var{r} * @var{c}. @var{nrow} is the number of rows (in
-terms of blocks), @var{colind} is the list of positions of the non-zero entries
-on the row, @var{rowptr} is the index (in nzval) of the first entry of the row.
-@var{fristentry} is the index of the first entry of the given arrays (usually 0
+terms of blocks), @code{colind[i]} is the block-column index for block @code{i}
+in @code{nzval}, @code{rowptr[i]} is the block-index (in nzval) of the first block of row @code{i}.
+@var{firstentry} is the index of the first entry of the given arrays (usually 0
 or 1).
 @end deftypefun
 
@@ -1146,6 +1148,18 @@ vector represented by @var{father_interface} once partitioned in
 @var{nparts} chunks of equal size.
 @end deftypefun
 
+@deftypefun void starpu_block_shadow_filter_func_vector (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
+Return in @code{*@var{child_interface}} the @var{id}th element of the
+vector represented by @var{father_interface} once partitioned in
+@var{nparts} chunks of equal size with a shadow border @code{filter_arg_ptr}
+
+The @code{filter_arg_ptr} field must be the shadow size casted into @code{void*}.
+
+IMPORTANT: This can only be used for read-only access, as no coherency is
+enforced for the shadowed parts.
+
+A usage example is available in examples/filters/shadow.c
+@end deftypefun
 
 @deftypefun void starpu_vector_list_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
 Return in @code{*@var{child_interface}} the @var{id}th element of the
@@ -1245,7 +1259,9 @@ always only define the field @code{opencl_funcs}.
 
 @deftp {Data Type} {struct starpu_codelet}
 The codelet structure describes a kernel that is possibly implemented on various
-targets. For compatibility, make sure to initialize the whole structure to zero.
+targets. For compatibility, make sure to initialize the whole structure to zero,
+either by using explicit memset, or by letting the compiler implicitly do it in
+e.g. static storage case.
 
 @table @asis
 @item @code{uint32_t where} (optional)
@@ -1343,11 +1359,13 @@ option when configuring StarPU.
 
 @item @code{struct starpu_perfmodel *model} (optional)
 This is a pointer to the task duration performance model associated to this
-codelet. This optional field is ignored when set to @code{NULL}.
+codelet. This optional field is ignored when set to @code{NULL} or
+when its @code{symbol} field is not set.
 
 @item @code{struct starpu_perfmodel *power_model} (optional)
 This is a pointer to the task power consumption performance model associated
-to this codelet. This optional field is ignored when set to @code{NULL}.
+to this codelet. This optional field is ignored when set to
+@code{NULL} or when its @code{symbol} field is not set.
 In the case of parallel codelets, this has to account for all processing units
 involved in the parallel execution.
 
@@ -1810,7 +1828,11 @@ The possible values are:
 @anchor{struct starpu_perfmodel}
 contains all information about a performance model. At least the
 @code{type} and @code{symbol} fields have to be filled when defining a
-performance model for a codelet. If not provided, other fields have to be zero.
+performance model for a codelet. For compatibility, make sure to initialize the
+whole structure to zero, either by using explicit memset, or by letting the
+compiler implicitly do it in e.g. static storage case.
+
+If not provided, other fields have to be zero.
 
 @table @asis
 @item @code{type}
@@ -1825,7 +1847,8 @@ archs will be determined by multiplying by an arch-specific factor.
 
 @item @code{const char *symbol}
 is the symbol name for the performance model, which will be used as
-file name to store the model.
+file name to store the model. It must be set otherwise the model will
+be ignored.
 
 @item @code{double (*cost_model)(struct starpu_buffer_descr *)}
 This field is deprecated. Use instead the @code{cost_function} field.
@@ -1935,6 +1958,10 @@ prints a list of all performance models on @var{output}.
 prints a matrix of bus bandwidths on @var{f}.
 @end deftypefun
 
+@deftypefun void starpu_bus_print_affinity ({FILE *}@var{f})
+prints the affinity devices on @var{f}.
+@end deftypefun
+
 @node Profiling API
 @section Profiling API
 
@@ -2240,8 +2267,23 @@ Return the computation kernel command queue of the current worker.
 Sets the arguments of a given kernel. The list of arguments must be given as
 (size_t @var{size_of_the_argument}, cl_mem * @var{pointer_to_the_argument}).
 The last argument must be 0. Returns the number of arguments that were
-successfully set. In case of failure, @var{err} is set to the error returned by
-OpenCL.
+successfully set. In case of failure, returns the id of the argument
+that could not be set and @var{err} is set to the error returned by
+OpenCL. Otherwise, returns the number of arguments that were set.
+
+@cartouche
+@smallexample
+int n;
+cl_int err;
+cl_kernel kernel;
+n = starpu_opencl_set_kernel_args(&err, 2, &kernel,
+                                  sizeof(foo), &foo,
+                                  sizeof(bar), &bar,
+                                  0);
+if (n != 2)
+   fprintf(stderr, "Error : %d\n", err);
+@end smallexample
+@end cartouche
 @end deftypefun
 
 @node Compiling OpenCL kernels
@@ -2277,6 +2319,43 @@ This function compiles an OpenCL source code stored in a string.
 This function unloads an OpenCL compiled code.
 @end deftypefun
 
+@deftypefun void starpu_opencl_load_program_source ({const char *}@var{source_file_name}, char *@var{located_file_name}, char *@var{located_dir_name}, char *@var{opencl_program_source})
+Store the contents of the file @var{source_file_name} in the buffer
+@var{opencl_program_source}. The file @var{source_file_name} can be
+located in the current directory, or in the directory specified by the
+environment variable @code{STARPU_OPENCL_PROGRAM_DIR}, or in the
+directory @code{share/starpu/opencl} of the installation directory of
+StarPU, or in the source directory of StarPU.
+When the file is found, @code{located_file_name} is the full name of
+the file as it has been located on the system, @code{located_dir_name}
+the directory where it has been located. Otherwise, they are both set
+to the empty string.
+@end deftypefun
+
+@deftypefun int starpu_opencl_compile_opencl_from_file ({const char *}@var{source_file_name}, {const char*} @var{build_options})
+Compile the OpenCL kernel stored in the file @code{source_file_name}
+with the given options @code{build_options} and stores the result in
+the directory @code{$STARPU_HOME/.starpu/opencl} with the same
+filename as @code{source_file_name}. The compilation is done for every
+OpenCL device, and the filename is suffixed with the vendor id and the
+device id of the OpenCL device.
+@end deftypefun
+
+@deftypefun int starpu_opencl_compile_opencl_from_string ({const char *}@var{opencl_program_source}, {const char *}@var{file_name}, {const char* }@var{build_options})
+Compile the OpenCL kernel in the string @code{opencl_program_source}
+with the given options @code{build_options} and stores the result in
+the directory @code{$STARPU_HOME/.starpu/opencl} with the filename
+@code{file_name}. The compilation is done for every
+OpenCL device, and the filename is suffixed with the vendor id and the
+device id of the OpenCL device.
+@end deftypefun
+
+@deftypefun int starpu_opencl_load_binary_opencl ({const char *}@var{kernel_id}, {struct starpu_opencl_program *}@var{opencl_programs})
+Compile the binary OpenCL kernel identified with @var{id}. For every
+OpenCL device, the binary OpenCL kernel will be loaded from the file
+@code{$STARPU_HOME/.starpu/opencl/<kernel_id>.<device_type>.vendor_id_<vendor_id>_device_id_<device_id>}.
+@end deftypefun
+
 @node Loading OpenCL kernels
 @subsection Loading OpenCL kernels
 
@@ -2303,6 +2382,11 @@ collect statistics about the kernel execution (used cycles, consumed power).
 @node OpenCL utilities
 @subsection OpenCL utilities
 
+@deftypefun {const char *}starpu_opencl_error_string (cl_int @var{status})
+Return the error message in English corresponding to @var{status}, an
+OpenCL error code.
+@end deftypefun
+
 @deftypefun void starpu_opencl_display_error ({const char *}@var{func}, {const char *}@var{file}, int @var{line}, {const char *}@var{msg}, cl_int @var{status})
 Given a valid error @var{status}, prints the corresponding error message on
 stdout, along with the given function name @var{func}, the given filename

+ 4 - 2
doc/chapters/basic-examples.texi

@@ -161,7 +161,9 @@ struct starpu_codelet cl =
 
 A codelet is a structure that represents a computational kernel. Such a codelet
 may contain an implementation of the same kernel on different architectures
-(e.g. CUDA, Cell's SPU, x86, ...).
+(e.g. CUDA, Cell's SPU, x86, ...). For compatibility, make sure that the whole
+structure is initialized to zero, either by using memset, or by letting the
+compiler implicitly do it as examplified above.
 
 The @code{nbuffers} field specifies the number of data buffers that are
 manipulated by the codelet: here the codelet does not access or modify any data
@@ -970,7 +972,7 @@ and to execute it, with the default configuration:
 or for example, by disabling CPU devices:
 
 @smallexample
-% STARPU_NCPUS=0 ./vector_scal
+% STARPU_NCPU=0 ./vector_scal
 0.000000 3.000000 6.000000 9.000000 12.000000
 @end smallexample
 

+ 1 - 1
doc/chapters/c-extensions.texi

@@ -321,7 +321,7 @@ simple way to allocate storage for arrays on the heap:
 This attributes applies to local variables with an array type.  Its
 effect is to automatically allocate the array's storage on
 the heap, using @code{starpu_malloc} under the hood (@pxref{Basic Data
-Library API, starpu_malloc}).  The heap-allocated array is automatically
+Management API, starpu_malloc}).  The heap-allocated array is automatically
 freed when the variable's scope is left, as with
 automatic variables.
 

+ 4 - 4
doc/chapters/configuration.texi

@@ -214,7 +214,7 @@ is enabled when the required dependencies are found.
 @subsection Configuring workers
 
 @menu
-* STARPU_NCPUS::                	Number of CPU workers
+* STARPU_NCPU::                	Number of CPU workers
 * STARPU_NCUDA::                	Number of CUDA workers
 * STARPU_NOPENCL::              	Number of OpenCL workers
 * STARPU_NGORDON::              	Number of SPU workers (Cell)
@@ -227,8 +227,8 @@ is enabled when the required dependencies are found.
 * STARPU_MAX_WORKERSIZE:: 		Maximum size of the combined workers
 @end menu
 
-@node STARPU_NCPUS
-@subsubsection @code{STARPU_NCPUS} -- Number of CPU workers
+@node STARPU_NCPU
+@subsubsection @code{STARPU_NCPU} -- Number of CPU workers
 
 Specify the number of CPU workers (thus not including workers dedicated to control acceleratores). Note that by default, StarPU will not allocate
 more CPU workers than there are physical CPUs, and that some CPUs are used to control
@@ -272,7 +272,7 @@ available.
 
 Note that the first workers correspond to the CUDA workers, then come the
 OpenCL and the SPU, and finally the CPU workers. For example if
-we have @code{STARPU_NCUDA=1}, @code{STARPU_NOPENCL=1}, @code{STARPU_NCPUS=2}
+we have @code{STARPU_NCUDA=1}, @code{STARPU_NOPENCL=1}, @code{STARPU_NCPU=2}
 and @code{STARPU_WORKERS_CPUID = "0 2 1 3"}, the CUDA device will be controlled
 by logical CPU #0, the OpenCL device will be controlled by logical CPU #2, and
 the logical CPUs #1 and #3 will be used by the CPU workers.

+ 1 - 1
doc/starpu.texi

@@ -5,7 +5,7 @@
 @settitle StarPU Handbook
 @c %**end of header
 
-@include version.texi
+@include chapters/version.texi
 
 @copying
 Copyright @copyright{} 2009--2011  Universit@'e de Bordeaux 1

+ 2 - 2
doc/tutorial/README

@@ -41,6 +41,6 @@ Instructions on how to compile and run StarPU examples
 % make vector_scal
 % ./vector_scal
 
-% STARPU_NCPUS=0 ./vector_scal
-% STARPU_NCPUS=0 STARPU_NCUDA=0 ./vector_scal
+% STARPU_NCPU=0 ./vector_scal
+% STARPU_NCPU=0 STARPU_NCUDA=0 ./vector_scal
 

+ 33 - 11
examples/Makefile.am

@@ -114,8 +114,8 @@ noinst_HEADERS = 				\
 	heat/dw_factolu.h			\
 	lu/xlu.h				\
 	lu/xlu_kernels.h			\
-	lu/float.h				\
-	lu/double.h				\
+	lu/lu-float.h				\
+	lu/lu-double.h				\
 	lu/complex_float.h			\
 	lu/complex_double.h			\
 	lu/blas_complex.h			\
@@ -179,6 +179,7 @@ examplebin_PROGRAMS +=				\
 	filters/fvector				\
 	filters/fblock				\
 	filters/fmatrix				\
+	filters/shadow				\
 	tag_example/tag_example			\
 	tag_example/tag_example2		\
 	tag_example/tag_example3		\
@@ -188,13 +189,13 @@ examplebin_PROGRAMS +=				\
 	spmv/spmv				\
 	callback/callback			\
 	incrementer/incrementer			\
+	binary/binary				\
 	interface/complex			\
 	matvecmult/matvecmult			\
 	profiling/profiling			\
 	scheduler/dummy_sched			\
 	reductions/dot_product			\
 	reductions/minmax_reduction		\
-	mandelbrot/mandelbrot			\
 	ppm_downscaler/ppm_downscaler		\
 	ppm_downscaler/yuv_downscaler
 
@@ -217,7 +218,8 @@ examplebin_PROGRAMS +=				\
 	lu/lu_implicit_example_float		\
 	lu/lu_implicit_example_double		\
 	heat/heat				\
-	cg/cg
+	cg/cg					\
+	pipeline/pipeline
 endif
 
 if MKL_BLAS_LIB
@@ -253,6 +255,7 @@ STARPU_EXAMPLES +=				\
 	spmv/spmv				\
 	callback/callback			\
 	incrementer/incrementer			\
+	binary/binary				\
 	interface/complex			\
 	matvecmult/matvecmult			\
 	profiling/profiling			\
@@ -279,7 +282,8 @@ STARPU_EXAMPLES +=				\
 	lu/lu_implicit_example_float		\
 	lu/lu_implicit_example_double		\
 	heat/heat				\
-	cg/cg
+	cg/cg					\
+	pipeline/pipeline
 endif
 
 if MKL_BLAS_LIB
@@ -730,6 +734,17 @@ nobase_STARPU_OPENCL_DATA_DATA += \
 	incrementer/incrementer_kernels_opencl_kernel.cl
 endif
 
+##################
+# Binary example #
+##################
+
+binary_binary_SOURCES =	\
+	binary/binary.c
+if STARPU_USE_OPENCL
+binary_binary_SOURCES +=	\
+	incrementer/incrementer_kernels_opencl.c
+endif
+
 #####################
 # interface example #
 #####################
@@ -777,12 +792,6 @@ endif
 # Mandelbrot Set #
 ##################
 
-mandelbrot_mandelbrot_CPPFLAGS = $(AM_CFLAGS) $(AM_CPPFLAGS)
-if HAVE_X11
-mandelbrot_mandelbrot_CPPFLAGS += $(X_CFLAGS)
-mandelbrot_mandelbrot_LDADD = $(X_PRE_LIBS) $(X_LIBS) $(X_EXTRA_LIBS) -lX11
-endif
-
 ################
 # Top Examples #
 ################
@@ -841,6 +850,19 @@ gl_interop_gl_interop_LDADD =			\
 	$(STARPU_OPENGL_RENDER_LDFLAGS)
 endif
 
+####################
+# pipeline example #
+####################
+
+if !NO_BLAS_LIB
+pipeline_pipeline_SOURCES	=	\
+	pipeline/pipeline.c		\
+	common/blas.c
+
+pipeline_pipeline_LDADD =		\
+	$(STARPU_BLAS_LDFLAGS)
+endif
+
 showcheck:
 	-cat $(TEST_LOGS) /dev/null
 	for i in $(SUBDIRS) ; do \

+ 2 - 1
examples/basic_examples/vector_scal.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
- * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -66,6 +66,7 @@ static struct starpu_codelet cl =
 		, scal_sse_func_icc
 #endif
 #endif
+		, NULL
 	},
 #ifdef STARPU_USE_CUDA
 	/* CUDA implementation of the codelet */

+ 119 - 0
examples/binary/binary.c

@@ -0,0 +1,119 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <pthread.h>
+#include <sys/time.h>
+
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
+#ifdef STARPU_USE_OPENCL
+#include <starpu_opencl.h>
+extern void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args);
+struct starpu_opencl_program opencl_program;
+#endif
+
+struct starpu_codelet cl =
+{
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {opencl_codelet, NULL},
+#endif
+	.nbuffers = 1,
+	.modes = {STARPU_RW}
+};
+
+int compute(char *file_name, int load_as_file)
+{
+	float float_array[4] __attribute__ ((aligned (16))) = { 0.0f, 0.0f, 0.0f, 0.0f};
+	starpu_data_handle_t float_array_handle;
+	unsigned i;
+	int ret = 0;
+	unsigned niter = 500;
+
+	starpu_vector_data_register(&float_array_handle, 0, (uintptr_t)&float_array, 4, sizeof(float));
+
+#ifdef STARPU_USE_OPENCL
+	if (load_as_file)
+	{
+		ret = starpu_opencl_compile_opencl_from_file(file_name, NULL);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_compile_opencl_from_file");
+		ret = starpu_opencl_load_binary_opencl(file_name, &opencl_program);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_binary_opencl");
+	}
+	else
+	{
+		char located_file_name[1024];
+		char located_dir_name[1024];
+		char opencl_program_source[16384];
+		starpu_opencl_load_program_source(file_name, located_file_name, located_dir_name, opencl_program_source);
+		ret = starpu_opencl_compile_opencl_from_string(opencl_program_source, "incrementer", NULL);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_compile_opencl_from_file");
+		ret = starpu_opencl_load_binary_opencl("incrementer", &opencl_program);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_binary_opencl");
+	}
+#endif
+
+	for (i = 0; i < niter; i++)
+	{
+		ret = starpu_insert_task(&cl, STARPU_RW, float_array_handle, 0);
+		if (STARPU_UNLIKELY(ret == -ENODEV))
+		{
+			FPRINTF(stderr, "No worker may execute this task\n");
+			exit(0);
+		}
+	}
+
+	starpu_task_wait_for_all();
+
+	/* update the array in RAM */
+	starpu_data_unregister(float_array_handle);
+
+	FPRINTF(stderr, "array -> %f, %f, %f, %f\n", float_array[0], float_array[1], float_array[2], float_array[3]);
+
+	if (float_array[0] != niter || float_array[0] != float_array[1] + float_array[2] + float_array[3])
+	{
+		FPRINTF(stderr, "Incorrect result\n");
+		ret = 1;
+	}
+	return ret;
+}
+
+int main(int argc, char **argv)
+{
+	int ret = 0;
+	struct starpu_conf conf;
+
+	starpu_conf_init(&conf);
+	conf.ncpus = 0;
+	conf.ncuda = 0;
+
+        ret = starpu_init(&conf);
+	if (STARPU_UNLIKELY(ret == -ENODEV))
+	{
+                FPRINTF(stderr, "This application requires an OpenCL worker.\n");
+		starpu_shutdown();
+		return 77;
+	}
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	ret = compute("examples/incrementer/incrementer_kernels_opencl_kernel.cl", 1);
+	if (ret == 0)
+		ret = compute("examples/incrementer/incrementer_kernels_opencl_kernel.cl", 0);
+
+	starpu_shutdown();
+	return ret;
+}

+ 1 - 1
examples/gl_interop/gl_interop.c

@@ -70,7 +70,7 @@ void callback_func(void *foo) {
 	printf("rendering done\n");
 
 	/* Tell it was already the last submitted task */
-	starpu_set_end_of_submissions();
+	starpu_drivers_request_termination();
 }
 
 int main(int argc, char **argv)

+ 1 - 1
examples/lu/clu.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
examples/lu/clu_implicit.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
examples/lu/clu_implicit_pivot.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
examples/lu/clu_kernels.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
examples/lu/clu_pivot.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
examples/lu/dlu.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "double.h"
+#include "lu-double.h"
 #include "xlu.c"

+ 1 - 1
examples/lu/dlu_implicit.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "double.h"
+#include "lu-double.h"
 #include "xlu_implicit.c"

+ 1 - 1
examples/lu/dlu_implicit_pivot.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "double.h"
+#include "lu-double.h"
 #include "xlu_implicit_pivot.c"

+ 1 - 1
examples/lu/dlu_kernels.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "double.h"
+#include "lu-double.h"
 #include "xlu_kernels.c"

+ 1 - 1
examples/lu/dlu_pivot.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "double.h"
+#include "lu-double.h"
 #include "xlu_pivot.c"

+ 0 - 47
examples/lu/double.h

@@ -1,47 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#define TYPE double
-#define CUBLAS_TYPE TYPE
-
-#define STARPU_LU(name)       starpu_dlu_##name
-
-#ifdef STARPU_HAVE_MAGMA
-#include <magmablas.h>
-#define CUBLAS_GEMM	magmablas_dgemm
-#define CUBLAS_TRSM	magmablas_dtrsm
-#else
-#define CUBLAS_GEMM	cublasDgemm
-#define CUBLAS_TRSM	cublasDtrsm
-#endif
-#define CUBLAS_SCAL	cublasDscal
-#define CUBLAS_GER	cublasDger
-#define CUBLAS_SWAP	cublasDswap
-#define CUBLAS_IAMAX	cublasIdamax
-
-#define CPU_GEMM	DGEMM
-#define CPU_TRSM	DTRSM
-#define CPU_SCAL	DSCAL
-#define CPU_GER		DGER
-#define CPU_SWAP	DSWAP
-
-#define CPU_TRMM	DTRMM
-#define CPU_AXPY	DAXPY
-#define CPU_ASUM	DASUM
-#define CPU_IAMAX	IDAMAX
-
-#define PIVOT_THRESHHOLD	10e-10

+ 0 - 49
examples/lu/float.h

@@ -1,49 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-
-#define TYPE float
-#define CUBLAS_TYPE TYPE
-
-#define STARPU_LU(name)       starpu_slu_##name
-
-#ifdef STARPU_HAVE_MAGMA
-#include <magmablas.h>
-#define CUBLAS_GEMM	magmablas_sgemm
-#define CUBLAS_TRSM	magmablas_strsm
-#else
-#define CUBLAS_GEMM	cublasSgemm
-#define CUBLAS_TRSM	cublasStrsm
-#endif
-
-#define CUBLAS_SCAL	cublasSscal
-#define CUBLAS_GER	cublasSger
-#define CUBLAS_SWAP	cublasSswap
-#define CUBLAS_IAMAX	cublasIsamax
-
-#define CPU_GEMM	SGEMM
-#define CPU_TRSM	STRSM
-#define CPU_SCAL	SSCAL
-#define CPU_GER		SGER
-#define CPU_SWAP	SSWAP
-
-#define CPU_TRMM	STRMM
-#define CPU_AXPY	SAXPY
-#define CPU_ASUM	SASUM
-#define CPU_IAMAX	ISAMAX
-
-#define PIVOT_THRESHHOLD	10e-5

+ 1 - 1
examples/lu/lu_example_complex_double.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
examples/lu/lu_example_complex_float.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
examples/lu/lu_example_double.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "double.h"
+#include "lu-double.h"
 #include "lu_example.c"

+ 1 - 1
examples/lu/lu_example_float.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "float.h"
+#include "lu-float.h"
 #include "lu_example.c"

+ 1 - 1
examples/lu/slu.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "float.h"
+#include "lu-float.h"
 #include "xlu.c"

+ 1 - 1
examples/lu/slu_implicit.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "float.h"
+#include "lu-float.h"
 #include "xlu_implicit.c"

+ 1 - 1
examples/lu/slu_implicit_pivot.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "float.h"
+#include "lu-float.h"
 #include "xlu_implicit_pivot.c"

+ 1 - 1
examples/lu/slu_kernels.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "float.h"
+#include "lu-float.h"
 #include "xlu_kernels.c"

+ 1 - 1
examples/lu/slu_pivot.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "float.h"
+#include "lu-float.h"
 #include "xlu_pivot.c"

+ 2 - 2
examples/lu/xlu_pivot.c

@@ -380,7 +380,7 @@ int STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size,
 	}
 #endif
 
-	double timing;
+	double timing=0.0;
 	int ret = dw_codelet_facto_pivot(&dataA, piv_description, nblocks, get_block_with_striding, &timing);
 
 	FPRINTF(stderr, "Computation took (in ms)\n");
@@ -435,7 +435,7 @@ int STARPU_LU(lu_decomposition_pivot_no_stride)(TYPE **matA, unsigned *ipiv, uns
 		piv_description[block].last = (block + 1) * (size / nblocks);
 	}
 
-	double timing;
+	double timing=0.0;
 	int ret = dw_codelet_facto_pivot(dataAp, piv_description, nblocks, get_block_with_no_striding, &timing);
 
 	FPRINTF(stderr, "Computation took (in ms)\n");

+ 1 - 1
examples/lu/zlu.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
examples/lu/zlu_implicit.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
examples/lu/zlu_implicit_pivot.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
examples/lu/zlu_kernels.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
examples/lu/zlu_pivot.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
examples/openmp/vector_scal.c

@@ -83,7 +83,7 @@ int main(int argc, char **argv)
 	starpu_conf_init(&conf);
 
 	/* Most OpenMP implementations do not support concurrent parallel
-	 * sections, so only create one big worker */
+	 * sections, so only enable one combined worker at a time.  */
 	conf.single_combined_worker = 1;
 	conf.sched_policy_name = "pheft";
 

+ 254 - 0
examples/pipeline/pipeline.c

@@ -0,0 +1,254 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2012  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This examples shows how to submit a pipeline to StarPU with limited buffer
+ * use, and avoiding submitted all the tasks at once.
+ *
+ * This is a dumb example pipeline, depicted here:
+ *
+ * x--\
+ *     >==axpy-->sum
+ * y--/
+ *
+ * x and y produce vectors full of x and y values, axpy multiplies them, and sum
+ * sums it up. We thus have 3 temporary buffers
+ */
+
+#include <starpu.h>
+#include <stdint.h>
+#include <semaphore.h>
+#include <common/blas.h>
+
+#ifdef STARPU_USE_CUDA
+#include <cublas.h>
+#endif
+
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
+/* Vector size */
+#ifdef STARPU_SLOW_MACHINE
+#define N 16
+#else
+#define N 1048576
+#endif
+
+/* Number of iteration buffers, and thus overlapped pipeline iterations */
+#define K 16
+
+/* Number of concurrently submitted pipeline iterations */
+#define C 64
+
+/* Number of iterations */
+#define L 256
+
+/* X / Y codelets */
+void pipeline_cpu_x(void *descr[], void *args)
+{
+	float x;
+	float *val = (float *) STARPU_VECTOR_GET_PTR(descr[0]);
+	int n = STARPU_VECTOR_GET_NX(descr[0]);
+	int i;
+
+	starpu_codelet_unpack_args(args, &x);
+	for (i = 0; i < n ; i++)
+		val[i] = x;
+}
+
+static struct starpu_perfmodel pipeline_model_x =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "pipeline_model_x"
+};
+
+static struct starpu_codelet pipeline_codelet_x =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {pipeline_cpu_x, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_W},
+	.model = &pipeline_model_x
+};
+
+/* axpy codelets */
+void pipeline_cpu_axpy(void *descr[], void *arg)
+{
+	float *x = (float *) STARPU_VECTOR_GET_PTR(descr[0]);
+	float *y = (float *) STARPU_VECTOR_GET_PTR(descr[1]);
+	int n = STARPU_VECTOR_GET_NX(descr[0]);
+
+	SAXPY(n, 1., x, 1, y, 1);
+}
+
+#ifdef STARPU_USE_CUDA
+void pipeline_cublas_axpy(void *descr[], void *arg)
+{
+	float *x = (float *) STARPU_VECTOR_GET_PTR(descr[0]);
+	float *y = (float *) STARPU_VECTOR_GET_PTR(descr[1]);
+	int n = STARPU_VECTOR_GET_NX(descr[0]);
+
+	cublasSaxpy(n, 1., x, 1, y, 1);
+}
+#endif
+
+static struct starpu_perfmodel pipeline_model_axpy =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "pipeline_model_axpy"
+};
+
+static struct starpu_codelet pipeline_codelet_axpy =
+{
+	.where = STARPU_CPU
+#ifdef STARPU_USE_CUDA
+		| STARPU_CUDA
+#endif
+		,
+	.cpu_funcs = {pipeline_cpu_axpy, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {pipeline_cublas_axpy, NULL},
+#endif
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_RW},
+	.model = &pipeline_model_axpy
+};
+
+/* sum codelet */
+void pipeline_cpu_sum(void *descr[], void *_args)
+{
+	float *x = (float *) STARPU_VECTOR_GET_PTR(descr[0]);
+	int n = STARPU_VECTOR_GET_NX(descr[0]);
+	float y;
+
+	y = SASUM(n, x, 1);
+
+	FPRINTF(stderr,"CPU finished with %f\n", y);
+}
+
+#ifdef STARPU_USE_CUDA
+void pipeline_cublas_sum(void *descr[], void *arg)
+{
+	float *x = (float *) STARPU_VECTOR_GET_PTR(descr[0]);
+	int n = STARPU_VECTOR_GET_NX(descr[0]);
+	float y;
+
+	y = cublasSasum(n, x, 1);
+
+	FPRINTF(stderr,"CUBLAS finished with %f\n", y);
+}
+#endif
+
+static struct starpu_perfmodel pipeline_model_sum =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "pipeline_model_sum"
+};
+
+static struct starpu_codelet pipeline_codelet_sum =
+{
+	.where = STARPU_CPU
+#ifdef STARPU_USE_CUDA
+		| STARPU_CUDA
+#endif
+		,
+	.cpu_funcs = {pipeline_cpu_sum, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {pipeline_cublas_sum, NULL},
+#endif
+	.nbuffers = 1,
+	.modes = {STARPU_R},
+	.model = &pipeline_model_sum
+};
+
+int main(void)
+{
+	int ret = 0;
+	int k, l, c;
+	starpu_data_handle_t buffersX[K], buffersY[K], buffersP[K];
+	sem_t sems[C];
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		exit(77);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_helper_cublas_init();
+
+	/* Initialize the K temporary buffers. No need to allocate it ourselves
+	 * Since it's the X and Y kernels which will fill the initial values. */
+	for (k = 0; k < K; k++)
+	{
+		starpu_vector_data_register(&buffersX[k], -1, 0, N, sizeof(float));
+		starpu_vector_data_register(&buffersY[k], -1, 0, N, sizeof(float));
+		starpu_vector_data_register(&buffersP[k], -1, 0, N, sizeof(float));
+	}
+
+	/* Initialize way to wait for the C previous concurrent stages */
+	for (c = 0; c < C; c++)
+		sem_init(&sems[c], 0, 0);
+
+	/* Submits the l pipeline stages */
+	for (l = 0; l < L; l++)
+	{
+		float x = l;
+		float y = 2*l;
+		/* First wait for the C previous concurrent stages */
+		if (l >= C)
+			sem_wait(&sems[l%C]);
+
+		/* Now submit the next stage */
+		ret = starpu_insert_task(&pipeline_codelet_x,
+				STARPU_W, buffersX[l%K],
+				STARPU_VALUE, &x, sizeof(x),
+				0);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task x");
+
+		ret = starpu_insert_task(&pipeline_codelet_x,
+				STARPU_W, buffersY[l%K],
+				STARPU_VALUE, &y, sizeof(y),
+				0);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task y");
+
+		ret = starpu_insert_task(&pipeline_codelet_axpy,
+				STARPU_R, buffersX[l%K],
+				STARPU_RW, buffersY[l%K],
+				0);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task axpy");
+
+		ret = starpu_insert_task(&pipeline_codelet_sum,
+				STARPU_R, buffersY[l%K],
+				STARPU_CALLBACK_WITH_ARG, (void (*)(void*))sem_post, &sems[l%C],
+				0);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task sum");
+	}
+	starpu_task_wait_for_all();
+
+enodev:
+	for (k = 0; k < K; k++)
+	{
+		starpu_data_unregister(buffersX[k]);
+		starpu_data_unregister(buffersY[k]);
+		starpu_data_unregister(buffersP[k]);
+	}
+	starpu_shutdown();
+
+	return (ret == -ENODEV ? 77 : 0);
+}

+ 5 - 0
examples/scheduler/schedulers.sh

@@ -29,6 +29,11 @@ SCHEDULERS=`STARPU_SCHED="help" ./basic_examples/hello_world 2>&1 | awk '/->/ {p
 
 for sched in $SCHEDULERS
 do
+    # XXX pgreedy often hangs, we have to fix it.
+    # Let's just disable it for now.
+    if [ "$sched" == "pgreedy" ] ; then
+        continue
+    fi
     echo "cholesky.$sched"
     STARPU_SCHED=$sched ./cholesky/cholesky_tag
     check_success $?

+ 24 - 2
gcc-plugin/Makefile.am

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
+# Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -15,9 +15,31 @@
 
 SUBDIRS = src tests examples
 
-EXTRA_DIST = COPYING README
+EXTRA_DIST = COPYING README ChangeLog
 
 showcheck:
 	for i in $(SUBDIRS) ; do \
 		make -C $$i showcheck ; \
 	done
+
+# Generate a GNU-style ChangeLog for inclusion in the tarball.
+# It requires network access and may be slow.
+gen-ChangeLog:
+	if test "x$$CHANGELOG" = "xno"; then					\
+	   echo "ChangeLog not built, per user request" >&2;			\
+	elif ! xsltproc --version > /dev/null 2>&1; then			\
+	   echo "xsltproc not found, ChangeLog not generated" >&2;		\
+	elif ! test -d "$(srcdir)/.svn"; then					\
+	   echo "Subversion meta-data not found, ChangeLog not generated" >&2;	\
+	elif ! svn --version > /dev/null 2>&1; then				\
+	   echo "Subversion not found, ChangeLog not generated" >&2;		\
+	else									\
+	   ( cd "$(srcdir)";							\
+	     svn log --xml --verbose ) |					\
+	   xsltproc "$(top_srcdir)/build-aux/svn2cl.xsl" - > "ChangeLog.tmp";	\
+	   mv "ChangeLog.tmp" "$(distdir)/ChangeLog";				\
+	fi
+
+dist-hook: gen-ChangeLog
+
+.PHONY: showcheck dist-hook gen-ChangeLog

+ 336 - 42
gcc-plugin/src/starpu.c

@@ -102,6 +102,11 @@ extern int yydebug;
 
 extern tree xref_tag (enum tree_code, tree);
 
+#ifndef STRINGIFY
+# define STRINGIFY_(x) # x
+# define STRINGIFY(x)  STRINGIFY_ (x)
+#endif
+
 
 #ifdef __cplusplus
 extern "C" {
@@ -116,7 +121,8 @@ static const char plugin_name[] = "starpu";
 /* Whether to enable verbose output.  */
 static bool verbose_output_p = false;
 
-/* Search path for OpenCL source files, for the `opencl' pragma.  */
+/* Search path for OpenCL source files for the `opencl' pragma, as a
+   `TREE_LIST'.  */
 static tree opencl_include_dirs = NULL_TREE;
 
 /* Names of public attributes.  */
@@ -412,13 +418,18 @@ build_hello_world (void)
 }
 
 /* Given ERROR_VAR, an integer variable holding a StarPU error code, return
-   statements that print out an error message and abort.  */
+   statements that print out the error message returned by
+   BUILD_ERROR_MESSAGE (ERROR_VAR) and abort.  */
 
-static tree build_error_statements (location_t, tree, const char *, ...)
-  __attribute__ ((format (printf, 3, 4)));
+static tree build_error_statements (location_t, tree,
+				    function_parm (tree, f, (tree)),
+				    const char *, ...)
+  __attribute__ ((format (printf, 4, 5)));
 
 static tree
-build_error_statements (location_t loc, tree error_var, const char *fmt, ...)
+build_error_statements (location_t loc, tree error_var,
+			function_parm (tree, build_error_message, (tree)),
+			const char *fmt, ...)
 {
   expanded_location xloc = expand_location (loc);
 
@@ -436,23 +447,17 @@ build_error_statements (location_t loc, tree error_var, const char *fmt, ...)
   if (error_var != NULL_TREE)
     {
       /* ERROR_VAR is an error code.  */
-
-      static tree strerror_fn;
-      LOOKUP_STARPU_FUNCTION (strerror_fn, "strerror");
-
       gcc_assert (TREE_CODE (error_var) == VAR_DECL
 		  && TREE_TYPE (error_var) == integer_type_node);
 
       asprintf (&fmt_long, "%s:%d: error: %s: %%s\n",
 		xloc.file, xloc.line, str);
 
-      tree error_code =
-	build1 (NEGATE_EXPR, TREE_TYPE (error_var), error_var);
       print =
 	build_call_expr (builtin_decl_explicit (BUILT_IN_PRINTF), 2,
 			 build_string_literal (strlen (fmt_long) + 1,
 					       fmt_long),
-			 build_call_expr (strerror_fn, 1, error_code));
+			 build_error_message (error_var));
     }
   else
     {
@@ -480,6 +485,20 @@ build_error_statements (location_t loc, tree error_var, const char *fmt, ...)
   return stmts;
 }
 
+/* Build an error string for the StarPU return value in ERROR_VAR.  */
+
+static tree
+build_starpu_error_string (tree error_var)
+{
+  static tree strerror_fn;
+  LOOKUP_STARPU_FUNCTION (strerror_fn, "strerror");
+
+  tree error_code =
+    build1 (NEGATE_EXPR, TREE_TYPE (error_var), error_var);
+
+  return build_call_expr (strerror_fn, 1, error_code);
+}
+
 
 /* List and vector utilities, à la SRFI-1.  */
 
@@ -620,7 +639,8 @@ handle_pragma_initialize (struct cpp_reader *reader)
 		      build2 (NE_EXPR, boolean_type_node,
 			      error_var, integer_zero_node),
 		      build_error_statements (loc, error_var,
-					      "failed to initialize StarPU"), 
+					      build_starpu_error_string,
+					      "failed to initialize StarPU"),
 		      NULL_TREE);
 
   tree stmts = NULL_TREE;
@@ -1055,7 +1075,7 @@ build_variable_from_file_contents (location_t loc,
 				   const_tree search_path)
 {
   gcc_assert (search_path != NULL_TREE
-	      && TREE_CODE (search_path) == STRING_CST);
+	      && TREE_CODE (search_path) == TREE_LIST);
 
   int err, dir_fd;
   struct stat st;
@@ -1064,11 +1084,14 @@ build_variable_from_file_contents (location_t loc,
 
   /* Look for FILE in each directory in SEARCH_PATH, and pick the first one
      that matches.  */
-  for (err = ENOENT, dir_fd = -1, dirs = opencl_include_dirs;
+  for (err = ENOENT, dir_fd = -1, dirs = search_path;
        (err != 0 || err == ENOENT) && dirs != NULL_TREE;
        dirs = TREE_CHAIN (dirs))
     {
-      dir_fd = open (TREE_STRING_POINTER (dirs),
+      gcc_assert (TREE_VALUE (dirs) != NULL_TREE
+		  && TREE_CODE (TREE_VALUE (dirs)) == STRING_CST);
+
+      dir_fd = open (TREE_STRING_POINTER (TREE_VALUE (dirs)),
 		     O_DIRECTORY | O_RDONLY);
       if (dir_fd < 0)
 	err = ENOENT;
@@ -1077,6 +1100,10 @@ build_variable_from_file_contents (location_t loc,
 	  err = fstatat (dir_fd, file, &st, 0);
 	  if (err != 0)
 	    close (dir_fd);
+	  else
+	    /* Leave DIRS unchanged so it can be referred to in diagnostics
+	       below.  */
+	    break;
 	}
     }
 
@@ -1091,7 +1118,7 @@ build_variable_from_file_contents (location_t loc,
     {
       if (verbose_output_p)
 	inform (loc, "found file %qs in %qs",
-		file, TREE_STRING_POINTER (dirs));
+		file, TREE_STRING_POINTER (TREE_VALUE (dirs)));
 
       int fd;
 
@@ -1140,22 +1167,149 @@ opencl_program_type (void)
   return t;
 }
 
+static tree
+opencl_kernel_type (void)
+{
+  tree t = lookup_name (get_identifier ("cl_kernel"));
+  gcc_assert (t != NULL_TREE);
+  if (TREE_CODE (t) == TYPE_DECL)
+    t = TREE_TYPE (t);
+  gcc_assert (TYPE_P (t));
+  return t;
+}
+
+static tree
+opencl_command_queue_type (void)
+{
+  tree t = lookup_name (get_identifier ("cl_command_queue"));
+  gcc_assert (t != NULL_TREE);
+  if (TREE_CODE (t) == TYPE_DECL)
+    t = TREE_TYPE (t);
+  gcc_assert (TYPE_P (t));
+  return t;
+}
+
+static tree
+opencl_event_type (void)
+{
+  tree t = lookup_name (get_identifier ("cl_event"));
+  gcc_assert (t != NULL_TREE);
+  if (TREE_CODE (t) == TYPE_DECL)
+    t = TREE_TYPE (t);
+  gcc_assert (TYPE_P (t));
+  return t;
+}
+
+/* Return an expression that, given the OpenCL error code in ERROR_VAR,
+   returns a string.  */
+
+static tree
+build_opencl_error_string (tree error_var)
+{
+  static tree clstrerror_fn;
+  LOOKUP_STARPU_FUNCTION (clstrerror_fn, "starpu_opencl_error_string");
+
+  return build_call_expr (clstrerror_fn, 1, error_var);
+}
+
+/* Return an error-checking `clSetKernelArg' call for argument ARG, at
+   index IDX, of KERNEL.  */
+
+static tree
+build_opencl_set_kernel_arg_call (location_t loc, tree fn,
+				  tree kernel, unsigned int idx,
+				  tree arg)
+{
+  gcc_assert (TREE_CODE (fn) == FUNCTION_DECL
+	      && TREE_TYPE (kernel) == opencl_kernel_type ());
+
+  static tree setkernarg_fn;
+  LOOKUP_STARPU_FUNCTION (setkernarg_fn, "clSetKernelArg");
+
+  tree call = build_call_expr (setkernarg_fn, 4, kernel,
+			       build_int_cst (integer_type_node, idx),
+			       size_in_bytes (TREE_TYPE (arg)),
+			       build_addr (arg, fn));
+  tree error_var = build_decl (loc, VAR_DECL,
+			       create_tmp_var_name ("setkernelarg_error"),
+			       integer_type_node);
+  DECL_ARTIFICIAL (error_var) = true;
+  DECL_CONTEXT (error_var) = fn;
+
+  tree assignment = build2 (INIT_EXPR, TREE_TYPE (error_var),
+			    error_var, call);
+
+  /* Build `if (ERROR_VAR != 0) error ();'.  */
+  tree cond;
+  cond = build3 (COND_EXPR, void_type_node,
+		 build2 (NE_EXPR, boolean_type_node,
+			 error_var, integer_zero_node),
+		 build_error_statements (loc, error_var,
+					 build_opencl_error_string,
+					 "failed to set OpenCL kernel "
+					 "argument %d", idx),
+		 NULL_TREE);
+
+  tree stmts = NULL_TREE;
+  append_to_statement_list (assignment, &stmts);
+  append_to_statement_list (cond, &stmts);
+
+  return build4 (TARGET_EXPR, void_type_node, error_var,
+		 stmts, NULL_TREE, NULL_TREE);
+}
+
+/* Return the sequence of `clSetKernelArg' calls for KERNEL.  */
+
+static tree
+build_opencl_set_kernel_arg_calls (location_t loc, tree task_impl,
+				   tree kernel)
+{
+  gcc_assert (task_implementation_p (task_impl));
+
+  size_t n;
+  tree arg, stmts = NULL_TREE;
+
+  for (arg = DECL_ARGUMENTS (task_impl), n = 0;
+       arg != NULL_TREE;
+       arg = TREE_CHAIN (arg), n++)
+    {
+      tree call = build_opencl_set_kernel_arg_call (loc, task_impl,
+						    kernel, n, arg);
+      append_to_statement_list (call, &stmts);
+    }
+
+  return stmts;
+}
+
 /* Define a body for TASK_IMPL that loads OpenCL source from FILE and calls
    KERNEL.  */
 
 static void
 define_opencl_task_implementation (location_t loc, tree task_impl,
-				   const char *file, const_tree kernel)
+				   const char *file, const_tree kernel,
+				   tree groupsize)
 {
   gcc_assert (task_implementation_p (task_impl)
 	      && task_implementation_where (task_impl) == STARPU_OPENCL);
   gcc_assert (TREE_CODE (kernel) == STRING_CST);
+  gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (groupsize)));
+
+  local_define (tree, local_var, (tree type))
+  {
+    tree var = build_decl (loc, VAR_DECL,
+			   create_tmp_var_name ("opencl_var"),
+			   type);
+    DECL_ARTIFICIAL (var) = true;
+    DECL_CONTEXT (var) = task_impl;
+    return var;
+  };
 
   if (!verbose_output_p)
     /* No further warnings for this node.  */
     TREE_NO_WARNING (task_impl) = true;
 
-  static tree load_fn;
+  static tree load_fn, load_kern_fn, enqueue_kern_fn, wid_fn, devid_fn, clfinish_fn,
+    collect_stats_fn, release_ev_fn;
 
   if (load_fn == NULL_TREE)
     {
@@ -1169,6 +1323,14 @@ define_opencl_task_implementation (location_t loc, tree task_impl,
 	}
     }
 
+  LOOKUP_STARPU_FUNCTION (load_kern_fn, "starpu_opencl_load_kernel");
+  LOOKUP_STARPU_FUNCTION (wid_fn, "starpu_worker_get_id");
+  LOOKUP_STARPU_FUNCTION (devid_fn, "starpu_worker_get_devid");
+  LOOKUP_STARPU_FUNCTION (enqueue_kern_fn, "clEnqueueNDRangeKernel");
+  LOOKUP_STARPU_FUNCTION (clfinish_fn, "clFinish");
+  LOOKUP_STARPU_FUNCTION (collect_stats_fn, "starpu_opencl_collect_stats");
+  LOOKUP_STARPU_FUNCTION (release_ev_fn, "clReleaseEvent");
+
   if (verbose_output_p)
     inform (loc, "defining %qE, with OpenCL kernel %qs from file %qs",
 	    DECL_NAME (task_impl), TREE_STRING_POINTER (kernel), file);
@@ -1178,6 +1340,9 @@ define_opencl_task_implementation (location_t loc, tree task_impl,
 						  file, opencl_include_dirs);
   if (source_var != NULL_TREE)
     {
+      /* Give TASK_IMPL an actual argument list.  */
+      DECL_ARGUMENTS (task_impl) = build_function_arguments (task_impl);
+
       tree prog_var, prog_loaded_var;
 
       /* Global variable to hold the `starpu_opencl_program' object.  */
@@ -1219,24 +1384,115 @@ define_opencl_task_implementation (location_t loc, tree task_impl,
 
       /* Build `if (!PROG_LOADED_VAR) { ...; PROG_LOADED_VAR = true; }'.  */
 
-      tree cond = build3 (COND_EXPR, void_type_node,
-			  prog_loaded_var,
-			  NULL_TREE,
-			  load_stmts);
-
-      /* TODO: Build the kernel invocation.  */
+      tree load_cond = build3 (COND_EXPR, void_type_node,
+			       prog_loaded_var,
+			       NULL_TREE,
+			       load_stmts);
+
+      /* Local variables.  */
+      tree kernel_var, queue_var, event_var, group_size_var, ngroups_var,
+	error_var;
+
+      kernel_var = local_var (opencl_kernel_type ());
+      queue_var = local_var (opencl_command_queue_type ());
+      event_var = local_var (opencl_event_type ());
+      group_size_var = local_var (size_type_node);
+      ngroups_var = local_var (size_type_node);
+      error_var = local_var (integer_type_node);
+
+      /* Build `starpu_opencl_load_kernel (...)'.
+         TODO: Check return value.  */
+      tree devid =
+	build_call_expr (devid_fn, 1, build_call_expr (wid_fn, 0));
+
+      tree load_kern = build_call_expr (load_kern_fn, 5,
+					build_addr (kernel_var, task_impl),
+					build_addr (queue_var, task_impl),
+					build_addr (prog_var, task_impl),
+					build_string_literal
+					(TREE_STRING_LENGTH (kernel) + 1,
+					 TREE_STRING_POINTER (kernel)),
+					devid);
+
+      tree enqueue_kern =
+	build_call_expr (enqueue_kern_fn, 9,
+			 queue_var, kernel_var,
+			 build_int_cst (integer_type_node, 1),
+			 null_pointer_node,
+			 build_addr (group_size_var, task_impl),
+			 build_addr (ngroups_var, task_impl),
+			 integer_zero_node,
+			 null_pointer_node,
+			 build_addr (event_var, task_impl));
+      tree enqueue_err =
+	build2 (INIT_EXPR, TREE_TYPE (error_var), error_var, enqueue_kern);
+
+      tree enqueue_cond =
+	build3 (COND_EXPR, void_type_node,
+		build2 (NE_EXPR, boolean_type_node,
+			error_var, integer_zero_node),
+		build_error_statements (loc, error_var,
+					build_opencl_error_string,
+					"failed to enqueue kernel"),
+		NULL_TREE);
+
+      tree clfinish =
+	build_call_expr (clfinish_fn, 1, queue_var);
+
+      tree collect_stats =
+	build_call_expr (collect_stats_fn, 1, event_var);
+
+      tree release_ev =
+	build_call_expr (release_ev_fn, 1, event_var);
+
+      tree enqueue_stmts = NULL_TREE;
+      append_to_statement_list (enqueue_err, &enqueue_stmts);
+      append_to_statement_list (enqueue_cond, &enqueue_stmts);
+
+
+      /* TODO: Build `clFinish', `clReleaseEvent', & co.  */
+      /* Put it all together.  */
+      tree stmts = NULL_TREE;
+      append_to_statement_list (load_cond, &stmts);
+      append_to_statement_list (load_kern, &stmts);
+      append_to_statement_list (build_opencl_set_kernel_arg_calls (loc,
+								   task_impl,
+								   kernel_var),
+				&stmts);
+
+      /* TODO: Support user-provided values.  */
+      append_to_statement_list (build2 (INIT_EXPR, TREE_TYPE (group_size_var),
+					group_size_var,
+					fold_convert (TREE_TYPE (group_size_var),
+						      groupsize)),
+				&stmts);
+      append_to_statement_list (build2 (INIT_EXPR, TREE_TYPE (ngroups_var),
+					ngroups_var,
+					build_int_cst (TREE_TYPE (ngroups_var),
+						       1)),
+				&stmts);
+      append_to_statement_list (build4 (TARGET_EXPR, void_type_node,
+					error_var, enqueue_stmts,
+					NULL_TREE, NULL_TREE),
+				&stmts);
+      append_to_statement_list (clfinish, &stmts);
+      append_to_statement_list (collect_stats, &stmts);
+      append_to_statement_list (release_ev, &stmts);
+
+      /* Bind the local vars.  */
+      tree vars = chain_trees (kernel_var, queue_var, event_var,
+			       group_size_var, ngroups_var, NULL_TREE);
+      tree bind = build3 (BIND_EXPR, void_type_node, vars, stmts,
+			  build_block (vars, NULL_TREE, task_impl, NULL_TREE));
 
       TREE_USED (task_impl) = true;
       TREE_STATIC (task_impl) = true;
       DECL_EXTERNAL (task_impl) = false;
       DECL_ARTIFICIAL (task_impl) = true;
-      DECL_SAVED_TREE (task_impl) = cond;
-      DECL_INITIAL (task_impl) =
-	build_block (NULL_TREE, NULL_TREE, task_impl, NULL_TREE);
+      DECL_SAVED_TREE (task_impl) = bind;
+      DECL_INITIAL (task_impl) = BIND_EXPR_BLOCK (bind);
       DECL_RESULT (task_impl) =
 	build_decl (loc, RESULT_DECL, NULL_TREE, void_type_node);
-      DECL_ARGUMENTS (task_impl) =
-	build_function_arguments (task_impl);
 
       /* Compile TASK_IMPL.  */
       rest_of_decl_compilation (task_impl, true, 0);
@@ -1288,8 +1544,8 @@ handle_pragma_opencl (struct cpp_reader *reader)
   if (args == NULL_TREE)
     return;
 
-  /* TODO: Add "group size" and "number of groups" arguments.  */
-  if (list_length (args) < 3)
+  /* TODO: Add "number of groups" arguments.  */
+  if (list_length (args) < 4)
     {
       error_at (loc, "wrong number of arguments for %<starpu opencl%> pragma");
       return;
@@ -1308,13 +1564,21 @@ handle_pragma_opencl (struct cpp_reader *reader)
   	      if (TREE_CODE (TREE_VALUE (args)) == STRING_CST)
   		{
   		  tree kernel = TREE_VALUE (args);
+		  args = TREE_CHAIN (args);
 
-  		  if (TREE_CHAIN (args) == NULL_TREE)
-		    define_opencl_task_implementation (loc, task_impl,
-						       TREE_STRING_POINTER (file),
-						       kernel);
-  		  else
-  		    error_at (loc, "junk after %<starpu opencl%> pragma");
+		  if (TREE_TYPE (TREE_VALUE (args)) != NULL_TREE &&
+		      INTEGRAL_TYPE_P (TREE_TYPE (TREE_VALUE (args))))
+		    {
+		      tree groupsize = TREE_VALUE (args);
+		      if (TREE_CHAIN (args) == NULL_TREE)
+			define_opencl_task_implementation (loc, task_impl,
+							   TREE_STRING_POINTER (file),
+							   kernel, groupsize);
+		      else
+			error_at (loc, "junk after %<starpu opencl%> pragma");
+		    }
+		  else
+		    error_at (loc, "%<groupsize%> argument must be an integral type");
   		}
   	      else
   		error_at (loc, "%<kernel%> argument must be a string constant");
@@ -2712,6 +2976,7 @@ build_pointer_lookup (tree pointer)
 		      build2 (EQ_EXPR, boolean_type_node,
 			      result_var, null_pointer_node),
 		      build_error_statements (loc, NULL_TREE,
+					      build_starpu_error_string,
 					      "attempt to use unregistered "
 					      "pointer"),
 		      NULL_TREE);
@@ -2835,6 +3100,7 @@ define_task (tree task_decl)
 		      build2 (NE_EXPR, boolean_type_node,
 			      error_var, integer_zero_node),
 		      build_error_statements (loc, error_var,
+					      build_starpu_error_string,
 					      "failed to insert task `%s'",
 					      IDENTIFIER_POINTER (name)),
 		      NULL_TREE);
@@ -3044,8 +3310,31 @@ int
 plugin_init (struct plugin_name_args *plugin_info,
 	     struct plugin_gcc_version *version)
 {
-  if (!plugin_default_version_check (version, &gcc_version))
-    return 1;
+  /* `plugin_default_version_check' happens to be stricter than necessary
+     (for instance, it fails when the `buildstamp' field of the plug-in
+     doesn't match that of GCC), so write our own check and make more relax
+     and more verbose.  */
+
+#define VERSION_CHECK(field)						\
+  do									\
+    {									\
+      if (strcmp (gcc_version. field, version-> field) != 0)		\
+	{								\
+	  error_at (UNKNOWN_LOCATION, "plug-in version check for `"	\
+		    STRINGIFY (field) "' failed: expected `%s', "	\
+		    "got `%s'",						\
+		    gcc_version. field, version-> field);		\
+	  return 1;							\
+	}								\
+    }									\
+  while (0)
+
+  VERSION_CHECK (basever);			  /* e.g., "4.6.2" */
+  VERSION_CHECK (devphase);
+  VERSION_CHECK (revision);
+  VERSION_CHECK (configuration_arguments);
+
+#undef VERSION_CHECK
 
   register_callback (plugin_name, PLUGIN_START_UNIT,
 		     define_cpp_macros, NULL);
@@ -3071,7 +3360,8 @@ plugin_init (struct plugin_name_args *plugin_info,
 		     NULL, &pass_info);
 
   include_dir = getenv ("STARPU_GCC_INCLUDE_DIR");
-  opencl_include_dirs = build_string (1, ".");
+  opencl_include_dirs = tree_cons (NULL_TREE, build_string (1, "."),
+				   NULL_TREE);
 
   int arg;
   for (arg = 0; arg < plugin_info->argc; arg++)
@@ -3094,7 +3384,8 @@ plugin_init (struct plugin_name_args *plugin_info,
 	    {
 	      tree dir = build_string (strlen (plugin_info->argv[arg].value),
 				       plugin_info->argv[arg].value);
-	      opencl_include_dirs = chainon (opencl_include_dirs, dir);
+	      opencl_include_dirs = tree_cons (NULL_TREE, dir,
+					       opencl_include_dirs);
 	    }
 	}
       else if (strcmp (plugin_info->argv[arg].key, "verbose") == 0)
@@ -3104,6 +3395,9 @@ plugin_init (struct plugin_name_args *plugin_info,
 		  plugin_info->argv[arg].key);
     }
 
+  /* Keep the directories in the order in which they appear.  */
+  opencl_include_dirs = nreverse (opencl_include_dirs);
+
   return 0;
 }
 

+ 4 - 7
gcc-plugin/tests/Makefile.am

@@ -85,20 +85,17 @@ EXTRA_DIST += ./run-test.in			\
   $(gcc_tests)
 
 # The test suite assumes that the CPU back-end is available.
-if STARPU_USE_CPU
+if RUN_GCC_PLUGIN_TESTS
 
 TESTS = $(gcc_tests)
-
-endif
-
 TESTS_ENVIRONMENT = ./run-test
 
-if !HAVE_GUILE
+else !RUN_GCC_PLUGIN_TESTS
 
 check-hook:
-	-@echo "GNU Guile not available, test suite not run."
+	-@echo "GNU Guile or CPU back-end not available, test suite not run."
 
-endif !HAVE_GUILE
+endif !RUN_GCC_PLUGIN_TESTS
 
 showcheck:
 	-cat $(TEST_LOGS) /dev/null

+ 137 - 2
gcc-plugin/tests/mocks.h

@@ -434,11 +434,31 @@ struct starpu_opencl_program
   /* Nothing.  */
 };
 
+typedef int cl_event;
+typedef int cl_kernel;
+typedef int cl_command_queue;
+
+extern cl_int clSetKernelArg (cl_kernel, cl_uint, size_t, const void *);
+
+extern cl_int
+clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
+                       cl_kernel        /* kernel */,
+                       cl_uint          /* work_dim */,
+                       const size_t *   /* global_work_offset */,
+                       const size_t *   /* global_work_size */,
+                       const size_t *   /* local_work_size */,
+                       cl_uint          /* num_events_in_wait_list */,
+                       const cl_event * /* event_wait_list */,
+                       cl_event *       /* event */);
+
 #endif
 
 
-/* Number of `load_opencl_from_string' calls.  */
-static unsigned int load_opencl_calls;
+/* Number of `load_opencl_from_string', `load_kernel', and `clSetKernelArg'
+   calls.  */
+static unsigned int load_opencl_calls, load_opencl_kernel_calls,
+  opencl_set_kernel_arg_calls, opencl_enqueue_calls, opencl_finish_calls,
+  opencl_collect_stats_calls, opencl_release_event_calls;
 
 struct load_opencl_arguments
 {
@@ -449,6 +469,15 @@ struct load_opencl_arguments
 /* Expected arguments.  */
 static struct load_opencl_arguments expected_load_opencl_arguments;
 
+struct cl_enqueue_kernel_arguments
+{
+  size_t * global_work_size;
+};
+
+/* Variable describing the expected `clEnqueueNDRangeKernel' arguments. */
+static struct cl_enqueue_kernel_arguments expected_cl_enqueue_kernel_arguments;
+
+
 int
 starpu_opencl_load_opencl_from_string (const char *source,
 				       struct starpu_opencl_program *program,
@@ -460,6 +489,112 @@ starpu_opencl_load_opencl_from_string (const char *source,
   return 0;
 }
 
+int
+starpu_opencl_load_kernel (cl_kernel *kernel,
+			   cl_command_queue *queue,
+			   struct starpu_opencl_program *programs,
+			   const char *kernel_name, int devid)
+{
+  assert (kernel != NULL && queue != NULL && programs != NULL
+	  && kernel_name != NULL && devid == -42);
+  load_opencl_kernel_calls++;
+  return 0;
+}
+
+int
+starpu_worker_get_id (void)
+{
+  return 42;
+}
+
+int
+starpu_worker_get_devid (int id)
+{
+  return -id;
+}
+
+/* Set the INDEXth argument to KERNEL to the SIZE bytes pointed to by
+   VALUE.  */
+cl_int
+clSetKernelArg (cl_kernel kernel, cl_uint index, size_t size,
+		const void *value)
+{
+  size_t n;
+  const struct insert_task_argument *arg;
+
+  for (n = 0, arg = expected_insert_task_arguments;
+       n < index;
+       n++, arg++)
+    assert (arg->pointer != NULL);
+
+  switch (arg->type)
+    {
+    case STARPU_VALUE:
+      assert (size == arg->size);
+      assert (memcmp (arg->pointer, value, size) == 0);
+      break;
+
+    case STARPU_RW:
+    case STARPU_R:
+    case STARPU_W:
+      assert (size == sizeof (void *));
+      assert (* (void **) value == arg->pointer);
+      break;
+
+    default:
+      abort ();
+    }
+
+  opencl_set_kernel_arg_calls++;
+  return 0;
+}
+
+cl_int
+clEnqueueNDRangeKernel(cl_command_queue command_queue,
+                       cl_kernel        kernel,
+                       cl_uint          work_dim,
+                       const size_t *   global_work_offset,
+                       const size_t *   global_work_size,
+                       const size_t *   local_work_size,
+                       cl_uint          num_events_in_wait_list,
+                       const cl_event * event_wait_list,
+                       cl_event *       event)
+{
+  assert (*local_work_size == 1);
+  assert (*global_work_size == *expected_cl_enqueue_kernel_arguments.global_work_size);
+
+  opencl_enqueue_calls++;
+  return 0;
+}
+
+cl_int
+clFinish (cl_command_queue command_queue)
+{
+  opencl_finish_calls++;
+  return 0;
+}
+
+cl_int
+starpu_opencl_collect_stats (cl_event event)
+{
+  opencl_collect_stats_calls++;
+  return 0;
+}
+
+cl_int
+clReleaseEvent (cl_event event)
+{
+  opencl_release_event_calls++;
+  return 0;
+}
+
+
+const char *
+starpu_opencl_error_string (cl_int s)
+{
+  return "mock";
+}
+
 
 /* Initialization.  */
 

+ 9 - 8
gcc-plugin/tests/opencl-errors.c

@@ -35,19 +35,20 @@ my_task_cpu (int x, float a[x])
 }
 
 
-#pragma starpu opencl my_task "test.cl" "kern" /* (error "not a.* task impl") */
+#pragma starpu opencl my_task "test.cl" "kern" 1 /* (error "not a.* task impl") */
 #pragma starpu opencl my_task_cpu  /* (error "not a.* task impl") */	\
-                      "test.cl" "kern"
-#pragma starpu opencl my_task_opencl "/dev/null" "kern" /* (error "empty") */
-#pragma starpu opencl my_task_opencl "/does-not-exist/" "kern" /* (error "failed to access") */
+                      "test.cl" "kern" 1
+#pragma starpu opencl my_task_opencl "/dev/null" "kern" 1 /* (error "empty") */
+#pragma starpu opencl my_task_opencl "/does-not-exist/" "kern" 1 /* (error "failed to access") */
 
 #pragma starpu opencl my_task_opencl	  /* (error "wrong number of arg") */
-#pragma starpu opencl my_task_opencl 123 "kern" /* (error "string constant") */
-#pragma starpu opencl my_task_opencl "test.cl" 123 /* (error "string constant") */
-#pragma starpu opencl my_task_opencl "test.cl" "kern" "foo" /* (error "junk after") */
+#pragma starpu opencl my_task_opencl 123 "kern" 1 /* (error "string constant") */
+#pragma starpu opencl my_task_opencl "test.cl" 123 1 /* (error "string constant") */
+#pragma starpu opencl my_task_opencl "test.cl" "kern" "a" /* (error "integral type") */
+#pragma starpu opencl my_task_opencl "test.cl" "kern" 1 "foo" /* (error "junk after") */
 
 void
 foo (void)
 {
-#pragma starpu opencl my_task_opencl "test.cl" "kern" /* (error "top-level") */
+#pragma starpu opencl my_task_opencl "test.cl" "kern" 1 /* (error "top-level") */
 }

+ 1 - 1
gcc-plugin/tests/opencl-lacking.c

@@ -23,4 +23,4 @@ static void my_task_opencl (int x, float a[x])
   __attribute__ ((task_implementation ("opencl", my_task)));
 
 #pragma starpu opencl my_task_opencl  /* (note "not generated") */	\
-               "test.cl" "kern"
+               "test.cl" "kern" 8

+ 8 - 2
gcc-plugin/tests/opencl.c

@@ -29,7 +29,7 @@ static void my_task (int x, float a[x])
 static void my_task_opencl (int x, float a[x])
   __attribute__ ((task_implementation ("opencl", my_task)));
 
-#pragma starpu opencl my_task_opencl "test.cl" "kern"
+#pragma starpu opencl my_task_opencl "test.cl" "kern" 8
 
 int
 main ()
@@ -55,6 +55,7 @@ main ()
 
   expected_insert_task_arguments = expected;
   expected_insert_task_targets = STARPU_OPENCL;
+  size_t y = 8; expected_cl_enqueue_kernel_arguments.global_work_size = &y;
 
   my_task (123, a);
   my_task (123, a);
@@ -62,6 +63,11 @@ main ()
 
   assert (tasks_submitted == 3);
   assert (load_opencl_calls == 1);
-
+  assert (load_opencl_kernel_calls == 3);
+  assert (opencl_set_kernel_arg_calls == 3 * 2);
+  assert (opencl_enqueue_calls == 3);
+  assert (opencl_finish_calls == 3);
+  assert (opencl_release_event_calls == 3);
+  assert (opencl_collect_stats_calls == 3);
   return EXIT_SUCCESS;
 }

+ 4 - 0
gcc-plugin/tests/run-test.in

@@ -73,6 +73,9 @@ exec "${GUILE-@GUILE@}" -l "$0"    \
 (define %cuda-cppflags
   (string-tokenize "@STARPU_CUDA_CPPFLAGS@"))
 
+(define %opencl-cppflags
+  (string-tokenize "@STARPU_OPENCL_CPPFLAGS@"))
+
 (define %default-cflags
   `("-I" ,%srcdir
     "-I" ,(string-append %srcdir "/../../src")    ; for <common/uthash.h>
@@ -81,6 +84,7 @@ exec "${GUILE-@GUILE@}" -l "$0"    \
     "-I" ,(string-append %builddir "/../..")
 
     ,@%cuda-cppflags
+    ,@%opencl-cppflags
 
     ;; Unfortunately `libtool --mode=execute' doesn't help here, so hard-code
     ;; the real file name.

+ 3 - 2
include/starpu.h

@@ -68,6 +68,7 @@ struct starpu_driver
 	enum starpu_archtype type;
 	union
 	{
+		unsigned cpu_id;
 		unsigned cuda_id;
 #if defined(STARPU_USE_OPENCL) && !defined(__CUDACC__)
 		cl_device_id opencl_id;
@@ -201,8 +202,8 @@ void starpu_worker_get_name(int id, char *dst, size_t maxlen);
  */
 int starpu_worker_get_devid(int id);
 
-int starpu_driver_run(struct starpu_driver *);
-void starpu_set_end_of_submissions(void);
+int starpu_driver_run(struct starpu_driver *d);
+void starpu_drivers_request_termination(void);
 
 int starpu_driver_init(struct starpu_driver *d);
 int starpu_driver_run_once(struct starpu_driver *d);

+ 5 - 0
include/starpu_data.h

@@ -52,6 +52,11 @@ struct starpu_data_interface_ops;
 void starpu_data_unregister(starpu_data_handle_t handle);
 void starpu_data_unregister_no_coherency(starpu_data_handle_t handle);
 
+/* Destroy the data handle once it is not needed anymore by any submitted task.
+ * No coherency is assumed.
+ */
+void starpu_data_unregister_lazy(starpu_data_handle_t handle);
+
 /* Destroy all data replicates. After data invalidation, the first access to
  * the handle must be performed in write-only mode. */
 void starpu_data_invalidate(starpu_data_handle_t handle);

+ 2 - 1
include/starpu_data_filters.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
@@ -66,6 +66,7 @@ void starpu_vertical_block_filter_func(void *father_interface, void *child_inter
 
 /* for vector */
 void starpu_block_filter_func_vector(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+void starpu_block_shadow_filter_func_vector(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 void starpu_vector_list_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 void starpu_vector_divide_in_2_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 

+ 9 - 2
include/starpu_opencl.h

@@ -31,6 +31,7 @@ extern "C"
 {
 #endif
 
+const char *starpu_opencl_error_string(cl_int status);
 void starpu_opencl_display_error(const char *func, const char *file, int line, const char *msg, cl_int status);
 #define STARPU_OPENCL_DISPLAY_ERROR(status) \
 	starpu_opencl_display_error(__starpu_func__, __FILE__, __LINE__, NULL, status)
@@ -58,6 +59,12 @@ void starpu_opencl_get_queue(int devid, cl_command_queue *queue);
 void starpu_opencl_get_current_context(cl_context *context);
 void starpu_opencl_get_current_queue(cl_command_queue *queue);
 
+void starpu_opencl_load_program_source(const char *source_file_name, char *located_file_name, char *located_dir_name, char *opencl_program_source);
+int starpu_opencl_compile_opencl_from_file(const char *source_file_name, const char* build_options);
+int starpu_opencl_compile_opencl_from_string(const char *opencl_program_source, const char *file_name, const char* build_options);
+
+int starpu_opencl_load_binary_opencl(const char *kernel_id, struct starpu_opencl_program *opencl_programs);
+
 int starpu_opencl_load_opencl_from_file(const char *source_file_name, struct starpu_opencl_program *opencl_programs, const char* build_options);
 int starpu_opencl_load_opencl_from_string(const char *opencl_program_source, struct starpu_opencl_program *opencl_programs, const char* build_options);
 int starpu_opencl_unload_opencl(struct starpu_opencl_program *opencl_programs);
@@ -70,13 +77,13 @@ int starpu_opencl_collect_stats(cl_event event);
 /*
  * Sets the arguments of an OpenCL kernel.
  * Arguments to pass to the kernel should be given as follows :
- * 
+ *
  * 	size of the argument,  pointer to the argument
  *
  * 0 must be passed to this function after the kernel arguments.
  *
  * In case of failure, returns the id of the argument that could not be set,
- * and sets "error" to the error returned. Otherwise, returns the number of 
+ * and sets "error" to the error returned. Otherwise, returns the number of
  * arguments that were set.
  *
  * Example :

+ 1 - 0
include/starpu_perfmodel.h

@@ -211,6 +211,7 @@ int starpu_list_models(FILE *output);
 
 void starpu_force_bus_sampling(void);
 void starpu_bus_print_bandwidth(FILE *f);
+void starpu_bus_print_affinity(FILE *f);
 
 #ifdef __cplusplus
 }

+ 2 - 2
include/starpu_util.h

@@ -54,7 +54,7 @@ extern "C"
 } while(0)
 
 #if defined(STARPU_HAVE_STRERROR_R)
-#  define STARPU_CHECK_RETURN_VALUE(err, message) {if (err < 0) { \
+#  define STARPU_CHECK_RETURN_VALUE(err, message) {if (err != 0) { \
 			char xmessage[256]; strerror_r(-err, xmessage, 256); \
 			fprintf(stderr, "StarPU function <%s> returned unexpected value: <%d:%s>\n", message, err, xmessage); \
 			STARPU_ASSERT(0); }}
@@ -63,7 +63,7 @@ extern "C"
 			fprintf(stderr, "StarPU function <%s> returned unexpected value: <%d:%s>\n", message, err, xmessage); \
 			STARPU_ASSERT(0); }}
 #else
-#  define STARPU_CHECK_RETURN_VALUE(err, message) {if (err < 0) {		\
+#  define STARPU_CHECK_RETURN_VALUE(err, message) {if (err != 0) {		\
 			fprintf(stderr, "StarPU function <%s> returned unexpected value: <%d>\n", message, err); \
 			STARPU_ASSERT(0); }}
 #  define STARPU_CHECK_RETURN_VALUE_IS(err, value, message) {if (err != value) { \

+ 2 - 2
mpi/Makefile.am

@@ -29,8 +29,8 @@ BUILT_SOURCES =
 CLEANFILES = *.gcno *.gcda *.linkinfo
 
 EXTRA_DIST = 					\
-	examples/mpi_lu/float.h			\
-	examples/mpi_lu/double.h		\
+	examples/mpi_lu/mpi_lu-float.h		\
+	examples/mpi_lu/mpi_lu-double.h		\
 	examples/mpi_lu/plu_example.c		\
 	examples/mpi_lu/plu_solve.c		\
 	examples/mpi_lu/pxlu.h			\

+ 0 - 42
mpi/examples/mpi_lu/double.h

@@ -1,42 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#define TYPE double
-#define MPI_TYPE	MPI_DOUBLE
-
-#define STARPU_PLU(name)       starpu_pdlu_##name
-
-#define CUBLAS_GEMM	cublasDgemm
-#define CUBLAS_TRSM	cublasDtrsm
-#define CUBLAS_SCAL	cublasDscal
-#define CUBLAS_GER	cublasDger
-#define CUBLAS_SWAP	cublasDswap
-#define CUBLAS_IAMAX	cublasIdamax
-
-#define CPU_GEMM	DGEMM
-#define CPU_GEMV	DGEMV
-#define CPU_TRSM	DTRSM
-#define CPU_SCAL	DSCAL
-#define CPU_GER		DGER
-#define CPU_SWAP	DSWAP
-
-#define CPU_TRMM	DTRMM
-#define CPU_AXPY	DAXPY
-#define CPU_ASUM	DASUM
-#define CPU_IAMAX	IDAMAX
-
-#define PIVOT_THRESHHOLD	10e-10

+ 0 - 42
mpi/examples/mpi_lu/float.h

@@ -1,42 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#define TYPE float
-#define MPI_TYPE	MPI_FLOAT
-
-#define STARPU_PLU(name)       starpu_pslu_##name
-
-#define CUBLAS_GEMM	cublasSgemm
-#define CUBLAS_TRSM	cublasStrsm
-#define CUBLAS_SCAL	cublasSscal
-#define CUBLAS_GER	cublasSger
-#define CUBLAS_SWAP	cublasSswap
-#define CUBLAS_IAMAX	cublasIsamax
-
-#define CPU_GEMM	SGEMM
-#define CPU_GEMV	SGEMV
-#define CPU_TRSM	STRSM
-#define CPU_SCAL	SSCAL
-#define CPU_GER		SGER
-#define CPU_SWAP	SSWAP
-
-#define CPU_TRMM	STRMM
-#define CPU_AXPY	SAXPY
-#define CPU_ASUM	SASUM
-#define CPU_IAMAX	ISAMAX
-
-#define PIVOT_THRESHHOLD	10e-5

+ 1 - 1
mpi/examples/mpi_lu/pdlu.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "double.h"
+#include "mpi_lu-double.h"
 #include "pxlu.c"

+ 1 - 1
mpi/examples/mpi_lu/pdlu_kernels.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "double.h"
+#include "mpi_lu-double.h"
 #include "pxlu_kernels.c"

+ 1 - 1
mpi/examples/mpi_lu/plu_example_double.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "double.h"
+#include "mpi_lu-double.h"
 #include "plu_example.c"

+ 1 - 1
mpi/examples/mpi_lu/plu_example_float.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "float.h"
+#include "mpi_lu-float.h"
 #include "plu_example.c"

+ 1 - 1
mpi/examples/mpi_lu/plu_solve_double.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "double.h"
+#include "mpi_lu-double.h"
 #include "plu_solve.c"

+ 1 - 1
mpi/examples/mpi_lu/plu_solve_float.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "float.h"
+#include "mpi_lu-float.h"
 #include "plu_solve.c"

+ 1 - 1
mpi/examples/mpi_lu/pslu.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "float.h"
+#include "mpi_lu-float.h"
 #include "pxlu.c"

+ 1 - 1
mpi/examples/mpi_lu/pslu_kernels.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "float.h"
+#include "mpi_lu-float.h"
 #include "pxlu_kernels.c"

+ 1 - 1
mpi/examples/mpi_lu/slu_kernels.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "float.h"
+#include "mpi_lu-float.h"
 #include "xlu_kernels.c"

+ 3 - 3
mpi/starpu_mpi_fxt.h

@@ -28,11 +28,11 @@
 
 #ifdef STARPU_USE_FXT
 #define TRACE_MPI_BARRIER(rank, worldsize, key)	\
-	FUT_DO_PROBE4(FUT_MPI_BARRIER, (rank), (worldsize), (key), syscall(SYS_gettid));
+	FUT_DO_PROBE4(FUT_MPI_BARRIER, (rank), (worldsize), (key), _starpu_gettid());
 #define TRACE_MPI_ISEND(dest, mpi_tag, size)	\
-	FUT_DO_PROBE4(FUT_MPI_ISEND, (dest), (mpi_tag), (size), syscall(SYS_gettid));
+	FUT_DO_PROBE4(FUT_MPI_ISEND, (dest), (mpi_tag), (size), _starpu_gettid());
 #define TRACE_MPI_IRECV_END(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_IRECV_END, (src), (mpi_tag), syscall(SYS_gettid));
+	FUT_DO_PROBE3(FUT_MPI_IRECV_END, (src), (mpi_tag), _starpu_gettid());
 #define TRACE
 #else
 #define TRACE_MPI_BARRIER(a, b, c)	do {} while(0);

+ 1 - 1
mpi/starpu_mpi_insert_task_cache.c

@@ -17,7 +17,7 @@
 
 #include <starpu_mpi_private.h>
 #include <starpu_mpi_insert_task_cache.h>
-#include <common/htable32.h>
+#include <common/htable64.h>
 
 typedef struct _starpu_mpi_clear_cache_s {
         starpu_data_handle_t data;

+ 5 - 2
socl/examples/Makefile.am

@@ -43,18 +43,21 @@ examplebin_PROGRAMS =
 
 examplebin_PROGRAMS +=		\
 	basic/basic		\
-	clinfo/clinfo
+	clinfo/clinfo \
+  matmul/matmul
 
 #	mandelbrot/mandelbrot
 
 SOCL_EXAMPLES +=		\
 	basic/basic		\
-	clinfo/clinfo
+	clinfo/clinfo\
+  matmul/matmul
 
 #	mandelbrot/mandelbrot
 
 basic_basic_SOURCES = basic/basic.c
 clinfo_clinfo_SOURCES = clinfo/clinfo.c
+matmul_matmul_SOURCES = matmul/matmul.c
 #mandelbrot_mandelbrot_SOURCES = mandelbrot/mandelbrot.c
 
 #mandelbrot_mandelbrot_CPPFLAGS = $(AM_CPPFLAGS) $(AM_CFLAGS)

+ 477 - 0
socl/examples/matmul/matmul.c

@@ -0,0 +1,477 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010,2011 University of Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <CL/cl.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <assert.h>
+#include <math.h>
+#include <sys/time.h>
+
+#define error(...) do { fprintf(stderr, "Error: " __VA_ARGS__); exit(EXIT_FAILURE); } while(0)
+#define check(exp) do { cl_int err = exp; if(err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error (%d): " #exp "\n", err); exit(EXIT_FAILURE); }} while(0)
+#define check2(exp) exp; if(err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error (%d): " #exp "\n", err); exit(EXIT_FAILURE); }
+
+// Thread block size
+#define BLOCK_SIZE 16  // Kernel thread-block size
+#define WORK_SIZE 64  // Kernel global size in lines of A (or C)
+#define TYPE float
+
+// Basic Matrix dimensions
+#define WA (128L * BLOCK_SIZE) // Matrix A width
+#define HA (512L * BLOCK_SIZE) // Matrix A height
+#define WB (128L * BLOCK_SIZE) // Matrix B width
+#define HB WA  // Matrix B height
+#define WC WB  // Matrix C width 
+#define HC HA  // Matrix C height
+#define BLOCKS (HA / WORK_SIZE)
+
+////////////////////////////////////////////////////////////////////////////////
+// declaration, forward
+void printDiff(TYPE*, TYPE*, int, int, int, TYPE);
+void computeReference(TYPE*, const TYPE*, const TYPE*, unsigned int, unsigned int, unsigned int);
+
+#define str(x) #x
+
+#define CODE "\
+#define TYPE float\n\
+__kernel void sgemmNN(int wa, int ha, int wb,  __global TYPE* A, __global TYPE* B, __global TYPE* C) {\n\
+#define BS 16\n\
+#define BLOCK_SIZE 16\n\
+  int bx = get_group_id(0);\n\
+  int by = get_group_id(1);\n\
+  \n\
+  int tx = get_local_id(0);\n\
+  int ty = get_local_id(1);\n\
+  \n\
+  int gx = get_global_id(0);\n\
+  int gy = get_global_id(1);\n\
+    __local float As[BS][BS+1];\
+    __local float Bs[BS][BS+1];\
+  \n\
+  unsigned int block_w = min(wb - bx * BLOCK_SIZE, BLOCK_SIZE);\n\
+  unsigned int block_h = min(ha - by * BLOCK_SIZE, BLOCK_SIZE);\n\
+  \n\
+  int valid = (gx < wb && gy < ha);\n\
+  \n\
+  TYPE Csub = (TYPE)0.0;\n\
+  \n\
+  int pos = 0;\n\
+  while (pos < wa) {\n\
+    unsigned int size = min(wa-pos, BLOCK_SIZE);\n\
+    if (tx < size && gy < ha)\n\
+      As[tx][ty] = A[pos + tx + wa * gy];\n\
+    if (ty < size && gx < wb)\n\
+      Bs[tx][ty] = B[gx + wb * (pos+ty)];\n\
+    \n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+    \n\
+    if (valid) {\n\
+      for (int k = 0; k < size; ++k)\n\
+        Csub += As[k][ty] * Bs[tx][k];\n\
+    }\n\
+    pos += size;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+  }\n\
+  \n\
+  if (valid)\n\
+    C[wb * gy + gx] = Csub;\n\
+}"
+
+static char * code =  CODE;
+
+int check = 0;
+
+static void __attribute__((unused)) parse_args(int argc, const char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++)
+	{
+		if (strcmp(argv[i], "-check") == 0)
+		{
+			check = 1;
+		}
+
+		if (strcmp(argv[i], "-h") == 0)
+		{
+			printf("usage : %s [-check]\n", argv[0]);
+		}
+	}
+}
+
+// Round Up Division function
+size_t roundUp(int group_size, int global_size) {
+	int r = global_size % group_size;
+	if(r == 0) {
+		return global_size;
+	} else {
+		return global_size + group_size - r;
+	}
+}
+
+void fillArray(TYPE* data, int size) {
+	int i;
+	const TYPE fScale = (TYPE)(1.0f / (float)RAND_MAX);
+	for (i = 0; i < size; ++i) {
+		data[i] = fScale * rand();
+	}
+}
+
+void printArray(float* data, int size) {
+	int i;
+	for (i = 0; i < size; ++i) {
+		printf("%d: %.3f\n", i, data[i]);
+	}
+}
+
+/**
+ * Compare two float arrays using L2-norm with an epsilon tolerance for equality
+ * @return shrTRUE if \a reference and \a data are identical, otherwise shrFALSE
+ * @param reference  handle to the reference data / gold image
+ * @param data       handle to the computed data
+ * @param len        number of elements in reference and data
+ * @param epsilon    epsilon to use for the comparison
+*/
+int shrCompareL2fe( const float* reference, const float* data, const unsigned int len, const float epsilon ) {
+	assert(epsilon >= 0);
+
+	float error = 0;
+	float ref = 0;
+
+	unsigned int i;
+	for(i = 0; i < len; ++i) {
+		float diff = reference[i] - data[i];
+		error += diff * diff;
+		ref += reference[i] * reference[i];
+	}
+
+	float normRef = sqrtf(ref);
+	if (fabs(ref) < 1e-7) {
+#ifdef _DEBUG
+		fprintf(stderr, "ERROR, reference l2-norm is 0\n");
+#endif
+		return 0;
+	}
+	float normError = sqrtf(error);
+	error = normError / normRef;
+	int result = error < epsilon;
+#ifdef _DEBUG
+	if( !result) {
+		fprintf(stderr, "ERROR, l2-norm error %d is greater than epsilon %lf \n", error, epsilon);
+	}
+#endif
+
+	return result;
+}
+
+
+int main(int argc, const char** argv) {
+	cl_uint platform_count;
+	cl_platform_id platforms[5];
+
+	cl_int err = CL_SUCCESS;
+	unsigned int i, p;
+
+	cl_device_type dev_type = CL_DEVICE_TYPE_ALL;
+
+	void * ptrs[BLOCKS];
+	cl_mem d_A[BLOCKS];
+	cl_mem d_C[BLOCKS];
+	cl_mem d_B[BLOCKS];
+
+	cl_event GPUDone[BLOCKS];
+	cl_event GPUExecution[BLOCKS];
+	struct timeval start, end;
+
+	int workOffset[BLOCKS];
+	int workSize[BLOCKS];
+
+	unsigned int sizePerGPU = HC / BLOCKS;
+	unsigned int sizeMod = HC % BLOCKS;
+
+	size_t A_size = WA * HA;
+	size_t A_mem_size = sizeof(TYPE) * A_size;
+	TYPE* A_data;
+
+	size_t B_size = WB * HB;
+	size_t B_mem_size = sizeof(TYPE) * B_size;
+	TYPE* B_data;
+
+	size_t C_size = WC * HC;
+	size_t C_mem_size = sizeof(TYPE) * C_size;
+	TYPE* C_data;
+
+	parse_args(argc, argv);
+
+	check(clGetPlatformIDs(5, platforms, &platform_count));
+	if (platform_count == 0) {
+		printf("No platform found\n");
+		exit(77);
+	}
+
+	cl_uint device_count;
+	cl_uint devs[platform_count];
+	cl_device_id * devices[platform_count];
+	cl_context ctx[platform_count];
+	cl_command_queue * commandQueue[platform_count];
+
+	device_count = 0;
+	for (p=0; p<platform_count; p++) {
+		cl_platform_id platform = platforms[p];
+
+		cl_int err = clGetDeviceIDs(platform, dev_type, 0, NULL, &devs[p]);
+		if (err == CL_DEVICE_NOT_FOUND) {
+			devs[p] = 0;
+			continue;
+		}
+		check(err);
+		if (devs[p] == 0)
+			continue;
+
+		devices[p] = (cl_device_id*)malloc(sizeof(cl_device_id) * devs[p]);
+		commandQueue[p] = (cl_command_queue*)malloc(sizeof(cl_command_queue) * devs[p]);
+
+		check(clGetDeviceIDs(platform, dev_type, devs[p], devices[p], NULL));
+
+		cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};
+		check2(ctx[p] = clCreateContext(properties, devs[p], devices[p], NULL, NULL, &err));
+
+		for(i = 0; i < devs[p]; ++i) 
+		{
+			cl_device_id device = devices[p][i];
+			char name[2048];
+			name[0] = '\0';
+			clGetDeviceInfo(device, CL_DEVICE_NAME, 2048, name, NULL);
+			printf("Device %d: %s\n", i, name);
+
+			check2(commandQueue[p][i] = clCreateCommandQueue(ctx[p], device, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err));
+		}
+
+		device_count += devs[p];
+	}
+
+	if (device_count == 0)
+		error("No device found\n");
+
+
+
+	cl_kernel multiplicationKernel[platform_count];
+
+	printf("\nUsing Matrix Sizes: A(%lu x %lu), B(%lu x %lu), C(%lu x %lu)\n", 
+			(unsigned long)WA, (unsigned long)HA, (unsigned long)WB, (unsigned long)HB, (unsigned long)WC, (unsigned long)HC);
+
+	// allocate host memory for matrices A, B and C
+	A_data = (TYPE*)malloc(A_mem_size);
+	if (A_data == NULL) {
+		perror("malloc");
+	}
+
+	B_data = (TYPE*)malloc(B_mem_size);
+	if (B_data == NULL) {
+		perror("malloc");
+	}
+
+	C_data = (TYPE*) malloc(C_mem_size);
+	if (C_data == NULL) {
+		perror("malloc");
+	}
+
+	cl_program program[platform_count];
+
+	for (p=0; p<platform_count; p++) {
+		if (devs[p] == 0)
+			continue;
+
+		check2(program[p] = clCreateProgramWithSource(ctx[p], 1, (const char **)&code, NULL, &err));
+
+		check(clBuildProgram(program[p], 0, NULL, NULL, NULL, NULL));
+
+		check2(multiplicationKernel[p] = clCreateKernel(program[p], "sgemmNN", &err));
+	}
+
+	printf("Initializing data...\n");
+	srand(2008);
+	fillArray(A_data, A_size);
+	fillArray(B_data, B_size);
+	memset(C_data, 0, C_size);
+
+
+	printf("Computing...\n");
+	workOffset[0] = 0;
+	gettimeofday(&start, NULL);
+
+	size_t localWorkSize[] = {BLOCK_SIZE, BLOCK_SIZE};
+	int c = 0;
+	for (p=0; p<platform_count;p++) {
+		for (i=0; i<devs[p]; i++) {
+			check2(d_B[c] = clCreateBuffer(ctx[p], CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR, HB * WB * sizeof(TYPE), B_data, &err));
+			c++;
+		}
+	}
+
+	for(i=0; i < BLOCKS; ++i) 
+	{
+		int d = i % device_count;
+		cl_uint p = 0;
+
+		// determine device platform
+		int dev = d;
+		for (p = 0; p < platform_count; p++) {
+			if ((cl_int)(dev - devs[p]) < 0)
+				break;
+			dev -= devs[p];
+		}
+
+		workSize[i] = (i < sizeMod) ? sizePerGPU+1 : sizePerGPU;        
+
+		check2(d_A[i] = clCreateBuffer(ctx[p], CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR, workSize[i] * WA * sizeof(TYPE), &A_data[workOffset[i] * WA], &err));
+		check2(d_C[i] = clCreateBuffer(ctx[p], CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, workSize[i] * WC * sizeof(TYPE), &C_data[workOffset[i] * WC], &err));
+
+		check(clSetKernelArg(multiplicationKernel[p], 0, sizeof(cl_int), &workSize[i]));
+		check(clSetKernelArg(multiplicationKernel[p], 1, sizeof(cl_int), &workSize[i]));
+		check(clSetKernelArg(multiplicationKernel[p], 2, sizeof(cl_int), &workSize[i]));
+		check(clSetKernelArg(multiplicationKernel[p], 3, sizeof(cl_mem), (void *) &d_A[i]));
+		check(clSetKernelArg(multiplicationKernel[p], 4, sizeof(cl_mem), (void *) &d_B[d]));
+		check(clSetKernelArg(multiplicationKernel[p], 5, sizeof(cl_mem), (void *) &d_C[i]));
+
+		size_t globalWorkSize[] = {roundUp(BLOCK_SIZE,WC), roundUp(BLOCK_SIZE,workSize[i])};
+
+		check(clEnqueueNDRangeKernel(commandQueue[p][dev], multiplicationKernel[p], 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &GPUExecution[i]));
+
+		// Non-blocking copy of result from device to host
+		check2(ptrs[i] = clEnqueueMapBuffer(commandQueue[p][dev], d_C[i], CL_FALSE, CL_MAP_READ, 0, WC * sizeof(TYPE) * workSize[i], 1, &GPUExecution[i], &GPUDone[i], &err));
+
+		if(i+1 < BLOCKS)
+			workOffset[i + 1] = workOffset[i] + workSize[i];
+	}
+
+
+	// CPU sync with GPU
+	for (p=0; p<platform_count;p++) {
+		cl_uint dev;
+		for (dev=0; dev<devs[p]; dev++) {
+			clFinish(commandQueue[p][dev]);
+		}
+	}
+
+	gettimeofday(&end, NULL);
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+
+	double dSeconds = timing/1000/1000;
+	double dNumOps = 2.0 * (double)WA * (double)HA * (double)WB;
+	double gflops = 1.0e-9 * dNumOps/dSeconds;
+
+	printf("Throughput = %.4f GFlops/s, Time = %.5f s, Size = %.0f, NumDevsUsed = %d, Blocks = %ld, Workgroup = %zu\n", 
+			gflops, dSeconds, dNumOps, device_count, BLOCKS, localWorkSize[0] * localWorkSize[1]);
+
+	for (i=0; i<device_count; i++) {
+		clReleaseMemObject(d_B[i]);
+	}
+
+	for(i = 0; i < BLOCKS; i++) 
+	{
+		clReleaseMemObject(d_A[i]);
+		clReleaseMemObject(d_C[i]);
+		clReleaseEvent(GPUExecution[i]);
+		clReleaseEvent(GPUDone[i]);
+	}
+
+
+	// compute reference solution
+	if (check) {
+		printf("Comparing results with CPU computation... ");
+		TYPE* reference = (TYPE*)malloc(C_mem_size);
+		computeReference(reference, A_data, B_data, HA, WA, WB);
+
+		// check result
+		int res = shrCompareL2fe(reference, C_data, C_size, 1.0e-6f);
+		if (res == 0) {
+			printf("\n\n");
+			printDiff(reference, C_data, WC, HC, 100, 1.0e-5f);
+		}
+		else printf("PASSED\n\n");
+		free(reference);
+	}
+
+	for (p=0; p<platform_count;p++) {
+		if (devs[p] == 0)
+			continue;
+
+		check(clReleaseKernel(multiplicationKernel[p]));
+		check(clReleaseProgram(program[p]));
+		check(clReleaseContext(ctx[p]));
+		cl_uint k;
+		for(k = 0; k < devs[p]; ++k) 
+		{
+			check(clReleaseCommandQueue(commandQueue[p][k]));
+		}
+	}
+
+	free(A_data);
+	free(B_data);
+	free(C_data);
+
+	return 0;
+}
+
+void printDiff(TYPE *data1, TYPE *data2, int width, int height, int listLength, TYPE listTol) {
+	printf("Listing first %d Differences > %.6f...\n", listLength, listTol);
+	int i,j,k;
+	int error_count=0;
+	for (j = 0; j < height; j++) {
+		if (error_count < listLength) {
+			printf("\n  Row %d:\n", j);
+		}
+		for (i = 0; i < width; i++) {
+			k = j * width + i;
+			float diff = fabs(data1[k] - data2[k]);
+			if (diff > listTol) {                
+				if (error_count < listLength) {
+					printf("    Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], diff);
+				}
+				error_count++;
+			}
+		}
+	}
+	printf(" \n  Total Errors = %d\n\n", error_count);
+}
+
+/**
+ * Compute reference data set
+ * C = A * B
+ * @param C          reference data, computed but preallocated
+ * @param A          matrix A as provided to device
+ * @param B          matrix B as provided to device
+ * @param hA         height of matrix A
+ * @param wB         width of matrix B
+*/
+void computeReference(TYPE* C, const TYPE* A, const TYPE* B, unsigned int hA, unsigned int wA, unsigned int wB) {
+	unsigned int i,j,k;
+	for (i = 0; i < hA; ++i)
+		for (j = 0; j < wB; ++j) {
+			double sum = 0;
+			for (k = 0; k < wA; ++k) {
+				double a = A[i * wA + k];
+				double b = B[k * wB + j];
+				sum += a * b;
+			}
+			C[i * wB + j] = (TYPE)sum;
+		}
+}
+

+ 1 - 0
socl/src/cl_createprogramwithsource.c

@@ -80,6 +80,7 @@ soclCreateProgramWithSource(cl_context      context,
       *errcode_ret = CL_SUCCESS;
 
    device_count = starpu_opencl_worker_get_count();
+   assert(device_count > 0);
    DEBUG_MSG("Worker count: %d\n", device_count);
 
    /* Check arguments */

+ 0 - 5
socl/src/cl_enqueuendrangekernel.c

@@ -117,11 +117,6 @@ static void cleaning_task_callback(void *args) {
 
 }
 
-static struct starpu_perfmodel perf_model = {
-	.type = STARPU_HISTORY_BASED,
-	.symbol = "perf_model"
-};
-
 /**
  * Real kernel enqueuing command
  */

+ 3 - 1
socl/src/cl_getkernelworkgroupinfo.c

@@ -117,7 +117,9 @@ soclGetKernelWorkGroupInfo(cl_kernel                kernel,
          task->cl_arg = &data;
          task->cl_arg_size = sizeof(data);
          task->synchronous = 1;
-         starpu_task_submit(task);
+         int ret = starpu_task_submit(task);
+	if (ret != 0)
+		DEBUG_ERROR("Unable to submit a task. Error %d\n", ret);
          INFO_CASE_EX2(data.compile_work_group_size);
          }
       case CL_KERNEL_LOCAL_MEM_SIZE:

+ 4 - 1
socl/src/command_queue.c

@@ -58,8 +58,11 @@ void command_queue_dependencies_implicit(
 	 * Return dependencies
 	 *********************/
 
-	cl_event * evs = malloc(ndeps * sizeof(cl_event));
 	int n = 0;
+	cl_event * evs = NULL;
+	if (ndeps > 0)
+		evs = malloc(ndeps * sizeof(cl_event));
+
 
 	/* Add dependency to last barrier if applicable */
 	if (cq->barrier != NULL)

+ 1 - 1
socl/src/gc.c

@@ -112,7 +112,7 @@ int gc_entity_release_ex(entity e) {
   if (refs != 0)
     return 0;
 
-  DEBUG_MSG("Releasing entity %lx\n", e);
+  DEBUG_MSG("Releasing entity %p\n", e);
 
   GC_LOCK;
 

+ 0 - 3
socl/src/init.c

@@ -28,9 +28,6 @@ __attribute__((constructor)) static void socl_init() {
   struct starpu_conf conf;
   starpu_conf_init(&conf);
   conf.ncuda = 0;
-  putenv("STARPU_NCUDA=0");
-  putenv("STARPU_NOPENCL=1");
-  putenv("STARPU_NCPUS=1");
 
   mem_object_init();
 

+ 21 - 2
src/common/fxt.c

@@ -29,6 +29,12 @@
 #include <windows.h>
 #endif
 
+#ifdef __linux__
+#include <sys/syscall.h>   /* for SYS_gettid */
+#elif defined(__FreeBSD__)
+#include <sys/thr.h>       /* for thr_self() */
+#endif
+
 #define _STARPU_PROF_BUFFER_SIZE  (8*1024*1024)
 
 static char _STARPU_PROF_FILE_USER[128];
@@ -38,6 +44,19 @@ static int _starpu_written = 0;
 
 static int _starpu_id;
 
+long _starpu_gettid()
+{
+#if defined(__linux__)
+	return syscall(SYS_gettid);
+#elif defined(__FreeBSD__)
+	long tid;
+	thr_self(&tid);
+	return tid;
+#else
+	return (long) pthread_self();
+#endif
+}
+
 static void _starpu_profile_set_tracefile(void *last, ...)
 {
 	va_list vl;
@@ -78,7 +97,7 @@ void _starpu_start_fxt_profiling(void)
 		_starpu_profile_set_tracefile(NULL);
 	}
 
-	threadid = syscall(SYS_gettid);
+	threadid = _starpu_gettid();
 
 	atexit(_starpu_stop_fxt_profiling);
 
@@ -140,7 +159,7 @@ void _starpu_stop_fxt_profiling(void)
 
 void _starpu_fxt_register_thread(unsigned cpuid)
 {
-	FUT_DO_PROBE2(FUT_NEW_LWP_CODE, cpuid, syscall(SYS_gettid));
+	FUT_DO_PROBE2(FUT_NEW_LWP_CODE, cpuid, _starpu_gettid());
 }
 
 #endif // STARPU_USE_FXT

+ 37 - 36
src/common/fxt.h

@@ -103,10 +103,11 @@
 #define _STARPU_FUT_TASK_WAIT_FOR_ALL	0x513b
 
 #ifdef STARPU_USE_FXT
-#include <sys/syscall.h> /* pour les définitions de SYS_xxx */
 #include <fxt/fxt.h>
 #include <fxt/fut.h>
 
+long _starpu_gettid(void);
+
 /* Initialize the FxT library. */
 void _starpu_start_fxt_profiling(void);
 
@@ -178,13 +179,13 @@ do {									\
 
 /* workerkind = _STARPU_FUT_CPU_KEY for instance */
 #define _STARPU_TRACE_NEW_MEM_NODE(nodeid)			\
-	FUT_DO_PROBE2(_STARPU_FUT_NEW_MEM_NODE, nodeid, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_NEW_MEM_NODE, nodeid, _starpu_gettid());
 
 #define _STARPU_TRACE_WORKER_INIT_START(workerkind, devid, memnode)	\
-	FUT_DO_PROBE4(_STARPU_FUT_WORKER_INIT_START, workerkind, devid, memnode, syscall(SYS_gettid));
+	FUT_DO_PROBE4(_STARPU_FUT_WORKER_INIT_START, workerkind, devid, memnode, _starpu_gettid());
 
 #define _STARPU_TRACE_WORKER_INIT_END				\
-	FUT_DO_PROBE1(_STARPU_FUT_WORKER_INIT_END, syscall(SYS_gettid));
+	FUT_DO_PROBE1(_STARPU_FUT_WORKER_INIT_END, _starpu_gettid());
 
 #define _STARPU_TRACE_START_CODELET_BODY(job)				\
 do {									\
@@ -192,10 +193,10 @@ do {									\
 	if (model_name)                                                 \
 	{								\
 		/* we include the symbol name */			\
-		_STARPU_FUT_DO_PROBE3STR(_STARPU_FUT_START_CODELET_BODY, (job), syscall(SYS_gettid), 1, model_name); \
+		_STARPU_FUT_DO_PROBE3STR(_STARPU_FUT_START_CODELET_BODY, (job), _starpu_gettid(), 1, model_name); \
 	}								\
 	else {                                                          \
-		FUT_DO_PROBE3(_STARPU_FUT_START_CODELET_BODY, (job), syscall(SYS_gettid), 0); \
+		FUT_DO_PROBE3(_STARPU_FUT_START_CODELET_BODY, (job), _starpu_gettid(), 0); \
 	}								\
 } while(0);
 
@@ -203,35 +204,35 @@ do {									\
 do {									\
 	const size_t job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));	\
 	const uint32_t job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));\
-	FUT_DO_PROBE5(_STARPU_FUT_END_CODELET_BODY, (job), (job_size), (job_hash), (archtype), syscall(SYS_gettid));	\
+	FUT_DO_PROBE5(_STARPU_FUT_END_CODELET_BODY, (job), (job_size), (job_hash), (archtype), _starpu_gettid());	\
 } while(0);
 
 #define _STARPU_TRACE_START_CALLBACK(job)	\
-	FUT_DO_PROBE2(_STARPU_FUT_START_CALLBACK, job, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_START_CALLBACK, job, _starpu_gettid());
 
 #define _STARPU_TRACE_END_CALLBACK(job)	\
-	FUT_DO_PROBE2(_STARPU_FUT_END_CALLBACK, job, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_END_CALLBACK, job, _starpu_gettid());
 
 #define _STARPU_TRACE_JOB_PUSH(task, prio)	\
-	FUT_DO_PROBE3(_STARPU_FUT_JOB_PUSH, task, prio, syscall(SYS_gettid));
+	FUT_DO_PROBE3(_STARPU_FUT_JOB_PUSH, task, prio, _starpu_gettid());
 
 #define _STARPU_TRACE_JOB_POP(task, prio)	\
-	FUT_DO_PROBE3(_STARPU_FUT_JOB_POP, task, prio, syscall(SYS_gettid));
+	FUT_DO_PROBE3(_STARPU_FUT_JOB_POP, task, prio, _starpu_gettid());
 
 #define _STARPU_TRACE_UPDATE_TASK_CNT(counter)	\
-	FUT_DO_PROBE2(_STARPU_FUT_UPDATE_TASK_CNT, counter, syscall(SYS_gettid))
+	FUT_DO_PROBE2(_STARPU_FUT_UPDATE_TASK_CNT, counter, _starpu_gettid())
 
 #define _STARPU_TRACE_START_FETCH_INPUT(job)	\
-	FUT_DO_PROBE2(_STARPU_FUT_START_FETCH_INPUT, job, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_START_FETCH_INPUT, job, _starpu_gettid());
 
 #define _STARPU_TRACE_END_FETCH_INPUT(job)	\
-	FUT_DO_PROBE2(_STARPU_FUT_END_FETCH_INPUT, job, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_END_FETCH_INPUT, job, _starpu_gettid());
 
 #define _STARPU_TRACE_START_PUSH_OUTPUT(job)	\
-	FUT_DO_PROBE2(_STARPU_FUT_START_PUSH_OUTPUT, job, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_START_PUSH_OUTPUT, job, _starpu_gettid());
 
 #define _STARPU_TRACE_END_PUSH_OUTPUT(job)	\
-	FUT_DO_PROBE2(_STARPU_FUT_END_PUSH_OUTPUT, job, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_END_PUSH_OUTPUT, job, _starpu_gettid());
 
 #define _STARPU_TRACE_TAG(tag, job)	\
 	FUT_DO_PROBE2(_STARPU_FUT_TAG, tag, (job)->job_id)
@@ -251,10 +252,10 @@ do {										\
         const char *model_name = _starpu_get_model_name((job));                       \
 	if (model_name)					                        \
 	{									\
-		_STARPU_FUT_DO_PROBE4STR(_STARPU_FUT_TASK_DONE, (job)->job_id, syscall(SYS_gettid), (long unsigned)exclude_from_dag, 1, model_name);\
+		_STARPU_FUT_DO_PROBE4STR(_STARPU_FUT_TASK_DONE, (job)->job_id, _starpu_gettid(), (long unsigned)exclude_from_dag, 1, model_name);\
 	}									\
 	else {									\
-		FUT_DO_PROBE4(_STARPU_FUT_TASK_DONE, (job)->job_id, syscall(SYS_gettid), (long unsigned)exclude_from_dag, 0);\
+		FUT_DO_PROBE4(_STARPU_FUT_TASK_DONE, (job)->job_id, _starpu_gettid(), (long unsigned)exclude_from_dag, 0);\
 	}									\
 } while(0);
 
@@ -264,10 +265,10 @@ do {										\
         const char *model_name = _starpu_get_model_name((job));                       \
 	if (model_name)                                                         \
 	{									\
-          _STARPU_FUT_DO_PROBE3STR(_STARPU_FUT_TAG_DONE, (tag)->id, syscall(SYS_gettid), 1, model_name); \
+          _STARPU_FUT_DO_PROBE3STR(_STARPU_FUT_TAG_DONE, (tag)->id, _starpu_gettid(), 1, model_name); \
 	}									\
 	else {									\
-		FUT_DO_PROBE3(_STARPU_FUT_TAG_DONE, (tag)->id, syscall(SYS_gettid), 0);\
+		FUT_DO_PROBE3(_STARPU_FUT_TAG_DONE, (tag)->id, _starpu_gettid(), 0);\
 	}									\
 } while(0);
 
@@ -290,56 +291,56 @@ do {										\
 	FUT_DO_PROBE2(_STARPU_FUT_WORK_STEALING, empty_q, victim_q)
 
 #define _STARPU_TRACE_WORKER_DEINIT_START			\
-	FUT_DO_PROBE1(_STARPU_FUT_WORKER_DEINIT_START, syscall(SYS_gettid));
+	FUT_DO_PROBE1(_STARPU_FUT_WORKER_DEINIT_START, _starpu_gettid());
 
 #define _STARPU_TRACE_WORKER_DEINIT_END(workerkind)		\
-	FUT_DO_PROBE2(_STARPU_FUT_WORKER_DEINIT_END, workerkind, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_WORKER_DEINIT_END, workerkind, _starpu_gettid());
 
 #define _STARPU_TRACE_WORKER_SLEEP_START	\
-	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SLEEP_START, syscall(SYS_gettid));
+	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SLEEP_START, _starpu_gettid());
 
 #define _STARPU_TRACE_WORKER_SLEEP_END	\
-	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SLEEP_END, syscall(SYS_gettid));
+	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SLEEP_END, _starpu_gettid());
 
 #define _STARPU_TRACE_USER_DEFINED_START	\
-	FUT_DO_PROBE1(_STARPU_FUT_USER_DEFINED_START, syscall(SYS_gettid));
+	FUT_DO_PROBE1(_STARPU_FUT_USER_DEFINED_START, _starpu_gettid());
 
 #define _STARPU_TRACE_USER_DEFINED_END		\
-	FUT_DO_PROBE1(_STARPU_FUT_USER_DEFINED_END, syscall(SYS_gettid));
+	FUT_DO_PROBE1(_STARPU_FUT_USER_DEFINED_END, _starpu_gettid());
 
 #define _STARPU_TRACE_START_ALLOC(memnode)		\
-	FUT_DO_PROBE2(_STARPU_FUT_START_ALLOC, memnode, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_START_ALLOC, memnode, _starpu_gettid());
 	
 #define _STARPU_TRACE_END_ALLOC(memnode)		\
-	FUT_DO_PROBE2(_STARPU_FUT_END_ALLOC, memnode, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_END_ALLOC, memnode, _starpu_gettid());
 
 #define _STARPU_TRACE_START_ALLOC_REUSE(memnode)		\
-	FUT_DO_PROBE2(_STARPU_FUT_START_ALLOC_REUSE, memnode, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_START_ALLOC_REUSE, memnode, _starpu_gettid());
 	
 #define _STARPU_TRACE_END_ALLOC_REUSE(memnode)		\
-	FUT_DO_PROBE2(_STARPU_FUT_END_ALLOC_REUSE, memnode, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_END_ALLOC_REUSE, memnode, _starpu_gettid());
 	
 #define _STARPU_TRACE_START_MEMRECLAIM(memnode)		\
-	FUT_DO_PROBE2(_STARPU_FUT_START_MEMRECLAIM, memnode, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_START_MEMRECLAIM, memnode, _starpu_gettid());
 	
 #define _STARPU_TRACE_END_MEMRECLAIM(memnode)		\
-	FUT_DO_PROBE2(_STARPU_FUT_END_MEMRECLAIM, memnode, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_END_MEMRECLAIM, memnode, _starpu_gettid());
 	
 /* We skip these events becasue they are called so often that they cause FxT to
  * fail and make the overall trace unreadable anyway. */
 #define _STARPU_TRACE_START_PROGRESS(memnode)		\
 	do {} while (0);
-//	FUT_DO_PROBE2(_STARPU_FUT_START_PROGRESS, memnode, syscall(SYS_gettid));
+//	FUT_DO_PROBE2(_STARPU_FUT_START_PROGRESS, memnode, _starpu_gettid());
 
 #define _STARPU_TRACE_END_PROGRESS(memnode)		\
 	do {} while (0);
-	//FUT_DO_PROBE2(_STARPU_FUT_END_PROGRESS, memnode, syscall(SYS_gettid));
+	//FUT_DO_PROBE2(_STARPU_FUT_END_PROGRESS, memnode, _starpu_gettid());
 	
 #define _STARPU_TRACE_USER_EVENT(code)			\
-	FUT_DO_PROBE2(_STARPU_FUT_USER_EVENT, code, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_USER_EVENT, code, _starpu_gettid());
 
 #define _STARPU_TRACE_SET_PROFILING(status)		\
-	FUT_DO_PROBE2(_STARPU_FUT_SET_PROFILING, status, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_SET_PROFILING, status, _starpu_gettid());
 
 #define _STARPU_TRACE_TASK_WAIT_FOR_ALL			\
 	FUT_DO_PROBE0(_STARPU_FUT_TASK_WAIT_FOR_ALL)

+ 1 - 1
src/common/htable32.h

@@ -23,7 +23,7 @@
 #include <stdio.h>
 #include <assert.h>
 
-#define _STARPU_HTBL32_NODE_SIZE	16
+#define _STARPU_HTBL32_NODE_SIZE	8
 
 /* Hierarchical table: all nodes have a 2^16 arity . */
 /* Note: this struct is used in include/starpu_perfmodel.h */

+ 1 - 1
src/common/starpu_spinlock.c

@@ -26,7 +26,7 @@ int _starpu_spin_init(struct _starpu_spinlock *lock)
 //	memcpy(&lock->errcheck_lock, PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP, sizeof(PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP));
 	int ret;
 	ret = pthread_mutexattr_init(&lock->errcheck_attr);
-	STARPU_ASSERT(!ret);
+	STARPU_CHECK_RETURN_VALUE(ret, "pthread_mutexattr_init");
 
 	ret = pthread_mutexattr_settype(&lock->errcheck_attr, PTHREAD_MUTEX_ERRORCHECK);
 	STARPU_ASSERT(!ret);

+ 41 - 1
src/common/utils.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2012  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -75,6 +75,32 @@ out:
 	return rv;
 }
 
+void _starpu_mkpath_and_check(const char *path, mode_t mode)
+{
+	int ret;
+
+	ret = _starpu_mkpath(path, mode);
+
+	if (ret == -1)
+	{
+		if (errno != EEXIST)
+		{
+			fprintf(stderr,"Error making StarPU directory %s:\n", path);
+			perror("mkdir");
+			STARPU_ASSERT(0);
+		}
+
+		/* make sure that it is actually a directory */
+		struct stat sb;
+		stat(path, &sb);
+		if (!S_ISDIR(sb.st_mode))
+		{
+			fprintf(stderr,"Error: %s is not a directory:\n", path);
+			STARPU_ASSERT(0);
+		}
+	}
+}
+
 int _starpu_check_mutex_deadlock(pthread_mutex_t *mutex)
 {
 	int ret;
@@ -92,3 +118,17 @@ int _starpu_check_mutex_deadlock(pthread_mutex_t *mutex)
 
 	return 1;
 }
+
+char *_starpu_get_home_path()
+{
+	char *path = getenv("XDG_CACHE_HOME");
+	if (!path)
+		path = getenv("STARPU_HOME");
+	if (!path)
+		path = getenv("HOME");
+	if (!path)
+		path = getenv("USERPROFILE");
+	if (!path)
+		_STARPU_ERROR("couldn't find a home place to put starpu data\n");
+	return path;
+}

+ 2 - 0
src/common/utils.h

@@ -54,7 +54,9 @@
 #define _STARPU_IS_ZERO(a) (fpclassify(a) == FP_ZERO)
 
 int _starpu_mkpath(const char *s, mode_t mode);
+void _starpu_mkpath_and_check(const char *s, mode_t mode);
 int _starpu_check_mutex_deadlock(pthread_mutex_t *mutex);
+char *_starpu_get_home_path();
 
 /* If FILE is currently on a comment line, eat it.  */
 void _starpu_drop_comments(FILE *f);

+ 12 - 16
src/core/dependencies/data_concurrency.c

@@ -260,8 +260,11 @@ static unsigned unlock_one_requester(struct _starpu_data_requester *r)
 		return 0;
 }
 
-/* The header lock must already be taken by the caller */
-void _starpu_notify_data_dependencies(starpu_data_handle_t handle)
+/* The header lock must already be taken by the caller.
+ * This may free the handle if it was lazily unregistered (1 is returned in
+ * that case). The handle pointer thus becomes invalid for the caller.
+ */
+int _starpu_notify_data_dependencies(starpu_data_handle_t handle)
 {
 	_starpu_spin_checklocked(&handle->header_lock);
 	/* A data access has finished so we remove a reference. */
@@ -269,19 +272,9 @@ void _starpu_notify_data_dependencies(starpu_data_handle_t handle)
 	handle->refcnt--;
 	STARPU_ASSERT(handle->busy_count > 0);
 	handle->busy_count--;
-	_starpu_data_check_not_busy(handle);
-
-	/* The handle has been destroyed in between (eg. this was a temporary
-	 * handle created for a reduction.) */
-	if (handle->lazy_unregister && handle->refcnt == 0)
-	{
-		_starpu_spin_unlock(&handle->header_lock);
-		starpu_data_unregister_no_coherency(handle);
-		/* Warning: in case we unregister the handle, we must be sure
-		 * that the caller will not try to unlock the header after
-		 * !*/
-		return;
-	}
+	if (_starpu_data_check_not_busy(handle))
+		/* Handle was destroyed, nothing left to do.  */
+		return 1;
 
 	/* In case there is a pending reduction, and that this is the last
 	 * requester, we may go back to a "normal" coherency model. */
@@ -358,7 +351,10 @@ void _starpu_notify_data_dependencies(starpu_data_handle_t handle)
 			_starpu_spin_lock(&handle->header_lock);
 			STARPU_ASSERT(handle->busy_count > 0);
 			handle->busy_count--;
-			_starpu_data_check_not_busy(handle);
+			if (_starpu_data_check_not_busy(handle))
+				return 1;
 		}
 	}
+
+	return 0;
 }

+ 2 - 2
src/core/dependencies/data_concurrency.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -22,7 +22,7 @@
 
 unsigned _starpu_submit_job_enforce_data_deps(struct _starpu_job *j);
 
-void _starpu_notify_data_dependencies(starpu_data_handle_t handle);
+int _starpu_notify_data_dependencies(starpu_data_handle_t handle);
 
 unsigned _starpu_attempt_to_submit_data_request_from_apps(starpu_data_handle_t handle,
 							  enum starpu_access_mode mode,

+ 2 - 2
src/core/dependencies/htable.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2011  Université de Bordeaux 1
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -28,7 +28,7 @@
 #include <assert.h>
 #include <core/dependencies/tags.h>
 
-#define _STARPU_HTBL_NODE_SIZE	16
+#define _STARPU_HTBL_NODE_SIZE	8
 
 struct _starpu_htbl_node
 {

+ 35 - 0
src/core/dependencies/implicit_data_deps.c

@@ -405,6 +405,41 @@ void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *tas
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
 }
 
+/* This is the same as _starpu_release_data_enforce_sequential_consistency, but
+ * for all data of a task */
+void _starpu_release_task_enforce_sequential_consistency(struct _starpu_job *j)
+{
+	struct starpu_task *task = j->task;
+        struct starpu_buffer_descr *descrs = j->ordered_buffers;
+
+	if (!task->cl)
+		return;
+
+        unsigned nbuffers = task->cl->nbuffers;
+
+	unsigned index;
+	for (index = 0; index < nbuffers; index++)
+	{
+		starpu_data_handle_t handle = descrs[index].handle;
+
+		if (index && descrs[index-1].handle == descrs[index].handle)
+			/* We have already released this data, skip it. This
+			 * depends on ordering putting writes before reads, see
+			 * _starpu_compar_handles */
+			continue;
+
+		_starpu_release_data_enforce_sequential_consistency(task, handle);
+		/* Release the reference acquired in _starpu_push_task_output */
+		_starpu_spin_lock(&handle->header_lock);
+		STARPU_ASSERT(handle->busy_count > 0);
+		handle->busy_count--;
+		if (!_starpu_data_check_not_busy(handle))
+			_starpu_spin_unlock(&handle->header_lock);
+
+	}
+}
+
+
 void _starpu_add_post_sync_tasks(struct starpu_task *post_sync_task, starpu_data_handle_t handle)
 {
         _STARPU_LOG_IN();

+ 1 - 0
src/core/dependencies/implicit_data_deps.h

@@ -25,6 +25,7 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 						   starpu_data_handle_t handle, enum starpu_access_mode mode);
 void _starpu_detect_implicit_data_deps(struct starpu_task *task);
 void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *task, starpu_data_handle_t handle);
+void _starpu_release_task_enforce_sequential_consistency(struct _starpu_job *j);
 
 void _starpu_add_post_sync_tasks(struct starpu_task *post_sync_task, starpu_data_handle_t handle);
 void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle);

+ 3 - 0
src/core/jobs.c

@@ -162,6 +162,9 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
 
+	/* Tell other tasks that we don't exist any more, thus no need for
+	 * implicit dependencies any more.  */
+	_starpu_release_task_enforce_sequential_consistency(j);
 	/* Task does not have a cl, but has explicit data dependencies, we need
 	 * to tell them that we will not exist any more before notifying the
 	 * tasks waiting for us */

+ 7 - 86
src/core/perfmodel/perfmodel.c

@@ -225,7 +225,7 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 	unsigned i;
 	int err;
 	double sum = 0.0;
-	int node, cpu_node;
+	int node;
 
 	/* We need to get one node per archtype. This is kinda ugly,
 	 * but it does the job.
@@ -233,7 +233,7 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 	 * (err != 1 && err != -ERANGE)
 	 */
 #ifdef STARPU_USE_CPU
-	int cpu_worker;
+	int cpu_worker, cpu_node;
 	err = starpu_worker_get_ids_by_type(STARPU_CPU_WORKER,
 					    &cpu_worker, 1);
 	if (err != 1 && err != -ERANGE)
@@ -462,16 +462,7 @@ void _starpu_get_perf_model_dir(char *path, size_t maxlen)
 	/* use the directory specified at configure time */
 	snprintf(path, maxlen, "%s", STARPU_PERF_MODEL_DIR);
 #else
-	const char *home_path = getenv("XDG_CACHE_HOME");
-	if (!home_path)
-		home_path = getenv("STARPU_HOME");
-	if (!home_path)
-		home_path = getenv("HOME");
-	if (!home_path)
-		home_path = getenv("USERPROFILE");
-	if (!home_path)
-		_STARPU_ERROR("couldn't find a home place to put starpu data\n");
-	snprintf(path, maxlen, "%s/.starpu/sampling/", home_path);
+	snprintf(path, maxlen, "%s/.starpu/sampling/", _starpu_get_home_path());
 #endif
 }
 
@@ -508,93 +499,23 @@ void _starpu_create_sampling_directory_if_needed(void)
 		   may not be safe: it is possible that the permission are
 		   changed in between. Instead, we create it and check if
 		   it already existed before */
-		int ret;
-		ret = _starpu_mkpath(perf_model_dir, S_IRWXU);
+		_starpu_mkpath_and_check(perf_model_dir, S_IRWXU);
 
-		if (ret == -1)
-		{
-			if (errno != EEXIST) {
-				fprintf(stderr,"Error making starpu directory %s:\n", perf_model_dir);
-				perror("mkdir");
-				STARPU_ASSERT(0);
-			}
-
-			/* make sure that it is actually a directory */
-			struct stat sb;
-			stat(perf_model_dir, &sb);
-			if (!S_ISDIR(sb.st_mode)) {
-				fprintf(stderr,"Error: %s is not a directory:\n", perf_model_dir);
-				STARPU_ASSERT(0);
-			}
-		}
 
 		/* Per-task performance models */
 		char perf_model_dir_codelets[256];
 		_starpu_get_perf_model_dir_codelets(perf_model_dir_codelets, 256);
-
-		ret = _starpu_mkpath(perf_model_dir_codelets, S_IRWXU);
-		if (ret == -1)
-		{
-			if (errno != EEXIST) {
-				fprintf(stderr,"Error making starpu directory %s:\n", perf_model_dir);
-				perror("mkdir");
-				STARPU_ASSERT(0);
-			}
-
-
-			/* make sure that it is actually a directory */
-			struct stat sb;
-			stat(perf_model_dir_codelets, &sb);
-			if (!S_ISDIR(sb.st_mode)) {
-				fprintf(stderr,"Error: %s is not a directory:\n", perf_model_dir);
-				STARPU_ASSERT(0);
-			}
-		}
+		_starpu_mkpath_and_check(perf_model_dir_codelets, S_IRWXU);
 
 		/* Performance of the memory subsystem */
 		char perf_model_dir_bus[256];
 		_starpu_get_perf_model_dir_bus(perf_model_dir_bus, 256);
-
-		ret = _starpu_mkpath(perf_model_dir_bus, S_IRWXU);
-		if (ret == -1)
-		{
-			if (errno != EEXIST) {
-				fprintf(stderr,"Error making starpu directory %s:\n", perf_model_dir);
-				perror("mkdir");
-				STARPU_ASSERT(0);
-			}
-
-			/* make sure that it is actually a directory */
-			struct stat sb;
-			stat(perf_model_dir_bus, &sb);
-			if (!S_ISDIR(sb.st_mode)) {
-				fprintf(stderr,"Error: %s is not a directory:\n", perf_model_dir);
-				STARPU_ASSERT(0);
-			}
-		}
+		_starpu_mkpath_and_check(perf_model_dir_bus, S_IRWXU);
 
 		/* Performance debug measurements */
 		char perf_model_dir_debug[256];
 		_starpu_get_perf_model_dir_debug(perf_model_dir_debug, 256);
-
-		ret = _starpu_mkpath(perf_model_dir_debug, S_IRWXU);
-		if (ret == -1)
-		{
-			if (errno != EEXIST) {
-				fprintf(stderr,"Error making starpu directory %s:\n", perf_model_dir);
-				perror("mkdir");
-				STARPU_ASSERT(0);
-			}
-
-
-			/* make sure that it is actually a directory */
-			struct stat sb;
-			stat(perf_model_dir_debug, &sb);
-			if (!S_ISDIR(sb.st_mode)) {
-				fprintf(stderr,"Error: %s is not a directory:\n", perf_model_dir);
-				STARPU_ASSERT(0);
-			}
-		}
+		_starpu_mkpath_and_check(perf_model_dir_debug, S_IRWXU);
 
 		directory_existence_was_tested = 1;
 	}

+ 54 - 15
src/core/perfmodel/perfmodel_bus.c

@@ -363,7 +363,7 @@ static int find_numa_node(hwloc_obj_t obj)
 }
 #endif
 
-static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *dev_timing_per_cpu, char type)
+static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *dev_timing_per_cpu, char *type)
 {
 	/* Either we have hwloc and we measure the bandwith between each GPU
 	 * and each NUMA node, or we don't have such NUMA information and we
@@ -423,11 +423,11 @@ static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *d
 #endif
 
 #ifdef STARPU_USE_CUDA
-                if (type == 'C')
+                if (strncmp(type, "CUDA", 4) == 0)
                         measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(dev, cpu, dev_timing_per_cpu);
 #endif
 #ifdef STARPU_USE_OPENCL
-                if (type == 'O')
+                if (strncmp(type, "OpenCL", 6) == 0)
                         measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(dev, cpu, dev_timing_per_cpu);
 #endif
 
@@ -456,7 +456,7 @@ static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *d
 }
 
 static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_htod, double *dev_timing_dtoh,
-                                                   struct dev_timing *dev_timing_per_cpu, char type)
+                                                   struct dev_timing *dev_timing_per_cpu, char *type)
 {
 	measure_bandwidth_between_cpus_and_dev(dev, dev_timing_per_cpu, type);
 
@@ -465,6 +465,9 @@ static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_h
               sizeof(struct dev_timing),
 			compar_dev_timing);
 
+#ifdef STARPU_DEVEL
+#  warning save timing_dtoh and timing_htod data to display them when calling starpu_machine_display ? (Brice would like that)
+#endif
 #ifdef STARPU_VERBOSE
         unsigned cpu;
 	for (cpu = 0; cpu < ncpus; cpu++)
@@ -475,12 +478,12 @@ static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_h
 
 		double bandwidth_sum2 = bandwidth_dtoh*bandwidth_dtoh + bandwidth_htod*bandwidth_htod;
 
-		_STARPU_DISP("BANDWIDTH GPU %d CPU %u - htod %f - dtoh %f - %f\n", dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
+		_STARPU_DISP("(%10s) BANDWIDTH GPU %d CPU %u - htod %f - dtoh %f - %f\n", type, dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
 	}
 
 	unsigned best_cpu = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].cpu_id;
 
-	_STARPU_DISP("BANDWIDTH GPU %d BEST CPU %u\n", dev, best_cpu);
+	_STARPU_DISP("(%10s) BANDWIDTH GPU %d BEST CPU %u\n", type, dev, best_cpu);
 #endif
 
 	/* The results are sorted in a decreasing order, so that the best
@@ -529,16 +532,16 @@ static void benchmark_all_gpu_devices(void)
 	ncuda = _starpu_get_cuda_device_count();
 	for (i = 0; i < ncuda; i++)
 	{
-		fprintf(stderr," CUDA %d...", i);
+		_STARPU_DISP("CUDA %d...\n", i);
 		/* measure bandwidth between Host and Device i */
-		measure_bandwidth_between_host_and_dev(i, cudadev_timing_htod, cudadev_timing_dtoh, cudadev_timing_per_cpu, 'C');
+		measure_bandwidth_between_host_and_dev(i, cudadev_timing_htod, cudadev_timing_dtoh, cudadev_timing_per_cpu, "CUDA");
 	}
 #ifdef HAVE_CUDA_MEMCPY_PEER
 	for (i = 0; i < ncuda; i++)
 		for (j = 0; j < ncuda; j++)
 			if (i != j)
 			{
-				fprintf(stderr," CUDA %d -> %d...", i, j);
+				_STARPU_DISP("CUDA %d -> %d...\n", i, j);
 				/* measure bandwidth between Host and Device i */
 				measure_bandwidth_between_dev_and_dev_cuda(i, j);
 			}
@@ -548,9 +551,9 @@ static void benchmark_all_gpu_devices(void)
         nopencl = _starpu_opencl_get_device_count();
 	for (i = 0; i < nopencl; i++)
 	{
-		fprintf(stderr," OpenCL %d...", i);
+		_STARPU_DISP("OpenCL %d...\n", i);
 		/* measure bandwith between Host and Device i */
-		measure_bandwidth_between_host_and_dev(i, opencldev_timing_htod, opencldev_timing_dtoh, opencldev_timing_per_cpu, 'O');
+		measure_bandwidth_between_host_and_dev(i, opencldev_timing_htod, opencldev_timing_dtoh, opencldev_timing_per_cpu, "OpenCL");
 	}
 #endif
 
@@ -579,7 +582,6 @@ static void benchmark_all_gpu_devices(void)
 static void get_bus_path(const char *type, char *path, size_t maxlen)
 {
 	_starpu_get_perf_model_dir_bus(path, maxlen);
-	strncat(path, type, maxlen);
 
 	char hostname[32];
 	char *forced_hostname = getenv("STARPU_HOSTNAME");
@@ -587,8 +589,9 @@ static void get_bus_path(const char *type, char *path, size_t maxlen)
 		snprintf(hostname, sizeof(hostname), "%s", forced_hostname);
 	else
 		gethostname(hostname, sizeof(hostname));
-	strncat(path, ".", maxlen);
 	strncat(path, hostname, maxlen);
+	strncat(path, ".", maxlen);
+	strncat(path, type, maxlen);
 }
 
 /*
@@ -766,6 +769,42 @@ int *_starpu_get_opencl_affinity_vector(unsigned gpuid)
 }
 #endif /* STARPU_USE_OPENCL */
 
+void starpu_bus_print_affinity(FILE *f)
+{
+	unsigned cpu;
+	int gpu;
+
+	fprintf(f, "# GPU\t");
+	for(cpu = 0 ; cpu < ncpus ; cpu++)
+	{
+		fprintf(f, "CPU%u\t", cpu);
+	}
+	fprintf(f, "\n");
+
+#ifdef STARPU_USE_CUDA
+	for(gpu = 0 ; gpu<ncuda ; gpu++)
+	{
+		fprintf(f, "%d\t", gpu);
+		for (cpu = 0; cpu < ncpus; cpu++)
+		{
+			fprintf(f, "%d\t", cuda_affinity_matrix[gpu][cpu]);
+		}
+		fprintf(f, "\n");
+	}
+#endif
+#ifdef STARPU_USE_OPENCL
+	for(gpu = 0 ; gpu<nopencl ; gpu++)
+	{
+		fprintf(f, "%d\t", gpu);
+		for (cpu = 0; cpu < ncpus; cpu++)
+		{
+			fprintf(f, "%d\t", opencl_affinity_matrix[gpu][cpu]);
+		}
+		fprintf(f, "\n");
+	}
+#endif
+}
+
 /*
  *	Latency
  */
@@ -1114,9 +1153,9 @@ static void check_bus_config_file()
         res = access(path, F_OK);
         if (res)
 	{
-		fprintf(stderr, "No performance model for the bus, calibrating...");
+		_STARPU_DISP("No performance model for the bus, calibrating...\n");
 		starpu_force_bus_sampling();
-		fprintf(stderr, "done\n");
+		_STARPU_DISP("... done\n");
         }
         else
 	{

+ 2 - 2
src/core/perfmodel/perfmodel_history.c

@@ -219,7 +219,7 @@ static void parse_arch(FILE *f, struct starpu_perfmodel *model, unsigned scan_hi
 		_STARPU_DEBUG("Parsing arch %u\n", arch);
 		_starpu_drop_comments(f);
 		ret = fscanf(f, "%d\n", &nimpls);
-		_STARPU_DEBUG("%u implementations\n", nimpls);
+		_STARPU_DEBUG("%d implementations\n", nimpls);
 		STARPU_ASSERT(ret == 1);
 		implmax = STARPU_MIN(nimpls, STARPU_MAXIMPLEMENTATIONS);
 		skipimpl = nimpls - STARPU_MAXIMPLEMENTATIONS;
@@ -243,7 +243,7 @@ static void parse_arch(FILE *f, struct starpu_perfmodel *model, unsigned scan_hi
 		{
 			_STARPU_DEBUG("skipping arch %u\n", arch);
 			ret = fscanf(f, "%d\n", &nimpls);
-			_STARPU_DEBUG("%u implementations\n", nimpls);
+			_STARPU_DEBUG("%d implementations\n", nimpls);
 			STARPU_ASSERT(ret == 1);
 			implmax = STARPU_MIN(nimpls, STARPU_MAXIMPLEMENTATIONS);
 			skipimpl = nimpls - STARPU_MAXIMPLEMENTATIONS;

+ 3 - 10
src/core/sched_policy.c

@@ -47,7 +47,7 @@ extern struct starpu_sched_policy _starpu_sched_dmda_sorted_policy;
 extern struct starpu_sched_policy _starpu_sched_eager_policy;
 extern struct starpu_sched_policy _starpu_sched_parallel_heft_policy;
 extern struct starpu_sched_policy _starpu_sched_pgreedy_policy;
-extern struct starpu_sched_policy heft_policy;
+extern struct starpu_sched_policy _starpu_sched_heft_policy;
 
 static struct starpu_sched_policy *predefined_policies[] =
 {
@@ -55,7 +55,7 @@ static struct starpu_sched_policy *predefined_policies[] =
 	&_starpu_sched_prio_policy,
 	&_starpu_sched_dm_policy,
 	&_starpu_sched_dmda_policy,
-	&heft_policy,
+	&_starpu_sched_heft_policy,
 	&_starpu_sched_dmda_ready_policy,
 	&_starpu_sched_dmda_sorted_policy,
 	&_starpu_sched_random_policy,
@@ -88,14 +88,7 @@ static void load_sched_policy(struct starpu_sched_policy *sched_policy)
 	}
 #endif
 
-	policy.init_sched = sched_policy->init_sched;
-	policy.deinit_sched = sched_policy->deinit_sched;
-	policy.push_task = sched_policy->push_task;
-	policy.push_task_notify = sched_policy->push_task_notify;
-	policy.pop_task = sched_policy->pop_task;
-        policy.pre_exec_hook = sched_policy->pre_exec_hook;
-        policy.post_exec_hook = sched_policy->post_exec_hook;
-	policy.pop_every_task = sched_policy->pop_every_task;
+	memcpy(&policy, sched_policy, sizeof(policy));
 }
 
 static struct starpu_sched_policy *find_sched_policy_from_name(const char *policy_name)

+ 0 - 0
src/core/task.c


Some files were not shown because too many files changed in this diff