Browse Source

Merge from trunk

Corentin Salingue 8 years ago
parent
commit
333e34a4b4
100 changed files with 2501 additions and 584 deletions
  1. 3 0
      ChangeLog
  2. 11 1
      Makefile.am
  3. 93 21
      configure.ac
  4. 143 3
      doc/doxygen/chapters/310_data_management.doxy
  5. 3 3
      doc/doxygen/chapters/330_scheduling_contexts.doxy
  6. 4 2
      doc/doxygen/chapters/401_out_of_core.doxy
  7. 31 20
      doc/doxygen/chapters/410_mpi_support.doxy
  8. 3 2
      doc/doxygen/chapters/501_environment_variables.doxy
  9. 14 0
      doc/doxygen/chapters/510_configure_options.doxy
  10. 85 12
      doc/doxygen/chapters/api/data_interfaces.doxy
  11. 12 2
      doc/doxygen/chapters/api/data_out_of_core.doxy
  12. 28 5
      doc/doxygen/chapters/api/data_partition.doxy
  13. 4 0
      doc/doxygen/chapters/api/initialization.doxy
  14. 1 1
      doc/doxygen/chapters/api/mpi.doxy
  15. 2 1
      doc/doxygen/chapters/api/scheduling_contexts.doxy
  16. 9 5
      doc/doxygen/chapters/api/scheduling_policy.doxy
  17. 7 0
      doc/doxygen/chapters/api/workers.doxy
  18. 4 0
      examples/Makefile.am
  19. 6 4
      examples/audio/starpu_audio_processing.c
  20. 2 4
      examples/basic_examples/variable.c
  21. 19 10
      examples/heat/dw_sparse_cg.c
  22. 20 17
      examples/mlr/mlr.c
  23. 2 2
      examples/sched_ctx/two_cpu_contexts.c
  24. 13 13
      examples/sched_ctx_utils/sched_ctx_utils.c
  25. 2 2
      examples/spmv/dw_block_spmv.c
  26. 6 5
      examples/spmv/spmv.c
  27. 11 0
      include/fstarpu_mod.f90
  28. 2 2
      include/starpu.h
  29. 2 0
      include/starpu_config.h.in
  30. 2 1
      include/starpu_data_filters.h
  31. 59 16
      include/starpu_data_interfaces.h
  32. 3 0
      include/starpu_disk.h
  33. 2 2
      include/starpu_mpi_ms.h
  34. 1 0
      include/starpu_perfmodel.h
  35. 0 3
      include/starpu_scheduler.h
  36. 11 1
      include/starpu_thread_util.h
  37. 35 4
      include/starpu_util.h
  38. 8 0
      include/starpu_worker.h
  39. 1 1
      mpi/examples/mpi_lu/pxlu.c
  40. 3 2
      mpi/examples/mpi_lu/pxlu.h
  41. 2 2
      mpi/src/load_balancer/policy/load_heat_propagation.c
  42. 13 2
      mpi/src/starpu_mpi.c
  43. 18 10
      mpi/src/starpu_mpi_datatype.c
  44. 24 0
      mpi/src/starpu_mpi_fxt.h
  45. 1 1
      mpi/src/starpu_mpi_private.h
  46. 2 2
      mpi/src/starpu_mpi_task_insert.c
  47. 1 1
      mpi/tests/block_interface.c
  48. 2 2
      mpi/tests/block_interface_pinned.c
  49. 360 117
      mpi/tests/datatypes.c
  50. 3 1
      mpi/tests/insert_task_compute.c
  51. 3 3
      mpi/tests/insert_task_owner.c
  52. 2 2
      mpi/tests/insert_task_sent_cache.c
  53. 3 1
      mpi/tests/mpi_reduction.c
  54. 1 1
      mpi/tests/mpi_reduction_kernels.c
  55. 9 9
      sc_hypervisor/examples/app_driven_test/app_driven_test.c
  56. 12 12
      sc_hypervisor/examples/hierarchical_ctxs/resize_hierarchical_ctxs.c
  57. 9 9
      sc_hypervisor/examples/lp_test/lp_resize_test.c
  58. 11 11
      sc_hypervisor/examples/lp_test/lp_test.c
  59. 14 14
      sc_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.c
  60. 2 2
      sc_hypervisor/src/uthash.h
  61. 2 2
      socl/src/cl_createcommandqueue.c
  62. 1 1
      socl/src/cl_createkernel.c
  63. 3 3
      socl/src/cl_enqueuendrangekernel.c
  64. 2 2
      socl/src/command.c
  65. 3 3
      socl/src/command_queue.c
  66. 7 7
      socl/src/gc.c
  67. 4 0
      src/Makefile.am
  68. 2 2
      src/common/barrier.c
  69. 16 3
      src/common/fxt.h
  70. 8 6
      src/common/rbtree.c
  71. 3 3
      src/common/uthash.h
  72. 1 1
      src/common/utils.c
  73. 1 1
      src/common/utils.h
  74. 134 27
      src/core/disk.c
  75. 0 2
      src/core/disk.h
  76. 739 0
      src/core/disk_ops/disk_hdf5.c
  77. 6 6
      src/core/disk_ops/disk_leveldb.cpp
  78. 35 20
      src/core/disk_ops/disk_stdio.c
  79. 2 2
      src/core/disk_ops/disk_unistd_o_direct.c
  80. 224 28
      src/core/disk_ops/unistd/disk_unistd_global.c
  81. 1 5
      src/core/jobs.h
  82. 12 12
      src/core/perfmodel/multiple_regression.c
  83. 1 1
      src/core/perfmodel/perfmodel.c
  84. 1 3
      src/core/perfmodel/perfmodel.h
  85. 24 2
      src/core/perfmodel/perfmodel_bus.c
  86. 3 3
      src/core/perfmodel/perfmodel_history.c
  87. 3 3
      src/core/perfmodel/perfmodel_print.c
  88. 5 10
      src/core/sched_ctx.c
  89. 12 10
      src/core/sched_policy.c
  90. 5 7
      src/core/simgrid.c
  91. 16 9
      src/core/task.c
  92. 10 6
      src/core/topology.c
  93. 11 2
      src/core/workers.c
  94. 6 4
      src/core/workers.h
  95. 15 0
      src/datawizard/copy_driver.c
  96. 6 1
      src/datawizard/copy_driver.h
  97. 0 17
      src/datawizard/data_request.c
  98. 9 0
      src/datawizard/data_request.h
  99. 9 1
      src/datawizard/filters.c
  100. 0 0
      src/datawizard/interfaces/bcsr_interface.c

+ 3 - 0
ChangeLog

@@ -54,6 +54,7 @@ Small changes:
   * Use asynchronous transfers for task data fetches with were not prefetched.
   * Allow to call starpu_sched_ctx_set_policy_data on the main
     scheduler context
+  * Fonction starpu_is_initialized() is moved to the public API.
 
 StarPU 1.2.2 (svn revision xxx)
 ==============================================
@@ -78,6 +79,8 @@ Small features:
   * MPI: Add mpi communications in dag.dot
   * Add STARPU_PERF_MODEL_HOMOGENEOUS_CPU environment variable to
     allow having one perfmodel per CPU core
+  * Add starpu_vector_filter_list_long filter.
+  * Add starpu_perfmodel_arch_comb_fetch function.
 
 Small changes:
   * Output generated through STARPU_MPI_COMM has been modified to

+ 11 - 1
Makefile.am

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2009-2017  Université de Bordeaux
-# Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
+# Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016, 2017  CNRS
 # Copyright (C) 2014  INRIA
 # Copyright (C) 2016  Inria
 #
@@ -138,6 +138,16 @@ if STARPU_DEVEL
 		echo "Please do not use getenv, use starpu_getenv instead, which catches unsafe uses"; \
 		false ; \
 	fi
+# we count the number of files which include unistd.h
+# we count the number of files which properly include unistd.h i.e by first detecting if it's available
+# and then we check both numbers are the same ...a
+	@UNISTD_ALL_LINES=$(shell grep -B1 -rs "^#include <unistd.h>" $(srcdir)/src/ $(srcdir)/include/ $(srcdir)/mpi/src $(srcdir)/mpi/include  |grep -v dolib|grep -v -e "--" | tr '\012' '@' | sed 's/unistd.h>@/unistd.h>\n/g' | wc -l) ;\
+	UNISTD_CORRECT_LINES=$(shell grep -B1 -rs "^#include <unistd.h>" $(srcdir)/src/ $(srcdir)/include/ $(srcdir)/mpi/src $(srcdir)/mpi/include  |grep -v dolib|grep -v -e "--" | tr '\012' '@' | sed 's/unistd.h>@/unistd.h>\n/g' | grep '#ifdef .*HAVE_UNISTD_H.*:#include <unistd.h>' | wc -l) ;\
+	if test $$UNISTD_ALL_LINES -ne $$UNISTD_CORRECT_LINES ; \
+	then \
+		echo "Please do not unconditionally include unistd.h, it is not available on Windows, include config.h and test for HAVE_UNISTD_H" ; \
+		false ; \
+	fi
 endif
 
 if BUILD_STARPU_TOP

+ 93 - 21
configure.ac

@@ -771,12 +771,74 @@ AC_CHECK_FUNC([sched_yield], [AC_DEFINE([STARPU_HAVE_SCHED_YIELD], [1], [Define
 
 AC_CHECK_HEADERS([aio.h])
 AC_CHECK_LIB([rt], [aio_read])
+#AC_CHECK_HEADERS([libaio.h])
+#AC_CHECK_LIB([aio], [io_setup])
 
 AC_CHECK_FUNCS([mkostemp])
 AC_CHECK_FUNCS([mkdtemp])
 
 AC_CHECK_FUNCS([pread pwrite])
 
+
+AC_ARG_WITH(hdf5-include-dir,
+	[AS_HELP_STRING([--with-hdf5-include-dir=<path>],
+	[specify where HDF5 headers are installed])],
+	[
+		hdf5_include_dir="$withval"
+	], [hdf5_include_dir=""])
+
+hdf5_inc_dir="/usr/include/hdf5 /usr/include/hdf5/serial ${hdf5_include_dir}"
+
+enable_include_hdf5=no
+for f in $hdf5_inc_dir; do
+        if test -n "$f" ; then
+                SAVED_CFLAGS="${CFLAGS}"
+                CFLAGS=-I${f}
+                AC_CHECK_HEADERS([hdf5.h])
+                if test "$ac_cv_header_hdf5_h" = "yes" ; then
+                        CFLAGS="-I${f} ${SAVED_CFLAGS}"
+                        enable_include_hdf5=yes
+                        break
+                else
+                        CFLAGS=${SAVED_CFLAGS}
+                fi
+                unset ac_cv_header_hdf5_h
+        fi
+done
+
+
+AC_ARG_WITH(hdf5-lib-dir,
+	[AS_HELP_STRING([--with-hdf5-lib-dir=<path>],
+	[specify where HDF5 libraries are installed])],
+	[
+		hdf5_libraries_dir="$withval"
+	], [hdf5_libraries_dir=""])
+
+hdf5_lib_dir="/usr/lib/x86_64-linux-gnu/hdf5 /usr/lib/x86_64-linux-gnu/hdf5/serial ${hdf5_libraries_dir}"
+
+enable_libraries_hdf5=no
+for f in $hdf5_lib_dir; do
+        if test -n "$f" ; then
+                SAVED_LDFLAGS="${LDFLAGS}"
+                LDFLAGS=-L${f}
+                STARPU_HAVE_LIBRARY(HDF5, [hdf5])
+                if test "$ac_cv_lib_hdf5_main" = "yes" ; then
+                        LDFLAGS="-L${f} ${SAVED_LDFLAGS} ${STARPU_HDF5_LDFLAGS}"
+                        enable_libraries_hdf5=yes
+                        break
+                else
+                        LDFLAGS=${SAVED_LDFLAGS}
+                fi
+                unset ac_cv_lib_hdf5_main
+        fi
+done
+
+if test  "x$enable_libraries_hdf5" = "xyes" -a "x$enable_include_hdf5" = "xyes"; then
+        AC_DEFINE([STARPU_HAVE_HDF5], [1], [Define to 1 if you have the <hdf5.h> header file.])
+fi
+AM_CONDITIONAL(STARPU_HAVE_HDF5, test  "x$enable_libraries_hdf5" = "xyes" -a "x$enable_include_hdf5" = "xyes")
+
+
 # This defines HAVE_SYNC_VAL_COMPARE_AND_SWAP
 STARPU_CHECK_SYNC_VAL_COMPARE_AND_SWAP
 
@@ -804,6 +866,18 @@ if test x$enable_libnuma = xyes; then
 	AC_DEFINE(STARPU_HAVE_LIBNUMA,[1],[libnuma is available])
 fi
 
+AC_MSG_CHECKING(whether statement expressions are available)
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+#define maxint(a,b) ({int _a = (a), _b = (b); _a > _b ? _a : _b; })
+]],
+		[[ int x=maxint(12,42); ]])],
+		[statement_expressions="yes"],
+		[statement_expressions="no"])
+AC_MSG_RESULT($statement_expressions)
+if test x$statement_expressions = xyes; then
+	AC_DEFINE(STARPU_HAVE_STATEMENT_EXPRESSIONS,[1],[statement expressions are available])
+fi
+
 ###############################################################################
 #									      #
 #				SCHED_CTX settings			      #
@@ -1945,15 +2019,19 @@ if test x$use_fxt = xyes; then
 	##########################################
 	# Poti is a library to generate paje trace files
 	##########################################
-	PKG_CHECK_MODULES([POTI], [poti], [
-		AC_DEFINE(STARPU_HAVE_POTI, [1], [Define to 1 if you have libpoti])
+	PKG_CHECK_MODULES([POTI], [poti], [have_valid_poti=yes], [have_valid_poti=no])
+	AC_ARG_ENABLE(poti, [AS_HELP_STRING([--enable-poti],
+				[Enable the use of the POTI library to generate Paje traces])],
+				enable_poti=$enableval, enable_poti=no)
+	if test x$enable_poti = xyes -a x$have_valid_poti = xyes ; then
+		AC_DEFINE(STARPU_HAVE_POTI, [1], [Define to 1 if you have libpoti and it is meant to be used])
 		save_LIBS="$LIBS"
 		LIBS="$LIBS $POTI_LIBS"
 		AC_CHECK_FUNCS([poti_init_custom])
 		LIBS="$save_LIBS"
-	], [:])
-	FXT_CFLAGS="$FXT_CFLAGS $POTI_CFLAGS"
-	FXT_LIBS="$FXT_LIBS $POTI_LIBS"
+		FXT_CFLAGS="$FXT_CFLAGS $POTI_CFLAGS"
+		FXT_LIBS="$FXT_LIBS $POTI_LIBS"
+	fi
 fi
 
 AC_MSG_CHECKING(whether additional locking systems FxT traces should be enabled)
@@ -2080,13 +2158,13 @@ if test x$maxnodes = x0 ; then
 		# we add nodes to use 2 memory disks
 		nodes=`expr $nmaxnumanodes + 2`
 		if test x$enable_cuda = xyes ; then
-			# we could have used nmaxcudadev + 1, but this would certainly give an
-			# odd number.
+			# we could have used nmaxcudadev + 1, but this would certainly give an
+			# odd number.
 			nodes=`expr $nodes + $nmaxcudadev`
 		fi
 		if test x$enable_opencl = xyes ; then
-			# we could have used nmaxcudadev + 1, but this would certainly give an
-			# odd number.
+			# we could have used nmaxcudadev + 1, but this would certainly give an
+			# odd number.
 			nodes=`expr $nodes + $nmaxopencldev`
 		fi
 		if test x$enable_mic = xyes ; then
@@ -3159,6 +3237,8 @@ AC_CONFIG_COMMANDS([executable-scripts], [
   test -e tests/microbenchs/parallel_independent_homogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks.sh tests/microbenchs/
   mkdir -p tests/datawizard
   test -e tests/datawizard/locality.sh || ln -sf $ac_abs_top_srcdir/tests/datawizard/locality.sh tests/datawizard/
+  mkdir -p tests/overlap
+  test -e tests/overlap/overlap.sh || ln -sf $ac_abs_top_srcdir/tests/overlap/overlap.sh tests/overlap/
   mkdir -p tests/model-checking
   test -e tests/model-checking/prio_list.sh || ln -sf $ac_abs_top_srcdir/tests/model-checking/prio_list.sh tests/model-checking/
   test -e tests/model-checking/barrier.sh || ln -sf $ac_abs_top_srcdir/tests/model-checking/barrier.sh tests/model-checking/
@@ -3166,6 +3246,10 @@ AC_CONFIG_COMMANDS([executable-scripts], [
   test -e examples/heat/heat.sh || ln -sf $ac_abs_top_srcdir/examples/heat/heat.sh examples/heat/
   mkdir -p examples/lu
   test -e examples/lu/lu.sh || ln -sf $ac_abs_top_srcdir/examples/lu/lu.sh examples/lu/
+  test -e tools/starpu_paje_draw_histogram.R || ln -sf $ac_abs_top_srcdir/tools/starpu_paje_draw_histogram.R tools/starpu_paje_draw_histogram.R
+  test -e tools/starpu_paje_state_stats.R || ln -sf $ac_abs_top_srcdir/tools/starpu_paje_state_stats.R tools/starpu_paje_state_stats.R
+  test -e tools/starpu_trace_state_stats.py || ln -sf $ac_abs_top_srcdir/tools/starpu_trace_state_stats.py tools/starpu_trace_state_stats.py
+  chmod +x tools/starpu_trace_state_stats.py
 ])
 
 # Create links to ICD files in build/socl/vendors directory. SOCL will use this
@@ -3185,18 +3269,6 @@ AC_SUBST(SOCL_VENDORS)
 AC_CONFIG_FILES(tests/regression/regression.sh tests/regression/profiles tests/regression/profiles.build.only)
 AC_CONFIG_HEADER(src/common/config.h include/starpu_config.h gcc-plugin/include/starpu-gcc/config.h starpu-top/config.h)
 
-AH_BOTTOM([
-#if defined(STARPU_DEVEL) && defined(BUILDING_STARPU)
-#  ifndef STARPU_CHECKED_UNISTD_H
-#    define STARPU_CHECKED_UNISTD_H
-#    ifdef _UNISTD_H
-#      define _UNISTD_H PLEASE_DONT_INCLUDE_IT
-#      error Please do not unconditionally include unistd.h, it is not available on Windows, include config.h and test for HAVE_UNISTD_H
-#    endif
-#  endif
-#endif
-])
-
 SANITIZE=$(echo $CFLAGS | grep sanitize)
 AM_CONDITIONAL(STARPU_SANITIZE, test -n "$SANITIZE")
 

+ 143 - 3
doc/doxygen/chapters/310_data_management.doxy

@@ -10,6 +10,132 @@
 
 TODO: intro qui parle de coherency entre autres
 
+\section DataInterface Data Interface
+
+StarPU provides several data interfaces for programmers to describe the data layout of their application. There are predefined interfaces already available in StarPU. Users can define new data interfaces as explained in \ref DefiningANewDataInterface. All functions provided by StarPU are documented in \ref API_Data_Interfaces. You will find a short list below.
+
+\subsection VariableDataInterface Variable Data Interface
+
+A variable is a given size byte element, typically a scalar. Here an
+example of how to register a variable data to StarPU by using
+starpu_variable_data_register().
+
+
+\code{.c}
+float var = 42.0;
+starpu_data_handle_t var_handle;
+starpu_variable_data_register(&var_handle, STARPU_MAIN_RAM, (uintptr_t)&var, sizeof(var));
+\endcode
+
+\subsection VectorDataInterface Vector Data Interface
+
+A vector is a fixed number of elements of a given size. Here an
+example of how to register a vector data to StarPU by using
+starpu_vector_data_register().
+
+\code{.c}
+float vector[NX];
+starpu_data_handle_t vector_handle;
+starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
+\endcode
+
+\subsection MatrixDataInterface Matrix Data Interface
+
+To register 2-D matrices with a potential padding, one can use the
+matrix data interface. Here an example of how to register a matrix
+data to StarPU by using starpu_matrix_data_register().
+
+\code{.c}
+float *matrix;
+starpu_data_handle_t matrix_handle;
+matrix = (float*)malloc(width * height * sizeof(float));
+starpu_matrix_data_register(&matrix_handle, STARPU_MAIN_RAM, (uintptr_t)matrix, width, width, height, sizeof(float));
+\endcode
+
+\subsection BlockDataInterface Block Data Interface
+
+To register 3-D blocks with potential paddings on Y and Z dimensions,
+one can use the block data interface. Here an example of how to
+register a block data to StarPU by using starpu_block_data_register().
+
+\code{.c}
+float *block;
+starpu_data_handle_t block_handle;
+block = (float*)malloc(nx*ny*nz*sizeof(float));
+starpu_block_data_register(&block_handle, STARPU_MAIN_RAM, (uintptr_t)block, nx, nx*ny, nx, ny, nz, sizeof(float));
+\endcode
+
+\subsection BCSRDataInterface BCSR Data Interface
+
+BCSR (Blocked Compressed Sparse Row Representation) sparse matrix data
+can be registered to StarPU using the bcsr data interface. Here an
+example on how to do so by using starpu_bcsr_data_register().
+
+\code{.c}
+/*
+ * We use the following matrix:
+ *
+ *   +----------------+
+ *   |  0   1   0   0 |
+ *   |  2   3   0   0 |
+ *   |  4   5   8   9 |
+ *   |  6   7  10  11 |
+ *   +----------------+
+ *
+ * nzval  = [0, 1, 2, 3] ++ [4, 5, 6, 7] ++ [8, 9, 10, 11]
+ * colind = [0, 0, 1]
+ * rowptr = [0, 1, 3]
+ * r = c = 2
+ */
+
+/* Size of the blocks */
+int R = 2;
+int C = 2;
+
+int NROWS = 2;
+int NNZ_BLOCKS = 3;    /* out of 4 */
+int NZVAL_SIZE = (R*C*NNZ_BLOCKS);
+
+int nzval[NZVAL_SIZE]  =
+{
+	0, 1, 2, 3,    /* First block  */
+	4, 5, 6, 7,    /* Second block */
+	8, 9, 10, 11   /* Third block  */
+};
+uint32_t colind[NNZ_BLOCKS] =
+{
+	0, /* block-column index for first block in nzval */
+	0, /* block-column index for second block in nzval */
+	1  /* block-column index for third block in nzval */
+};
+uint32_t rowptr[NROWS+1] =
+{
+	0, / * block-index in nzval of the first block of the first row. */
+	1, / * block-index in nzval of the first block of the second row. */
+	NNZ_BLOCKS /* number of blocks, to allow an easier element's access for the kernels */
+};
+
+starpu_data_handle_t bcsr_handle;
+starpu_bcsr_data_register(&bcsr_handle,
+			  STARPU_MAIN_RAM,
+			  NNZ_BLOCKS,
+			  NROWS,
+			  (uintptr_t) nzval,
+			  colind,
+			  rowptr,
+			  0, /* firstentry */
+			  R,
+			  C,
+			  sizeof(nzval[0]));
+\endcode
+
+StarPU provides an example on how to deal with such matrices in
+<c>examples/spmv</c>.
+
+\subsection CSRDataInterface CSR Data Interface
+
+TODO
+
 \section DataManagement Data Management
 
 When the application allocates data, whenever possible it should use
@@ -119,6 +245,8 @@ back to its home node, and evict it from GPUs when room is needed.
 An existing piece of data can be partitioned in sub parts to be used by different tasks, for instance:
 
 \code{.c}
+#define NX 1048576
+#define PARTS 16
 int vector[NX];
 starpu_data_handle_t handle;
 
@@ -162,7 +290,7 @@ Partitioning can be applied several times, see
 Wherever the whole piece of data is already available, the partitioning will
 be done in-place, i.e. without allocating new buffers but just using pointers
 inside the existing copy. This is particularly important to be aware of when
-using OpenCL, where the kernel parameters are not pointers, but handles. The
+using OpenCL, where the kernel parameters are not pointers, but cl_mem handles. The
 kernel thus needs to be also passed the offset within the OpenCL buffer:
 
 \code{.c}
@@ -190,7 +318,10 @@ __kernel void opencl_kernel(__global int *vector, unsigned offset)
 
 StarPU provides various interfaces and filters for matrices, vectors, etc.,
 but applications can also write their own data interfaces and filters, see
-<c>examples/interface</c> and <c>examples/filters/custom_mf</c> for an example.
+<c>examples/interface</c> and <c>examples/filters/custom_mf</c> for an example,
+and see \ref DefiningANewDataInterface and \ref DefiningANewDataFilter
+for documentation.
+
 
 \section AsynchronousPartitioning Asynchronous Partitioning
 
@@ -306,6 +437,14 @@ starpu_data_invalidate_submit(handle);
 
 And now we can start using vertical slices, etc.
 
+\section DefiningANewDataFilter Defining A New Data Filter
+
+StarPU provides a series of predefined filters in API_Data_Partition, but
+additional filters can be defined by the application. The principle is that the
+filter function just fills the memory location of the i-th subpart of a data.
+Examples are provided in <c>src/datawizard/interfaces/*_filters.c</c>,
+and see \ref starpu_data_filter::filter_func for the details.
+
 \section DataReduction Data Reduction
 
 In various cases, some piece of data is used to accumulate intermediate
@@ -634,7 +773,8 @@ Different operations need to be defined for a data interface through
 the type starpu_data_interface_ops. We only define here the basic
 operations needed to run simple applications. The source code for the
 different functions can be found in the file
-<c>examples/interface/complex_interface.c</c>.
+<c>examples/interface/complex_interface.c</c>, the details of the hooks to be
+provided are documented \ref starpu_data_interface_ops .
 
 \code{.c}
 static struct starpu_data_interface_ops interface_complex_ops =

+ 3 - 3
doc/doxygen/chapters/330_scheduling_contexts.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
 //  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * Copyright (C) 2016 Uppsala University
  * See the file version.doxy for copying conditions.
@@ -84,7 +84,7 @@ contexts in this traditional way.
 
 To create a <b>context</b> with the default scheduler, that is either
 controlled through the environment variable <c>STARPU_SCHED</c> or the
-StarPU default scheduler, one can explicitly use the option <c>STARPU_SCHED_CTX_POLICY_NAME, NULL</c> as in the following example:
+StarPU default scheduler, one can explicitly use the option <c>STARPU_SCHED_CTX_POLICY_NAME, ""</c> as in the following example:
 
 \code{.c}
 /* the list of resources the context will manage */
@@ -92,7 +92,7 @@ int workerids[3] = {1, 3, 10};
 
 /* indicate the list of workers assigned to it, the number of workers,
 and use the default scheduling policy. */
-int id_ctx = starpu_sched_ctx_create(workerids, 3, "my_ctx", STARPU_SCHED_CTX_POLICY_NAME, NULL, 0);
+int id_ctx = starpu_sched_ctx_create(workerids, 3, "my_ctx", STARPU_SCHED_CTX_POLICY_NAME, "", 0);
 
 /* .... */
 \endcode

+ 4 - 2
doc/doxygen/chapters/401_out_of_core.doxy

@@ -13,7 +13,7 @@ When using StarPU, one may need to store more data than what the main memory
 disk and to use it.
 
 The principle is that one first registers a disk location, seen by StarPU as
-a <c>void*</c>, which can be for instance a Unix path for the stdio or unistd case,
+a <c>void*</c>, which can be for instance a Unix path for the stdio, unistd or unistd_o_direct case,
 or a database file path for a leveldb case, etc. The disk backend opens this
 place with the plug method.
 
@@ -48,10 +48,12 @@ export STARPU_DISK_SWAP_BACKEND=unistd
 export STARPU_DISK_SWAP_SIZE=200
 \endverbatim
 
+The backend can be set to stdio, unistd, unistd_o_direct, or leveldb.
+
 When the register function is called, StarPU will benchmark the disk. This can
 take some time.
 
-<strong>Warning: the size thus has to be at least 1 MB!</strong> 
+<strong>Warning: the size thus has to be at least \ref STARPU_DISK_SIZE_MIN bytes ! </strong> 
 
 StarPU will automatically try to evict unused data to this new disk. One can
 also use the standard StarPU memory node API, see the \ref API_Standard_Memory_Library

+ 31 - 20
doc/doxygen/chapters/410_mpi_support.doxy

@@ -769,27 +769,38 @@ data transfers and supports data matrices which do not fit in memory (out-of-cor
 
 \section MPIMasterSlave MPI Master Slave Support
 
-StarPU includes an other way to execute the application across many nodes. The Master
-Slave support permits to use remote cores without thinking about data distribution. This
-support can be activated with the \ref enable-mpi-master-slave "--enable-mpi-master-slave". However, you should not activate
-both MPI support and MPI Master-Slave support.
+StarPU provides an other way to execute applications across many
+nodes. The Master Slave support permits to use remote cores without
+thinking about data distribution. This support can be activated with
+the configure option \ref enable-mpi-master-slave
+"--enable-mpi-master-slave". However, you should not activate both MPI
+support and MPI Master-Slave support.
 
 If a codelet contains a kernel for CPU devices, it is automatically eligible to be executed
-on a MPI Slave device. However, you can decide to execute the codelet on a MPI Slave by filling
-the \ref starpu_codelet::mpi_ms_funcs variable. The functions have to be globally-visible (i.e. not static ) for
-StarPU to be able to look them up, and <c>-rdynamic</c> must be passed to gcc (or <c>-export-dynamic</c> to ld)
-so that symbols of the main program are visible.
-
-By default, one core is dedicated on the master to manage the entire set of slaves. If MPI
-has a good multiple threads support, you can use \ref with-mpi-master-slave-multiple-thread "--with-mpi-master-slave-multiple-thread"  to
-dedicate one core per slave.
-
-If you want to chose the number of cores on the slave device, use the \ref STARPU_NMPIMSTHREADS "STARPU_NMPIMSTHREADS=\<number\>"
-with <c>\<number\></c> is the number of cores wanted. The default value is all the slave's cores. To select
-the number of slaves nodes, change the <c>-n</c> parameter when executing the application with mpirun
-or mpiexec.
-
-The node chosen by default is the with the MPI rank 0. To modify this, use the environment variable
-\ref STARPU_MPI_MASTER_NODE "STARPU_MPI_MASTER_NODE=\<number\>" with <c>\<number\></c> is the MPI rank wanted.
+on a MPI Slave device. Moreover, you can force the execution on a MPI Slave by setting
+the field \ref starpu_codelet::mpi_ms_funcs. Functions have to be
+globally-visible (i.e. not static) for StarPU to be able to look them
+up, and <c>-rdynamic</c> must be passed to gcc (or
+<c>-export-dynamic</c> to ld) so that symbols of the main program are
+visible.
+
+By default, one core is dedicated on the master node to manage the
+entire set of slaves. If the implementation of MPI you are using has a
+good multiple threads support, you can use the configure option
+\ref with-mpi-master-slave-multiple-thread "--with-mpi-master-slave-multiple-thread"
+to dedicate one core per slave.
+
+Choosing the number of cores on each slave device is done by setting
+the environment variable \ref STARPU_NMPIMSTHREADS "STARPU_NMPIMSTHREADS=\<number\>"
+with <c>\<number\></c> being the requested number of cores. By default
+all the slave's cores are used.
+
+Setting the number of slaves nodes is done by changing the <c>-n</c>
+parameter when executing the application with mpirun or mpiexec.
+
+The master node is by default the node with the MPI rank equal to 0.
+To select another node, use the environment variable \ref
+STARPU_MPI_MASTER_NODE "STARPU_MPI_MASTER_NODE=\<number\>" with
+<c>\<number\></c> being the requested MPI rank node.
 
 */

+ 3 - 2
doc/doxygen/chapters/501_environment_variables.doxy

@@ -842,7 +842,7 @@ that have a limited amount of memory.
 \addindex __env__STARPU_LIMIT_CPU_MEM
 This variable specifies the maximum number of megabytes that should be
 available to the application in the main CPU memory. Setting it enables allocation
-cache in main memory
+cache in main memory. Setting it to zero lets StarPU overflow memory.
 </dd>
 
 <dt>STARPU_LIMIT_CPU_NUMA_devid_MEM</dt>
@@ -913,7 +913,8 @@ full.
 This specifies then backend to be used by StarPU to push data when the main
 memory is getting full. The default is unistd (i.e. using read/write functions),
 other values are stdio (i.e. using fread/fwrite), unistd_o_direct (i.e. using
-read/write with O_DIRECT), and leveldb (i.e. using a leveldb database).
+read/write with O_DIRECT), leveldb (i.e. using a leveldb database), and hdf5 
+(i.e. using HDF5 library).
 </dd>
 
 <dt>STARPU_DISK_SWAP_SIZE</dt>

+ 14 - 0
doc/doxygen/chapters/510_configure_options.doxy

@@ -494,6 +494,20 @@ Specify the blas library to be used by some of the examples. Librairies availabl
 Enable linking with LevelDB if available
 </dd>
 
+<dt>--with-hdf5-include-dir=<path></dt>
+<dd>
+\anchor with-hdf5-include-dir
+\addindex __configure__--with-hdf5-include-dir
+Specify the directory where is stored the header hdf5.h.
+</dd>
+
+<dt>--with-hdf5-lib-dir=<path></dt>
+<dd>
+\anchor with-hdf5-lib-dir
+\addindex __configure__--with-hdf5-lib-dir
+Specify the directory where is stored the hdf5 library.
+</dd>
+
 <dt>--disable-starpufft</dt>
 <dd>
 \anchor disable-starpufft

+ 85 - 12
doc/doxygen/chapters/api/data_interfaces.doxy

@@ -14,14 +14,23 @@ Per-interface data transfer methods.
 \var void (*starpu_data_interface_ops::register_data_handle)(starpu_data_handle_t handle, unsigned home_node, void *data_interface)
     Register an existing interface into a data handle.
 
+    This iterates over all memory nodes to initialize all fields of the data
+    interface on each of them. Since data is not allocated yet except on the
+    home node, pointers should be left as NULL except on the \p home_node, for
+    which the pointers should be copied from the given \p data_interface, which
+    was filled with the application's pointers.
+
 \var starpu_ssize_t (*starpu_data_interface_ops::allocate_data_on_node)(void *data_interface, unsigned node)
-    Allocate data for the interface on a given node.
+    Allocate data for the interface on a given node. This should use
+    starpu_malloc_on_node to perform the allocation(s), and fill the pointers
+    in the data interface. It should return the size of the allocated memory, or
+    -ENOMEM if memory could not be allocated.
 
 \var void (*starpu_data_interface_ops::free_data_on_node)(void *data_interface, unsigned node)
     Free data of the interface on a given node.
 
 \var const struct starpu_data_copy_methods *starpu_data_interface_ops::copy_methods
-    ram/cuda/opencl synchronous and asynchronous transfer methods.
+    This provides a series of methods for performing ram/cuda/opencl synchronous and asynchronous transfers.
 
 \var void *(*starpu_data_interface_ops::handle_to_pointer)(starpu_data_handle_t handle, unsigned node)
     Return the current pointer (if any) for the handle on the given node.
@@ -30,20 +39,25 @@ Per-interface data transfer methods.
     Return an estimation of the size of data, for performance models.
 
 \var uint32_t (*starpu_data_interface_ops::footprint)(starpu_data_handle_t handle)
-    Return a 32bit footprint which characterizes the data size.
+    Return a 32bit footprint which characterizes the data size and layout (nx, ny, ld, elemsize, etc.)
 
 \var int (*starpu_data_interface_ops::compare)(void *data_interface_a, void *data_interface_b)
-    Compare the data size of two interfaces.
+    Compare the data size and layout of two interfaces (nx, ny, ld, elemsize,
+    etc.). It should return 1 if the two interfaces size and layout match, and 0
+    otherwise.
 
 \var void (*starpu_data_interface_ops::display)(starpu_data_handle_t handle, FILE *f)
     Dump the sizes of a handle to a file.
 
 \var starpu_ssize_t (*starpu_data_interface_ops::describe)(void *data_interface, char *buf, size_t size)
-    Describe the data into a string.
+    Describe the data into a string in a brief way, such as one letter to describe the type of data, and the data dimensions.
 
 \var enum starpu_data_interface_id starpu_data_interface_ops::interfaceid
     An identifier that is unique to each interface.
 
+\var char *starpu_data_interface_ops::name
+    Name of the interface
+
 \var size_t starpu_data_interface_ops::interface_size
     The size of the interface data descriptor.
 
@@ -407,11 +421,70 @@ Compressed Sparse Row Representation) sparse matrix interface.
 Register the sparse matrix made of \p nnz non-zero blocks of elements of
 size \p elemsize stored in \p nzval and initializes \p handle to represent it.
 Blocks have size \p r * \p c. \p nrow is the number of rows (in terms of
-blocks), \p colind[i] is the block-column index for block i in \p nzval,
-\p rowptr[i] is the block-index (in \p nzval) of the first block of row i.
+blocks), \p colind is an array of nnz elements, colind[i] is the block-column index for block i in \p nzval,
+\p rowptr is an array of nrow+1 elements, rowptr[i] is the block-index (in \p nzval) of the first block of row i. By convention, rowptr[nrow] is the number of blocks, this allows an easier access of the matrix's elements for the kernels.
 \p firstentry is the index of the first entry of the given arrays
 (usually 0 or 1).
 
+Here an example of how to use the function.
+\code{.c}
+/*
+ * We use the following matrix:
+ *
+ *   +----------------+
+ *   |  0   1   0   0 |
+ *   |  2   3   0   0 |
+ *   |  4   5   8   9 |
+ *   |  6   7  10  11 |
+ *   +----------------+
+ *
+ * nzval  = [0, 1, 2, 3] ++ [4, 5, 6, 7] ++ [8, 9, 10, 11]
+ * colind = [0, 0, 1]
+ * rowptr = [0, 1, 3]
+ * r = c = 2
+ */
+
+/* Size of the blocks */
+int R = 2;
+int C = 2;
+
+int NROWS = 2;
+int NNZ_BLOCKS = 3;    /* out of 4 */
+int NZVAL_SIZE = (R*C*NNZ_BLOCKS);
+
+int nzval[NZVAL_SIZE]  =
+{
+	0, 1, 2, 3,    /* First block  */
+	4, 5, 6, 7,    /* Second block */
+	8, 9, 10, 11   /* Third block  */
+};
+uint32_t colind[NNZ_BLOCKS] =
+{
+	0, /* block-column index for first block in nzval */
+	0, /* block-column index for second block in nzval */
+	1  /* block-column index for third block in nzval */
+};
+uint32_t rowptr[NROWS+1] =
+{
+	0, / * block-index in nzval of the first block of the first row. */
+	1, / * block-index in nzval of the first block of the second row. */
+	NNZ_BLOCKS /* number of blocks, to allow an easier element's access for the kernels */
+};
+
+starpu_data_handle_t bcsr_handle;
+starpu_bcsr_data_register(&bcsr_handle,
+			  STARPU_MAIN_RAM,
+			  NNZ_BLOCKS,
+			  NROWS,
+			  (uintptr_t) nzval,
+			  colind,
+			  rowptr,
+			  0, /* firstentry */
+			  R,
+			  C,
+			  sizeof(nzval[0]));
+\endcode
+
 \fn void starpu_csr_data_register(starpu_data_handle_t *handle, int home_node, uint32_t nnz, uint32_t nrow, uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry, size_t elemsize)
 \ingroup API_Data_Interfaces
 This variant of starpu_data_register() uses the CSR (Compressed
@@ -793,16 +866,16 @@ row representation)
 \var uintptr_t starpu_bcsr_interface::nzval
     non-zero values
 \var uint32_t *starpu_bcsr_interface::colind
-    position of non-zero entried on the row
+    array of nnz elements, colind[i] is the block-column index for block i in nzval
 \var uint32_t *starpu_bcsr_interface::rowptr
-    index (in nzval) of the first entry of the row
+    array of nrow+1 elements, rowptr[i] is the block-index (in nzval) of the first block of row i. By convention, rowptr[nrow] is the number of blocks, this allows an easier access of the matrix's elements for the kernels.
 \var starpu_bcsr_interface::firstentry
     k for k-based indexing (0 or 1 usually). Also useful when partitionning the matrix.
 \var uint32_t starpu_bcsr_interface::r
-    size of the blocks
+    height of the blocks
 \var uint32_t starpu_bcsr_interface::c
-    size of the blocks
-\var size_t starpu_bcsr_interface::elemsize;
+    width of the blocks
+\var size_t starpu_bcsr_interface::elemsize
     size of the elements of the matrix
 
 \fn uint32_t starpu_bcsr_get_nnz(starpu_data_handle_t handle)

+ 12 - 2
doc/doxygen/chapters/api/data_out_of_core.doxy

@@ -7,6 +7,10 @@
 
 /*! \defgroup API_Out_Of_Core Out Of Core
 
+\def STARPU_DISK_SIZE_MIN
+\ingroup API_Out_Of_Core
+Minimum size of a registered disk. The size of a disk is the last parameter of the function starpu_disk_register().
+
 \struct starpu_disk_ops
 \ingroup API_Out_Of_Core
 This is a set of functions to manipulate datas on disk.
@@ -88,8 +92,7 @@ Register a disk memory node with a set of functions to manipulate datas. The \c
 plug member of \p func will be passed \p parameter, and return a \c base which will be passed to all \p func methods. <br />
 SUCCESS: return the disk node. <br />
 FAIL: return an error code. <br />
-The \p size must be at least 1 MB !
-\p size being negative means infinite size.
+\p size must be at least \ref STARPU_DISK_SIZE_MIN bytes ! \p size being negative means infinite size.
 
 \fn void *starpu_disk_open(unsigned node, void *pos, size_t size)
 \ingroup API_Out_Of_Core
@@ -129,4 +132,11 @@ This set uses the leveldb created by Google <br />
 More information at https://code.google.com/p/leveldb/ <br />
 It doesn't support asynchronous transfers.
 
+\var starpu_disk_hdf5_ops
+\ingroup API_Out_Of_Core
+This set uses the HDF5 library.<br />
+<strong>It doesn't support multiple opening from different processes. </strong> <br />
+You may only allow one process to write in the HDF5 file. <br />
+<strong>If HDF5 library is not compiled with --thread-safe you can't open more than one HDF5 file at the same time. </strong>
+
 */

+ 28 - 5
doc/doxygen/chapters/api/data_partition.doxy

@@ -12,10 +12,24 @@
 The filter structure describes a data partitioning operation, to be
 given to the starpu_data_partition() function.
 \ingroup API_Data_Partition
-\var void (*starpu_data_filter::filter_func)(void *father_interface, void *child_interface, struct starpu_data_filter *, unsigned id, unsigned nparts)
+\var void (*starpu_data_filter::filter_func)(void *father_interface, void *child_interface, struct starpu_data_filter *filter, unsigned i, unsigned nparts)
     Fill the \p child_interface structure with interface information
-    for the \p id -th child of the parent \p father_interface (among
-    \p nparts).
+    for the \p i -th child of the parent \p father_interface (among
+    \p nparts). The \p filter structure is provided, allowing to inspect the
+    starpu_data_filter::filter_arg and starpu_data_filter::filter_arg_ptr
+    parameters.
+
+    The details of what needs to be filled in \p child_interface vary according
+    to the data interface, but generally speaking:
+    <ul>
+    <li> <c>id</c> is usually just copied over from the father, when the sub data has the same structure as the father, e.g. a subvector is a vector, a submatrix is a matrix, etc. This is however not the case for instance when dividing a BCSR matrix into its dense blocks, which then are matrices. </li>
+    <li> <c>nx</c>, <c>ny</c> and alike are usually divided by the number of subdata, depending how the subdivision is done (e.g. nx division vs ny division for vertical matrix division vs horizontal matrix division). </li>
+    <li> <c>ld</c> for matrix interfaces are usually just copied over: the leading dimension (ld) usually does not change. </li>
+    <li> <c>elemsize</c> is usually just copied over. </li>
+    <li> <c>ptr</c>, the pointer to the data, has to be computed according to \p i and the father's <c>ptr</c>, so as to point to the start of the sub data. This should however be done only if the father has <c>ptr</c> different from NULL: in the OpenCL case notably, the <c>dev_handle</c> and <c>offset</c> fields are used instead. </li>
+    <li> <c>dev_handle</c> should be just copied over from the parent. </li>
+    <li> <c>offset</c> has to be computed according to \p i and the father's <c>offset</c>, so as to provide the offset of the start of the sub data. This is notably used for the OpenCL case.
+    </ul>
 \var unsigned starpu_data_filter::nchildren
     Number of parts to partition the data into.
 \var unsigned (*starpu_data_filter::get_nchildren)(struct starpu_data_filter *, starpu_data_handle_t initial_handle)
@@ -27,9 +41,9 @@ given to the starpu_data_partition() function.
     this function returns which interface is used by child number \p
     id.
 \var unsigned starpu_data_filter::filter_arg
-    Allow to define an additional parameter for the filter function.
+    Additional parameter for the filter function
 \var void *starpu_data_filter::filter_arg_ptr
-    Allow to define an additional pointer parameter for the filter
+    Additional pointer parameter for the filter
     function, such as the sizes of the different parts.
 
 @name Basic API
@@ -220,6 +234,15 @@ of \p f must be the shadow size casted into \c void*.
 enforced for the shadowed parts. An usage example is available in
 examples/filters/shadow.c
 
+\fn void starpu_vector_filter_list_long(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
+\ingroup API_Data_Partition
+Return in \p child_interface the \p id th element of the vector
+represented by \p father_interface once partitioned into \p nparts chunks
+according to the <c>filter_arg_ptr</c> field of \p f. The
+<c>filter_arg_ptr</c> field must point to an array of \p nparts long
+elements, each of which specifies the number of elements in each chunk
+of the partition.
+
 \fn void starpu_vector_filter_list(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
 \ingroup API_Data_Partition
 Return in \p child_interface the \p id th element of the vector

+ 4 - 0
doc/doxygen/chapters/api/initialization.doxy

@@ -257,6 +257,10 @@ field starpu_conf::calibrate of \p conf. Upon successful
 completion, this function returns 0. Otherwise, <c>-EINVAL</c> indicates that
 the argument was <c>NULL</c>.
 
+\fn int starpu_is_initialized(void)
+\ingroup API_Initialization_and_Termination
+Return 1 if StarPU is already initialized.
+
 \fn void starpu_shutdown(void)
 \ingroup API_Initialization_and_Termination
 This is StarPU termination method. It must be called at the end of the

+ 1 - 1
doc/doxygen/chapters/api/mpi.doxy

@@ -503,7 +503,7 @@ with the argument \p rarg on the process root, the \p scallback
 function is called with the argument \p sarg on any other process.
 
 @name MPI Master Slave
-\anchor MPIMasterSlave
+\anchor MPIMasterSlaveSupport
 \ingroup API_MPI_Support
 
 \def STARPU_USE_MPI_MASTER_SLAVE

+ 2 - 1
doc/doxygen/chapters/api/scheduling_contexts.doxy

@@ -56,7 +56,8 @@ The arguments following the name of the scheduling context can be of
 the following types:
 <ul>
 <li> ::STARPU_SCHED_CTX_POLICY_NAME, followed by the name of a
-predefined scheduling policy
+predefined scheduling policy. Use an empty string to create the
+context with the default scheduling policy.
 </li>
 <li> ::STARPU_SCHED_CTX_POLICY_STRUCT, followed by a pointer to a
 custom scheduling policy (struct starpu_sched_policy *)

+ 9 - 5
doc/doxygen/chapters/api/scheduling_policy.doxy

@@ -117,12 +117,16 @@ condition variable. For instance, in the case of a scheduling strategy
 with a single task queue, the same condition variable would be used to
 block and wake up all workers.
 
-\fn int starpu_wake_worker(int workerid)
+\fn int starpu_wake_worker_no_relax(int workerid)
 \ingroup API_Scheduling_Policy
-In simgrid or blocking driver mode, 
-this should be called by push functions to wake the potential workers that are
-supposed to pick up the tasks which just have been pushed, otherwise they may
-remain sleeping.
+Must be called to wake up a worker that is sleeping on the cond.
+Return 0 whenever the worker is not in a sleeping state or has the
+state_keep_awake flag on.
+
+\fn int starpu_wake_worker_locked(int workerid)
+\ingroup API_Scheduling_Policy
+Version of starpu_wake_worker_no_relax() which assumes that the sched
+mutex is locked
 
 \fn int starpu_sched_set_min_priority(int min_prio)
 \ingroup API_Scheduling_Policy

+ 7 - 0
doc/doxygen/chapters/api/workers.doxy

@@ -330,4 +330,11 @@ if needed during the waiting process. Returns 1 if \p workerid has been woken
 up or its state_keep_awake flag has been set to 1, and 0 otherwise (if \p
 workerid was not in the STATE_SLEEPING or in the STATE_SCHEDULING).
 
+\fn hwloc_cpuset_t starpu_worker_get_hwloc_cpuset(int workerid)
+\ingroup API_Workers_Properties
+If StarPU was compiled with hwloc support, returns a duplicate of the
+hwloc cpuset associated with the worker \p workerid. The returned cpuset is obtained
+from a \c hwloc_bitmap_dup() function call. It must be freed by the caller
+using \c hwloc_bitmap_free().
+
 */

+ 4 - 0
examples/Makefile.am

@@ -375,10 +375,14 @@ basic_examples_vector_scal_SOURCES =		\
 
 if STARPU_HAVE_ICC
 if STARPU_CROSS_COMPILING
+basic_examples_vector_scal_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) $(basic_examples_vector_scal_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
 else
 basic_examples_vector_scal_SOURCES +=		\
 	basic_examples/vector_scal_cpu_icc.icc
+basic_examples_vector_scal_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(ICC) $(basic_examples_vector_scal_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
 endif
+else
+basic_examples_vector_scal_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) $(basic_examples_vector_scal_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
 endif
 
 if STARPU_USE_CUDA

+ 6 - 4
examples/audio/starpu_audio_processing.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2010-2012, 2014-2015  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -217,7 +217,7 @@ static void band_filter_kernel_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *
 
 	/* FFTW does not normalize its output ! */
 	float scal = 1.0f/nsamples;
-	cublasStatus_t status = cublasSscal (starpu_cublas_local_handle(), nsamples, &scal, localA, 1);
+	cublasStatus_t status = cublasSscal (starpu_cublas_get_local_handle(), nsamples, &scal, localA, 1);
 	if (status != CUBLAS_STATUS_SUCCESS)
 		STARPU_CUBLAS_REPORT_ERROR(status);
 }
@@ -238,7 +238,7 @@ static void band_filter_kernel_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *
 		plans[workerid].Acopy = malloc(nsamples*sizeof(float));
 
 		/* create plans, only "fftwf_execute" is thread safe in FFTW ... */
-		starpu_pthread_mutex_lock(&fftw_mutex);
+		STARPU_PTHREAD_MUTEX_LOCK(&fftw_mutex);
 		plans[workerid].plan_cpu = fftwf_plan_dft_r2c_1d(nsamples,
 					plans[workerid].Acopy,
 					plans[workerid].localout_cpu,
@@ -247,7 +247,7 @@ static void band_filter_kernel_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *
 					plans[workerid].localout_cpu,
 					plans[workerid].Acopy,
 					FFTW_ESTIMATE);
-		starpu_pthread_mutex_unlock(&fftw_mutex);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&fftw_mutex);
 
 		plans[workerid].is_initialized = 1;
 	}
@@ -341,6 +341,8 @@ static void init_problem(void)
 	/* read length of input WAV's data */
 	/* each element is 2 bytes long (16bits)*/
 	length_data = get_wav_data_bytes_length(infile)/2;
+	while (nsamples > length_data)
+		nsamples /= 2;
 
 	/* allocate a buffer to store the content of input file */
 	if (use_pin)

+ 2 - 4
examples/basic_examples/variable.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011, 2013, 2015-2016  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -51,9 +51,6 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) goto enodev;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-#ifdef STARPU_QUICK_CHECK
-	niter /= 100;
-#endif
         if (argc == 2) niter = atoi(argv[1]);
         foo = 0.0f;
 
@@ -104,6 +101,7 @@ int main(int argc, char **argv)
 	starpu_data_unregister(float_array_handle);
 
 	FPRINTF(stderr, "variable -> %f\n", foo);
+	FPRINTF(stderr, "result is %scorrect\n", foo==niter?"":"IN");
 
 	starpu_shutdown();
 

+ 19 - 10
examples/heat/dw_sparse_cg.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010, 2011, 2015, 2017  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -127,21 +127,24 @@ void init_problem(void)
  *	cg initialization phase
  */
 
-static struct starpu_codelet cl1 = {
+static struct starpu_codelet cl1 =
+{
 	.cpu_funcs = { cpu_codelet_func_1 },
 	.cpu_funcs_name = { "cpu_codelet_func_1" },
 	.nbuffers = 4,
 	.modes = { STARPU_R, STARPU_R, STARPU_W, STARPU_R },
 };
 
-static struct starpu_codelet cl2 = {
+static struct starpu_codelet cl2 =
+{
 	.cpu_funcs = { cpu_codelet_func_2 },
 	.cpu_funcs_name = { "cpu_codelet_func_2" },
 	.nbuffers = 2,
 	.modes = { STARPU_W, STARPU_R },
 };
 
-static struct starpu_codelet cl3 = {
+static struct starpu_codelet cl3 =
+{
 	.cpu_funcs = { cpu_codelet_func_3 },
 	.cpu_funcs_name = { "cpu_codelet_func_3" },
 #ifdef STARPU_USE_CUDA
@@ -205,14 +208,16 @@ void init_cg(struct cg_problem *problem)
  *		the codelet code launcher is its own callback !
  */
 
-static struct starpu_codelet cl4 = {
+static struct starpu_codelet cl4 =
+{
 	.cpu_funcs = { cpu_codelet_func_4 },
 	.cpu_funcs_name = { "cpu_codelet_func_4" },
 	.nbuffers = 3,
 	.modes = { STARPU_R, STARPU_R, STARPU_W },
 };
 
-static struct starpu_codelet cl5 = {
+static struct starpu_codelet cl5 =
+{
 	.cpu_funcs = { cpu_codelet_func_5 },
 	.cpu_funcs_name = { "cpu_codelet_func_5" },
 #ifdef STARPU_USE_CUDA
@@ -222,7 +227,8 @@ static struct starpu_codelet cl5 = {
 	.modes = { STARPU_R, STARPU_R },
 };
 
-static struct starpu_codelet cl6 = {
+static struct starpu_codelet cl6 =
+{
 	.cpu_funcs = { cpu_codelet_func_6 },
 	.cpu_funcs_name = { "cpu_codelet_func_6" },
 #ifdef STARPU_USE_CUDA
@@ -233,7 +239,8 @@ static struct starpu_codelet cl6 = {
 	.modes = { STARPU_RW, STARPU_R },
 };
 
-static struct starpu_codelet cl7 = {
+static struct starpu_codelet cl7 =
+{
 	.cpu_funcs = { cpu_codelet_func_7 },
 	.cpu_funcs_name = { "cpu_codelet_func_7" },
 #ifdef STARPU_USE_CUDA
@@ -244,7 +251,8 @@ static struct starpu_codelet cl7 = {
 	.modes = { STARPU_RW, STARPU_R },
 };
 
-static struct starpu_codelet cl8 = {
+static struct starpu_codelet cl8 =
+{
 	.cpu_funcs = { cpu_codelet_func_8 },
 	.cpu_funcs_name = { "cpu_codelet_func_8" },
 #ifdef STARPU_USE_CUDA
@@ -254,7 +262,8 @@ static struct starpu_codelet cl8 = {
 	.modes = { STARPU_R },
 };
 
-static struct starpu_codelet cl9 = {
+static struct starpu_codelet cl9 =
+{
 	.cpu_funcs = { cpu_codelet_func_9 },
 	.cpu_funcs_name = { "cpu_codelet_func_9" },
 #ifdef STARPU_USE_CUDA

+ 20 - 17
examples/mlr/mlr.c

@@ -151,25 +151,28 @@ static struct starpu_codelet cl_final =
 int main(int argc, char **argv)
 {
 	/* Initialization */
-	unsigned i,j;
+	unsigned i;
 	int ret;
+
 	ret = starpu_init(NULL);
 	if (ret == -ENODEV)
 		return 77;
 
 	sum=0;
-	int m,n,k;
-	int* vector_mn = calloc( 2, sizeof(int) );
+	int* vector_mn = calloc(2, sizeof(int));
 	starpu_data_handle_t vector_mn_handle;
 
-	starpu_vector_data_register( &vector_mn_handle,
-				     STARPU_MAIN_RAM,
-				     (uintptr_t)vector_mn, 2,
-				     sizeof(int) );
+	starpu_vector_data_register(&vector_mn_handle,
+				    STARPU_MAIN_RAM,
+				    (uintptr_t)vector_mn, 2,
+				    sizeof(int));
 
 	/* Giving pseudo-random values to the M,N,K parameters and inserting tasks */
-	for ( i = 0; i < 42; i++)
+	for (i = 0; i < 42; i++)
 	{
+		int j;
+		int m,n,k;
+
 		m = (int) ((rand() % 10)+1);
 		n = (int) ((rand() % 10)+1);
 		k = (int) ((rand() % 10)+1);
@@ -180,16 +183,16 @@ int main(int argc, char **argv)
 		vector_mn[1] = n;
 		starpu_data_release(vector_mn_handle);
 
-		for ( j = 0; j < 42; j++)
+		for (j = 0; j < 42; j++)
 		{
-			starpu_insert_task( &cl_init,
-					    STARPU_R, vector_mn_handle,
-					    STARPU_VALUE, &k, sizeof(int),
-					    0 );
-			starpu_insert_task( &cl_final,
-					    STARPU_R, vector_mn_handle,
-					    STARPU_VALUE, &k, sizeof(int),
-					    0 );
+			starpu_insert_task(&cl_init,
+					   STARPU_R, vector_mn_handle,
+					   STARPU_VALUE, &k, sizeof(int),
+					   0);
+			starpu_insert_task(&cl_final,
+					   STARPU_R, vector_mn_handle,
+					   STARPU_VALUE, &k, sizeof(int),
+					   0);
 		}
 	}
 

+ 2 - 2
examples/sched_ctx/two_cpu_contexts.c

@@ -72,8 +72,8 @@ int main(int argc, char **argv)
 		procs2[i] = procs[i+nprocs1];
 	}
 
-        /* create sched context 1 with default policy, by giving a NULL policy name */
-	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", STARPU_SCHED_CTX_POLICY_NAME, NULL, 0);
+        /* create sched context 1 with default policy, by giving a empty policy name */
+	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", STARPU_SCHED_CTX_POLICY_NAME, "", 0);
         /* create sched context 2 with a user selected policy name */
 	unsigned sched_ctx2 = starpu_sched_ctx_create(procs2, nprocs2, "ctx2", STARPU_SCHED_CTX_POLICY_NAME, "eager", 0);
 

+ 13 - 13
examples/sched_ctx_utils/sched_ctx_utils.c

@@ -75,12 +75,12 @@ void init()
 
 	p1.id = 0;
 	p2.id = 1;
-	starpu_pthread_key_create(&key, NULL);
+	STARPU_PTHREAD_KEY_CREATE(&key, NULL);
 }
 
 void update_sched_ctx_timing_results(double flops, double avg_timing)
 {
-	unsigned *id = starpu_pthread_getspecific(key);
+	unsigned *id = STARPU_PTHREAD_GETSPECIFIC(key);
 	rv[*id].flops += flops;
 	rv[*id].avg_timing += avg_timing;
 }
@@ -90,7 +90,7 @@ void* start_bench(void *val)
 	struct params *p = (struct params*)val;
 	int i;
 
-	starpu_pthread_setspecific(key, &p->id);
+	STARPU_PTHREAD_SETSPECIFIC(key, &p->id);
 
 	if(p->ctx != 0)
 		starpu_sched_ctx_set_context(&p->ctx);
@@ -100,14 +100,14 @@ void* start_bench(void *val)
 
 	if(p->ctx != 0)
 	{
-		starpu_pthread_mutex_lock(&mut);
+		STARPU_PTHREAD_MUTEX_LOCK(&mut);
 		if(first)
 		{
 			starpu_sched_ctx_delete(p->ctx);
 		}
 
 		first = 0;
-		starpu_pthread_mutex_unlock(&mut);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&mut);
 	}
 
 	rv[p->id].flops /= NSAMPLES;
@@ -129,22 +129,22 @@ void start_2benchs(void (*bench)(unsigned, unsigned))
 	p2.nblocks = nblocks2;
 
 	starpu_pthread_t tid[2];
-	starpu_pthread_mutex_init(&mut, NULL);
+	STARPU_PTHREAD_MUTEX_INIT(&mut, NULL);
 
 	double start;
 	double end;
 
 	start = starpu_timing_now();
 
-	starpu_pthread_create(&tid[0], NULL, (void*)start_bench, (void*)&p1);
-	starpu_pthread_create(&tid[1], NULL, (void*)start_bench, (void*)&p2);
+	STARPU_PTHREAD_CREATE(&tid[0], NULL, (void*)start_bench, (void*)&p1);
+	STARPU_PTHREAD_CREATE(&tid[1], NULL, (void*)start_bench, (void*)&p2);
 
-	starpu_pthread_join(tid[0], NULL);
-	starpu_pthread_join(tid[1], NULL);
+	STARPU_PTHREAD_JOIN(tid[0], NULL);
+	STARPU_PTHREAD_JOIN(tid[1], NULL);
 
 	end = starpu_timing_now();
 
-	starpu_pthread_mutex_destroy(&mut);
+	STARPU_PTHREAD_MUTEX_DESTROY(&mut);
 
 	double timing = end - start;
 	timing /= 1000000;
@@ -169,7 +169,7 @@ void start_1stbench(void (*bench)(unsigned, unsigned))
 
 	end = starpu_timing_now();
 
-	starpu_pthread_mutex_destroy(&mut);
+	STARPU_PTHREAD_MUTEX_DESTROY(&mut);
 
 	double timing = end - start;
 	timing /= 1000000;
@@ -193,7 +193,7 @@ void start_2ndbench(void (*bench)(unsigned, unsigned))
 
 	end = starpu_timing_now();
 
-	starpu_pthread_mutex_destroy(&mut);
+	STARPU_PTHREAD_MUTEX_DESTROY(&mut);
 
 	double timing = end - start;
 	timing /= 1000000;

+ 2 - 2
examples/spmv/dw_block_spmv.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009-2012, 2014-2015  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -176,7 +176,6 @@ void launch_spmv_codelets(void)
 {
 	struct starpu_task *task_tab;
 	uint8_t *is_entry_tab;
-	int ret;
 
 	/* we call one codelet per block */
 	unsigned nblocks = starpu_bcsr_get_nnz(sparse_matrix);
@@ -263,6 +262,7 @@ void launch_spmv_codelets(void)
 	unsigned task;
 	for (task = 0; task < totaltasks; task++)
 	{
+		int ret;
 		if (is_entry_tab[task])
 		{
 			nchains++;

+ 6 - 5
examples/spmv/spmv.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009, 2010, 2011, 2013-2015  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011, 2012  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -64,15 +64,16 @@ static void csr_filter_func(void *father_interface, void *child_interface, struc
 
 	uint32_t first_index = id*chunk_size - firstentry;
 	uint32_t local_firstentry = rowptr[first_index];
-	
+
 	uint32_t child_nrow = STARPU_MIN(chunk_size, nrow - id*chunk_size);
-	uint32_t local_nnz = rowptr[first_index + child_nrow] - rowptr[first_index]; 
-	
+	uint32_t local_nnz = rowptr[first_index + child_nrow] - rowptr[first_index];
+
+	csr_child->id = csr_father->id;
 	csr_child->nnz = local_nnz;
 	csr_child->nrow = child_nrow;
 	csr_child->firstentry = local_firstentry;
 	csr_child->elemsize = elemsize;
-	
+
 	if (csr_father->nzval)
 	{
 		csr_child->rowptr = &csr_father->rowptr[first_index];

+ 11 - 0
include/fstarpu_mod.f90

@@ -1298,6 +1298,17 @@ module fstarpu_mod
                         type(c_ptr), value, intent(in) :: nparts
                 end subroutine fstarpu_vector_filter_block_shadow
 
+                ! void starpu_vector_filter_list_long(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+                subroutine fstarpu_vector_filter_list_long (father_interface,child_interface,filter,id,nparts) &
+                                bind(C,name="starpu_vector_filter_list_long")
+                        use iso_c_binding, only: c_ptr
+                        type(c_ptr), value, intent(in) :: father_interface
+                        type(c_ptr), value, intent(in) :: child_interface
+                        type(c_ptr), value, intent(in) :: filter
+                        type(c_ptr), value, intent(in) :: id
+                        type(c_ptr), value, intent(in) :: nparts
+                end subroutine fstarpu_vector_filter_list_long
+
                 ! void starpu_vector_filter_list(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
                 subroutine fstarpu_vector_filter_list (father_interface,child_interface,filter,id,nparts) &
                                 bind(C,name="starpu_vector_filter_list")

+ 2 - 2
include/starpu.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2014, 2016-2017  Université de Bordeaux
- * Copyright (C) 2010-2015  CNRS
+ * Copyright (C) 2010-2015, 2017  CNRS
  * Copyright (C) 2014, 2016  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -145,8 +145,8 @@ struct starpu_conf
 int starpu_conf_init(struct starpu_conf *conf);
 
 int starpu_init(struct starpu_conf *conf) STARPU_WARN_UNUSED_RESULT;
-
 int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv);
+int starpu_is_initialized(void);
 
 void starpu_pause(void);
 void starpu_resume(void);

+ 2 - 0
include/starpu_config.h.in

@@ -108,6 +108,7 @@
 #undef STARPU_HAVE_SETENV
 #undef STARPU_HAVE_UNSETENV
 #undef STARPU_HAVE_UNISTD_H
+#undef STARPU_HAVE_HDF5
 
 #undef STARPU_FXT_LOCK_TRACES
 
@@ -152,5 +153,6 @@ typedef ssize_t starpu_ssize_t;
 
 #undef STARPU_HAVE_CXX11
 #undef STARPU_HAVE_STRERROR_R
+#undef STARPU_HAVE_STATEMENT_EXPRESSIONS
 
 #endif

+ 2 - 1
include/starpu_data_filters.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012, 2015  Université de Bordeaux
+ * Copyright (C) 2010-2012, 2015, 2017  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013  CNRS
  *
@@ -69,6 +69,7 @@ void starpu_matrix_filter_vertical_block_shadow(void *father_interface, void *ch
 
 void starpu_vector_filter_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 void starpu_vector_filter_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+void starpu_vector_filter_list_long(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 void starpu_vector_filter_list(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 void starpu_vector_filter_divide_in_2(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 

+ 59 - 16
include/starpu_data_interfaces.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2016  Université de Bordeaux
- * Copyright (C) 2010-2014  CNRS
+ * Copyright (C) 2010-2014, 2017  CNRS
  * Copyright (C) 2011-2012, 2016  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -137,6 +137,8 @@ struct starpu_data_interface_ops
 
 	int (*pack_data) (starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ssize_t *count);
 	int (*unpack_data) (starpu_data_handle_t handle, unsigned node, void *ptr, size_t count);
+
+	char *name;
 };
 
 int starpu_data_interface_get_next_id(void);
@@ -173,13 +175,24 @@ uint32_t starpu_matrix_get_local_ld(starpu_data_handle_t handle);
 uintptr_t starpu_matrix_get_local_ptr(starpu_data_handle_t handle);
 size_t starpu_matrix_get_elemsize(starpu_data_handle_t handle);
 
+#if defined(STARPU_HAVE_STATEMENT_EXPRESSIONS) && defined(STARPU_DEBUG)
+#define STARPU_MATRIX_CHECK(interface)          STARPU_ASSERT_MSG((((struct starpu_matrix_interface *)(interface))->id) == STARPU_MATRIX_INTERFACE_ID, "Error. The given data is not a matrix.")
+#define STARPU_MATRIX_GET_PTR(interface)	({ STARPU_MATRIX_CHECK(interface); (((struct starpu_matrix_interface *)(interface))->ptr) ; })
+#define STARPU_MATRIX_GET_DEV_HANDLE(interface)	({ STARPU_MATRIX_CHECK(interface); (((struct starpu_matrix_interface *)(interface))->dev_handle) ; })
+#define STARPU_MATRIX_GET_OFFSET(interface)	({ STARPU_MATRIX_CHECK(interface); (((struct starpu_matrix_interface *)(interface))->offset) ; })
+#define STARPU_MATRIX_GET_NX(interface)	        ({ STARPU_MATRIX_CHECK(interface); (((struct starpu_matrix_interface *)(interface))->nx) ; })
+#define STARPU_MATRIX_GET_NY(interface)	        ({ STARPU_MATRIX_CHECK(interface); (((struct starpu_matrix_interface *)(interface))->ny) ; })
+#define STARPU_MATRIX_GET_LD(interface)	        ({ STARPU_MATRIX_CHECK(interface); (((struct starpu_matrix_interface *)(interface))->ld) ; })
+#define STARPU_MATRIX_GET_ELEMSIZE(interface)	({ STARPU_MATRIX_CHECK(interface); (((struct starpu_matrix_interface *)(interface))->elemsize) ; })
+#else
 #define STARPU_MATRIX_GET_PTR(interface)	(((struct starpu_matrix_interface *)(interface))->ptr)
 #define STARPU_MATRIX_GET_DEV_HANDLE(interface)	(((struct starpu_matrix_interface *)(interface))->dev_handle)
 #define STARPU_MATRIX_GET_OFFSET(interface)	(((struct starpu_matrix_interface *)(interface))->offset)
-#define STARPU_MATRIX_GET_NX(interface)	(((struct starpu_matrix_interface *)(interface))->nx)
-#define STARPU_MATRIX_GET_NY(interface)	(((struct starpu_matrix_interface *)(interface))->ny)
-#define STARPU_MATRIX_GET_LD(interface)	(((struct starpu_matrix_interface *)(interface))->ld)
+#define STARPU_MATRIX_GET_NX(interface)	        (((struct starpu_matrix_interface *)(interface))->nx)
+#define STARPU_MATRIX_GET_NY(interface)	        (((struct starpu_matrix_interface *)(interface))->ny)
+#define STARPU_MATRIX_GET_LD(interface)	        (((struct starpu_matrix_interface *)(interface))->ld)
 #define STARPU_MATRIX_GET_ELEMSIZE(interface)	(((struct starpu_matrix_interface *)(interface))->elemsize)
+#endif
 
 extern struct starpu_data_interface_ops starpu_interface_coo_ops;
 
@@ -248,15 +261,28 @@ uint32_t starpu_block_get_local_ldz(starpu_data_handle_t handle);
 uintptr_t starpu_block_get_local_ptr(starpu_data_handle_t handle);
 size_t starpu_block_get_elemsize(starpu_data_handle_t handle);
 
-#define STARPU_BLOCK_GET_PTR(interface)	(((struct starpu_block_interface *)(interface))->ptr)
+#if defined(STARPU_HAVE_STATEMENT_EXPRESSIONS) && defined(STARPU_DEBUG)
+#define STARPU_BLOCK_CHECK(interface)           STARPU_ASSERT_MSG((((struct starpu_block_interface *)(interface))->id) == STARPU_BLOCK_INTERFACE_ID, "Error. The given data is not a block.")
+#define STARPU_BLOCK_GET_PTR(interface)	        ({ STARPU_BLOCK_CHECK(interface); (((struct starpu_block_interface *)(interface))->ptr) ; })
+#define STARPU_BLOCK_GET_DEV_HANDLE(interface)	({ STARPU_BLOCK_CHECK(interface); (((struct starpu_block_interface *)(interface))->dev_handle) ; })
+#define STARPU_BLOCK_GET_OFFSET(interface)	({ STARPU_BLOCK_CHECK(interface); (((struct starpu_block_interface *)(interface))->offset) ; })
+#define STARPU_BLOCK_GET_NX(interface)	        ({ STARPU_BLOCK_CHECK(interface); (((struct starpu_block_interface *)(interface))->nx) ; })
+#define STARPU_BLOCK_GET_NY(interface)	        ({ STARPU_BLOCK_CHECK(interface); (((struct starpu_block_interface *)(interface))->ny) ; })
+#define STARPU_BLOCK_GET_NZ(interface)	        ({ STARPU_BLOCK_CHECK(interface); (((struct starpu_block_interface *)(interface))->nz) ; })
+#define STARPU_BLOCK_GET_LDY(interface)	        ({ STARPU_BLOCK_CHECK(interface); (((struct starpu_block_interface *)(interface))->ldy) ; })
+#define STARPU_BLOCK_GET_LDZ(interface)	        ({ STARPU_BLOCK_CHECK(interface); (((struct starpu_block_interface *)(interface))->ldz) ; })
+#define STARPU_BLOCK_GET_ELEMSIZE(interface)	({ STARPU_BLOCK_CHECK(interface); (((struct starpu_block_interface *)(interface))->elemsize) ; })
+#else
+#define STARPU_BLOCK_GET_PTR(interface)	        (((struct starpu_block_interface *)(interface))->ptr)
 #define STARPU_BLOCK_GET_DEV_HANDLE(interface)	(((struct starpu_block_interface *)(interface))->dev_handle)
 #define STARPU_BLOCK_GET_OFFSET(interface)	(((struct starpu_block_interface *)(interface))->offset)
-#define STARPU_BLOCK_GET_NX(interface)	(((struct starpu_block_interface *)(interface))->nx)
-#define STARPU_BLOCK_GET_NY(interface)	(((struct starpu_block_interface *)(interface))->ny)
-#define STARPU_BLOCK_GET_NZ(interface)	(((struct starpu_block_interface *)(interface))->nz)
-#define STARPU_BLOCK_GET_LDY(interface)	(((struct starpu_block_interface *)(interface))->ldy)
-#define STARPU_BLOCK_GET_LDZ(interface)	(((struct starpu_block_interface *)(interface))->ldz)
+#define STARPU_BLOCK_GET_NX(interface)	        (((struct starpu_block_interface *)(interface))->nx)
+#define STARPU_BLOCK_GET_NY(interface)	        (((struct starpu_block_interface *)(interface))->ny)
+#define STARPU_BLOCK_GET_NZ(interface)	        (((struct starpu_block_interface *)(interface))->nz)
+#define STARPU_BLOCK_GET_LDY(interface)	        (((struct starpu_block_interface *)(interface))->ldy)
+#define STARPU_BLOCK_GET_LDZ(interface)	        (((struct starpu_block_interface *)(interface))->ldz)
 #define STARPU_BLOCK_GET_ELEMSIZE(interface)	(((struct starpu_block_interface *)(interface))->elemsize)
+#endif
 
 extern struct starpu_data_interface_ops starpu_interface_vector_ops;
 
@@ -279,12 +305,22 @@ uint32_t starpu_vector_get_nx(starpu_data_handle_t handle);
 size_t starpu_vector_get_elemsize(starpu_data_handle_t handle);
 uintptr_t starpu_vector_get_local_ptr(starpu_data_handle_t handle);
 
+#if defined(STARPU_HAVE_STATEMENT_EXPRESSIONS) && defined(STARPU_DEBUG)
+#define STARPU_VECTOR_CHECK(interface)          STARPU_ASSERT_MSG((((struct starpu_vector_interface *)(interface))->id) == STARPU_VECTOR_INTERFACE_ID, "Error. The given data is not a vector.")
+#define STARPU_VECTOR_GET_PTR(interface)	({ STARPU_VECTOR_CHECK(interface); (((struct starpu_vector_interface *)(interface))->ptr); })
+#define STARPU_VECTOR_GET_DEV_HANDLE(interface)	({ STARPU_VECTOR_CHECK(interface); (((struct starpu_vector_interface *)(interface))->dev_handle); })
+#define STARPU_VECTOR_GET_OFFSET(interface)	({ STARPU_VECTOR_CHECK(interface); (((struct starpu_vector_interface *)(interface))->offset); })
+#define STARPU_VECTOR_GET_NX(interface)	        ({ STARPU_VECTOR_CHECK(interface); (((struct starpu_vector_interface *)(interface))->nx); })
+#define STARPU_VECTOR_GET_ELEMSIZE(interface)	({ STARPU_VECTOR_CHECK(interface); (((struct starpu_vector_interface *)(interface))->elemsize); })
+#define STARPU_VECTOR_GET_SLICE_BASE(interface)	({ STARPU_VECTOR_CHECK(interface); (((struct starpu_vector_interface *)(interface))->slice_base); })
+#else
 #define STARPU_VECTOR_GET_PTR(interface)	(((struct starpu_vector_interface *)(interface))->ptr)
 #define STARPU_VECTOR_GET_DEV_HANDLE(interface)	(((struct starpu_vector_interface *)(interface))->dev_handle)
 #define STARPU_VECTOR_GET_OFFSET(interface)	(((struct starpu_vector_interface *)(interface))->offset)
-#define STARPU_VECTOR_GET_NX(interface)	(((struct starpu_vector_interface *)(interface))->nx)
+#define STARPU_VECTOR_GET_NX(interface)	        (((struct starpu_vector_interface *)(interface))->nx)
 #define STARPU_VECTOR_GET_ELEMSIZE(interface)	(((struct starpu_vector_interface *)(interface))->elemsize)
 #define STARPU_VECTOR_GET_SLICE_BASE(interface)	(((struct starpu_vector_interface *)(interface))->slice_base)
+#endif
 
 extern struct starpu_data_interface_ops starpu_interface_variable_ops;
 
@@ -303,11 +339,18 @@ void starpu_variable_ptr_register(starpu_data_handle_t handle, unsigned node, ui
 size_t starpu_variable_get_elemsize(starpu_data_handle_t handle);
 uintptr_t starpu_variable_get_local_ptr(starpu_data_handle_t handle);
 
-#define STARPU_VARIABLE_GET_PTR(interface)	(((struct starpu_variable_interface *)(interface))->ptr)
-#define STARPU_VARIABLE_GET_OFFSET(interface)	(((struct starpu_variable_interface *)(interface))->offset)
-#define STARPU_VARIABLE_GET_ELEMSIZE(interface)	(((struct starpu_variable_interface *)(interface))->elemsize)
-#define STARPU_VARIABLE_GET_DEV_HANDLE(interface) \
-	(((struct starpu_variable_interface *)(interface))->ptr)
+#if defined(STARPU_HAVE_STATEMENT_EXPRESSIONS) && defined(STARPU_DEBUG)
+#define STARPU_VARIABLE_CHECK(interface)          STARPU_ASSERT_MSG((((struct starpu_variable_interface *)(interface))->id) == STARPU_VARIABLE_INTERFACE_ID, "Error. The given data is not a variable.")
+#define STARPU_VARIABLE_GET_PTR(interface)	  ({ STARPU_VARIABLE_CHECK(interface); (((struct starpu_variable_interface *)(interface))->ptr) ; })
+#define STARPU_VARIABLE_GET_OFFSET(interface)	  ({ STARPU_VARIABLE_CHECK(interface); (((struct starpu_variable_interface *)(interface))->offset) ; })
+#define STARPU_VARIABLE_GET_ELEMSIZE(interface)	  ({ STARPU_VARIABLE_CHECK(interface); (((struct starpu_variable_interface *)(interface))->elemsize) ; })
+#define STARPU_VARIABLE_GET_DEV_HANDLE(interface) ({ STARPU_VARIABLE_CHECK(interface); (((struct starpu_variable_interface *)(interface))->ptr) ; })
+#else
+#define STARPU_VARIABLE_GET_PTR(interface)	  (((struct starpu_variable_interface *)(interface))->ptr)
+#define STARPU_VARIABLE_GET_OFFSET(interface)	  (((struct starpu_variable_interface *)(interface))->offset)
+#define STARPU_VARIABLE_GET_ELEMSIZE(interface)	  (((struct starpu_variable_interface *)(interface))->elemsize)
+#define STARPU_VARIABLE_GET_DEV_HANDLE(interface) (((struct starpu_variable_interface *)(interface))->ptr)
+#endif
 
 extern struct starpu_data_interface_ops starpu_interface_void_ops;
 

+ 3 - 0
include/starpu_disk.h

@@ -57,6 +57,7 @@ struct starpu_disk_ops
 
 /* Posix functions to use disk memory */
 extern struct starpu_disk_ops starpu_disk_stdio_ops;
+extern struct starpu_disk_ops starpu_disk_hdf5_ops;
 extern struct starpu_disk_ops starpu_disk_unistd_ops;
 extern struct starpu_disk_ops starpu_disk_unistd_o_direct_ops;
 extern struct starpu_disk_ops starpu_disk_leveldb_ops;
@@ -67,6 +68,8 @@ void *starpu_disk_open(unsigned node, void *pos, size_t size);
 
 int starpu_disk_register(struct starpu_disk_ops *func, void *parameter, starpu_ssize_t size);
 
+#define STARPU_DISK_SIZE_MIN (64*1024*1024)
+
 extern int starpu_disk_swap_node;
 
 #endif /* __STARPU_DISK_H__ */

+ 2 - 2
include/starpu_mpi_ms.h

@@ -36,5 +36,5 @@ starpu_mpi_ms_kernel_t starpu_mpi_ms_get_kernel(starpu_mpi_ms_func_symbol_t symb
 }
 #endif
 
-#endif /* STARPU_USE_MIC */
-#endif /* __STARPU_MIC_H__ */
+#endif /* STARPU_USE_MPI_MASTER_SLAVE */
+#endif /* __STARPU_MPI_MS_H__ */

+ 1 - 0
include/starpu_perfmodel.h

@@ -170,6 +170,7 @@ struct starpu_perfmodel_arch *starpu_worker_get_perf_archtype(int workerid, unsi
 int starpu_perfmodel_get_narch_combs();
 int starpu_perfmodel_arch_comb_add(int ndevices, struct starpu_perfmodel_device* devices);
 int starpu_perfmodel_arch_comb_get(int ndevices, struct starpu_perfmodel_device *devices);
+struct starpu_perfmodel_arch *starpu_perfmodel_arch_comb_fetch(int comb);
 
 struct starpu_perfmodel_per_arch *starpu_perfmodel_get_model_per_arch(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, unsigned impl);
 struct starpu_perfmodel_per_arch *starpu_perfmodel_get_model_per_devices(struct starpu_perfmodel *model, int impl, ...);

+ 0 - 3
include/starpu_scheduler.h

@@ -60,10 +60,7 @@ struct starpu_sched_policy **starpu_sched_get_predefined_policies();
 void starpu_worker_get_sched_condition(int workerid, starpu_pthread_mutex_t **sched_mutex, starpu_pthread_cond_t **sched_cond);
 unsigned long starpu_task_get_job_id(struct starpu_task *task);
 
-/* This function must be called to wake up a worker that is sleeping on the cond. 
- * It returns 0 whenever the worker is not in a sleeping state or has the state_keep_awake flag on */
 int starpu_wake_worker_no_relax(int workerid);
-/* This is a version of starpu_wake_worker which assumes that the sched mutex is locked */
 int starpu_wake_worker_locked(int workerid);
 
 int starpu_worker_can_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);

+ 11 - 1
include/starpu_thread_util.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2012-2014, 2016-2017  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -47,6 +47,16 @@
 	}									\
 } while (0)
 
+#define STARPU_PTHREAD_JOIN(thread, retval) do {		    	\
+	int p_ret =  starpu_pthread_join((thread), (retval)); \
+	if (STARPU_UNLIKELY(p_ret != 0)) {					\
+		fprintf(stderr,							\
+			"%s:%d starpu_pthread_join: %s\n",			\
+			__FILE__, __LINE__, strerror(p_ret));			\
+		STARPU_ABORT();							\
+	}									\
+} while (0)
+
 /*
  * Encapsulation of the starpu_pthread_mutex_* functions.
  */

+ 35 - 4
include/starpu_util.h

@@ -159,13 +159,44 @@ extern "C"
 } while(0)
 
 #if defined(STARPU_HAVE_STRERROR_R)
+#if (! defined(__GLIBC__) || !__GLIBC__) || ((_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) && (! defined(_GNU_SOURCE)))
+/* XSI-compliant version of strerror_r returns an int */
+#define starpu_strerror_r(errnum, buf, buflen) \
+	do \
+	{ \
+		int _ret = strerror_r((errnum), (buf), (buflen)); \
+		STARPU_ASSERT(_ret == 0); \
+	} \
+	while (0)
+#else
+/* GNU-specific version of strerror_r returns a char * */
+#define starpu_strerror_r(errnum, buf, buflen) \
+	do \
+	{ \
+		char * const _user_buf = (buf); \
+		const size_t _user_buflen = (buflen); \
+		/* the GNU-specific behaviour when 'buf' == NULL cannot be emulated with the XSI-compliant version */ \
+		STARPU_ASSERT((buf) != NULL); \
+		char * _tmp_buf = strerror_r((errnum), _user_buf, _user_buflen); \
+		if (_tmp_buf != _user_buf) \
+		{ \
+			if (_user_buflen > 0) \
+			{ \
+				strncpy(_user_buf, _tmp_buf, _user_buflen); \
+				_user_buf[_user_buflen-1] = '\0'; \
+			} \
+		} \
+	} \
+	while (0)
+#endif /* strerror_r ABI version */
+
 #  define STARPU_CHECK_RETURN_VALUE(err, message, ...) {if (STARPU_UNLIKELY(err != 0)) { \
-			char xmessage[256]; char *_strerror = strerror_r(-err, xmessage, 256); \
-			fprintf(stderr, "[starpu] Unexpected value: <%d:%s> returned for " message "\n", err, _strerror==NULL?"":xmessage, ## __VA_ARGS__); \
+			char xmessage[256]; starpu_strerror_r(-err, xmessage, 256); \
+			fprintf(stderr, "[starpu] Unexpected value: <%d:%s> returned for " message "\n", err, xmessage, ## __VA_ARGS__); \
 			STARPU_ABORT(); }}
 #  define STARPU_CHECK_RETURN_VALUE_IS(err, value, message, ...) {if (STARPU_UNLIKELY(err != value)) { \
-			char xmessage[256]; char *_strerror=strerror_r(-err, xmessage, 256); \
-			fprintf(stderr, "[starpu] Unexpected value: <%d!=%d:%s> returned for " message "\n", err, value, _strerror==NULL?"":xmessage, ## __VA_ARGS__); \
+			char xmessage[256]; starpu_strerror_r(-err, xmessage, 256); \
+			fprintf(stderr, "[starpu] Unexpected value: <%d!=%d:%s> returned for " message "\n", err, value, xmessage, ## __VA_ARGS__); \
 			STARPU_ABORT(); }}
 #else
 #  define STARPU_CHECK_RETURN_VALUE(err, message, ...) {if (STARPU_UNLIKELY(err != 0)) { \

+ 8 - 0
include/starpu_worker.h

@@ -25,6 +25,10 @@
 #include <starpu_thread.h>
 #include <starpu_task.h>
 
+#ifdef STARPU_HAVE_HWLOC
+#include <hwloc.h>
+#endif
+
 #ifdef __cplusplus
 extern "C"
 {
@@ -162,6 +166,10 @@ void starpu_worker_unlock_self(void);
 
 int starpu_wake_worker_relax(int workerid);
 
+#ifdef STARPU_HAVE_HWLOC
+hwloc_cpuset_t starpu_worker_get_hwloc_cpuset(int workerid);
+#endif
+
 #ifdef __cplusplus
 }
 #endif

+ 1 - 1
mpi/examples/mpi_lu/pxlu.c

@@ -302,7 +302,7 @@ static void create_task_11(unsigned k)
 	if (get_block_rank(k, k) == rank)
 	{
 #ifdef VERBOSE_INIT
-		fprintf(stderr, "CREATE real task 11(%u) (TAG11_SAVE(%u) = %lx) on node %d\n", k, k, TAG11_SAVE(k), rank);
+		fprintf(stderr, "CREATE real task 11(%u) (TAG11_SAVE(%u) = %llux) on node %d\n", k, k, TAG11_SAVE(k), rank);
 #endif
 		create_task_11_real(k);
 	}

+ 3 - 2
mpi/examples/mpi_lu/pxlu.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2014  Université de Bordeaux
- * Copyright (C) 2010, 2012, 2014  CNRS
+ * Copyright (C) 2010, 2012, 2014, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -31,7 +31,8 @@
 //#define SINGLE_TMP11	1
 //#define SINGLE_TMP1221	1
 
-struct debug_info {
+struct debug_info
+{
 	unsigned i;
 	unsigned j;
 	unsigned k;

+ 2 - 2
mpi/src/load_balancer/policy/load_heat_propagation.c

@@ -96,14 +96,14 @@ static void balance(starpu_data_handle_t load_data_cpy)
 {
 	int less_loaded = -1;
 	int n;
-	double elapsed_time, ref_elapsed_time;
+	double ref_elapsed_time;
 	double my_elapsed_time = load_data_get_elapsed_time(load_data_cpy);
 
 	/* Search for the less loaded neighbor */
 	ref_elapsed_time = my_elapsed_time;
 	for (n = 0; n < nneighbors; n++)
 	{
-		elapsed_time = load_data_get_elapsed_time(neighbor_load_data_handles[n]);
+		double elapsed_time = load_data_get_elapsed_time(neighbor_load_data_handles[n]);
 		if (ref_elapsed_time > elapsed_time)
 		{
 			//fprintf(stderr,"Node%d: ref local time %lf vs neighbour%d time %lf\n", my_rank, ref_elapsed_time, neighbor_ids[n], elapsed_time);

+ 13 - 2
mpi/src/starpu_mpi.c

@@ -1166,11 +1166,20 @@ static void _starpu_mpi_test_detached_requests(void)
 
 	STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
 
+	if (_starpu_mpi_req_list_empty(detached_requests))
+	{
+		STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
+		//_STARPU_MPI_LOG_OUT();
+		return;
+	}
+
+	_STARPU_MPI_TRACE_TESTING_DETACHED_BEGIN();
 	req = _starpu_mpi_req_list_begin(detached_requests);
 	while (req != _starpu_mpi_req_list_end(detached_requests))
 	{
 		STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
 
+		_STARPU_MPI_TRACE_TEST_BEGIN(req->node_tag.rank, req->node_tag.data_tag);
 		//_STARPU_MPI_DEBUG(3, "Test detached request %p - mpitag %d - TYPE %s %d\n", &req->data_request, req->node_tag.data_tag, _starpu_mpi_request_type(req->request_type), req->node_tag.rank);
 #ifdef STARPU_SIMGRID
 		req->ret = _starpu_mpi_simgrid_mpi_test(&req->done, &flag);
@@ -1180,6 +1189,7 @@ static void _starpu_mpi_test_detached_requests(void)
 #endif
 
 		STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Test returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
+		_STARPU_MPI_TRACE_TEST_END(req->node_tag.rank, req->node_tag.data_tag);
 
 		if (!flag)
 		{
@@ -1209,6 +1219,7 @@ static void _starpu_mpi_test_detached_requests(void)
 
 		STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
 	}
+	_STARPU_MPI_TRACE_TESTING_DETACHED_END();
 
 	STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
 	//_STARPU_MPI_LOG_OUT();
@@ -1221,7 +1232,7 @@ static void _starpu_mpi_handle_detached_request(struct _starpu_mpi_req *req)
 		/* put the submitted request into the list of pending requests
 		 * so that it can be handled by the progression mechanisms */
 		STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
-		_starpu_mpi_req_list_push_front(detached_requests, req);
+		_starpu_mpi_req_list_push_back(detached_requests, req);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
 
 		starpu_wake_all_blocked_workers();
@@ -1691,7 +1702,7 @@ void _starpu_mpi_progress_shutdown(int *value)
 	(void) value;
 	MSG_process_sleep(1);
 #else
-	starpu_pthread_join(progress_thread, (void *)value);
+	STARPU_PTHREAD_JOIN(progress_thread, (void *)value);
 #endif
 
         /* free the request queues */

+ 18 - 10
mpi/src/starpu_mpi_datatype.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2011, 2015  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -148,8 +148,8 @@ static starpu_mpi_datatype_allocate_func_t handle_to_datatype_funcs[STARPU_MAX_I
 	[STARPU_MATRIX_INTERFACE_ID]	= handle_to_datatype_matrix,
 	[STARPU_BLOCK_INTERFACE_ID]	= handle_to_datatype_block,
 	[STARPU_VECTOR_INTERFACE_ID]	= handle_to_datatype_vector,
-	[STARPU_CSR_INTERFACE_ID]	= NULL,
-	[STARPU_BCSR_INTERFACE_ID]	= NULL,
+	[STARPU_CSR_INTERFACE_ID]	= NULL, /* Sent through pack/unpack operations */
+	[STARPU_BCSR_INTERFACE_ID]	= NULL, /* Sent through pack/unpack operations */
 	[STARPU_VARIABLE_INTERFACE_ID]	= handle_to_datatype_variable,
 	[STARPU_VOID_INTERFACE_ID]	= handle_to_datatype_void,
 	[STARPU_MULTIFORMAT_INTERFACE_ID] = NULL,
@@ -162,9 +162,17 @@ void _starpu_mpi_datatype_allocate(starpu_data_handle_t data_handle, struct _sta
 	if (id < STARPU_MAX_INTERFACE_ID)
 	{
 		starpu_mpi_datatype_allocate_func_t func = handle_to_datatype_funcs[id];
-		STARPU_ASSERT_MSG(func, "Handle To Datatype Function not defined for StarPU data interface %d", id);
-		func(data_handle, &req->datatype);
-		req->registered_datatype = 1;
+		if (func)
+		{
+			func(data_handle, &req->datatype);
+			req->registered_datatype = 1;
+		}
+		else
+		{
+			/* The datatype is predefined by StarPU but it will be sent as a memory area */
+			req->datatype = MPI_BYTE;
+			req->registered_datatype = 0;
+		}
 	}
 	else
 	{
@@ -236,8 +244,8 @@ static starpu_mpi_datatype_free_func_t handle_free_datatype_funcs[STARPU_MAX_INT
 	[STARPU_MATRIX_INTERFACE_ID]	= _starpu_mpi_handle_free_simple_datatype,
 	[STARPU_BLOCK_INTERFACE_ID]	= _starpu_mpi_handle_free_complex_datatype,
 	[STARPU_VECTOR_INTERFACE_ID]	= _starpu_mpi_handle_free_simple_datatype,
-	[STARPU_CSR_INTERFACE_ID]	= NULL,
-	[STARPU_BCSR_INTERFACE_ID]	= NULL,
+	[STARPU_CSR_INTERFACE_ID]	= NULL,  /* Sent through pack/unpack operations */
+	[STARPU_BCSR_INTERFACE_ID]	= NULL,  /* Sent through pack/unpack operations */
 	[STARPU_VARIABLE_INTERFACE_ID]	= _starpu_mpi_handle_free_simple_datatype,
 	[STARPU_VOID_INTERFACE_ID]      = _starpu_mpi_handle_free_simple_datatype,
 	[STARPU_MULTIFORMAT_INTERFACE_ID] = NULL,
@@ -250,8 +258,8 @@ void _starpu_mpi_datatype_free(starpu_data_handle_t data_handle, MPI_Datatype *d
 	if (id < STARPU_MAX_INTERFACE_ID)
 	{
 		starpu_mpi_datatype_free_func_t func = handle_free_datatype_funcs[id];
-		STARPU_ASSERT_MSG(func, "Handle free datatype function not defined for StarPU data interface %d", id);
-		func(datatype);
+		if (func)
+			func(datatype);
 	}
 	else
 	{

+ 24 - 0
mpi/src/starpu_mpi_fxt.h

@@ -49,6 +49,10 @@ extern "C"
 #define _STARPU_MPI_FUT_DATA_SET_RANK			0x521a
 #define _STARPU_MPI_FUT_IRECV_TERMINATED		0x521b
 #define _STARPU_MPI_FUT_ISEND_TERMINATED		0x521c
+#define _STARPU_MPI_FUT_TESTING_DETACHED_BEGIN		0x521d
+#define _STARPU_MPI_FUT_TESTING_DETACHED_END		0x521e
+#define _STARPU_MPI_FUT_TEST_BEGIN			0x521f
+#define _STARPU_MPI_FUT_TEST_END			0x5220
 
 #ifdef STARPU_USE_FXT
 #define _STARPU_MPI_TRACE_START(rank, worldsize)	\
@@ -98,6 +102,22 @@ extern "C"
 	FUT_DO_PROBE3(_STARPU_MPI_FUT_UWAIT_END, (src), (mpi_tag), _starpu_gettid());
 #define _STARPU_MPI_TRACE_DATA_SET_RANK(handle, rank)	\
 	FUT_DO_PROBE3(_STARPU_MPI_FUT_DATA_SET_RANK, (handle), (rank), _starpu_gettid());
+#if 0
+/* This is very expensive in the trace, only enable for debugging */
+#define _STARPU_MPI_TRACE_TESTING_DETACHED_BEGIN()	\
+	FUT_DO_PROBE1(_STARPU_MPI_FUT_TESTING_DETACHED_BEGIN, _starpu_gettid());
+#define _STARPU_MPI_TRACE_TESTING_DETACHED_END()	\
+	FUT_DO_PROBE1(_STARPU_MPI_FUT_TESTING_DETACHED_END, _starpu_gettid());
+#define _STARPU_MPI_TRACE_TEST_BEGIN(peer, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_TEST_BEGIN, (peer), (mpi_tag), _starpu_gettid());
+#define _STARPU_MPI_TRACE_TEST_END(peer, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_TEST_END, (peer), (mpi_tag), _starpu_gettid());
+#else
+#define _STARPU_MPI_TRACE_TESTING_DETACHED_BEGIN()		do {} while(0)
+#define _STARPU_MPI_TRACE_TESTING_DETACHED_END()		do {} while(0)
+#define _STARPU_MPI_TRACE_TEST_BEGIN(peer, mpi_tag)		do {} while(0)
+#define _STARPU_MPI_TRACE_TEST_END(peer, mpi_tag)		do {} while(0)
+#endif
 #define TRACE
 #else
 #define _STARPU_MPI_TRACE_START(a, b)				do {} while(0);
@@ -123,6 +143,10 @@ extern "C"
 #define _STARPU_MPI_TRACE_UWAIT_BEGIN(a, b)			do {} while(0);
 #define _STARPU_MPI_TRACE_UWAIT_END(a, b)			do {} while(0);
 #define _STARPU_MPI_TRACE_DATA_SET_RANK(a, b)			do {} while(0);
+#define _STARPU_MPI_TRACE_TESTING_DETACHED_BEGIN()		do {} while(0)
+#define _STARPU_MPI_TRACE_TESTING_DETACHED_END()		do {} while(0)
+#define _STARPU_MPI_TRACE_TEST_BEGIN(peer, mpi_tag)		do {} while(0)
+#define _STARPU_MPI_TRACE_TEST_END(peer, mpi_tag)		do {} while(0)
 #endif
 
 #ifdef __cplusplus

+ 1 - 1
mpi/src/starpu_mpi_private.h

@@ -91,7 +91,7 @@ int _starpu_debug_rank;
 
 #define _STARPU_MPI_MALLOC(ptr, size) do { ptr = malloc(size); STARPU_MPI_ASSERT_MSG(ptr != NULL, "Cannot allocate %ld bytes\n", (long) size); } while (0)
 #define _STARPU_MPI_CALLOC(ptr, nmemb, size) do { ptr = calloc(nmemb, size); STARPU_MPI_ASSERT_MSG(ptr != NULL, "Cannot allocate %ld bytes\n", (long) (nmemb*size)); } while (0)
-#define _STARPU_MPI_REALLOC(ptr, size) do { ptr = realloc(ptr, size); STARPU_MPI_ASSERT_MSG(ptr != NULL, "Cannot reallocate %ld bytes\n", (long) size); } while (0)
+#define _STARPU_MPI_REALLOC(ptr, size) do { void *_new_ptr = realloc(ptr, size); STARPU_MPI_ASSERT_MSG(_new_ptr != NULL, "Cannot reallocate %ld bytes\n", (long) size); ptr = _new_ptr; } while (0)
 
 #ifdef STARPU_VERBOSE
 #  define _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, node, tag, utag, comm, way) \

+ 2 - 2
mpi/src/starpu_mpi_task_insert.c

@@ -54,7 +54,7 @@ int starpu_mpi_pre_submit_hook_unregister()
 
 int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int *do_execute, int *inconsistent_execute, int *xrank)
 {
-	if (mode & STARPU_W)
+	if (mode & STARPU_W || mode & STARPU_REDUX)
 	{
 		if (!data)
 		{
@@ -431,7 +431,7 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
 	if (inconsistent_execute == 1 || *xrank == -1)
 	{
 		// We need to find out which node is going to execute the codelet.
-		_STARPU_MPI_DISP("Different nodes are owning W data. The node to execute the codelet is going to be selected with the current selection node policy. See starpu_mpi_node_selection_set_current_policy() to change the policy, or use STARPU_EXECUTE_ON_NODE or STARPU_EXECUTE_ON_DATA to specify the node\n");
+		_STARPU_MPI_DEBUG(100, "Different nodes are owning W data. The node to execute the codelet is going to be selected with the current selection node policy. See starpu_mpi_node_selection_set_current_policy() to change the policy, or use STARPU_EXECUTE_ON_NODE or STARPU_EXECUTE_ON_DATA to specify the node\n");
 		*xrank = _starpu_mpi_select_node(me, nb_nodes, descrs, nb_data, select_node_policy);
 		*do_execute = (me == *xrank);
 	}

+ 1 - 1
mpi/tests/block_interface.c

@@ -56,7 +56,7 @@ int main(int argc, char **argv)
 	 * their blocks. */
 
 	float *block = NULL;
-	starpu_data_handle_t block_handle;
+	starpu_data_handle_t block_handle = NULL;
 
 	if (rank == 0)
 	{

+ 2 - 2
mpi/tests/block_interface_pinned.c

@@ -56,8 +56,8 @@ int main(int argc, char **argv)
 	 * register it directly. Node 0 and 1 will then exchange the content of
 	 * their blocks. */
 
-	float *block;
-	starpu_data_handle_t block_handle;
+	float *block = NULL;
+	starpu_data_handle_t block_handle = NULL;
 
 	if (rank == 0)
 	{

+ 360 - 117
mpi/tests/datatypes.c

@@ -20,11 +20,65 @@
 
 typedef void (*check_func)(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error);
 
+void send_recv_and_check(int rank, int node, starpu_data_handle_t handle_s, int tag_s, starpu_data_handle_t handle_r, int tag_r, int *error, check_func func)
+{
+	int ret;
+	MPI_Status status;
+
+	if (rank == 0)
+	{
+		ret = starpu_mpi_send(handle_s, node, tag_s, MPI_COMM_WORLD);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
+		ret = starpu_mpi_recv(handle_r, node, tag_r, MPI_COMM_WORLD, &status);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
+
+		assert(func);
+		func(handle_s, handle_r, error);
+	}
+	else if (rank == 1)
+	{
+		ret = starpu_mpi_recv(handle_s, node, tag_s, MPI_COMM_WORLD, &status);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
+		ret = starpu_mpi_send(handle_s, node, tag_r, MPI_COMM_WORLD);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
+	}
+}
+
+/*
+ * Void
+ */
 void check_void(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
 {
 	FPRINTF_MPI(stderr, "Success with void value\n");
 }
 
+void exchange_void(int rank, int *error)
+{
+	STARPU_SKIP_IF_VALGRIND;
+
+	if (rank == 0)
+	{
+		starpu_data_handle_t void_handle[2];
+		starpu_void_data_register(&void_handle[0]);
+		starpu_void_data_register(&void_handle[1]);
+
+		send_recv_and_check(rank, 1, void_handle[0], 0x42, void_handle[1], 0x1337, error, check_void);
+
+		starpu_data_unregister(void_handle[0]);
+		starpu_data_unregister(void_handle[1]);
+	}
+	else if (rank == 1)
+	{
+		starpu_data_handle_t void_handle;
+		starpu_void_data_register(&void_handle);
+		send_recv_and_check(rank, 0, void_handle, 0x42, NULL, 0x1337, NULL, NULL);
+		starpu_data_unregister(void_handle);
+	}
+}
+
+/*
+ * Variable
+ */
 void check_variable(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
 {
 	float *v_s, *v_r;
@@ -45,6 +99,32 @@ void check_variable(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r
 	}
 }
 
+void exchange_variable(int rank, int *error)
+{
+	if (rank == 0)
+	{
+		float v = 42.12;
+		starpu_data_handle_t variable_handle[2];
+		starpu_variable_data_register(&variable_handle[0], STARPU_MAIN_RAM, (uintptr_t)&v, sizeof(v));
+		starpu_variable_data_register(&variable_handle[1], -1, (uintptr_t)NULL, sizeof(v));
+
+		send_recv_and_check(rank, 1, variable_handle[0], 0x42, variable_handle[1], 0x1337, error, check_variable);
+
+		starpu_data_unregister(variable_handle[0]);
+		starpu_data_unregister(variable_handle[1]);
+	}
+	else if (rank == 1)
+	{
+		starpu_data_handle_t variable_handle;
+		starpu_variable_data_register(&variable_handle, -1, (uintptr_t)NULL, sizeof(float));
+		send_recv_and_check(rank, 0, variable_handle, 0x42, NULL, 0x1337, NULL, NULL);
+		starpu_data_unregister(variable_handle);
+	}
+}
+
+/*
+ * Vector
+ */
 void check_vector(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
 {
 	int i;
@@ -72,6 +152,33 @@ void check_vector(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r,
 	}
 }
 
+void exchange_vector(int rank, int *error)
+{
+	if (rank == 0)
+	{
+		int vector[4] = {1, 2, 3, 4};
+		starpu_data_handle_t vector_handle[2];
+
+		starpu_vector_data_register(&vector_handle[0], STARPU_MAIN_RAM, (uintptr_t)vector, 4, sizeof(vector[0]));
+		starpu_vector_data_register(&vector_handle[1], -1, (uintptr_t)NULL, 4, sizeof(vector[0]));
+
+		send_recv_and_check(rank, 1, vector_handle[0], 0x43, vector_handle[1], 0x2337, error, check_vector);
+
+		starpu_data_unregister(vector_handle[0]);
+		starpu_data_unregister(vector_handle[1]);
+	}
+	else if (rank == 1)
+	{
+		starpu_data_handle_t vector_handle;
+		starpu_vector_data_register(&vector_handle, -1, (uintptr_t)NULL, 4, sizeof(int));
+		send_recv_and_check(rank, 0, vector_handle, 0x43, NULL, 0x2337, NULL, NULL);
+		starpu_data_unregister(vector_handle);
+	}
+}
+
+/*
+ * Matrix
+ */
 void check_matrix(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
 {
 	STARPU_ASSERT(starpu_matrix_get_elemsize(handle_s) == starpu_matrix_get_elemsize(handle_r));
@@ -106,6 +213,48 @@ void check_matrix(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r,
 	}
 }
 
+void exchange_matrix(int rank, int *error)
+{
+	int nx=3;
+	int ny=2;
+
+	if (rank == 0)
+	{
+		char *matrix, n='a';
+		int x, y;
+		starpu_data_handle_t matrix_handle[2];
+
+		matrix = (char*)malloc(nx*ny*sizeof(char));
+		assert(matrix);
+		for(y=0 ; y<ny ; y++)
+		{
+			for(x=0 ; x<nx ; x++)
+			{
+				matrix[(y*nx)+x] = n++;
+			}
+		}
+
+		starpu_matrix_data_register(&matrix_handle[0], STARPU_MAIN_RAM, (uintptr_t)matrix, nx, nx, ny, sizeof(char));
+		starpu_matrix_data_register(&matrix_handle[1], -1, (uintptr_t)NULL, nx, nx, ny, sizeof(char));
+
+		send_recv_and_check(rank, 1, matrix_handle[0], 0x75, matrix_handle[1], 0x8555, error, check_matrix);
+
+		starpu_data_unregister(matrix_handle[0]);
+		starpu_data_unregister(matrix_handle[1]);
+		free(matrix);
+	}
+	else if (rank == 1)
+	{
+		starpu_data_handle_t matrix_handle;
+		starpu_matrix_data_register(&matrix_handle, -1, (uintptr_t)NULL, nx, nx, ny, sizeof(char));
+		send_recv_and_check(rank, 0, matrix_handle, 0x75, NULL, 0x8555, NULL, NULL);
+		starpu_data_unregister(matrix_handle);
+	}
+}
+
+/*
+ * Block
+ */
 void check_block(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
 {
 	STARPU_ASSERT(starpu_block_get_elemsize(handle_s) == starpu_block_get_elemsize(handle_r));
@@ -152,180 +301,272 @@ void check_block(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, i
 	starpu_data_release(handle_r);
 }
 
-void send_recv_and_check(int rank, int node, starpu_data_handle_t handle_s, int tag_s, starpu_data_handle_t handle_r, int tag_r, int *error, check_func func)
+void exchange_block(int rank, int *error)
 {
-	int ret;
-	MPI_Status status;
+	int nx=3;
+	int ny=2;
+	int nz=4;
 
 	if (rank == 0)
 	{
-		ret = starpu_mpi_send(handle_s, node, tag_s, MPI_COMM_WORLD);
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
-		ret = starpu_mpi_recv(handle_r, node, tag_r, MPI_COMM_WORLD, &status);
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
+		float *block, n=1.0;
+		int x, y, z;
+		starpu_data_handle_t block_handle[2];
 
-		assert(func);
-		func(handle_s, handle_r, error);
+		block = (float*)malloc(nx*ny*nz*sizeof(float));
+		assert(block);
+		for(z=0 ; z<nz ; z++)
+		{
+			for(y=0 ; y<ny ; y++)
+			{
+				for(x=0 ; x<nx ; x++)
+				{
+					block[(z*nx*ny)+(y*nx)+x] = n++;
+				}
+			}
+		}
+
+		starpu_block_data_register(&block_handle[0], STARPU_MAIN_RAM, (uintptr_t)block, nx, nx*ny, nx, ny, nz, sizeof(float));
+		starpu_block_data_register(&block_handle[1], -1, (uintptr_t)NULL, nx, nx*ny, nx, ny, nz, sizeof(float));
+
+		send_recv_and_check(rank, 1, block_handle[0], 0x73, block_handle[1], 0x8337, error, check_block);
+
+		starpu_data_unregister(block_handle[0]);
+		starpu_data_unregister(block_handle[1]);
+		free(block);
 	}
 	else if (rank == 1)
 	{
-		ret = starpu_mpi_recv(handle_s, node, tag_s, MPI_COMM_WORLD, &status);
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
-		ret = starpu_mpi_send(handle_s, node, tag_r, MPI_COMM_WORLD);
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
+		starpu_data_handle_t block_handle;
+		starpu_block_data_register(&block_handle, -1, (uintptr_t)NULL, nx, nx*ny, nx, ny, nz, sizeof(float));
+		send_recv_and_check(rank, 0, block_handle, 0x73, NULL, 0x8337, NULL, NULL);
+		starpu_data_unregister(block_handle);
 	}
 }
 
-void exchange_void(int rank, int *error)
+/*
+ * BCSR
+ */
+void check_bcsr(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
 {
-	STARPU_SKIP_IF_VALGRIND;
+	STARPU_ASSERT(starpu_bcsr_get_elemsize(handle_s) == starpu_bcsr_get_elemsize(handle_r));
+	STARPU_ASSERT(starpu_bcsr_get_nnz(handle_s) == starpu_bcsr_get_nnz(handle_r));
+	STARPU_ASSERT(starpu_bcsr_get_nrow(handle_s) == starpu_bcsr_get_nrow(handle_r));
+	STARPU_ASSERT(starpu_bcsr_get_firstentry(handle_s) == starpu_bcsr_get_firstentry(handle_r));
+	STARPU_ASSERT(starpu_bcsr_get_r(handle_s) == starpu_bcsr_get_r(handle_r));
+	STARPU_ASSERT(starpu_bcsr_get_c(handle_s) == starpu_bcsr_get_c(handle_r));
 
-	if (rank == 0)
-	{
-		starpu_data_handle_t void_handle[2];
-		starpu_void_data_register(&void_handle[0]);
-		starpu_void_data_register(&void_handle[1]);
+	starpu_data_acquire(handle_s, STARPU_R);
+	starpu_data_acquire(handle_r, STARPU_R);
 
-		send_recv_and_check(rank, 1, void_handle[0], 0x42, void_handle[1], 0x1337, error, check_void);
+	uint32_t *colind_s = starpu_bcsr_get_local_colind(handle_s);
+	uint32_t *colind_r = starpu_bcsr_get_local_colind(handle_r);
+	uint32_t *rowptr_s = starpu_bcsr_get_local_rowptr(handle_s);
+	uint32_t *rowptr_r = starpu_bcsr_get_local_rowptr(handle_r);
 
-		starpu_data_unregister(void_handle[0]);
-		starpu_data_unregister(void_handle[1]);
-	}
-	else if (rank == 1)
+	int *bcsr_s = (int *)starpu_bcsr_get_local_nzval(handle_s);
+	int *bcsr_r = (int *)starpu_bcsr_get_local_nzval(handle_r);
+
+	int r = starpu_bcsr_get_r(handle_s);
+	int c = starpu_bcsr_get_c(handle_s);
+	int nnz = starpu_bcsr_get_nnz(handle_s);
+	int nrows = starpu_bcsr_get_nrow(handle_s);
+
+	int x;
+
+	for(x=0 ; x<nnz ; x++)
 	{
-		starpu_data_handle_t void_handle;
-		starpu_void_data_register(&void_handle);
-		send_recv_and_check(rank, 0, void_handle, 0x42, NULL, 0x1337, NULL, NULL);
-		starpu_data_unregister(void_handle);
+		if (colind_s[x] == colind_r[x])
+		{
+			FPRINTF_MPI(stderr, "Success with colind[%d] value: %u == %u\n", x, colind_s[x], colind_r[x]);
+		}
+		else
+		{
+			*error = 1;
+			FPRINTF_MPI(stderr, "Error with colind[%d] value: %u != %u\n", x, colind_s[x], colind_r[x]);
+		}
 	}
-}
 
-void exchange_variable(int rank, int *error)
-{
-	if (rank == 0)
+	for(x=0 ; x<nrows+1 ; x++)
 	{
-		float v = 42.12;
-		starpu_data_handle_t variable_handle[2];
-		starpu_variable_data_register(&variable_handle[0], STARPU_MAIN_RAM, (uintptr_t)&v, sizeof(v));
-		starpu_variable_data_register(&variable_handle[1], -1, (uintptr_t)NULL, sizeof(v));
-
-		send_recv_and_check(rank, 1, variable_handle[0], 0x42, variable_handle[1], 0x1337, error, check_variable);
-
-		starpu_data_unregister(variable_handle[0]);
-		starpu_data_unregister(variable_handle[1]);
+		if (rowptr_s[x] == rowptr_r[x])
+		{
+			FPRINTF_MPI(stderr, "Success with rowptr[%d] value: %u == %u\n", x, rowptr_s[x], rowptr_r[x]);
+		}
+		else
+		{
+			*error = 1;
+			FPRINTF_MPI(stderr, "Error with rowptr[%d] value: %u != %u\n", x, rowptr_s[x], rowptr_r[x]);
+		}
 	}
-	else if (rank == 1)
+
+	for(x=0 ; x<r*c*nnz ; x++)
 	{
-		starpu_data_handle_t variable_handle;
-		starpu_variable_data_register(&variable_handle, -1, (uintptr_t)NULL, sizeof(float));
-		send_recv_and_check(rank, 0, variable_handle, 0x42, NULL, 0x1337, NULL, NULL);
-		starpu_data_unregister(variable_handle);
+		if (bcsr_s[x] == bcsr_r[x])
+		{
+			FPRINTF_MPI(stderr, "Success with bcsr[%d] value: %d == %d\n", x, bcsr_s[x], bcsr_r[x]);
+		}
+		else
+		{
+			*error = 1;
+			FPRINTF_MPI(stderr, "Error with bcsr[%d] value: %d != %d\n", x, bcsr_s[x], bcsr_r[x]);
+		}
 	}
+
+	starpu_data_release(handle_s);
+	starpu_data_release(handle_r);
 }
 
-void exchange_vector(int rank, int *error)
+void exchange_bcsr(int rank, int *error)
 {
+	/*
+	 * We use the following matrix:
+	 *
+	 *   +----------------+
+	 *   |  0   1   0   0 |
+	 *   |  2   3   0   0 |
+	 *   |  4   5   8   9 |
+	 *   |  6   7  10  11 |
+	 *   +----------------+
+	 *
+	 * nzval  = [0, 1, 2, 3] ++ [4, 5, 6, 7] ++ [8, 9, 10, 11]
+	 * colind = [0, 0, 1]
+	 * rowptr = [0, 1, 3]
+	 * r = c = 2
+	 */
+
+	/* Size of the blocks */
+#define BCSR_R 2
+#define BCSR_C 2
+#define BCSR_NROWS 2
+#define BCSR_NNZ_BLOCKS 3     /* out of 4 */
+#define BCSR_NZVAL_SIZE (BCSR_R*BCSR_C*BCSR_NNZ_BLOCKS)
+
 	if (rank == 0)
 	{
-		int vector[4] = {1, 2, 3, 4};
-		starpu_data_handle_t vector_handle[2];
+		starpu_data_handle_t bcsr_handle[2];
+		uint32_t colind[BCSR_NNZ_BLOCKS] = {0, 0, 1};
+		uint32_t rowptr[BCSR_NROWS+1] = {0, 1, BCSR_NNZ_BLOCKS};
+		int nzval[BCSR_NZVAL_SIZE]  =
+		{
+			0, 1, 2, 3,    /* First block  */
+			4, 5, 6, 7,    /* Second block */
+			8, 9, 10, 11   /* Third block  */
+		};
 
-		starpu_vector_data_register(&vector_handle[0], STARPU_MAIN_RAM, (uintptr_t)vector, 4, sizeof(vector[0]));
-		starpu_vector_data_register(&vector_handle[1], -1, (uintptr_t)NULL, 4, sizeof(vector[0]));
+		starpu_bcsr_data_register(&bcsr_handle[0], STARPU_MAIN_RAM, BCSR_NNZ_BLOCKS, BCSR_NROWS, (uintptr_t) nzval, colind, rowptr, 0, BCSR_R, BCSR_C, sizeof(nzval[0]));
+		starpu_bcsr_data_register(&bcsr_handle[1], -1, BCSR_NNZ_BLOCKS, BCSR_NROWS, (uintptr_t) NULL, (uint32_t *) NULL, (uint32_t *) NULL, 0, BCSR_R, BCSR_C, sizeof(nzval[0]));
 
-		send_recv_and_check(rank, 1, vector_handle[0], 0x43, vector_handle[1], 0x2337, error, check_vector);
+		send_recv_and_check(rank, 1, bcsr_handle[0], 0x73, bcsr_handle[1], 0x8337, error, check_bcsr);
 
-		starpu_data_unregister(vector_handle[0]);
-		starpu_data_unregister(vector_handle[1]);
+		starpu_data_unregister(bcsr_handle[0]);
+		starpu_data_unregister(bcsr_handle[1]);
 	}
 	else if (rank == 1)
 	{
-		starpu_data_handle_t vector_handle;
-		starpu_vector_data_register(&vector_handle, -1, (uintptr_t)NULL, 4, sizeof(int));
-		send_recv_and_check(rank, 0, vector_handle, 0x43, NULL, 0x2337, NULL, NULL);
-		starpu_data_unregister(vector_handle);
+		starpu_data_handle_t bcsr_handle;
+		starpu_bcsr_data_register(&bcsr_handle, -1, BCSR_NNZ_BLOCKS, BCSR_NROWS, (uintptr_t) NULL, (uint32_t *) NULL, (uint32_t *) NULL, 0, BCSR_R, BCSR_C, sizeof(int));
+		send_recv_and_check(rank, 0, bcsr_handle, 0x73, NULL, 0x8337, NULL, NULL);
+		starpu_data_unregister(bcsr_handle);
 	}
 }
 
-void exchange_matrix(int rank, int *error)
+/*
+ * CSR
+ */
+void check_csr(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
 {
-	int nx=3;
-	int ny=2;
+	STARPU_ASSERT(starpu_csr_get_elemsize(handle_s) == starpu_csr_get_elemsize(handle_r));
+	STARPU_ASSERT(starpu_csr_get_nnz(handle_s) == starpu_csr_get_nnz(handle_r));
+	STARPU_ASSERT(starpu_csr_get_nrow(handle_s) == starpu_csr_get_nrow(handle_r));
+	STARPU_ASSERT(starpu_csr_get_firstentry(handle_s) == starpu_csr_get_firstentry(handle_r));
 
-	if (rank == 0)
-	{
-		char *matrix, n='a';
-		int x, y;
-		starpu_data_handle_t matrix_handle[2];
+	starpu_data_acquire(handle_s, STARPU_R);
+	starpu_data_acquire(handle_r, STARPU_R);
 
-		matrix = (char*)malloc(nx*ny*sizeof(char));
-		assert(matrix);
-		for(y=0 ; y<ny ; y++)
-		{
-			for(x=0 ; x<nx ; x++)
-			{
-				matrix[(y*nx)+x] = n++;
-			}
-		}
+	uint32_t *colind_s = starpu_csr_get_local_colind(handle_s);
+	uint32_t *colind_r = starpu_csr_get_local_colind(handle_r);
+	uint32_t *rowptr_s = starpu_csr_get_local_rowptr(handle_s);
+	uint32_t *rowptr_r = starpu_csr_get_local_rowptr(handle_r);
 
-		starpu_matrix_data_register(&matrix_handle[0], STARPU_MAIN_RAM, (uintptr_t)matrix, nx, nx, ny, sizeof(char));
-		starpu_matrix_data_register(&matrix_handle[1], -1, (uintptr_t)NULL, nx, nx, ny, sizeof(char));
+	int *csr_s = (int *)starpu_csr_get_local_nzval(handle_s);
+	int *csr_r = (int *)starpu_csr_get_local_nzval(handle_r);
 
-		send_recv_and_check(rank, 1, matrix_handle[0], 0x75, matrix_handle[1], 0x8555, error, check_matrix);
+	int nnz = starpu_csr_get_nnz(handle_s);
+	int nrows = starpu_csr_get_nrow(handle_s);
 
-		starpu_data_unregister(matrix_handle[0]);
-		starpu_data_unregister(matrix_handle[1]);
-		free(matrix);
+	int x;
+
+	for(x=0 ; x<nnz ; x++)
+	{
+		if (colind_s[x] == colind_r[x])
+		{
+			FPRINTF_MPI(stderr, "Success with colind[%d] value: %u == %u\n", x, colind_s[x], colind_r[x]);
+		}
+		else
+		{
+			*error = 1;
+			FPRINTF_MPI(stderr, "Error with colind[%d] value: %u != %u\n", x, colind_s[x], colind_r[x]);
+		}
 	}
-	else if (rank == 1)
+
+	for(x=0 ; x<nrows+1 ; x++)
 	{
-		starpu_data_handle_t matrix_handle;
-		starpu_matrix_data_register(&matrix_handle, -1, (uintptr_t)NULL, nx, nx, ny, sizeof(char));
-		send_recv_and_check(rank, 0, matrix_handle, 0x75, NULL, 0x8555, NULL, NULL);
-		starpu_data_unregister(matrix_handle);
+		if (rowptr_s[x] == rowptr_r[x])
+		{
+			FPRINTF_MPI(stderr, "Success with rowptr[%d] value: %u == %u\n", x, rowptr_s[x], rowptr_r[x]);
+		}
+		else
+		{
+			*error = 1;
+			FPRINTF_MPI(stderr, "Error with rowptr[%d] value: %u != %u\n", x, rowptr_s[x], rowptr_r[x]);
+		}
 	}
+
+	for(x=0 ; x<nnz ; x++)
+	{
+		if (csr_s[x] == csr_r[x])
+		{
+			FPRINTF_MPI(stderr, "Success with csr[%d] value: %d == %d\n", x, csr_s[x], csr_r[x]);
+		}
+		else
+		{
+			*error = 1;
+			FPRINTF_MPI(stderr, "Error with csr[%d] value: %d != %d\n", x, csr_s[x], csr_r[x]);
+		}
+	}
+
+	starpu_data_release(handle_s);
+	starpu_data_release(handle_r);
 }
 
-void exchange_block(int rank, int *error)
+void exchange_csr(int rank, int *error)
 {
-	int nx=3;
-	int ny=2;
-	int nz=4;
+	// the values are completely wrong, we just want to test that the communication is done correctly
+#define CSR_NROWS 2
+#define CSR_NNZ   5
 
 	if (rank == 0)
 	{
-		float *block, n=1.0;
-		int x, y, z;
-		starpu_data_handle_t block_handle[2];
+		starpu_data_handle_t csr_handle[2];
+		uint32_t colind[CSR_NNZ] = {0, 1, 2, 3, 4};
+		uint32_t rowptr[CSR_NROWS+1] = {0, 1, CSR_NNZ};
+		int nzval[CSR_NNZ] = { 11, 22, 33, 44, 55 };
 
-		block = (float*)malloc(nx*ny*nz*sizeof(float));
-		assert(block);
-		for(z=0 ; z<nz ; z++)
-		{
-			for(y=0 ; y<ny ; y++)
-			{
-				for(x=0 ; x<nx ; x++)
-				{
-					block[(z*nx*ny)+(y*nx)+x] = n++;
-				}
-			}
-		}
+		starpu_csr_data_register(&csr_handle[0], STARPU_MAIN_RAM, CSR_NNZ, CSR_NROWS, (uintptr_t) nzval, colind, rowptr, 0, sizeof(nzval[0]));
+		starpu_csr_data_register(&csr_handle[1], -1, CSR_NNZ, CSR_NROWS, (uintptr_t) NULL, (uint32_t *) NULL, (uint32_t *) NULL, 0, sizeof(nzval[0]));
 
-		starpu_block_data_register(&block_handle[0], STARPU_MAIN_RAM, (uintptr_t)block, nx, nx*ny, nx, ny, nz, sizeof(float));
-		starpu_block_data_register(&block_handle[1], -1, (uintptr_t)NULL, nx, nx*ny, nx, ny, nz, sizeof(float));
-
-		send_recv_and_check(rank, 1, block_handle[0], 0x73, block_handle[1], 0x8337, error, check_block);
+		send_recv_and_check(rank, 1, csr_handle[0], 0x84, csr_handle[1], 0x8765, error, check_csr);
 
-		starpu_data_unregister(block_handle[0]);
-		starpu_data_unregister(block_handle[1]);
-		free(block);
+		starpu_data_unregister(csr_handle[0]);
+		starpu_data_unregister(csr_handle[1]);
 	}
 	else if (rank == 1)
 	{
-		starpu_data_handle_t block_handle;
-		starpu_block_data_register(&block_handle, -1, (uintptr_t)NULL, nx, nx*ny, nx, ny, nz, sizeof(float));
-		send_recv_and_check(rank, 0, block_handle, 0x73, NULL, 0x8337, NULL, NULL);
-		starpu_data_unregister(block_handle);
+		starpu_data_handle_t csr_handle;
+		starpu_csr_data_register(&csr_handle, -1, CSR_NNZ, CSR_NROWS, (uintptr_t) NULL, (uint32_t *) NULL, (uint32_t *) NULL, 0, sizeof(int));
+		send_recv_and_check(rank, 0, csr_handle, 0x84, NULL, 0x8765, NULL, NULL);
+		starpu_data_unregister(csr_handle);
 	}
 }
 
@@ -362,6 +603,8 @@ int main(int argc, char **argv)
 	exchange_vector(rank, &error);
 	exchange_matrix(rank, &error);
 	exchange_block(rank, &error);
+	exchange_bcsr(rank, &error);
+	exchange_csr(rank, &error);
 
 	starpu_mpi_shutdown();
 	starpu_shutdown();

+ 3 - 1
mpi/tests/insert_task_compute.c

@@ -53,6 +53,7 @@ int test(int rank, int node, int *before, int *after, int task_insert, int data_
 	int ok, ret, i, x[2];
 	starpu_data_handle_t data_handles[2];
 	struct starpu_data_descr descrs[2];
+	int barrier_ret;
 
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
@@ -211,7 +212,8 @@ enodev:
 #endif
 
 nodata:
-	MPI_Barrier(MPI_COMM_WORLD);
+	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 

+ 3 - 3
mpi/tests/insert_task_owner.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016  CNRS
+ * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -86,8 +86,8 @@ int main(int argc, char **argv)
 	int ret, rank, size, err, node;
 	long x0=32;
 	int x1=23;
-	starpu_data_handle_t data_handlesx0;
-	starpu_data_handle_t data_handlesx1;
+	starpu_data_handle_t data_handlesx0 = NULL;
+	starpu_data_handle_t data_handlesx1 = NULL;
 
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");

+ 2 - 2
mpi/tests/insert_task_sent_cache.c

@@ -138,7 +138,7 @@ void test_cache(int rank, char *enabled, size_t *comm_amount)
 
 int main(int argc, char **argv)
 {
-	int dst, rank, size;
+	int rank, size;
 	int result=0;
 	size_t *comm_amount_with_cache;
 	size_t *comm_amount_without_cache;
@@ -157,7 +157,7 @@ int main(int argc, char **argv)
 
 	if (rank == 0 || rank == 1)
 	{
-		dst = (rank == 0) ? 1 : 0;
+		int dst = (rank == 0) ? 1 : 0;
 		result = (comm_amount_with_cache[dst] == comm_amount_without_cache[dst] * 5);
 		FPRINTF_MPI(stderr, "Communication cache mechanism is %sworking\n", result?"":"NOT ");
 	}

+ 3 - 1
mpi/tests/mpi_reduction.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2013, 2015  Université de Bordeaux
- * Copyright (C) 2012, 2013, 2014, 2015, 2016  CNRS
+ * Copyright (C) 2012, 2013, 2014, 2015, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -97,6 +97,8 @@ int main(int argc, char **argv)
 
 	int nb_elements, step, loops;
 
+	STARPU_SKIP_IF_VALGRIND_RETURN_SKIP;
+
 	/* Not supported yet */
 	if (starpu_get_env_number_default("STARPU_GLOBAL_ARBITER", 0) > 0)
 		return STARPU_TEST_SKIPPED;

+ 1 - 1
mpi/tests/mpi_reduction_kernels.c

@@ -66,7 +66,7 @@ void dot_cpu_func(void *descr[], void *cl_arg)
  */
 void display_cpu_func(void *descr[], void *cl_arg)
 {
-	long int *local_x = (long int *)STARPU_VECTOR_GET_PTR(descr[0]);
+	long int *local_x = (long int *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 	FPRINTF_MPI(stderr, "Local=%ld\n", *local_x);
 }

+ 9 - 9
sc_hypervisor/examples/app_driven_test/app_driven_test.c

@@ -30,7 +30,7 @@ struct params
 };
 
 unsigned val[2];
-pthread_mutex_t mut[2];
+starpu_pthread_mutex_t mut[2];
 
 /* Every implementation of a codelet must have this prototype, the first                                                                                                                                             * argument (buffers) describes the buffers/streams that are managed by the
  * DSM; the second arguments references read-only data that is passed as an
@@ -44,9 +44,9 @@ void cpu_func(__attribute__((unused))void *buffers[], void *cl_arg)
 	int i;
 	for(i = 0; i < NINCR; i++)
 	{
-		pthread_mutex_lock(&mut[params->sched_ctx - 1]);
+		STARPU_PTHREAD_MUTEX_LOCK(&mut[params->sched_ctx - 1]);
 		val[params->sched_ctx - 1]++;
-		pthread_mutex_unlock(&mut[params->sched_ctx - 1]);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&mut[params->sched_ctx - 1]);
 	}
 	if(params->task_tag != 0)
 		FPRINTF(stdout, "Task with tag %d executed in ctx = %u %u counter_tests\n", params->task_tag, params->sched_ctx, val[params->sched_ctx - 1]);
@@ -153,15 +153,15 @@ int main()
 
 	val[0] = 0;
 	val[1] = 0;
-	pthread_mutex_init(&mut[0], NULL);
-	pthread_mutex_init(&mut[1], NULL);
+	STARPU_PTHREAD_MUTEX_INIT(&mut[0], NULL);
+	STARPU_PTHREAD_MUTEX_INIT(&mut[1], NULL);
 
 	/* we create two threads to simulate simultaneous submission of tasks */
-	starpu_pthread_create(&tid[0], NULL, submit_tasks_thread, (void*)&sched_ctx1);
-	starpu_pthread_create(&tid[1], NULL, submit_tasks_thread, (void*)&sched_ctx2);
+	STARPU_PTHREAD_CREATE(&tid[0], NULL, submit_tasks_thread, (void*)&sched_ctx1);
+	STARPU_PTHREAD_CREATE(&tid[1], NULL, submit_tasks_thread, (void*)&sched_ctx2);
 
-	starpu_pthread_join(tid[0], NULL);
-	starpu_pthread_join(tid[1], NULL);
+	STARPU_PTHREAD_JOIN(tid[0], NULL);
+	STARPU_PTHREAD_JOIN(tid[1], NULL);
 
 	/* free starpu and hypervisor data */
 	starpu_shutdown();

+ 12 - 12
sc_hypervisor/examples/hierarchical_ctxs/resize_hierarchical_ctxs.c

@@ -25,7 +25,7 @@
 
 
 unsigned val[3];
-pthread_mutex_t mut[3];
+starpu_pthread_mutex_t mut[3];
 
 /* Every implementation of a codelet must have this prototype, the first                                                                                                                                             * argument (buffers) describes the buffers/streams that are managed by the
  * DSM; the second arguments references read-only data that is passed as an
@@ -39,9 +39,9 @@ void cpu_func(__attribute__((unused))void *buffers[], void *cl_arg)
 	int i;
 	for(i = 0; i < NINCR; i++)
 	{
-		pthread_mutex_lock(&mut[sched_ctx - 1]);
+		STARPU_PTHREAD_MUTEX_LOCK(&mut[sched_ctx - 1]);
 		val[sched_ctx - 1]++;
-		pthread_mutex_unlock(&mut[sched_ctx - 1]);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&mut[sched_ctx - 1]);
 	}
 }
 
@@ -139,18 +139,18 @@ int main()
 	val[0] = 0;
 	val[1] = 0;
 	val[2] = 0;
-	pthread_mutex_init(&mut[0], NULL);
-	pthread_mutex_init(&mut[1], NULL);
-	pthread_mutex_init(&mut[2], NULL);
+	STARPU_PTHREAD_MUTEX_INIT(&mut[0], NULL);
+	STARPU_PTHREAD_MUTEX_INIT(&mut[1], NULL);
+	STARPU_PTHREAD_MUTEX_INIT(&mut[2], NULL);
 
 	/* we create two threads to simulate simultaneous submission of tasks */
-	starpu_pthread_create(&tid[0], NULL, submit_tasks_thread, (void*)&sched_ctx1);
-	starpu_pthread_create(&tid[1], NULL, submit_tasks_thread, (void*)&sched_ctx2);
-	starpu_pthread_create(&tid[2], NULL, submit_tasks_thread, (void*)&sched_ctx3);
+	STARPU_PTHREAD_CREATE(&tid[0], NULL, submit_tasks_thread, (void*)&sched_ctx1);
+	STARPU_PTHREAD_CREATE(&tid[1], NULL, submit_tasks_thread, (void*)&sched_ctx2);
+	STARPU_PTHREAD_CREATE(&tid[2], NULL, submit_tasks_thread, (void*)&sched_ctx3);
 
-	starpu_pthread_join(tid[0], NULL);
-	starpu_pthread_join(tid[1], NULL);
-	starpu_pthread_join(tid[2], NULL);
+	STARPU_PTHREAD_JOIN(tid[0], NULL);
+	STARPU_PTHREAD_JOIN(tid[1], NULL);
+	STARPU_PTHREAD_JOIN(tid[2], NULL);
 
 	/* free starpu and hypervisor data */
 	starpu_shutdown();

+ 9 - 9
sc_hypervisor/examples/lp_test/lp_resize_test.c

@@ -25,7 +25,7 @@
 
 
 unsigned val[2];
-pthread_mutex_t mut[2];
+starpu_pthread_mutex_t mut[2];
 
 /* Every implementation of a codelet must have this prototype, the first                                                                                                                                             * argument (buffers) describes the buffers/streams that are managed by the
  * DSM; the second arguments references read-only data that is passed as an
@@ -39,9 +39,9 @@ void cpu_func(__attribute__((unused))void *buffers[], void *cl_arg)
 	int i;
 	for(i = 0; i < NINCR; i++)
 	{
-		pthread_mutex_lock(&mut[sched_ctx - 1]);
+		STARPU_PTHREAD_MUTEX_LOCK(&mut[sched_ctx - 1]);
 		val[sched_ctx - 1]++;
-		pthread_mutex_unlock(&mut[sched_ctx - 1]);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&mut[sched_ctx - 1]);
 	}
 }
 
@@ -117,15 +117,15 @@ int main()
 
 	val[0] = 0;
 	val[1] = 0;
-	pthread_mutex_init(&mut[0], NULL);
-	pthread_mutex_init(&mut[1], NULL);
+	STARPU_PTHREAD_MUTEX_INIT(&mut[0], NULL);
+	STARPU_PTHREAD_MUTEX_INIT(&mut[1], NULL);
 
 	/* we create two threads to simulate simultaneous submission of tasks */
-	starpu_pthread_create(&tid[0], NULL, submit_tasks_thread, (void*)&sched_ctx1);
-	starpu_pthread_create(&tid[1], NULL, submit_tasks_thread, (void*)&sched_ctx2);
+	STARPU_PTHREAD_CREATE(&tid[0], NULL, submit_tasks_thread, (void*)&sched_ctx1);
+	STARPU_PTHREAD_CREATE(&tid[1], NULL, submit_tasks_thread, (void*)&sched_ctx2);
 
-	starpu_pthread_join(tid[0], NULL);
-	starpu_pthread_join(tid[1], NULL);
+	STARPU_PTHREAD_JOIN(tid[0], NULL);
+	STARPU_PTHREAD_JOIN(tid[1], NULL);
 
 	/* free starpu and hypervisor data */
 	starpu_shutdown();

+ 11 - 11
sc_hypervisor/examples/lp_test/lp_test.c

@@ -25,7 +25,7 @@
 
 
 unsigned val[2];
-pthread_mutex_t mut[2];
+starpu_pthread_mutex_t mut[2];
 
 /* Every implementation of a codelet must have this prototype, the first                                                                                                                                             * argument (buffers) describes the buffers/streams that are managed by the
  * DSM; the second arguments references read-only data that is passed as an
@@ -39,9 +39,9 @@ void cpu_func(__attribute__((unused))void *buffers[], void *cl_arg)
 	int i;
 	for(i = 0; i < NINCR; i++)
 	{
-		pthread_mutex_lock(&mut[sched_ctx - 1]);
+		STARPU_PTHREAD_MUTEX_LOCK(&mut[sched_ctx - 1]);
 		val[sched_ctx - 1]++;
-		pthread_mutex_unlock(&mut[sched_ctx - 1]);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&mut[sched_ctx - 1]);
 	}
 }
 
@@ -91,12 +91,12 @@ int main()
 	struct sc_hypervisor_policy policy;
 	policy.custom = 0;
 	/* indicate which strategy to use
-	   in this particular case we use app_driven which allows the user to resize 
+	   in this particular case we use app_driven which allows the user to resize
 	   the ctxs dynamically at particular moments of the execution of the application */
 	policy.name = "feft_lp";
 	void *perf_counters = sc_hypervisor_init(&policy);
 
-	/* let starpu know which performance counters should use 
+	/* let starpu know which performance counters should use
 	   to inform the hypervisor how the application and the resources are executing */
 	starpu_sched_ctx_set_perf_counters(sched_ctx1, perf_counters);
 	starpu_sched_ctx_set_perf_counters(sched_ctx2, perf_counters);
@@ -116,15 +116,15 @@ int main()
 
 	val[0] = 0;
 	val[1] = 0;
-	pthread_mutex_init(&mut[0], NULL);
-	pthread_mutex_init(&mut[1], NULL);
+	STARPU_PTHREAD_MUTEX_INIT(&mut[0], NULL);
+	STARPU_PTHREAD_MUTEX_INIT(&mut[1], NULL);
 
 	/* we create two threads to simulate simultaneous submission of tasks */
-	starpu_pthread_create(&tid[0], NULL, submit_tasks_thread, (void*)&sched_ctx1);
-	starpu_pthread_create(&tid[1], NULL, submit_tasks_thread, (void*)&sched_ctx2);
+	STARPU_PTHREAD_CREATE(&tid[0], NULL, submit_tasks_thread, (void*)&sched_ctx1);
+	STARPU_PTHREAD_CREATE(&tid[1], NULL, submit_tasks_thread, (void*)&sched_ctx2);
 
-	starpu_pthread_join(tid[0], NULL);
-	starpu_pthread_join(tid[1], NULL);
+	STARPU_PTHREAD_JOIN(tid[0], NULL);
+	STARPU_PTHREAD_JOIN(tid[1], NULL);
 
 	/* free starpu and hypervisor data */
 	starpu_shutdown();

+ 14 - 14
sc_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.c

@@ -79,12 +79,12 @@ void init()
 
 	p1.id = 0;
 	p2.id = 1;
-	starpu_pthread_key_create(&key, NULL);
+	STARPU_PTHREAD_KEY_CREATE(&key, NULL);
 }
 
 void update_sched_ctx_timing_results(double flops, double avg_timing)
 {
-	unsigned *id = starpu_pthread_getspecific(key);
+	unsigned *id = STARPU_PTHREAD_GETSPECIFIC(key);
 	rv[*id].flops += flops;
 	rv[*id].avg_timing += avg_timing;
 }
@@ -94,7 +94,7 @@ void* start_bench(void *val)
 	params *p = (params*)val;
 	int i;
 
-	starpu_pthread_setspecific(key, &p->id);
+	STARPU_PTHREAD_SETSPECIFIC(key, &p->id);
 
 	if(p->ctx != 0)
 		starpu_sched_ctx_set_context(&p->ctx);
@@ -104,14 +104,14 @@ void* start_bench(void *val)
 
 	/* if(p->ctx != 0) */
 	/* { */
-	/* 	starpu_pthread_mutex_lock(&mut); */
+	/* 	STARPU_PTHREAD_MUTEX_LOCK(&mut); */
 	/* 	if(first){ */
 	/* 		sc_hypervisor_unregiser_ctx(p->ctx); */
 	/* 		starpu_sched_ctx_delete(p->ctx, p->the_other_ctx); */
 	/* 	} */
 
 	/* 	first = 0; */
-	/* 	starpu_pthread_mutex_unlock(&mut); */
+	/* 	STARPU_PTHREAD_MUTEX_UNLOCK(&mut); */
 	/* } */
 	sc_hypervisor_stop_resize(p->the_other_ctx);
 	rv[p->id].flops /= NSAMPLES;
@@ -154,22 +154,22 @@ void start_2benchs(void (*bench)(float*, unsigned, unsigned))
 	}
 
 	starpu_pthread_t tid[2];
-	starpu_pthread_mutex_init(&mut, NULL);
+	STARPU_PTHREAD_MUTEX_INIT(&mut, NULL);
 
 	struct timeval start;
 	struct timeval end;
 
 	gettimeofday(&start, NULL);
 
-	starpu_pthread_create(&tid[0], NULL, (void*)start_bench, (void*)&p1);
-	starpu_pthread_create(&tid[1], NULL, (void*)start_bench, (void*)&p2);
+	STARPU_PTHREAD_CREATE(&tid[0], NULL, (void*)start_bench, (void*)&p1);
+	STARPU_PTHREAD_CREATE(&tid[1], NULL, (void*)start_bench, (void*)&p2);
 
-	starpu_pthread_join(tid[0], NULL);
-	starpu_pthread_join(tid[1], NULL);
+	STARPU_PTHREAD_JOIN(tid[0], NULL);
+	STARPU_PTHREAD_JOIN(tid[1], NULL);
 
 	gettimeofday(&end, NULL);
 
-	starpu_pthread_mutex_destroy(&mut);
+	STARPU_PTHREAD_MUTEX_DESTROY(&mut);
 
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 	timing /= 1000000;
@@ -200,7 +200,7 @@ void start_1stbench(void (*bench)(float*, unsigned, unsigned))
 
 	gettimeofday(&end, NULL);
 
-	starpu_pthread_mutex_destroy(&mut);
+	STARPU_PTHREAD_MUTEX_DESTROY(&mut);
 
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 	timing /= 1000000;
@@ -229,7 +229,7 @@ void start_2ndbench(void (*bench)(float*, unsigned, unsigned))
 
 	gettimeofday(&end, NULL);
 
-	starpu_pthread_mutex_destroy(&mut);
+	STARPU_PTHREAD_MUTEX_DESTROY(&mut);
 
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 	timing /= 1000000;
@@ -332,7 +332,7 @@ void construct_contexts(void (*bench)(float*, unsigned, unsigned))
 
 void set_hypervisor_conf(int event, int task_tag)
 {
-/* 	unsigned *id = starpu_pthread_getspecific(key); */
+/* 	unsigned *id = STARPU_PTHREAD_GETSPECIFIC(key); */
 /* 	if(*id == 0) */
 /* 	{ */
 /* 		if(event == END_BENCH) */

+ 2 - 2
sc_hypervisor/src/uthash.h

@@ -269,7 +269,7 @@ do {
             }                                                                    \
             _count += _bkt_count;                                                \
             if ((head)->hh.tbl->buckets[_bkt_i].count !=  _bkt_count) {          \
-               HASH_OOPS("invalid bucket count %d, actual %d\n",                 \
+               HASH_OOPS("invalid bucket count %u, actual %u\n",                 \
                 (head)->hh.tbl->buckets[_bkt_i].count, _bkt_count);              \
             }                                                                    \
         }                                                                        \
@@ -292,7 +292,7 @@ do {
                                   (head)->hh.tbl->hho) : NULL );                 \
         }                                                                        \
         if (_count != (head)->hh.tbl->num_items) {                               \
-            HASH_OOPS("invalid app item count %d, actual %d\n",                  \
+            HASH_OOPS("invalid app item count %u, actual %u\n",                  \
                 (head)->hh.tbl->num_items, _count );                             \
         }                                                                        \
     }                                                                            \

+ 2 - 2
socl/src/cl_createcommandqueue.c

@@ -30,7 +30,7 @@ static void release_callback_command_queue(void * e) {
   gc_entity_unstore(&cq->context);
 
   /* Destruct object */
-  starpu_pthread_mutex_destroy(&cq->mutex);
+  STARPU_PTHREAD_MUTEX_DESTROY(&cq->mutex);
 }
 
 
@@ -73,7 +73,7 @@ soclCreateCommandQueue(cl_context                   context,
 
    cq->commands = NULL;
    cq->barrier = NULL;
-   starpu_pthread_mutex_init(&cq->mutex, NULL);
+   STARPU_PTHREAD_MUTEX_INIT(&cq->mutex, NULL);
 
    if (errcode_ret != NULL)
       *errcode_ret = CL_SUCCESS;

+ 1 - 1
socl/src/cl_createkernel.c

@@ -144,7 +144,7 @@ soclCreateKernel(cl_program    program,
    k->split_space = 0;
    k->split_data = NULL;
    k->split_perfs = NULL;
-   starpu_pthread_mutex_init(&k->split_lock, NULL);
+   STARPU_PTHREAD_MUTEX_INIT(&k->split_lock, NULL);
 
    #ifdef DEBUG
    static int id = 0;

+ 3 - 3
socl/src/cl_enqueuendrangekernel.c

@@ -160,7 +160,7 @@ soclEnqueueNDRangeKernel(cl_command_queue cq,
 		cl_event *       event) CL_API_SUFFIX__VERSION_1_1
 {
 
-   if (kernel->split_func != NULL && !starpu_pthread_mutex_trylock(&kernel->split_lock)) {
+   if (kernel->split_func != NULL && !STARPU_PTHREAD_MUTEX_TRYLOCK(&kernel->split_lock)) {
 
       cl_event beforeEvent, afterEvent, totalEvent;
 
@@ -200,7 +200,7 @@ soclEnqueueNDRangeKernel(cl_command_queue cq,
 
          kernel->split_perfs[iter] = end-start;
 
-         starpu_pthread_mutex_unlock(&kernel->split_lock);
+         STARPU_PTHREAD_MUTEX_UNLOCK(&kernel->split_lock);
 
          event_complete(totalEvent);
 
@@ -211,7 +211,7 @@ soclEnqueueNDRangeKernel(cl_command_queue cq,
 
          RETURN_EVENT(totalEvent,event);
       } else {
-         starpu_pthread_mutex_unlock(&kernel->split_lock);
+         STARPU_PTHREAD_MUTEX_UNLOCK(&kernel->split_lock);
          soclReleaseEvent(totalEvent);
       }
 

+ 2 - 2
socl/src/command.c

@@ -45,7 +45,7 @@ static void command_release_callback(void *a) {
   cl_command_queue cq = cmd->event->cq;
   if (cq != NULL) {
     /* Lock command queue */
-    starpu_pthread_mutex_lock(&cq->mutex);
+    STARPU_PTHREAD_MUTEX_LOCK(&cq->mutex);
 
     /* Remove barrier if applicable */
     if (cq->barrier == cmd)
@@ -55,7 +55,7 @@ static void command_release_callback(void *a) {
     cq->commands = command_list_remove(cq->commands, cmd);
 
     /* Unlock command queue */
-    starpu_pthread_mutex_unlock(&cq->mutex);
+    STARPU_PTHREAD_MUTEX_UNLOCK(&cq->mutex);
   }
 
   // Events may survive to commands that created them

+ 3 - 3
socl/src/command_queue.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2012 University of Bordeaux
- * Copyright (C) 2012 CNRS
+ * Copyright (C) 2012, 2017 CNRS
  * Copyright (C) 2012 Vincent Danjean <Vincent.Danjean@ens-lyon.org>
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -40,7 +40,7 @@ void command_queue_enqueue_ex(cl_command_queue cq, cl_command cmd, cl_uint num_e
 	gc_entity_store(&cmd->event->cq, cq);
 
 	/* Lock command queue */
-	starpu_pthread_mutex_lock(&cq->mutex);
+	STARPU_PTHREAD_MUTEX_LOCK(&cq->mutex);
 
 	/*** Number of dependencies ***/
 	int ndeps = num_events;
@@ -105,7 +105,7 @@ void command_queue_enqueue_ex(cl_command_queue cq, cl_command cmd, cl_uint num_e
 	command_submit_ex(cmd);
 
 	/* Unlock command queue */
-	starpu_pthread_mutex_unlock(&cq->mutex);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&cq->mutex);
 
 	gc_entity_release(cmd);
 }

+ 7 - 7
socl/src/gc.c

@@ -37,10 +37,10 @@ static starpu_pthread_cond_t  gc_cond = STARPU_PTHREAD_COND_INITIALIZER;
 /* Set to 1 to stop release thread execution */
 static volatile int gc_stop_required = 0;
 
-#define GC_LOCK starpu_pthread_mutex_lock(&gc_mutex)
-#define GC_UNLOCK { starpu_pthread_cond_signal(&gc_cond); \
-                    starpu_pthread_mutex_unlock(&gc_mutex);}
-#define GC_UNLOCK_NO_SIGNAL starpu_pthread_mutex_unlock(&gc_mutex)
+#define GC_LOCK STARPU_PTHREAD_MUTEX_LOCK(&gc_mutex)
+#define GC_UNLOCK { STARPU_PTHREAD_COND_SIGNAL(&gc_cond); \
+                    STARPU_PTHREAD_MUTEX_UNLOCK(&gc_mutex);}
+#define GC_UNLOCK_NO_SIGNAL STARPU_PTHREAD_MUTEX_UNLOCK(&gc_mutex)
 
 /* Thread routine */
 static void * gc_thread_routine(void *UNUSED(arg)) {
@@ -81,7 +81,7 @@ static void * gc_thread_routine(void *UNUSED(arg)) {
     }
 
     /* Otherwise we sleep */
-    starpu_pthread_cond_wait(&gc_cond, &gc_mutex);
+    STARPU_PTHREAD_COND_WAIT(&gc_cond, &gc_mutex);
 
   } while (1);
 
@@ -92,7 +92,7 @@ static starpu_pthread_t gc_thread;
 
 /* Start garbage collection */
 void gc_start(void) {
-  starpu_pthread_create(&gc_thread, NULL, gc_thread_routine, NULL);
+  STARPU_PTHREAD_CREATE(&gc_thread, NULL, gc_thread_routine, NULL);
 }
 
 /* Stop garbage collection */
@@ -103,7 +103,7 @@ void gc_stop(void) {
 
   GC_UNLOCK;
 
-  starpu_pthread_join(gc_thread, NULL);
+  STARPU_PTHREAD_JOIN(gc_thread, NULL);
 }
 
 int gc_entity_release_ex(entity e, const char * DEBUG_PARAM(caller)) {

+ 4 - 0
src/Makefile.am

@@ -305,6 +305,10 @@ if STARPU_HAVE_LEVELDB
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += core/disk_ops/disk_leveldb.cpp
 endif
 
+if STARPU_HAVE_HDF5
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += core/disk_ops/disk_hdf5.c
+endif
+
 if STARPU_USE_CPU
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cpu/driver_cpu.c
 endif

+ 2 - 2
src/common/barrier.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010,2011,2013,2014  CNRS
+ * Copyright (C) 2010,2011,2013,2014,2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -54,7 +54,7 @@ int _starpu_barrier_destroy(struct _starpu_barrier *barrier)
 	{
 		ret = _starpu_barrier_test(barrier);
 	}
-	_STARPU_DEBUG("reached_exit %d\n", barrier->reached_exit);
+	_STARPU_DEBUG("reached_exit %u\n", barrier->reached_exit);
 
 	STARPU_PTHREAD_MUTEX_DESTROY(&barrier->mutex);
 	STARPU_PTHREAD_MUTEX_DESTROY(&barrier->mutex_exit);

+ 16 - 3
src/common/fxt.h

@@ -21,7 +21,7 @@
 
 
 #ifndef _GNU_SOURCE
-#define _GNU_SOURCE  /* ou _BSD_SOURCE ou _SVID_SOURCE */
+#define _GNU_SOURCE  1 /* ou _BSD_SOURCE ou _SVID_SOURCE */
 #endif
 
 #include <string.h>
@@ -80,6 +80,7 @@
 
 #define _STARPU_FUT_DATA_NAME		0x511d
 #define _STARPU_FUT_DATA_COORDINATES	0x511e
+#define _STARPU_FUT_HANDLE_DATA_UNREGISTER	0x511f
 
 #define _STARPU_FUT_USER_DEFINED_START	0x5120
 #define _STARPU_FUT_USER_DEFINED_END	0x5121
@@ -1046,8 +1047,19 @@ do {										\
 #define _STARPU_TRACE_SCHED_COMPONENT_PULL(from, to, task)		\
 	FUT_DO_PROBE5(_STARPU_FUT_SCHED_COMPONENT_PULL, _starpu_gettid(), from, to, task, (task)->priority);
 
-#define _STARPU_TRACE_HANDLE_DATA_REGISTER(handle)		\
-	FUT_DO_PROBE1(_STARPU_FUT_HANDLE_DATA_REGISTER, handle)
+#define _STARPU_TRACE_HANDLE_DATA_REGISTER(handle)	do {	\
+	const size_t __data_size = handle->ops->get_size(handle); \
+	char __buf[(FXT_MAX_PARAMS-2)*sizeof(long)]; \
+	void *__interface = handle->per_node[0].data_interface; \
+	if (handle->ops->describe) \
+		handle->ops->describe(__interface, __buf, sizeof(__buf)); \
+	else \
+		__buf[0] = 0; \
+	FUT_DO_PROBE3STR(_STARPU_FUT_HANDLE_DATA_REGISTER, handle, __data_size, handle->home_node, __buf); \
+} while (0)
+
+#define _STARPU_TRACE_HANDLE_DATA_UNREGISTER(handle)	\
+	FUT_DO_PROBE1(_STARPU_FUT_HANDLE_DATA_UNREGISTER, handle)
 
 #if 0
 #define _STARPU_TRACE_DATA_INVALIDATE(handle, node)		\
@@ -1169,6 +1181,7 @@ do {										\
 #define _STARPU_TRACE_SCHED_COMPONENT_PUSH(from, to, task)	do {(void)(from); (void)(to); (void)(task);} while (0)
 #define _STARPU_TRACE_SCHED_COMPONENT_PULL(from, to, task)	do {(void)(from); (void)(to); (void)(task);} while (0)
 #define _STARPU_TRACE_HANDLE_DATA_REGISTER(handle)	do {(void)(handle);} while (0)
+#define _STARPU_TRACE_HANDLE_DATA_UNREGISTER(handle)	do {(void)(handle);} while (0)
 #define _STARPU_TRACE_DATA_INVALIDATE(handle, node)	do {(void)(handle); (void)(node);} while (0)
 #define _STARPU_TRACE_WORKER_START_FETCH_INPUT(job, id)	do {(void)(job); (void)(id);} while(0)
 #define _STARPU_TRACE_WORKER_END_FETCH_INPUT(job, id)	do {(void)(job); (void)(id);} while(0)

+ 8 - 6
src/common/rbtree.c

@@ -145,8 +145,7 @@ static void starpu_rbtree_rotate(struct starpu_rbtree *tree, struct starpu_rbtre
 void starpu_rbtree_insert_rebalance(struct starpu_rbtree *tree, struct starpu_rbtree_node *parent,
                              int index, struct starpu_rbtree_node *node)
 {
-    struct starpu_rbtree_node *grand_parent, *uncle, *tmp;
-    int left, right;
+    struct starpu_rbtree_node *grand_parent, *tmp;
 
     assert(starpu_rbtree_check_alignment(parent));
     assert(starpu_rbtree_check_alignment(node));
@@ -162,7 +161,10 @@ void starpu_rbtree_insert_rebalance(struct starpu_rbtree *tree, struct starpu_rb
 
     for (;;)
     {
-        if (parent == NULL)
+	struct starpu_rbtree_node *uncle;
+	int left, right;
+
+	if (parent == NULL)
 	{
             starpu_rbtree_set_black(node);
             break;
@@ -414,11 +416,11 @@ struct starpu_rbtree_node * starpu_rbtree_walk(struct starpu_rbtree_node *node,
     }
     else
     {
-        struct starpu_rbtree_node *parent;
-        int index;
-
         for (;;)
 	{
+            struct starpu_rbtree_node *parent;
+	    int index;
+
             parent = starpu_rbtree_parent(node);
 
             if (parent == NULL)

+ 3 - 3
src/common/uthash.h

@@ -282,12 +282,12 @@ do {
             }                                                                    \
             _count += _bkt_count;                                                \
             if ((head)->hh.tbl->buckets[_bkt_i].count !=  _bkt_count) {          \
-               HASH_OOPS("invalid bucket count %d, actual %d\n",                 \
+               HASH_OOPS("invalid bucket count %u, actual %u\n",                 \
                 (head)->hh.tbl->buckets[_bkt_i].count, _bkt_count);              \
             }                                                                    \
         }                                                                        \
         if (_count != (head)->hh.tbl->num_items) {                               \
-            HASH_OOPS("invalid hh item count %d, actual %d\n",                   \
+            HASH_OOPS("invalid hh item count %u, actual %u\n",                   \
                 (head)->hh.tbl->num_items, _count );                             \
         }                                                                        \
         /* traverse hh in app order; check next/prev integrity, count */         \
@@ -305,7 +305,7 @@ do {
                                   (head)->hh.tbl->hho) : NULL );                 \
         }                                                                        \
         if (_count != (head)->hh.tbl->num_items) {                               \
-            HASH_OOPS("invalid app item count %d, actual %d\n",                  \
+            HASH_OOPS("invalid app item count %u, actual %u\n",                  \
                 (head)->hh.tbl->num_items, _count );                             \
         }                                                                        \
     }                                                                            \

+ 1 - 1
src/common/utils.c

@@ -263,7 +263,7 @@ char *_starpu_mktemp_many(const char *directory, int depth, int flags, int *fd)
 		return NULL;
 	}
 
-	memcpy(path, directory, len);
+	memcpy(path, directory, len+1);
 	for (i = 0; i < depth; i++)
 	{
 		int r = starpu_lrand48();

+ 1 - 1
src/common/utils.h

@@ -118,7 +118,7 @@
 
 #define _STARPU_MALLOC(ptr, size) do { ptr = malloc(size); STARPU_ASSERT_MSG(ptr != NULL, "Cannot allocate %ld bytes\n", (long) size); } while (0)
 #define _STARPU_CALLOC(ptr, nmemb, size) do { ptr = calloc(nmemb, size); STARPU_ASSERT_MSG(ptr != NULL, "Cannot allocate %ld bytes\n", (long) (nmemb*size)); } while (0)
-#define _STARPU_REALLOC(ptr, size) do { ptr = realloc(ptr, size); STARPU_ASSERT_MSG(ptr != NULL, "Cannot reallocate %ld bytes\n", (long) size); } while (0)
+#define _STARPU_REALLOC(ptr, size) do { void *_new_ptr = realloc(ptr, size); STARPU_ASSERT_MSG(_new_ptr != NULL, "Cannot reallocate %ld bytes\n", (long) size); ptr = _new_ptr;} while (0)
 
 #ifdef _MSC_VER
 #define _STARPU_IS_ZERO(a) (a == 0.0)

+ 134 - 27
src/core/disk.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2013  Corentin Salingue
- * Copyright (C) 2015, 2016  CNRS
+ * Copyright (C) 2015, 2016, 2017  CNRS
  * Copyright (C) 2017  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -48,7 +48,7 @@ struct disk_register
 	int flag;
 };
 
-static void add_disk_in_list(unsigned node, struct starpu_disk_ops *func, void *base);
+static int add_disk_in_list(unsigned node, struct starpu_disk_ops *func, void *base);
 static int get_location_with_node(unsigned node);
 
 static struct disk_register **disk_register_list = NULL;
@@ -57,9 +57,26 @@ static int size_register_list = 2;
 
 int starpu_disk_swap_node = -1;
 
+static void add_async_event(struct _starpu_async_channel * channel, void * event)
+{
+        if (!event)
+                return; 
+
+        if (channel->event.disk_event.requests == NULL)
+        {
+                channel->event.disk_event.requests = _starpu_disk_backend_event_list_new();
+        }
+
+        struct _starpu_disk_backend_event * disk_event = _starpu_disk_backend_event_new();
+        disk_event->backend_event = event;
+
+        /* Store event at the end of the list */
+        _starpu_disk_backend_event_list_push_back(channel->event.disk_event.requests, disk_event);
+}
+
 int starpu_disk_register(struct starpu_disk_ops *func, void *parameter, starpu_ssize_t size)
 {
-	STARPU_ASSERT_MSG(size < 0 || size >= SIZE_DISK_MIN,"Minimum disk size is %u Bytes ! (Here %u) \n", (int) SIZE_DISK_MIN, (int) size);
+	STARPU_ASSERT_MSG(size < 0 || size >= STARPU_DISK_SIZE_MIN,"Minimum disk size is %u Bytes ! (Here %u) \n", (int) STARPU_DISK_SIZE_MIN, (int) size);
 	/* register disk */
 	unsigned disk_memnode = _starpu_memory_node_register(STARPU_DISK_RAM, 0);
 
@@ -76,7 +93,15 @@ int starpu_disk_register(struct starpu_disk_ops *func, void *parameter, starpu_s
 	void *base = func->plug(parameter, size);
 
 	/* remember it */
-	add_disk_in_list(disk_memnode,func,base);
+	int n STARPU_ATTRIBUTE_UNUSED = add_disk_in_list(disk_memnode, func, base);
+
+#ifdef STARPU_SIMGRID
+	char name[16];
+	snprintf(name, sizeof(name), "DISK%d", n);
+	msg_host_t host = _starpu_simgrid_get_host_by_name(name);
+	STARPU_ASSERT_MSG(host, "Could not find disk %s in platform file", name);
+	_starpu_simgrid_memory_node_set_host(memory_node, host);
+#endif
 
 	int ret = func->bandwidth(disk_memnode);
 	/* have a problem with the disk */
@@ -129,6 +154,7 @@ void _starpu_disk_free(unsigned node, void *obj, size_t size)
 /* src_node == disk node and dst_node == STARPU_MAIN_RAM */
 int _starpu_disk_read(unsigned src_node, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *obj, void *buf, off_t offset, size_t size, struct _starpu_async_channel *channel)
 {
+        void *event = NULL;
 	int pos = get_location_with_node(src_node);
 
         if (channel != NULL)
@@ -137,16 +163,17 @@ int _starpu_disk_read(unsigned src_node, unsigned dst_node STARPU_ATTRIBUTE_UNUS
 			channel = NULL;
 		else
 		{
-			channel->type = STARPU_DISK_RAM;
 			channel->event.disk_event.memory_node = src_node;
 
 			_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
-			channel->event.disk_event.backend_event = disk_register_list[pos]->functions->async_read(disk_register_list[pos]->base, obj, buf, offset, size);
+			event = disk_register_list[pos]->functions->async_read(disk_register_list[pos]->base, obj, buf, offset, size);
 			_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+
+                        add_async_event(channel, event);
 		}
 	}
 	/* asynchronous request failed or synchronous request is asked */
-	if (channel == NULL || !channel->event.disk_event.backend_event)
+	if (channel == NULL || !event)
 	{
 		disk_register_list[pos]->functions->read(disk_register_list[pos]->base, obj, buf, offset, size);
 		return 0;
@@ -157,6 +184,7 @@ int _starpu_disk_read(unsigned src_node, unsigned dst_node STARPU_ATTRIBUTE_UNUS
 /* src_node == STARPU_MAIN_RAM and dst_node == disk node */
 int _starpu_disk_write(unsigned src_node STARPU_ATTRIBUTE_UNUSED, unsigned dst_node, void *obj, void *buf, off_t offset, size_t size, struct _starpu_async_channel *channel)
 {
+        void * event;
 	int pos = get_location_with_node(dst_node);
 
         if (channel != NULL)
@@ -165,16 +193,17 @@ int _starpu_disk_write(unsigned src_node STARPU_ATTRIBUTE_UNUSED, unsigned dst_n
 			channel = NULL;
 		else
                 {
-			channel->type = STARPU_DISK_RAM;
 			channel->event.disk_event.memory_node = dst_node;
 
 			_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
-			channel->event.disk_event.backend_event = disk_register_list[pos]->functions->async_write(disk_register_list[pos]->base, obj, buf, offset, size);
+			event = disk_register_list[pos]->functions->async_write(disk_register_list[pos]->base, obj, buf, offset, size);
         		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+
+                        add_async_event(channel, event);
 		}
         }
         /* asynchronous request failed or synchronous request is asked */
-	if (channel == NULL || !channel->event.disk_event.backend_event)
+	if (channel == NULL || !event)
         {
 		disk_register_list[pos]->functions->write(disk_register_list[pos]->base, obj, buf, offset, size);
         	return 0;
@@ -187,16 +216,20 @@ int _starpu_disk_copy(unsigned node_src, void *obj_src, off_t offset_src, unsign
 	int pos_src = get_location_with_node(node_src);
 	int pos_dst = get_location_with_node(node_dst);
 	/* both nodes have same copy function */
+        void * event;
 	channel->event.disk_event.memory_node = node_src;
-	channel->event.disk_event.backend_event = disk_register_list[pos_src]->functions->copy(disk_register_list[pos_src]->base, obj_src, offset_src,
+	event = disk_register_list[pos_src]->functions->copy(disk_register_list[pos_src]->base, obj_src, offset_src,
 											       disk_register_list[pos_dst]->base, obj_dst, offset_dst,
 											       size);
-	STARPU_ASSERT(channel->event.disk_event.backend_event);
+        add_async_event(channel, event);
+
+	STARPU_ASSERT(event);
 	return -EAGAIN;
 }
 
 int _starpu_disk_full_read(unsigned src_node, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *obj, void **ptr, size_t *size, struct _starpu_async_channel *channel)
 {
+        void * event;
 	int pos = get_location_with_node(src_node);
 
 	if (channel != NULL)
@@ -205,16 +238,17 @@ int _starpu_disk_full_read(unsigned src_node, unsigned dst_node STARPU_ATTRIBUTE
 			channel = NULL;
 		else
 		{
-			channel->type = STARPU_DISK_RAM;
 			channel->event.disk_event.memory_node = src_node;
 
 			_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
-			channel->event.disk_event.backend_event = disk_register_list[pos]->functions->async_full_read(disk_register_list[pos]->base, obj, ptr, size);
+			event = disk_register_list[pos]->functions->async_full_read(disk_register_list[pos]->base, obj, ptr, size);
 			_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+
+                        add_async_event(channel, event);
 		}
 	}
 	/* asynchronous request failed or synchronous request is asked */
-	if (channel == NULL || !channel->event.disk_event.backend_event)
+	if (channel == NULL || !event)
 	{
 		disk_register_list[pos]->functions->full_read(disk_register_list[pos]->base, obj, ptr, size);
 		return 0;
@@ -224,6 +258,7 @@ int _starpu_disk_full_read(unsigned src_node, unsigned dst_node STARPU_ATTRIBUTE
 
 int _starpu_disk_full_write(unsigned src_node STARPU_ATTRIBUTE_UNUSED, unsigned dst_node, void *obj, void *ptr, size_t size, struct _starpu_async_channel *channel)
 {
+        void * event;
 	int pos = get_location_with_node(dst_node);
 
 	if (channel != NULL)
@@ -232,16 +267,17 @@ int _starpu_disk_full_write(unsigned src_node STARPU_ATTRIBUTE_UNUSED, unsigned
 			channel = NULL;
 		else
 		{
-			channel->type = STARPU_DISK_RAM;
 			channel->event.disk_event.memory_node = dst_node;
 
 			_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
-			channel->event.disk_event.backend_event = disk_register_list[pos]->functions->async_full_write(disk_register_list[pos]->base, obj, ptr, size);
+			event = disk_register_list[pos]->functions->async_full_write(disk_register_list[pos]->base, obj, ptr, size);
 			_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+
+                        add_async_event(channel, event);
 		}
 	}
 	/* asynchronous request failed or synchronous request is asked */
-	if (channel == NULL || !channel->event.disk_event.backend_event)
+	if (channel == NULL || !event)
 	{
 		disk_register_list[pos]->functions->full_write(disk_register_list[pos]->base, obj, ptr, size);
 		return 0;
@@ -264,24 +300,88 @@ void starpu_disk_close(unsigned node, void *obj, size_t size)
 void starpu_disk_wait_request(struct _starpu_async_channel *async_channel)
 {
 	int position = get_location_with_node(async_channel->event.disk_event.memory_node);
-	disk_register_list[position]->functions->wait_request(async_channel->event.disk_event.backend_event);
+
+        if (async_channel->event.disk_event.requests != NULL && !_starpu_disk_backend_event_list_empty(async_channel->event.disk_event.requests))
+        {
+                struct _starpu_disk_backend_event * event = _starpu_disk_backend_event_list_begin(async_channel->event.disk_event.requests);
+                struct _starpu_disk_backend_event * next;
+
+                /* Wait all events in the list and remove them */
+                while (event != _starpu_disk_backend_event_list_end(async_channel->event.disk_event.requests))
+                {
+                        next = _starpu_disk_backend_event_list_next(event);
+
+                        disk_register_list[position]->functions->wait_request(event->backend_event);
+
+                        disk_register_list[position]->functions->free_request(event->backend_event);
+
+                        _starpu_disk_backend_event_list_erase(async_channel->event.disk_event.requests, event);
+
+                        _starpu_disk_backend_event_delete(event);
+
+                        event = next;
+                }
+
+                /* Remove the list because it doesn't contain any event */
+                _starpu_disk_backend_event_list_delete(async_channel->event.disk_event.requests);
+                async_channel->event.disk_event.requests = NULL;
+        }
 }
 
 int starpu_disk_test_request(struct _starpu_async_channel *async_channel)
 {
 	int position = get_location_with_node(async_channel->event.disk_event.memory_node);
-	return disk_register_list[position]->functions->test_request(async_channel->event.disk_event.backend_event);
+
+        if (async_channel->event.disk_event.requests != NULL && !_starpu_disk_backend_event_list_empty(async_channel->event.disk_event.requests))
+        {
+                struct _starpu_disk_backend_event * event = _starpu_disk_backend_event_list_begin(async_channel->event.disk_event.requests);
+                struct _starpu_disk_backend_event * next;
+
+                /* Wait all events in the list and remove them */
+                while (event != _starpu_disk_backend_event_list_end(async_channel->event.disk_event.requests))
+                {
+                        next = _starpu_disk_backend_event_list_next(event);
+
+                        int res = disk_register_list[position]->functions->test_request(event->backend_event);
+
+                                if (res)
+                                {
+                                        disk_register_list[position]->functions->free_request(event->backend_event);
+
+                                        _starpu_disk_backend_event_list_erase(async_channel->event.disk_event.requests, event);
+
+                                        _starpu_disk_backend_event_delete(event);
+                                }
+
+                        event = next;
+                }
+
+                /* Remove the list because it doesn't contain any event */
+                if (_starpu_disk_backend_event_list_empty(async_channel->event.disk_event.requests))
+                {
+                        _starpu_disk_backend_event_list_delete(async_channel->event.disk_event.requests);
+                        async_channel->event.disk_event.requests = NULL;
+                }
+        }
+
+	return async_channel->event.disk_event.requests == NULL;
 }
 
 void starpu_disk_free_request(struct _starpu_async_channel *async_channel)
 {
-	int position = get_location_with_node(async_channel->event.disk_event.memory_node);
+/* It does not have any sense to use this function currently because requests are freed in test of wait functions */
+        STARPU_ABORT();
+
+/*	int position = get_location_with_node(async_channel->event.disk_event.memory_node);
 	if (async_channel->event.disk_event.backend_event)
 		disk_register_list[position]->functions->free_request(async_channel->event.disk_event.backend_event);
+*/
 }
 
-static void add_disk_in_list(unsigned node,  struct starpu_disk_ops *func, void *base)
+static int add_disk_in_list(unsigned node,  struct starpu_disk_ops *func, void *base)
 {
+	int n;
+
 	/* initialization */
 	if (disk_register_list == NULL)
 	{
@@ -300,7 +400,9 @@ static void add_disk_in_list(unsigned node,  struct starpu_disk_ops *func, void
 	dr->base = base;
 	dr->flag = STARPU_DISK_ALL;
 	dr->functions = func;
-	disk_register_list[++disk_number] = dr;
+	n = ++disk_number;
+	disk_register_list[n] = dr;
+	return n;
 }
 
 static int get_location_with_node(unsigned node)
@@ -356,23 +458,19 @@ void _starpu_swap_init(void)
 	backend = starpu_getenv("STARPU_DISK_SWAP_BACKEND");
 	if (!backend)
 	{
-		_starpu_mkpath(path, S_IRWXU);
 		ops = &starpu_disk_unistd_ops;
 	}
 	else if (!strcmp(backend, "stdio"))
 	{
-		_starpu_mkpath(path, S_IRWXU);
 		ops = &starpu_disk_stdio_ops;
 	}
 	else if (!strcmp(backend, "unistd"))
 	{
-		_starpu_mkpath(path, S_IRWXU);
 		ops = &starpu_disk_unistd_ops;
 	}
 	else if (!strcmp(backend, "unistd_o_direct"))
 	{
 #ifdef STARPU_LINUX_SYS
-		_starpu_mkpath(path, S_IRWXU);
 		ops = &starpu_disk_unistd_o_direct_ops;
 #else
 		_STARPU_DISP("Warning: o_direct support is not compiled in, could not enable disk swap");
@@ -389,6 +487,15 @@ void _starpu_swap_init(void)
 		return;
 #endif
 	}
+        else if (!strcmp(backend, "hdf5"))
+        {
+#ifdef STARPU_HAVE_HDF5
+                ops = &starpu_disk_hdf5_ops;
+#else
+		_STARPU_DISP("Warning: hdf5 support is not compiled in, could not enable disk swap");
+		return;
+#endif
+        }
 	else
 	{
 		_STARPU_DISP("Warning: unknown disk swap backend %s, could not enable disk swap", backend);

+ 0 - 2
src/core/disk.h

@@ -18,8 +18,6 @@
 #ifndef __DISK_H__
 #define __DISK_H__
 
-#define SIZE_DISK_MIN (1024*1024)
-
 #define STARPU_DISK_ALL 1
 #define STARPU_DISK_NO_RECLAIM 2
 

+ 739 - 0
src/core/disk_ops/disk_hdf5.c

@@ -0,0 +1,739 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2017  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <fcntl.h>
+#include <errno.h>
+#include <common/config.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <time.h>
+#include <hdf5.h>
+
+#include <starpu.h>
+#include <core/disk.h>
+#include <core/perfmodel/perfmodel.h>
+
+#ifndef O_BINARY
+#define O_BINARY 0
+#endif
+
+#define NITER	_starpu_calibration_minimum
+
+#ifndef H5_HAVE_THREADSAFE
+/* if thread safe mode not enabled, we can't open more than one file */
+static int nb_disk_open = 0;
+#endif
+
+
+/* ------------------- use HDF5 to write on disk -------------------  */
+
+enum hdf5_work_type { READ, WRITE, FULL_READ, FULL_WRITE };
+
+LIST_TYPE(_starpu_hdf5_work,
+        enum hdf5_work_type type;
+        struct starpu_hdf5_obj * obj;
+        struct starpu_hdf5_base * base;
+        void * ptr;
+        off_t offset;
+        size_t size;
+        void * event;
+);
+
+struct starpu_hdf5_base
+{
+        hid_t fileID;
+        char * path;
+        unsigned created;       /* StarPU creates the HDF5 file */
+        unsigned next_dataset_id;
+        starpu_pthread_t thread;        /* This thread will perform each write/read because we don't have asynchronous functions */
+        int run;                        /* Ask to the thread if he can continue */
+	starpu_pthread_mutex_t mutex;   /* Mutex is used to protect work_list and if HDF5 library is not safe */
+        starpu_pthread_cond_t cond;
+        struct _starpu_hdf5_work_list * work_list;        /* This list contains the work for the hdf5 thread */
+};
+
+struct starpu_hdf5_obj
+{
+        hid_t dataset;          /* describe this object in HDF5 file */
+        char * path;            /* path where data are stored in HDF5 file */
+};
+
+static inline void _starpu_hdf5_protect_start(void * base)
+{
+#ifndef H5_HAVE_THREADSAFE
+        struct starpu_hdf5_base * fileBase = (struct starpu_hdf5_base *) base;
+        STARPU_PTHREAD_MUTEX_LOCK(&fileBase->mutex);
+#endif
+}
+
+static inline void _starpu_hdf5_protect_stop(void * base)
+{
+#ifndef H5_HAVE_THREADSAFE
+        struct starpu_hdf5_base * fileBase = (struct starpu_hdf5_base *) base;
+        STARPU_PTHREAD_MUTEX_UNLOCK(&fileBase->mutex);
+#endif
+}
+
+/* ------------------ Functions for internal thread -------------------- */
+
+static void starpu_hdf5_full_read_internal(struct _starpu_hdf5_work * work)
+{
+        herr_t status;
+
+        status = H5Dread(work->obj->dataset, H5T_NATIVE_CHAR, H5S_ALL, H5S_ALL, H5P_DEFAULT, work->ptr);
+        STARPU_ASSERT_MSG(status >= 0, "Can not read data associed to this dataset (%s)\n", work->obj->path);
+}
+
+static void starpu_hdf5_full_write_internal(struct _starpu_hdf5_work * work)
+{
+        herr_t status;
+
+        /* Write ALL the dataspace */
+        status = H5Dwrite(work->obj->dataset, H5T_NATIVE_CHAR, H5S_ALL, H5S_ALL, H5P_DEFAULT, work->ptr);
+        STARPU_ASSERT_MSG(status >= 0, "Can not write data to this dataset (%s)\n", work->obj->path);
+}
+
+static void starpu_hdf5_read_internal(struct _starpu_hdf5_work * work)
+{
+        herr_t status;
+
+        /* Get official datatype */
+        hid_t datatype = H5Dget_type(work->obj->dataset);
+        hsize_t sizeDatatype = H5Tget_size(datatype);
+
+        /* count in element, not in byte */
+        work->offset /= sizeDatatype;
+        work->size /= sizeDatatype;
+
+        /* duplicate the dataspace in the dataset */
+        hid_t dataspace_select = H5Dget_space(work->obj->dataset);
+        STARPU_ASSERT_MSG(dataspace_select >= 0, "Error when reading this HDF5 dataset (%s)\n", work->obj->path);
+
+        /* Select what we want of the duplicated dataspace (it's called an hyperslab). This operation is done on place */
+        hsize_t offsets[1] = {work->offset};
+        hsize_t count[1] = {work->size};
+        /* stride and block size are NULL which is equivalent of a shift of 1 */
+        status = H5Sselect_hyperslab(dataspace_select, H5S_SELECT_SET, offsets, NULL, count, NULL);
+        STARPU_ASSERT_MSG(status >= 0, "Error when reading this HDF5 dataset (%s)\n", work->obj->path);
+
+        /* create the dataspace for the received data which describes ptr */
+        hsize_t dims_receive[1] = {work->size};
+        hid_t dataspace_receive = H5Screate_simple(1, dims_receive, NULL);
+        STARPU_ASSERT_MSG(dataspace_receive >= 0, "Error when reading this HDF5 dataset (%s)\n", work->obj->path);
+
+        /* Receiver has to be an hyperslabs */
+        offsets[0] = 0;
+        count[0] = work->size;
+        status = H5Sselect_hyperslab(dataspace_receive, H5S_SELECT_SET, offsets, NULL, count, NULL);
+        STARPU_ASSERT_MSG(dataspace_receive >= 0, "Error when reading this HDF5 dataset (%s)\n", work->obj->path);
+
+        status = H5Dread(work->obj->dataset, datatype, dataspace_receive, dataspace_select, H5P_DEFAULT, work->ptr);
+        STARPU_ASSERT_MSG(status >= 0, "Error when reading this HDF5 dataset (%s)\n", work->obj->path);
+
+        /* don't need these dataspaces */
+        status = H5Sclose(dataspace_select);
+        STARPU_ASSERT_MSG(status >= 0, "Error when reading this HDF5 dataset (%s)\n", work->obj->path);
+        status = H5Sclose(dataspace_receive);
+        STARPU_ASSERT_MSG(status >= 0, "Error when reading this HDF5 dataset (%s)\n", work->obj->path);
+}
+
+static void starpu_hdf5_write_internal(struct _starpu_hdf5_work * work)
+{
+        herr_t status;
+
+        /* Get official datatype */
+        hid_t datatype = H5Dget_type(work->obj->dataset);
+        hsize_t sizeDatatype = H5Tget_size(datatype);
+
+        /* count in element, not in byte */
+        work->offset /= sizeDatatype;
+        work->size /= sizeDatatype;
+
+        /* duplicate the dataspace in the dataset */
+        hid_t dataspace_select = H5Dget_space(work->obj->dataset);
+        STARPU_ASSERT_MSG(dataspace_select >= 0, "Error when writing this HDF5 dataset (%s)\n", work->obj->path);
+
+        /* Select what we want of the duplicated dataspace (it's called an hyperslab). This operation is done on place */
+        hsize_t offsets[1] = {work->offset};
+        hsize_t count[1] = {work->size};
+        /* stride and block size are NULL which is equivalent of a shift of 1 */
+        status = H5Sselect_hyperslab(dataspace_select, H5S_SELECT_SET, offsets, NULL, count, NULL);
+        STARPU_ASSERT_MSG(status >= 0, "Error when writing this HDF5 dataset (%s)\n", work->obj->path);
+
+        /* create the dataspace for the received data which describes ptr */
+        hsize_t dims_send[1] = {work->size};
+        hid_t dataspace_send = H5Screate_simple(1, dims_send, NULL);
+        STARPU_ASSERT_MSG(dataspace_send >= 0, "Error when writing this HDF5 dataset (%s)\n", work->obj->path);
+
+        /* Receiver has to be an hyperslabs */
+        offsets[0] = 0;
+        count[0] = work->size;
+        status = H5Sselect_hyperslab(dataspace_send, H5S_SELECT_SET, offsets, NULL, count, NULL);
+        STARPU_ASSERT_MSG(dataspace_send >= 0, "Error when writing this HDF5 dataset (%s)\n", work->obj->path);
+
+        status = H5Dwrite(work->obj->dataset, datatype, dataspace_send, dataspace_select, H5P_DEFAULT, work->ptr);
+        STARPU_ASSERT_MSG(status >= 0, "Error when writing this HDF5 dataset (%s)\n", work->obj->path);
+
+        /* don't need these dataspaces */
+        status = H5Sclose(dataspace_select);
+        STARPU_ASSERT_MSG(status >= 0, "Error when writing this HDF5 dataset (%s)\n", work->obj->path);
+        status = H5Sclose(dataspace_send);
+        STARPU_ASSERT_MSG(status >= 0, "Error when writing this HDF5 dataset (%s)\n", work->obj->path);
+}
+
+static void * _starpu_hdf5_internal_thread(void * arg)
+{
+        struct starpu_hdf5_base * base = (struct starpu_hdf5_base *) arg;
+        while (base->run || !_starpu_hdf5_work_list_empty(base->work_list))
+        {
+                STARPU_PTHREAD_MUTEX_LOCK(&base->mutex);
+                if (_starpu_hdf5_work_list_empty(base->work_list) && base->run)
+                        STARPU_PTHREAD_COND_WAIT(&base->cond, &base->mutex);
+                STARPU_PTHREAD_MUTEX_UNLOCK(&base->mutex);
+
+                /* We are the only consummer here, don't need to protect here */
+                if (!_starpu_hdf5_work_list_empty(base->work_list))
+                {
+                        STARPU_PTHREAD_MUTEX_LOCK(&base->mutex);
+                        struct _starpu_hdf5_work * work = _starpu_hdf5_work_list_pop_back(base->work_list);
+                        STARPU_PTHREAD_MUTEX_UNLOCK(&base->mutex);
+
+                        _starpu_hdf5_protect_start(work->base);
+                        switch(work->type)
+                        {
+                                case READ:
+                                        starpu_hdf5_read_internal(work);
+                                        break;
+
+                                case WRITE:
+                                        starpu_hdf5_write_internal(work);
+                                        break;
+
+                                case FULL_READ:
+                                        starpu_hdf5_full_read_internal(work);
+                                        break;
+
+                                case FULL_WRITE:
+                                        starpu_hdf5_full_write_internal(work);
+                                        break;
+
+                                default:
+                                        STARPU_ABORT();
+                        }
+                        _starpu_hdf5_protect_stop(work->base);
+
+                        /* Update event to tell it's finished */
+                        starpu_sem_post((starpu_sem_t *) work->event);
+
+                        free(work);
+                }
+        }
+
+        STARPU_PTHREAD_MUTEX_LOCK(&base->mutex);
+        STARPU_PTHREAD_COND_BROADCAST(&base->cond);
+        STARPU_PTHREAD_MUTEX_UNLOCK(&base->mutex);
+
+        return NULL;
+}
+
+static void _starpu_hdf5_create_thread(struct starpu_hdf5_base * base)
+{
+        base->work_list = _starpu_hdf5_work_list_new();
+        base->run = 1;
+
+        STARPU_PTHREAD_COND_INIT(&base->cond, NULL);
+        STARPU_PTHREAD_CREATE(&base->thread, NULL, _starpu_hdf5_internal_thread, (void *) base); 
+}
+
+/* returns the size in BYTES */
+static hsize_t _starpu_get_size_obj(struct starpu_hdf5_obj * obj)
+{
+        herr_t status;
+
+        hid_t dataspace = H5Dget_space(obj->dataset);
+        STARPU_ASSERT_MSG(dataspace >= 0, "Can not get the size of this HDF5 dataset (%s)\n", obj->path);
+
+        hsize_t dims[1];
+        status = H5Sget_simple_extent_dims(dataspace, dims, NULL);
+        STARPU_ASSERT_MSG(status >= 0, "Can not get the size of this HDF5 dataset (%s)\n", obj->path);
+
+        hid_t datatype = H5Dget_type(obj->dataset);
+        STARPU_ASSERT_MSG(datatype >= 0, "Can not get the size of this HDF5 dataset (%s)\n", obj->path);
+
+        hsize_t sizeDatatype = H5Tget_size(datatype);
+        STARPU_ASSERT_MSG(sizeDatatype > 0, "Can not get the size of this HDF5 dataset (%s)\n", obj->path);
+
+        H5Sclose(dataspace);
+        H5Tclose(datatype);
+
+        return dims[0]*sizeDatatype;
+}
+
+static void starpu_hdf5_send_work(void *base, void *obj, void *buf, off_t offset, size_t size, void * event, enum hdf5_work_type type)
+{
+        struct starpu_hdf5_obj * dataObj = (struct starpu_hdf5_obj *) obj;
+        struct starpu_hdf5_base * fileBase = (struct starpu_hdf5_base *) base;
+
+        struct _starpu_hdf5_work * work;
+        _STARPU_MALLOC(work, sizeof(*work));
+
+        work->type = type;
+        work->obj = dataObj;
+        work->base = fileBase;
+        work->ptr = buf;
+        work->offset = offset;
+        work->size = size;
+        work->event = event;
+
+        STARPU_PTHREAD_MUTEX_LOCK(&fileBase->mutex);
+        _starpu_hdf5_work_list_push_front(fileBase->work_list, work);
+        /* Wake up internal thread */
+        STARPU_PTHREAD_COND_BROADCAST(&fileBase->cond);
+        STARPU_PTHREAD_MUTEX_UNLOCK(&fileBase->mutex);
+}
+
+static struct starpu_hdf5_obj * _starpu_hdf5_data_alloc(struct starpu_hdf5_base * fileBase,  char * name, size_t size)
+{
+        struct starpu_hdf5_obj * obj;
+	_STARPU_MALLOC(obj, sizeof(*obj));
+
+        _starpu_hdf5_protect_start((void *) fileBase);
+
+        /* create a dataspace with one dimension of size elements */
+        hsize_t dim[1] = {size};
+        hid_t dataspace = H5Screate_simple(1, dim, NULL);
+
+        if (dataspace < 0)
+        {
+                free(obj);
+                return NULL;
+        }
+
+        /* create a dataset at location name, with data described by the dataspace.
+         * Each element are like char in C (expected one byte) 
+         */
+        obj->dataset = H5Dcreate2(fileBase->fileID, name, H5T_NATIVE_CHAR, dataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+
+        H5Sclose(dataspace);
+
+        if (obj->dataset < 0)
+        {
+                free(obj);
+                return NULL;
+        }
+
+        obj->path = name;
+
+        _starpu_hdf5_protect_stop((void *) fileBase);
+        
+        return obj;
+}
+
+static struct starpu_hdf5_obj * _starpu_hdf5_data_open(struct starpu_hdf5_base * fileBase,  char * name, size_t size)
+{
+        struct starpu_hdf5_obj * obj;
+	_STARPU_MALLOC(obj, sizeof(*obj));
+
+        _starpu_hdf5_protect_start((void *) fileBase);
+
+        /* create a dataset at location name, with data described by the dataspace.
+         * Each element are like char in C (expected one byte) 
+         */
+        obj->dataset = H5Dopen2(fileBase->fileID, name, H5P_DEFAULT);
+
+        _starpu_hdf5_protect_stop((void *) fileBase);
+
+        if (obj->dataset < 0)
+        {
+                free(obj);
+                return NULL;
+        }
+
+        obj->path = name;
+        
+        return obj;
+}
+
+static void *starpu_hdf5_plug(void *parameter, starpu_ssize_t size STARPU_ATTRIBUTE_UNUSED)
+{
+#ifndef H5_HAVE_THREADSAFE
+        int nb_disk = STARPU_ATOMIC_ADD(&nb_disk_open, 1);
+        if (nb_disk != 1)
+                _STARPU_ERROR("HDF5 library is not compiled with --enable-threadsafe. You can't open more than one HDF5 file in the same time !\n");
+#endif
+
+        struct starpu_hdf5_base * base;
+        _STARPU_MALLOC(base, sizeof(struct starpu_hdf5_base));
+
+	STARPU_PTHREAD_MUTEX_INIT(&base->mutex, NULL);
+        _starpu_hdf5_protect_start(base);
+
+        struct stat buf;
+        if (stat(parameter, &buf) != 0 || !S_ISREG(buf.st_mode))
+        {
+                /* The file doesn't exist or the directory exists => create the datafile */
+                int id;
+                base->path = _starpu_mktemp_many(parameter, 0, O_RDWR | O_BINARY, &id);
+                if (!base->path)
+                {
+                        free(base);
+                        _STARPU_ERROR("Can not create the HDF5 file (%s)", (char *) parameter);
+                }
+
+                /* just use _starpu_mktemp_many to create a file, close the file descriptor */
+                close(id);
+
+                /* Truncate it */
+                base->fileID = H5Fcreate((char *)base->path, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
+                if (base->fileID < 0) 
+                {
+                        free(base); 
+                        _STARPU_ERROR("Can not create the HDF5 file (%s)", (char *) parameter);
+                }
+                base->created = 1;
+        } 
+        else
+        {
+                /* Well, open it ! */
+                char * path;
+                _STARPU_MALLOC(path, strlen((char *) parameter)+1);
+                strcpy(path, (char *) parameter);
+
+                base->fileID = H5Fopen((char *)parameter, H5F_ACC_RDWR, H5P_DEFAULT);
+                if (base->fileID < 0) 
+                {
+                        free(base); 
+                        _STARPU_ERROR("Can not open the HDF5 file (%s)", (char *) parameter);
+                }
+                base->created = 0;
+                base->path = path;
+        }
+
+        _starpu_hdf5_create_thread(base);
+
+        _starpu_hdf5_protect_stop(base);
+
+        base->next_dataset_id = 0;
+
+	return (void *) base;
+}
+
+/* free memory allocated for the base */
+static void starpu_hdf5_unplug(void *base)
+{
+#ifndef H5_HAVE_THREADSAFE
+        STARPU_ATOMIC_ADD(&nb_disk_open, -1);
+#endif
+
+        struct starpu_hdf5_base * fileBase = (struct starpu_hdf5_base *) base;
+        herr_t status;
+
+        STARPU_PTHREAD_MUTEX_LOCK(&fileBase->mutex);
+
+        fileBase->run = 0;
+        STARPU_PTHREAD_COND_BROADCAST(&fileBase->cond);
+        STARPU_PTHREAD_COND_WAIT(&fileBase->cond, &fileBase->mutex);
+        /* the internal thread is deleted */
+
+        status = H5Fclose(fileBase->fileID);
+
+        STARPU_PTHREAD_MUTEX_UNLOCK(&fileBase->mutex);
+	STARPU_PTHREAD_MUTEX_DESTROY(&fileBase->mutex);
+
+        STARPU_ASSERT_MSG(status >= 0, "Can not unplug this HDF5 disk (%s)\n", fileBase->path);
+        if (fileBase->created)
+        {
+                unlink(fileBase->path);        
+        }
+        else
+        {
+                /* Warn user about repack, because unlink dataset doesn't delete data in file */
+                _STARPU_DISP("This disk (%s)  was used to store temporary data. You may use the h5repack command to reduce the size of the file... \n", fileBase->path);
+        }
+        free(fileBase->path);
+	free(fileBase);
+}
+
+static void *starpu_hdf5_alloc(void *base, size_t size)
+{
+        struct starpu_hdf5_base * fileBase = (struct starpu_hdf5_base *) base;
+        struct starpu_hdf5_obj * obj;
+        char * name;
+        char * prefix = "STARPU_";
+        char name_id[16];
+
+        /* Save the name of the dataset */
+        STARPU_PTHREAD_MUTEX_LOCK(&fileBase->mutex);
+        sprintf(name_id, "%u", fileBase->next_dataset_id);
+        fileBase->next_dataset_id++;
+        STARPU_PTHREAD_MUTEX_UNLOCK(&fileBase->mutex);
+
+        /* name in HDF5 is like a path */
+        _STARPU_MALLOC(name, 1+strlen(prefix)+strlen(name_id)+1);
+        strcpy(name, "/");
+        strcat(name, prefix);
+        strcat(name, name_id);
+
+        obj = _starpu_hdf5_data_alloc(fileBase, name, size);
+
+        if (!obj)
+        {
+                free(name);
+        }
+
+        return (void *) obj;
+}
+
+static void starpu_hdf5_free(void *base, void *obj, size_t size STARPU_ATTRIBUTE_UNUSED)
+{
+        struct starpu_hdf5_base * fileBase = (struct starpu_hdf5_base *) base;
+        struct starpu_hdf5_obj * dataObj = (struct starpu_hdf5_obj *) obj;
+        herr_t status;
+
+        _starpu_hdf5_protect_start(base);
+
+        status = H5Dclose(dataObj->dataset);
+        STARPU_ASSERT_MSG(status >= 0, "Can not free this HDF5 dataset (%s)\n", dataObj->path);
+
+        /* remove the dataset link in the HDF5 
+         * But it doesn't delete the space in the file */
+        status = H5Ldelete(fileBase->fileID, dataObj->path, H5P_DEFAULT);
+        STARPU_ASSERT_MSG(status >= 0, "Can not delete the link associed to this dataset (%s)\n", dataObj->path);
+
+        _starpu_hdf5_protect_stop(base);
+
+        free(dataObj->path);
+        free(dataObj);
+}
+
+static void *starpu_hdf5_open(void *base, void *pos, size_t size)
+{
+        struct starpu_hdf5_base * fileBase = (struct starpu_hdf5_base *) base;
+        struct starpu_hdf5_obj * obj;
+        char * name;
+
+        _STARPU_MALLOC(name, strlen(pos)+1);
+        strcpy(name, (char *) pos);
+
+        obj = _starpu_hdf5_data_open(fileBase, name, size);
+
+        if (!obj)
+        {
+                free(name);
+        }
+
+        return (void *) obj;
+}
+
+static void starpu_hdf5_close(void *base, void *obj, size_t size STARPU_ATTRIBUTE_UNUSED)
+{
+        struct starpu_hdf5_obj * dataObj = (struct starpu_hdf5_obj *) obj;
+        herr_t status;
+
+        _starpu_hdf5_protect_start(base);
+
+        status = H5Dclose(dataObj->dataset);
+        STARPU_ASSERT_MSG(status >= 0, "Can not close this HDF5 dataset (%s)\n", dataObj->path);
+
+        _starpu_hdf5_protect_stop(base);
+
+        free(dataObj->path);
+        free(dataObj);
+}
+
+static void starpu_hdf5_wait(void * event)
+{
+        starpu_sem_t * finished = (starpu_sem_t *) event;
+
+        starpu_sem_wait(finished);
+}
+
+static int starpu_hdf5_test(void * event)
+{
+        starpu_sem_t * finished = (starpu_sem_t *) event;
+
+        return starpu_sem_trywait(finished) == 0;
+}
+
+static int starpu_hdf5_full_read(void *base, void *obj, void **ptr, size_t *size)
+{
+        struct starpu_hdf5_obj * dataObj = (struct starpu_hdf5_obj *) obj;
+
+        starpu_sem_t finished;
+        starpu_sem_init(&finished, 0, 0);
+
+        _starpu_hdf5_protect_start(base);
+        *size = _starpu_get_size_obj(dataObj);
+        _starpu_hdf5_protect_stop(base);
+
+        starpu_malloc_flags(ptr, *size, 0); 
+
+        starpu_hdf5_send_work(base, obj, *ptr, 0, *size, (void*) &finished, FULL_READ);
+        
+        starpu_hdf5_wait(&finished);
+
+        starpu_sem_destroy(&finished);
+        
+        return 0;
+}
+
+static int starpu_hdf5_full_write(void *base, void *obj, void *ptr, size_t size)
+{
+        starpu_sem_t finished;
+        starpu_sem_init(&finished, 0, 0);
+
+        starpu_hdf5_send_work(base, obj, ptr, 0, size, (void*) &finished, FULL_WRITE);
+
+        starpu_hdf5_wait(&finished);
+
+        starpu_sem_destroy(&finished);
+
+        return 0;
+}
+
+static int starpu_hdf5_read(void *base, void *obj, void *buf, off_t offset, size_t size)
+{
+        starpu_sem_t finished;
+        starpu_sem_init(&finished, 0, 0);
+
+        starpu_hdf5_send_work(base, obj, buf, offset, size, (void*) &finished, READ);
+
+        starpu_hdf5_wait(&finished);
+
+        starpu_sem_destroy(&finished);
+
+        return 0;
+}
+
+static int starpu_hdf5_write(void *base, void *obj, const void *buf, off_t offset, size_t size)
+{
+        starpu_sem_t finished;
+        starpu_sem_init(&finished, 0, 0);
+
+        starpu_hdf5_send_work(base, obj, (void *) buf, offset, size, (void*) &finished, WRITE);
+
+        starpu_hdf5_wait(&finished);
+
+        starpu_sem_destroy(&finished);
+
+        return 0;
+}
+
+static void * starpu_hdf5_async_read(void *base, void *obj, void *buf, off_t offset, size_t size)
+{
+        starpu_sem_t * finished;
+        _STARPU_MALLOC(finished, sizeof(*finished));
+        starpu_sem_init(finished, 0, 0);
+
+        starpu_hdf5_send_work(base, obj, buf, offset, size, (void*) finished, READ);
+
+        return finished;
+}
+
+static void * starpu_hdf5_async_write(void *base, void *obj, void *buf, off_t offset, size_t size)
+{
+        starpu_sem_t * finished;
+        _STARPU_MALLOC(finished, sizeof(*finished));
+        starpu_sem_init(finished, 0, 0);
+
+        starpu_hdf5_send_work(base, obj, (void *) buf, offset, size, (void*) finished, WRITE);
+
+        return finished;
+}
+
+static void starpu_hdf5_free_request(void * event)
+{
+        starpu_sem_destroy(event);
+        free(event);
+}
+
+static int get_hdf5_bandwidth_between_disk_and_main_ram(unsigned node)
+{
+	unsigned iter;
+	double timing_slowness, timing_latency;
+	double start;
+	double end;
+	char *buf;
+
+	srand(time(NULL));
+	starpu_malloc_flags((void **) &buf, STARPU_DISK_SIZE_MIN, 0);
+	STARPU_ASSERT(buf != NULL);
+
+	/* allocate memory */
+	void *mem = _starpu_disk_alloc(node, STARPU_DISK_SIZE_MIN);
+	/* fail to alloc */
+	if (mem == NULL)
+		return 0;
+
+	memset(buf, 0, STARPU_DISK_SIZE_MIN);
+
+	/* Measure upload slowness */
+	start = starpu_timing_now();
+	for (iter = 0; iter < NITER; ++iter)
+	{
+		_starpu_disk_write(STARPU_MAIN_RAM, node, mem, buf, 0, STARPU_DISK_SIZE_MIN, NULL);
+
+	}
+	end = starpu_timing_now();
+	timing_slowness = end - start;
+
+	/* free memory */
+	starpu_free_flags(buf, STARPU_DISK_SIZE_MIN, 0);
+
+	starpu_malloc_flags((void**) &buf, sizeof(char), 0);
+	STARPU_ASSERT(buf != NULL);
+
+	*buf = 0;
+
+	/* Measure latency */
+	start = starpu_timing_now();
+	for (iter = 0; iter < NITER; ++iter)
+	{
+		_starpu_disk_write(STARPU_MAIN_RAM, node, mem, buf, rand() % (STARPU_DISK_SIZE_MIN -1) , 1, NULL);
+	}
+	end = starpu_timing_now();
+	timing_latency = end - start;
+
+	_starpu_disk_free(node, mem, STARPU_DISK_SIZE_MIN);
+	starpu_free_flags(buf, sizeof(char), 0);
+
+	_starpu_save_bandwidth_and_latency_disk((NITER/timing_slowness)*STARPU_DISK_SIZE_MIN, (NITER/timing_slowness)*STARPU_DISK_SIZE_MIN,
+					       timing_latency/NITER, timing_latency/NITER, node);
+	return 1;
+}
+
+struct starpu_disk_ops starpu_disk_hdf5_ops =
+{
+	.alloc = starpu_hdf5_alloc,
+	.free = starpu_hdf5_free,
+	.open = starpu_hdf5_open,
+	.close = starpu_hdf5_close,
+	.read = starpu_hdf5_read,
+	.write = starpu_hdf5_write,
+	.plug = starpu_hdf5_plug,
+	.unplug = starpu_hdf5_unplug,
+	.copy = NULL,
+	.bandwidth = get_hdf5_bandwidth_between_disk_and_main_ram,
+	.full_read = starpu_hdf5_full_read,
+	.full_write = starpu_hdf5_full_write,
+
+	.async_read = starpu_hdf5_async_read,
+	.async_write = starpu_hdf5_async_write,
+	.wait_request = starpu_hdf5_wait,
+	.test_request = starpu_hdf5_test,
+	.free_request = starpu_hdf5_free_request
+};

+ 6 - 6
src/core/disk_ops/disk_leveldb.cpp

@@ -270,11 +270,11 @@ static int get_leveldb_bandwidth_between_disk_and_main_ram(unsigned node)
 	double end;
 
 	srand(time (NULL));
-	char *buf = (char *)malloc(SIZE_DISK_MIN*sizeof(char));
+	char *buf = (char *)malloc(STARPU_DISK_SIZE_MIN*sizeof(char));
 	STARPU_ASSERT(buf);
 
 	/* allocate memory */
-	void *mem = _starpu_disk_alloc(node, SIZE_DISK_MIN);
+	void *mem = _starpu_disk_alloc(node, STARPU_DISK_SIZE_MIN);
 	/* fail to alloc */
 	if (mem == NULL)
 	{
@@ -286,7 +286,7 @@ static int get_leveldb_bandwidth_between_disk_and_main_ram(unsigned node)
 	start = starpu_timing_now();
 	for (iter = 0; iter < NITER; ++iter)
 	{
-		_starpu_disk_write(STARPU_MAIN_RAM, node, mem, buf, 0, SIZE_DISK_MIN, NULL);
+		_starpu_disk_write(STARPU_MAIN_RAM, node, mem, buf, 0, STARPU_DISK_SIZE_MIN, NULL);
 	}
 	end = starpu_timing_now();
 	timing_slowness = end - start;
@@ -302,15 +302,15 @@ static int get_leveldb_bandwidth_between_disk_and_main_ram(unsigned node)
 	start = starpu_timing_now();
 	for (iter = 0; iter < NITER; ++iter)
 	{
-		_starpu_disk_write(STARPU_MAIN_RAM, node, mem, buf, rand() % (SIZE_DISK_MIN -1) , 1, NULL);
+		_starpu_disk_write(STARPU_MAIN_RAM, node, mem, buf, rand() % (STARPU_DISK_SIZE_MIN -1) , 1, NULL);
 	}
 	end = starpu_timing_now();
 	timing_latency = end - start;
 
-	_starpu_disk_free(node, mem, SIZE_DISK_MIN);
+	_starpu_disk_free(node, mem, STARPU_DISK_SIZE_MIN);
 	free(buf);
 
-	_starpu_save_bandwidth_and_latency_disk((NITER/timing_slowness)*1000000, (NITER/timing_slowness)*1000000,
+	_starpu_save_bandwidth_and_latency_disk((NITER/timing_slowness)*STARPU_DISK_SIZE_MIN, (NITER/timing_slowness)*STARPU_DISK_SIZE_MIN,
 					       timing_latency/NITER, timing_latency/NITER, node);
 	return 1;
 }

+ 35 - 20
src/core/disk_ops/disk_stdio.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2013 Corentin Salingue
- * Copyright (C) 2015, 2016 CNRS
+ * Copyright (C) 2015, 2016, 2017 CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -55,6 +55,12 @@ struct starpu_stdio_obj
 	starpu_pthread_mutex_t mutex;
 };
 
+struct starpu_stdio_base
+{
+	char * path;
+	int created;
+};
+
 static struct starpu_stdio_obj *_starpu_stdio_init(int descriptor, char *path, size_t size)
 {
 	struct starpu_stdio_obj *obj;
@@ -127,9 +133,10 @@ static void _starpu_stdio_fini(struct starpu_stdio_obj *obj)
 static void *starpu_stdio_alloc(void *base, size_t size)
 {
 	struct starpu_stdio_obj *obj;
+	struct starpu_stdio_base * fileBase = (struct starpu_stdio_base *) base;
 
 	int id;
-	char *baseCpy = _starpu_mktemp_many(base, TEMP_HIERARCHY_DEPTH, O_RDWR | O_BINARY, &id);
+	char *baseCpy = _starpu_mktemp_many(fileBase->path, TEMP_HIERARCHY_DEPTH, O_RDWR | O_BINARY, &id);
 
 	/* fail */
 	if (!baseCpy)
@@ -171,11 +178,12 @@ static void starpu_stdio_free(void *base STARPU_ATTRIBUTE_UNUSED, void *obj, siz
 /* open an existing memory on disk */
 static void *starpu_stdio_open(void *base, void *pos, size_t size)
 {
+	struct starpu_stdio_base * fileBase = (struct starpu_stdio_base *) base;
 	struct starpu_stdio_obj *obj;
 	/* create template */
 	char *baseCpy;
-	_STARPU_MALLOC(baseCpy, strlen(base)+1+strlen(pos)+1);
-	strcpy(baseCpy,(char *) base);
+	_STARPU_MALLOC(baseCpy, strlen(fileBase->path)+1+strlen(pos)+1);
+	strcpy(baseCpy,(char *) fileBase->path);
 	strcat(baseCpy,(char *) "/");
 	strcat(baseCpy,(char *) pos);
 
@@ -316,28 +324,35 @@ static int starpu_stdio_full_write(void *base STARPU_ATTRIBUTE_UNUSED, void *obj
 	return 0;
 }
 
-/* create a new copy of parameter == base */
 static void *starpu_stdio_plug(void *parameter, starpu_ssize_t size STARPU_ATTRIBUTE_UNUSED)
 {
-	char *tmp;
-	_STARPU_MALLOC(tmp, sizeof(char)*(strlen(parameter)+1));
-	strcpy(tmp,(char *) parameter);
+	struct starpu_stdio_base * base;
+	_STARPU_MALLOC(base, sizeof(*base));
+	base->created = 0;
+
+	_STARPU_MALLOC(base->path, sizeof(char)*(strlen(parameter)+1));
+	strcpy(base->path,(char *) parameter);
 
 	{
 		struct stat buf;
-		if (!(stat(tmp, &buf) == 0 && S_ISDIR(buf.st_mode)))
+		if (!(stat(base->path, &buf) == 0 && S_ISDIR(buf.st_mode)))
 		{
-			_STARPU_ERROR("Directory '%s' does not exist\n", tmp);
+			_starpu_mkpath(base->path, S_IRWXU);
+			base->created = 1;
 		}
 	}
 
-	return (void *) tmp;
+	return (void *) base;
 }
 
 /* free memory allocated for the base */
 static void starpu_stdio_unplug(void *base)
 {
-	free(base);
+	struct starpu_stdio_base * fileBase = (struct starpu_stdio_base *) base;
+	if (fileBase->created)
+		rmdir(fileBase->path);
+	free(fileBase->path);
+	free(fileBase);
 }
 
 static int get_stdio_bandwidth_between_disk_and_main_ram(unsigned node)
@@ -349,17 +364,17 @@ static int get_stdio_bandwidth_between_disk_and_main_ram(unsigned node)
 	char *buf;
 
 	srand(time(NULL));
-	starpu_malloc_flags((void **) &buf, SIZE_DISK_MIN, 0);
+	starpu_malloc_flags((void **) &buf, STARPU_DISK_SIZE_MIN, 0);
 	STARPU_ASSERT(buf != NULL);
 
 	/* allocate memory */
-	void *mem = _starpu_disk_alloc(node, SIZE_DISK_MIN);
+	void *mem = _starpu_disk_alloc(node, STARPU_DISK_SIZE_MIN);
 	/* fail to alloc */
 	if (mem == NULL)
 		return 0;
 	struct starpu_stdio_obj *tmp = (struct starpu_stdio_obj *) mem;
 
-	memset(buf, 0, SIZE_DISK_MIN);
+	memset(buf, 0, STARPU_DISK_SIZE_MIN);
 
 	/* Measure upload slowness */
 	start = starpu_timing_now();
@@ -367,7 +382,7 @@ static int get_stdio_bandwidth_between_disk_and_main_ram(unsigned node)
 	{
 		FILE *f = tmp->file;
 
-		_starpu_disk_write(STARPU_MAIN_RAM, node, mem, buf, 0, SIZE_DISK_MIN, NULL);
+		_starpu_disk_write(STARPU_MAIN_RAM, node, mem, buf, 0, STARPU_DISK_SIZE_MIN, NULL);
 
 		if (!f)
 			f = _starpu_stdio_reopen(tmp);
@@ -390,7 +405,7 @@ static int get_stdio_bandwidth_between_disk_and_main_ram(unsigned node)
 	timing_slowness = end - start;
 
 	/* free memory */
-	starpu_free_flags(buf, SIZE_DISK_MIN, 0);
+	starpu_free_flags(buf, STARPU_DISK_SIZE_MIN, 0);
 
 	starpu_malloc_flags((void**) &buf, sizeof(char), 0);
 	STARPU_ASSERT(buf != NULL);
@@ -403,7 +418,7 @@ static int get_stdio_bandwidth_between_disk_and_main_ram(unsigned node)
 	{
 		FILE *f = tmp->file;
 
-		_starpu_disk_write(STARPU_MAIN_RAM, node, mem, buf, rand() % (SIZE_DISK_MIN -1) , 1, NULL);
+		_starpu_disk_write(STARPU_MAIN_RAM, node, mem, buf, rand() % (STARPU_DISK_SIZE_MIN -1) , 1, NULL);
 
 		if (!f)
 			f = _starpu_stdio_reopen(tmp);
@@ -424,10 +439,10 @@ static int get_stdio_bandwidth_between_disk_and_main_ram(unsigned node)
 	end = starpu_timing_now();
 	timing_latency = end - start;
 
-	_starpu_disk_free(node, mem, SIZE_DISK_MIN);
+	_starpu_disk_free(node, mem, STARPU_DISK_SIZE_MIN);
 	starpu_free_flags(buf, sizeof(char), 0);
 
-	_starpu_save_bandwidth_and_latency_disk((NITER/timing_slowness)*1000000, (NITER/timing_slowness)*1000000,
+	_starpu_save_bandwidth_and_latency_disk((NITER/timing_slowness)*STARPU_DISK_SIZE_MIN, (NITER/timing_slowness)*STARPU_DISK_SIZE_MIN,
 					       timing_latency/NITER, timing_latency/NITER, node);
 	return 1;
 }

+ 2 - 2
src/core/disk_ops/disk_unistd_o_direct.c

@@ -78,7 +78,7 @@ static void *starpu_unistd_o_direct_plug(void *parameter, starpu_ssize_t size)
 	return starpu_unistd_global_plug (parameter, size);
 }
 
-#ifdef HAVE_AIO_H
+#if defined(HAVE_AIO_H) || defined(HAVE_LIBAIO_H)
 void *starpu_unistd_o_direct_global_async_read(void *base, void *obj, void *buf, off_t offset, size_t size)
 {
 	STARPU_ASSERT_MSG((size % getpagesize()) == 0, "The unistd_o_direct variant can only read a multiple of page size %lu Bytes (Here %lu). Use the non-o_direct unistd variant if your data is not a multiple of %lu",
@@ -112,7 +112,7 @@ struct starpu_disk_ops starpu_disk_unistd_o_direct_ops =
 	.unplug = starpu_unistd_global_unplug,
 	.copy = NULL,
 	.bandwidth = get_unistd_global_bandwidth_between_disk_and_main_ram,
-#ifdef HAVE_AIO_H
+#if defined(HAVE_AIO_H) || defined(HAVE_LIBAIO_H)
         .async_read = starpu_unistd_o_direct_global_async_read,
         .async_write = starpu_unistd_o_direct_global_async_write,
         .wait_request = starpu_unistd_global_wait_request,

+ 224 - 28
src/core/disk_ops/unistd/disk_unistd_global.c

@@ -19,12 +19,14 @@
 #include <stdlib.h>
 #include <sys/stat.h>
 #include <stdint.h>
-#ifdef HAVE_AIO_H
-#include <aio.h>
-#endif
 #include <errno.h>
 
 #include <common/config.h>
+#if defined(HAVE_LIBAIO_H)
+#include <libaio.h>
+#elif defined(HAVE_AIO_H)
+#include <aio.h>
+#endif
 #ifdef HAVE_UNISTD_H
 #  include <unistd.h>
 #endif
@@ -33,8 +35,10 @@
 #include <core/perfmodel/perfmodel.h>
 #include <core/disk_ops/unistd/disk_unistd_global.h>
 #include <datawizard/copy_driver.h>
+#include <datawizard/data_request.h>
 #include <datawizard/memory_manager.h>
 #include <starpu_parameters.h>
+#include <common/uthash.h>
 
 #ifdef STARPU_HAVE_WINDOWS
 #  include <io.h>
@@ -51,11 +55,35 @@
 #define MAX_OPEN_FILES 64
 #define TEMP_HIERARCHY_DEPTH 2
 
-/* TODO: on Linux, use io_submit */
-
 static unsigned starpu_unistd_opened_files;
 
-#ifdef HAVE_AIO_H
+struct starpu_unistd_base
+{
+	char * path;
+	int created;
+#if defined(HAVE_LIBAIO_H)
+	io_context_t ctx;
+        struct starpu_unistd_aiocb_link * hashtable;
+        starpu_pthread_mutex_t mutex;
+#endif
+};
+
+#if defined(HAVE_LIBAIO_H)
+struct starpu_unistd_aiocb_link
+{
+        UT_hash_handle hh;
+        void * starpu_aiocb;
+        void * aiocb;
+};
+struct starpu_unistd_aiocb
+{
+        int finished;
+	struct iocb iocb;
+	struct starpu_unistd_global_obj *obj;
+        struct starpu_unistd_base *base;
+	size_t len;
+};
+#elif defined(HAVE_AIO_H)
 struct starpu_unistd_aiocb
 {
 	struct aiocb aiocb;
@@ -119,7 +147,8 @@ static void _starpu_unistd_fini(struct starpu_unistd_global_obj *obj)
 void *starpu_unistd_global_alloc(struct starpu_unistd_global_obj *obj, void *base, size_t size)
 {
 	int id;
-	char *baseCpy = _starpu_mktemp_many(base, TEMP_HIERARCHY_DEPTH, obj->flags, &id);
+	struct starpu_unistd_base * fileBase = (struct starpu_unistd_base *) base;
+	char *baseCpy = _starpu_mktemp_many(fileBase->path, TEMP_HIERARCHY_DEPTH, obj->flags, &id);
 
 	/* fail */
 	if (!baseCpy)
@@ -159,10 +188,11 @@ void starpu_unistd_global_free(void *base STARPU_ATTRIBUTE_UNUSED, void *obj, si
 /* open an existing memory on disk */
 void *starpu_unistd_global_open(struct starpu_unistd_global_obj *obj, void *base, void *pos, size_t size)
 {
+	struct starpu_unistd_base * fileBase = (struct starpu_unistd_base *) base;
 	/* create template */
 	char *baseCpy;
-	_STARPU_MALLOC(baseCpy, strlen(base)+1+strlen(pos)+1);
-	strcpy(baseCpy,(char *) base);
+	_STARPU_MALLOC(baseCpy, strlen(fileBase->path)+1+strlen(pos)+1);
+	strcpy(baseCpy,(char *) fileBase->path);
 	strcat(baseCpy,(char *) "/");
 	strcat(baseCpy,(char *) pos);
 
@@ -223,7 +253,43 @@ int starpu_unistd_global_read(void *base STARPU_ATTRIBUTE_UNUSED, void *obj, voi
 	return nb;
 }
 
-#ifdef HAVE_AIO_H
+#if defined(HAVE_LIBAIO_H)
+void *starpu_unistd_global_async_read(void *base, void *obj, void *buf, off_t offset, size_t size)
+{
+        struct starpu_unistd_base * fileBase = (struct starpu_unistd_base *) base;
+        struct starpu_unistd_global_obj *tmp = obj;
+        struct starpu_unistd_aiocb *starpu_aiocb;
+	_STARPU_CALLOC(starpu_aiocb, 1,sizeof(*starpu_aiocb));
+        struct iocb *iocb = &starpu_aiocb->iocb;
+        starpu_aiocb->obj = obj;
+        int fd = tmp->descriptor;
+
+        if (fd < 0)
+                fd = _starpu_unistd_reopen(obj);
+
+	starpu_aiocb->len = size;
+        starpu_aiocb->finished = 0;
+        starpu_aiocb->base = fileBase;
+	io_prep_pread(iocb, fd, buf, size, offset);
+	if (io_submit(fileBase->ctx, 1, &iocb) < 0)
+	{
+                free(iocb);
+                if (tmp->descriptor < 0)
+                        _starpu_unistd_reclose(fd);
+                iocb = NULL;
+        }
+
+        struct starpu_unistd_aiocb_link *l;
+        _STARPU_MALLOC(l, sizeof(*l));
+        l->aiocb = iocb;
+        l->starpu_aiocb = starpu_aiocb;
+        STARPU_PTHREAD_MUTEX_LOCK(&fileBase->mutex);
+        HASH_ADD_PTR(fileBase->hashtable, aiocb, l);
+        STARPU_PTHREAD_MUTEX_UNLOCK(&fileBase->mutex);
+
+        return starpu_aiocb;
+}
+#elif defined(HAVE_AIO_H)
 void *starpu_unistd_global_async_read(void *base STARPU_ATTRIBUTE_UNUSED, void *obj, void *buf, off_t offset, size_t size)
 {
         struct starpu_unistd_global_obj *tmp = obj;
@@ -312,7 +378,43 @@ int starpu_unistd_global_write(void *base STARPU_ATTRIBUTE_UNUSED, void *obj, co
 	return 0;
 }
 
-#ifdef HAVE_AIO_H
+#if defined(HAVE_LIBAIO_H)
+void *starpu_unistd_global_async_write(void *base, void *obj, void *buf, off_t offset, size_t size)
+{
+        struct starpu_unistd_base * fileBase = (struct starpu_unistd_base *) base;
+        struct starpu_unistd_global_obj *tmp = obj;
+        struct starpu_unistd_aiocb *starpu_aiocb;
+	_STARPU_CALLOC(starpu_aiocb, 1,sizeof(*starpu_aiocb));
+        struct iocb *iocb = &starpu_aiocb->iocb;
+        starpu_aiocb->obj = obj;
+        int fd = tmp->descriptor;
+
+        if (fd < 0)
+                fd = _starpu_unistd_reopen(obj);
+
+	starpu_aiocb->len = size;
+        starpu_aiocb->finished = 0;
+        starpu_aiocb->base = fileBase;
+	io_prep_pwrite(iocb, fd, buf, size, offset);
+	if (io_submit(fileBase->ctx, 1, &iocb) < 0)
+        {
+                free(iocb);
+                if (tmp->descriptor < 0)
+                        _starpu_unistd_reclose(fd);
+                iocb = NULL;
+        }
+
+        struct starpu_unistd_aiocb_link *l;
+        _STARPU_MALLOC(l, sizeof(*l));
+        l->aiocb = iocb;
+        l->starpu_aiocb = starpu_aiocb;
+        STARPU_PTHREAD_MUTEX_LOCK(&fileBase->mutex);
+        HASH_ADD_PTR(fileBase->hashtable, aiocb, l);
+        STARPU_PTHREAD_MUTEX_UNLOCK(&fileBase->mutex);
+
+        return starpu_aiocb;
+}
+#elif defined(HAVE_AIO_H)
 void *starpu_unistd_global_async_write(void *base STARPU_ATTRIBUTE_UNUSED, void *obj, void *buf, off_t offset, size_t size)
 {
         struct starpu_unistd_global_obj *tmp = obj;
@@ -368,25 +470,46 @@ int starpu_unistd_global_full_write(void *base STARPU_ATTRIBUTE_UNUSED, void *ob
 /* create a new copy of parameter == base */
 void *starpu_unistd_global_plug(void *parameter, starpu_ssize_t size STARPU_ATTRIBUTE_UNUSED)
 {
-	char *tmp;
-	_STARPU_MALLOC(tmp, sizeof(char)*(strlen(parameter)+1));
-	strcpy(tmp,(char *) parameter);
+	struct starpu_unistd_base * base;
+	_STARPU_MALLOC(base, sizeof(*base));
+	base->created = 0;
+
+	_STARPU_MALLOC(base->path, sizeof(char)*(strlen(parameter)+1));
+	strcpy(base->path,(char *) parameter);
 
 	{
 		struct stat buf;
-		if (!(stat(tmp, &buf) == 0 && S_ISDIR(buf.st_mode)))
+		if (!(stat(base->path, &buf) == 0 && S_ISDIR(buf.st_mode)))
 		{
-			_STARPU_ERROR("Directory '%s' does not exist\n", tmp);
+			_starpu_mkpath(base->path, S_IRWXU);
+			base->created = 1;
 		}
 	}
 
-	return (void *) tmp;
+#if defined(HAVE_LIBAIO_H)
+        STARPU_PTHREAD_MUTEX_INIT(&base->mutex, NULL);
+        base->hashtable = NULL;
+        unsigned nb_event = MAX_PENDING_REQUESTS_PER_NODE + MAX_PENDING_PREFETCH_REQUESTS_PER_NODE + MAX_PENDING_IDLE_REQUESTS_PER_NODE;
+        memset(&base->ctx, 0, sizeof(base->ctx));
+	int ret = io_setup(nb_event, &base->ctx);
+	STARPU_ASSERT(ret == 0);
+#endif
+
+	return (void *) base;
 }
 
 /* free memory allocated for the base */
 void starpu_unistd_global_unplug(void *base)
 {
-	free(base);
+	struct starpu_unistd_base * fileBase = (struct starpu_unistd_base *) base;
+#if defined(HAVE_LIBAIO_H)
+        STARPU_PTHREAD_MUTEX_DESTROY(&fileBase->mutex);
+        io_destroy(fileBase->ctx);
+#endif
+	if (fileBase->created)
+		rmdir(fileBase->path);
+	free(fileBase->path);
+	free(fileBase);
 }
 
 int get_unistd_global_bandwidth_between_disk_and_main_ram(unsigned node)
@@ -399,12 +522,12 @@ int get_unistd_global_bandwidth_between_disk_and_main_ram(unsigned node)
 
 	srand(time(NULL));
 	char *buf;
-	starpu_malloc_flags((void *) &buf, SIZE_DISK_MIN, 0);
+	starpu_malloc_flags((void *) &buf, STARPU_DISK_SIZE_MIN, 0);
 	STARPU_ASSERT(buf != NULL);
-	memset(buf, 0, SIZE_DISK_MIN);
+	memset(buf, 0, STARPU_DISK_SIZE_MIN);
 
 	/* allocate memory */
-	void *mem = _starpu_disk_alloc(node, SIZE_DISK_MIN);
+	void *mem = _starpu_disk_alloc(node, STARPU_DISK_SIZE_MIN);
 	/* fail to alloc */
 	if (mem == NULL)
 		return 0;
@@ -417,7 +540,7 @@ int get_unistd_global_bandwidth_between_disk_and_main_ram(unsigned node)
 	{
 		int fd = tmp->descriptor;
 
-		_starpu_disk_write(STARPU_MAIN_RAM, node, mem, buf, 0, SIZE_DISK_MIN, NULL);
+		_starpu_disk_write(STARPU_MAIN_RAM, node, mem, buf, 0, STARPU_DISK_SIZE_MIN, NULL);
 
 		if (fd < 0)
 			fd = _starpu_unistd_reopen(tmp);
@@ -435,7 +558,7 @@ int get_unistd_global_bandwidth_between_disk_and_main_ram(unsigned node)
 	timing_slowness = end - start;
 
 	/* free memory */
-	starpu_free_flags(buf, SIZE_DISK_MIN, 0);
+	starpu_free_flags(buf, STARPU_DISK_SIZE_MIN, 0);
 
 	starpu_malloc_flags((void *) &buf, MEM_SIZE, 0);
 	STARPU_ASSERT(buf != NULL);
@@ -448,7 +571,7 @@ int get_unistd_global_bandwidth_between_disk_and_main_ram(unsigned node)
 	{
 		int fd = tmp->descriptor;
 
-		_starpu_disk_write(STARPU_MAIN_RAM, node, mem, buf, (rand() % (SIZE_DISK_MIN/MEM_SIZE)) * MEM_SIZE, MEM_SIZE, NULL);
+		_starpu_disk_write(STARPU_MAIN_RAM, node, mem, buf, (rand() % (STARPU_DISK_SIZE_MIN/MEM_SIZE)) * MEM_SIZE, MEM_SIZE, NULL);
 
 		if (fd < 0)
 			fd = _starpu_unistd_reopen(tmp);
@@ -465,15 +588,88 @@ int get_unistd_global_bandwidth_between_disk_and_main_ram(unsigned node)
 	end = starpu_timing_now();
 	timing_latency = end - start;
 
-	_starpu_disk_free(node, mem, SIZE_DISK_MIN);
+	_starpu_disk_free(node, mem, STARPU_DISK_SIZE_MIN);
 	starpu_free_flags(buf, MEM_SIZE, 0);
 
-	_starpu_save_bandwidth_and_latency_disk((NITER/timing_slowness)*1000000, (NITER/timing_slowness)*1000000,
+	_starpu_save_bandwidth_and_latency_disk((NITER/timing_slowness)*STARPU_DISK_SIZE_MIN, (NITER/timing_slowness)*STARPU_DISK_SIZE_MIN,
 					       timing_latency/NITER, timing_latency/NITER, node);
 	return 1;
 }
 
-#ifdef HAVE_AIO_H
+#if defined(HAVE_LIBAIO_H)
+void starpu_unistd_global_wait_request(void *async_channel)
+{
+	struct starpu_unistd_aiocb *starpu_aiocb = async_channel;
+	struct io_event event;
+
+        int values = -1;
+        int myerrno = EAGAIN;
+        while(!starpu_aiocb->finished || (values <= 0 && (myerrno == EAGAIN || myerrno == EINTR)))
+        {
+                /* Wait the answer of the request timeout IS NULL */
+		values = io_getevents(starpu_aiocb->base->ctx, 1, 1, &event, NULL);
+		if (values < 0)
+			myerrno = -values;
+                if (values > 0)
+                {
+                        //we may catch an other request...
+                        STARPU_PTHREAD_MUTEX_LOCK(&starpu_aiocb->base->mutex);
+                        struct starpu_unistd_aiocb_link *l = NULL;
+                        while (l == NULL)
+                                HASH_FIND_PTR(starpu_aiocb->base->hashtable, &event.obj, l);
+
+                        HASH_DEL(starpu_aiocb->base->hashtable, l);
+                        STARPU_PTHREAD_MUTEX_UNLOCK(&starpu_aiocb->base->mutex);
+                        ((struct starpu_unistd_aiocb *) l->starpu_aiocb)->finished = 1;
+                        free(l);
+                }
+        }
+}
+
+int starpu_unistd_global_test_request(void *async_channel)
+{
+	struct starpu_unistd_aiocb *starpu_aiocb = async_channel;
+	struct io_event event;
+	struct timespec ts;
+        int ret;
+
+        if (starpu_aiocb->finished)
+                return 1;
+
+	memset(&ts, 0, sizeof(ts));
+
+        /* Test the answer of the request */
+	ret = io_getevents(starpu_aiocb->base->ctx, 0, 1, &event, &ts);
+
+        if (ret == 1)
+	{
+                //we may catch an other request...
+                STARPU_PTHREAD_MUTEX_LOCK(&starpu_aiocb->base->mutex);
+                struct starpu_unistd_aiocb_link *l = NULL;
+                while (l == NULL)
+                        HASH_FIND_PTR(starpu_aiocb->base->hashtable, &event.obj, l);
+
+                HASH_DEL(starpu_aiocb->base->hashtable, l);
+                STARPU_PTHREAD_MUTEX_UNLOCK(&starpu_aiocb->base->mutex);
+                ((struct starpu_unistd_aiocb *) l->starpu_aiocb)->finished = 1;
+                free(l);
+
+                if (starpu_aiocb->finished)
+                        return 1;
+	}
+
+        return 0;
+}
+
+void starpu_unistd_global_free_request(void *async_channel)
+{
+        struct starpu_unistd_aiocb *starpu_aiocb = async_channel;
+        struct iocb *iocb = &starpu_aiocb->iocb;
+        if (starpu_aiocb->obj->descriptor < 0)
+                _starpu_unistd_reclose(iocb->aio_fildes);
+        free(starpu_aiocb);
+}
+#elif defined(HAVE_AIO_H)
 void starpu_unistd_global_wait_request(void *async_channel)
 {
         const struct aiocb *aiocb = async_channel;
@@ -507,7 +703,7 @@ int starpu_unistd_global_test_request(void *async_channel)
         if (ret == 0)
                 /* request is finished */
                 return 1;
-        if (ret == EINPROGRESS || ret == EAGAIN)
+        if (ret == EINTR || ret == EINPROGRESS || ret == EAGAIN)
                 return 0;
         /* an error occured */
         STARPU_ABORT_MSG("aio_error returned %d", ret);

+ 1 - 5
src/core/jobs.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2016  Université de Bordeaux
+ * Copyright (C) 2009-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2013, 2014, 2015, 2017  CNRS
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2014  INRIA
@@ -180,10 +180,6 @@ struct _starpu_job {
 	 * parallel tasks only). */
 	int active_task_alias_count;
 
-	/* Used to record codelet start time instead of using a
-	 * local variable */
-	struct timespec cl_start;
-
 	struct bound_task *bound_task;
 
 	/* Parallel workers may have to synchronize before/after the execution of a parallel task. */

+ 12 - 12
src/core/perfmodel/multiple_regression.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010, 2011, 2015-2016  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2016, 2017  CNRS
  * Copyright (C) 2016-2017  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -29,15 +29,15 @@ int dgels_(char *trans, integer *m, integer *n, integer *nrhs, doublereal *a, in
 
 static long count_file_lines(FILE *f)
 {
-	int ch, lines=0;
+	int lines=0;
 	while(!feof(f))
 	{
-		ch = fgetc(f);
-	    if(ch == '\n')
-	    {
-		  lines++;
+		int ch = fgetc(f);
+		if(ch == '\n')
+		{
+			lines++;
 		}
-    }
+	}
 	rewind(f);
 
 	return lines;
@@ -62,20 +62,20 @@ static void dump_multiple_regression_list(double *mpar, double *my, int start, u
 static void load_old_calibration(double *mx, double *my, unsigned nparameters, char *filepath)
 {
 	char buffer[1024];
-	char *record,*line;
-	int i=0,j=0;
+	char *line;
+	int i=0;
 
 	FILE *f=NULL;
 	f = fopen(filepath, "a+");
 	STARPU_ASSERT_MSG(f, "Could not save performance model into the file %s\n", filepath);
 
-	line=fgets(buffer,sizeof(buffer),f);//skipping first line
+	fgets(buffer,sizeof(buffer),f);//skipping first line
 	while((line=fgets(buffer,sizeof(buffer),f))!=NULL)
 	{
-		record = strtok(line,",");
+		char *record = strtok(line,",");
 		my[i] = atof(record);
 		record = strtok(NULL,",");
-		j=0;
+		int j=0;
 		while(record != NULL)
 		{
 			mx[i*nparameters+j] = atof(record) ;

+ 1 - 1
src/core/perfmodel/perfmodel.c

@@ -482,7 +482,7 @@ void _starpu_set_perf_model_dirs()
 
 #ifdef STARPU_PERF_MODEL_DIR
 	/* use the directory specified at configure time */
-	snprintf(_perf_model_dir, _PERF_MODEL_DIR_MAXLEN, "%s", STARPU_PERF_MODEL_DIR);
+	snprintf(_perf_model_dir, _PERF_MODEL_DIR_MAXLEN, "%s", (char *)STARPU_PERF_MODEL_DIR);
 #else
 	snprintf(_perf_model_dir, _PERF_MODEL_DIR_MAXLEN, "%s/.starpu/sampling/", _starpu_get_home_path());
 #endif

+ 1 - 3
src/core/perfmodel/perfmodel.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2016  Université de Bordeaux
+ * Copyright (C) 2009-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2016  Inria
@@ -108,8 +108,6 @@ void _starpu_write_double(FILE *f, char *format, double val);
 int _starpu_read_double(FILE *f, char *format, double *val);
 void _starpu_simgrid_get_platform_path(int version, char *path, size_t maxlen);
 
-struct starpu_perfmodel_arch * _starpu_arch_comb_get(int comb);
-
 void _starpu_perfmodel_realloc(struct starpu_perfmodel *model, int nb);
 
 #if defined(STARPU_HAVE_HWLOC)

+ 24 - 2
src/core/perfmodel/perfmodel_bus.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2016  Université de Bordeaux
+ * Copyright (C) 2009-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
  * Copyright (C) 2017  Inria
  * Copyright (C) 2013 Corentin Salingue
@@ -19,7 +19,7 @@
 
 #ifdef STARPU_USE_CUDA
 #ifndef _GNU_SOURCE
-#define _GNU_SOURCE
+#define _GNU_SOURCE 1
 #endif
 #include <sched.h>
 #endif
@@ -2993,6 +2993,13 @@ void _starpu_save_bandwidth_and_latency_disk(double bandwidth_write, double band
 {
 	unsigned int i, j;
 	double slowness_disk_between_main_ram, slowness_main_ram_between_node;
+	int print_stats = starpu_get_env_number_default("STARPU_BUS_STATS", 0);
+
+	if (print_stats)
+	{
+		fprintf(stderr, "\n#---------------------\n");
+		fprintf(stderr, "Data transfer speed for %u:\n", node);
+	}
 
 	/* save bandwith */
 	for(i = 0; i < STARPU_MAXNODES; ++i)
@@ -3017,6 +3024,9 @@ void _starpu_save_bandwidth_and_latency_disk(double bandwidth_write, double band
 					slowness_main_ram_between_node = 0;
 
 				bandwidth_matrix[i][j] = 1/(slowness_disk_between_main_ram+slowness_main_ram_between_node);
+
+				if (!isnan(bandwidth_matrix[i][j]) && print_stats)
+					fprintf(stderr,"%u -> %u: %.0f MB/s\n", i, j, bandwidth_matrix[i][j]);
 			}
 			else if (j == node) /* destination == disk */
 			{
@@ -3032,6 +3042,9 @@ void _starpu_save_bandwidth_and_latency_disk(double bandwidth_write, double band
 					slowness_main_ram_between_node = 0;
 
 				bandwidth_matrix[i][j] = 1/(slowness_disk_between_main_ram+slowness_main_ram_between_node);
+
+				if (!isnan(bandwidth_matrix[i][j]) && print_stats)
+					fprintf(stderr,"%u -> %u: %.0f MB/s\n", i, j, bandwidth_matrix[i][j]);
 			}
 			else if (j > node || i > node) /* not affected by the node */
 			{
@@ -3052,10 +3065,16 @@ void _starpu_save_bandwidth_and_latency_disk(double bandwidth_write, double band
 			else if (i == node) /* source == disk */
 			{
 				latency_matrix[i][j] = (latency_write+latency_matrix[STARPU_MAIN_RAM][j]);
+
+				if (!isnan(latency_matrix[i][j]) && print_stats)
+					fprintf(stderr,"%u -> %u: %.0f µs\n", i, j, latency_matrix[i][j]);
 			}
 			else if (j == node) /* destination == disk */
 			{
 				latency_matrix[i][j] = (latency_read+latency_matrix[i][STARPU_MAIN_RAM]);
+
+				if (!isnan(latency_matrix[i][j]) && print_stats)
+					fprintf(stderr,"%u -> %u: %.0f µs\n", i, j, latency_matrix[i][j]);
 			}
 			else if (j > node || i > node) /* not affected by the node */
 			{
@@ -3063,4 +3082,7 @@ void _starpu_save_bandwidth_and_latency_disk(double bandwidth_write, double band
 			}
 		}
 	}
+
+	if (print_stats)
+		fprintf(stderr, "\n#---------------------\n");
 }

+ 3 - 3
src/core/perfmodel/perfmodel_history.c

@@ -185,7 +185,7 @@ int starpu_perfmodel_get_narch_combs()
 	return current_arch_comb;
 }
 
-struct starpu_perfmodel_arch *_starpu_arch_comb_get(int comb)
+struct starpu_perfmodel_arch *starpu_perfmodel_arch_comb_fetch(int comb)
 {
 	return arch_combs[comb];
 }
@@ -718,7 +718,7 @@ static int parse_model_file(FILE *f, const char *path, struct starpu_perfmodel *
 		model->state->ncombs = ncombs;
 	}
 
-	if (ncombs >= model->state->ncombs_set)
+	if (ncombs > model->state->ncombs_set)
 	{
 		// The model has more combs than the original number of arch_combs, we need to reallocate
 		_starpu_perfmodel_realloc(model, ncombs);
@@ -1868,7 +1868,7 @@ int starpu_perfmodel_list_combs(FILE *output, struct starpu_perfmodel *model)
 		struct starpu_perfmodel_arch *arch;
 		int device;
 
-		arch = _starpu_arch_comb_get(model->state->combs[comb]);
+		arch = starpu_perfmodel_arch_comb_fetch(model->state->combs[comb]);
 		fprintf(output, "\tComb %d: %d device%s\n", model->state->combs[comb], arch->ndevices, arch->ndevices>1?"s":"");
 		for(device=0 ; device<arch->ndevices ; device++)
 		{

+ 3 - 3
src/core/perfmodel/perfmodel_print.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011, 2013-2016  Université de Bordeaux
+ * Copyright (C) 2011, 2013-2017  Université de Bordeaux
  * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
  * Copyright (C) 2011  Télécom-SudParis
  *
@@ -176,7 +176,7 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 		int comb, impl;
 		for(comb = 0; comb < starpu_perfmodel_get_narch_combs(); comb++)
 		{
-			struct starpu_perfmodel_arch *arch_comb = _starpu_arch_comb_get(comb);
+			struct starpu_perfmodel_arch *arch_comb = starpu_perfmodel_arch_comb_fetch(comb);
 			int nimpls = model->state ? model->state->nimpls[comb] : 0;
 			for(impl = 0; impl < nimpls; impl++)
 				starpu_perfmodel_print(model, arch_comb, impl, parameter, footprint, output);
@@ -240,7 +240,7 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 			int comb;
 			for(comb = 0; comb < starpu_perfmodel_get_narch_combs(); comb++)
 			{
-				struct starpu_perfmodel_arch *arch_comb = _starpu_arch_comb_get(comb);
+				struct starpu_perfmodel_arch *arch_comb = starpu_perfmodel_arch_comb_fetch(comb);
 				if(arch_comb->ndevices == 1 && arch_comb->devices[0].type == STARPU_CUDA_WORKER)
 				{
 					perf_arch.devices[0].devid = arch_comb->devices[0].devid;

+ 5 - 10
src/core/sched_ctx.c

@@ -161,10 +161,10 @@ static void _starpu_update_workers_with_ctx(int *workerids, int nworkers, int sc
 static void _starpu_update_notified_workers_with_ctx(int *workerids, int nworkers, int sched_ctx_id)
 {
 	int i;
-	struct _starpu_worker *worker = NULL;
 
 	for(i = 0; i < nworkers; i++)
 	{
+		struct _starpu_worker *worker;
 		worker = _starpu_get_worker_struct(workerids[i]);
 		_starpu_worker_gets_into_ctx(sched_ctx_id, worker);
 	}
@@ -201,10 +201,10 @@ static void _starpu_update_workers_without_ctx(int *workerids, int nworkers, int
 static void _starpu_update_notified_workers_without_ctx(int *workerids, int nworkers, int sched_ctx_id, unsigned now)
 {
 	int i;
-	struct _starpu_worker *worker = NULL;
 
 	for(i = 0; i < nworkers; i++)
 	{
+		struct _starpu_worker *worker;
 		worker = _starpu_get_worker_struct(workerids[i]);
 		if(now)
 		{
@@ -1027,10 +1027,10 @@ void _starpu_delete_all_sched_ctxs()
 	unsigned i;
 	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
 	{
-		_starpu_sched_ctx_lock_write(i);
 		struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(i);
 		if(sched_ctx->id != STARPU_NMAX_SCHED_CTXS)
 		{
+			_starpu_sched_ctx_lock_write(i);
 			_starpu_sched_ctx_free_scheduling_data(sched_ctx);
 			_starpu_barrier_counter_destroy(&sched_ctx->tasks_barrier);
 			_starpu_barrier_counter_destroy(&sched_ctx->ready_tasks_barrier);
@@ -1038,11 +1038,6 @@ void _starpu_delete_all_sched_ctxs()
 			STARPU_PTHREAD_RWLOCK_DESTROY(&sched_ctx->rwlock);
 			_starpu_delete_sched_ctx(sched_ctx);
 		}
-		else
-		{
-			_starpu_sched_ctx_unlock_write(i);
-			STARPU_PTHREAD_RWLOCK_DESTROY(&sched_ctx->rwlock);
-		}
 	}
 
 	STARPU_PTHREAD_KEY_DELETE(sched_ctx_key);
@@ -1583,12 +1578,12 @@ int _starpu_wait_for_no_ready_of_sched_ctx(unsigned sched_ctx_id)
 
 void starpu_sched_ctx_set_context(unsigned *sched_ctx)
 {
-	starpu_pthread_setspecific(sched_ctx_key, (void*)sched_ctx);
+	STARPU_PTHREAD_SETSPECIFIC(sched_ctx_key, (void*)sched_ctx);
 }
 
 unsigned starpu_sched_ctx_get_context()
 {
-	unsigned *sched_ctx = (unsigned*)starpu_pthread_getspecific(sched_ctx_key);
+	unsigned *sched_ctx = (unsigned*)STARPU_PTHREAD_GETSPECIFIC(sched_ctx_key);
 	if(sched_ctx == NULL)
 		return STARPU_NMAX_SCHED_CTXS;
 	STARPU_ASSERT(*sched_ctx < STARPU_NMAX_SCHED_CTXS);

+ 12 - 10
src/core/sched_policy.c

@@ -1049,17 +1049,18 @@ void _starpu_sched_pre_exec_hook(struct starpu_task *task)
 	{
 		int workerid = starpu_worker_get_id();
 		struct _starpu_worker *worker =  _starpu_get_worker_struct(workerid);
-		struct _starpu_sched_ctx *other_sched_ctx;
-		struct _starpu_sched_ctx_elt *e = NULL;
 		struct _starpu_sched_ctx_list_iterator list_it;
-		
+
 		_starpu_sched_ctx_list_iterator_init(worker->sched_ctx_list, &list_it);
 		while (_starpu_sched_ctx_list_iterator_has_next(&list_it))
 		{
+			struct _starpu_sched_ctx *other_sched_ctx;
+			struct _starpu_sched_ctx_elt *e = NULL;
+
 			e = _starpu_sched_ctx_list_iterator_get_next(&list_it);
 			other_sched_ctx = _starpu_get_sched_ctx_struct(e->sched_ctx);
-			if (other_sched_ctx != sched_ctx && 
-			    other_sched_ctx->sched_policy != NULL && 
+			if (other_sched_ctx != sched_ctx &&
+			    other_sched_ctx->sched_policy != NULL &&
 			    other_sched_ctx->sched_policy->pre_exec_hook)
 			{
 				_STARPU_SCHED_BEGIN;
@@ -1086,17 +1087,18 @@ void _starpu_sched_post_exec_hook(struct starpu_task *task)
 	{
 		int workerid = starpu_worker_get_id();
 		struct _starpu_worker *worker =  _starpu_get_worker_struct(workerid);
-		struct _starpu_sched_ctx *other_sched_ctx;
-		struct _starpu_sched_ctx_elt *e = NULL;
 		struct _starpu_sched_ctx_list_iterator list_it;
-		
+
 		_starpu_sched_ctx_list_iterator_init(worker->sched_ctx_list, &list_it);
 		while (_starpu_sched_ctx_list_iterator_has_next(&list_it))
 		{
+			struct _starpu_sched_ctx *other_sched_ctx;
+			struct _starpu_sched_ctx_elt *e = NULL;
+
 			e = _starpu_sched_ctx_list_iterator_get_next(&list_it);
 			other_sched_ctx = _starpu_get_sched_ctx_struct(e->sched_ctx);
-			if (other_sched_ctx != sched_ctx && 
-			    other_sched_ctx->sched_policy != NULL && 
+			if (other_sched_ctx != sched_ctx &&
+			    other_sched_ctx->sched_policy != NULL &&
 			    other_sched_ctx->sched_policy->post_exec_hook)
 			{
 				_STARPU_SCHED_BEGIN;

+ 5 - 7
src/core/simgrid.c

@@ -241,11 +241,6 @@ static void start_simgrid(int *argc, char **argv)
 	MSG_create_environment(path);
 }
 
-struct main_args
-{
-	int argc;
-	char **argv;
-};
 static int main_ret;
 
 int do_starpu_main(int argc, char *argv[])
@@ -445,7 +440,7 @@ static int task_execute(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[] STARPU_AT
 	/* FIXME: Ugly work-around for bug in simgrid: the MPI context is not properly set at MSG process startup */
 	MSG_process_sleep(0.000001);
 
-	unsigned workerid = (uintptr_t) starpu_pthread_getspecific(0);
+	unsigned workerid = (uintptr_t) STARPU_PTHREAD_GETSPECIFIC(0);
 	struct worker_runner *w = &worker_runner[workerid];
 
 	_STARPU_DEBUG("worker runner %u started\n", workerid);
@@ -687,7 +682,7 @@ static int transfer_execute(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[] STARP
 	/* FIXME: Ugly work-around for bug in simgrid: the MPI context is not properly set at MSG process startup */
 	MSG_process_sleep(0.000001);
 
-	unsigned src_dst = (uintptr_t) starpu_pthread_getspecific(0);
+	unsigned src_dst = (uintptr_t) STARPU_PTHREAD_GETSPECIFIC(0);
 	unsigned src = src_dst >> 16;
 	unsigned dst = src_dst & 0xffff;
 	struct transfer_runner *t = &transfer_runner[src][dst];
@@ -957,6 +952,9 @@ _starpu_simgrid_get_memnode_host(unsigned node)
 		case STARPU_OPENCL_RAM:
 			fmt = "OpenCL%u";
 			break;
+		case STARPU_DISK_RAM:
+			fmt = "DISK%u";
+			break;
 		default:
 			STARPU_ABORT();
 			break;

+ 16 - 9
src/core/task.c

@@ -56,6 +56,8 @@ static int limit_min_submitted_tasks;
 static int limit_max_submitted_tasks;
 static int watchdog_crash;
 
+#define _STARPU_TASK_MAGIC 42
+
 /* Called once at starpu_init */
 void _starpu_task_init(void)
 {
@@ -98,7 +100,7 @@ void starpu_task_init(struct starpu_task *task)
 	task->predicted_transfer = NAN;
 	task->predicted_start = NAN;
 
-	task->magic = 42;
+	task->magic = _STARPU_TASK_MAGIC;
 	task->sched_ctx = STARPU_NMAX_SCHED_CTXS;
 
 	task->flops = 0.0;
@@ -179,7 +181,7 @@ void _starpu_task_destroy(struct starpu_task *task)
 	{
 		starpu_task_clean(task);
 		/* TODO handle the case of task with detach = 1 and destroy = 1 */
-		/* TODO handle the case of non terminated tasks -> return -EINVAL */
+		/* TODO handle the case of non terminated tasks -> assertion failure, it's too dangerous to be doing something like this */
 
 		/* Does user want StarPU release cl_arg ? */
 		if (task->cl_arg_free)
@@ -566,7 +568,9 @@ static int _starpu_task_submit_head(struct starpu_task *task)
 
 		/* Check buffers */
 		if (task->dyn_handles == NULL)
-			STARPU_ASSERT_MSG(STARPU_TASK_GET_NBUFFERS(task) <= STARPU_NMAXBUFS, "Codelet %p has too many buffers (%d vs max %d). Either use --enable-maxbuffers configure option to increase the max, or use dyn_handles instead of handles.", task->cl, STARPU_TASK_GET_NBUFFERS(task), STARPU_NMAXBUFS);
+			STARPU_ASSERT_MSG(STARPU_TASK_GET_NBUFFERS(task) <= STARPU_NMAXBUFS,
+					  "Codelet %p has too many buffers (%d vs max %d). Either use --enable-maxbuffers configure option to increase the max, or use dyn_handles instead of handles.",
+					  task->cl, STARPU_TASK_GET_NBUFFERS(task), STARPU_NMAXBUFS);
 
 		if (task->dyn_handles)
 		{
@@ -577,7 +581,7 @@ static int _starpu_task_submit_head(struct starpu_task *task)
 		{
 			starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
 			/* Make sure handles are valid */
-			STARPU_ASSERT_MSG(handle->magic == 42, "data %p is invalid (was it already unregistered?)", handle);
+			STARPU_ASSERT_MSG(handle->magic == _STARPU_TASK_MAGIC, "data %p is invalid (was it already unregistered?)", handle);
 			/* Make sure handles are not partitioned */
 			STARPU_ASSERT_MSG(handle->nchildren == 0, "only unpartitioned data (or the pieces of a partitioned data) can be used in a task");
 			/* Provide the home interface for now if any,
@@ -617,7 +621,7 @@ int starpu_task_submit(struct starpu_task *task)
 {
 	_STARPU_LOG_IN();
 	STARPU_ASSERT(task);
-	STARPU_ASSERT_MSG(task->magic == 42, "Tasks must be created with starpu_task_create, or initialized with starpu_task_init.");
+	STARPU_ASSERT_MSG(task->magic == _STARPU_TASK_MAGIC, "Tasks must be created with starpu_task_create, or initialized with starpu_task_init.");
 
 	int ret;
 	unsigned is_sync = task->synchronous;
@@ -817,6 +821,8 @@ void starpu_codelet_init(struct starpu_codelet *cl)
 	memset(cl, 0, sizeof(struct starpu_codelet));
 }
 
+#define _STARPU_CODELET_WORKER_NAME_LEN 32
+
 void starpu_codelet_display_stats(struct starpu_codelet *cl)
 {
 	unsigned worker;
@@ -834,8 +840,8 @@ void starpu_codelet_display_stats(struct starpu_codelet *cl)
 
 	for (worker = 0; worker < nworkers; worker++)
 	{
-		char name[32];
-		starpu_worker_get_name(worker, name, 32);
+		char name[_STARPU_CODELET_WORKER_NAME_LEN];
+		starpu_worker_get_name(worker, name, _STARPU_CODELET_WORKER_NAME_LEN);
 
 		fprintf(stderr, "\t%s -> %lu / %lu (%2.2f %%)\n", name, cl->per_worker_stats[worker], total, (100.0f*cl->per_worker_stats[worker])/total);
 	}
@@ -1276,7 +1282,8 @@ static void *watchdog_func(void *arg)
 		if (!config->watchdog_ok && last_nsubmitted
 				&& last_nsubmitted == starpu_task_nsubmitted())
 		{
-			_STARPU_MSG("The StarPU watchdog detected that no task finished for %fs (can be configured through STARPU_WATCHDOG_TIMEOUT)\n", timeout);
+			_STARPU_MSG("The StarPU watchdog detected that no task finished for %fs (can be configured through STARPU_WATCHDOG_TIMEOUT)\n",
+				    timeout);
 			if (watchdog_crash)
 			{
 				_STARPU_MSG("Crashing the process\n");
@@ -1312,5 +1319,5 @@ void _starpu_watchdog_shutdown(void)
 	if (!timeout_env)
 		return;
 
-	starpu_pthread_join(watchdog_thread, NULL);
+	STARPU_PTHREAD_JOIN(watchdog_thread, NULL);
 }

+ 10 - 6
src/core/topology.c

@@ -895,11 +895,15 @@ _starpu_get_next_bindid (struct _starpu_machine_config *config,
 {
 	struct _starpu_machine_topology *topology = &config->topology;
 
-	unsigned found = 0;
 	int current_preferred;
 	int nhyperthreads = topology->nhwpus / topology->nhwcpus;
 	unsigned i;
 
+	if (npreferred)
+	{
+		STARPU_ASSERT_MSG(preferred_binding, "Passing NULL pointer for parameter preferred_binding with a non-0 value of parameter npreferred");
+	}
+
 	/* loop over the preference list */
 	for (current_preferred = 0;
 	     current_preferred < npreferred;
@@ -1073,7 +1077,7 @@ _starpu_init_mic_config (struct _starpu_machine_config *config,
 		if ((unsigned) nmiccores > topology->nhwmiccores[mic_idx])
 		{
 			/* The user requires more MIC cores than there is available */
-			_STARPU_MSG("# Warning: %d MIC cores requested. Only %d available.\n", nmiccores, topology->nhwmiccores[mic_idx]);
+			_STARPU_MSG("# Warning: %d MIC cores requested. Only %u available.\n", nmiccores, topology->nhwmiccores[mic_idx]);
 			nmiccores = topology->nhwmiccores[mic_idx];
 		}
 	}
@@ -1137,7 +1141,7 @@ _starpu_init_mpi_config (struct _starpu_machine_config *config,
                 if ((unsigned) nmpicores > topology->nhwmpicores[mpi_idx])
                 {
                         /* The user requires more MPI cores than there is available */
-                        _STARPU_MSG("# Warning: %d MPI cores requested. Only %d available.\n",
+                        _STARPU_MSG("# Warning: %d MPI cores requested. Only %u available.\n",
 				    nmpicores, topology->nhwmpicores[mpi_idx]);
                         nmpicores = topology->nhwmpicores[mpi_idx];
                 }
@@ -1206,7 +1210,7 @@ _starpu_init_mp_config (struct _starpu_machine_config *config,
 			if ((unsigned) reqmicdevices > nhwmicdevices)
 			{
 				/* The user requires more MIC devices than there is available */
-				_STARPU_MSG("# Warning: %d MIC devices requested. Only %d available.\n", reqmicdevices, nhwmicdevices);
+				_STARPU_MSG("# Warning: %d MIC devices requested. Only %u available.\n", reqmicdevices, nhwmicdevices);
 				reqmicdevices = nhwmicdevices;
 			}
 		}
@@ -1242,7 +1246,7 @@ _starpu_init_mp_config (struct _starpu_machine_config *config,
 			if ((unsigned) reqmpidevices > nhwmpidevices)
 			{
 				/* The user requires more MPI devices than there is available */
-				_STARPU_MSG("# Warning: %d MPI Master-Slave devices requested. Only %d available.\n",
+				_STARPU_MSG("# Warning: %d MPI Master-Slave devices requested. Only %u available.\n",
 					    reqmpidevices, nhwmpidevices);
 				reqmpidevices = nhwmpidevices;
 			}
@@ -1675,7 +1679,7 @@ _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_con
 		if (ncpu == -1)
 		{
 			unsigned mic_busy_cpus = 0;
-			unsigned j = 0;
+			int j = 0;
 			for (j = 0; j < STARPU_MAXMICDEVS; j++)
 				mic_busy_cpus += (topology->nmiccores[j] ? 1 : 0);
 

+ 11 - 2
src/core/workers.c

@@ -90,7 +90,7 @@ char ***_starpu_get_argv()
 	return my_argv;
 }
 
-int _starpu_is_initialized(void)
+int starpu_is_initialized(void)
 {
 	return initialized == INITIALIZED;
 }
@@ -1383,7 +1383,8 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 		_starpu_launch_drivers(&_starpu_config);
 
 	/* Allocate swap, if any */
-	_starpu_swap_init();
+	if (!is_a_sink)
+		_starpu_swap_init();
 
 	_starpu_watchdog_init();
 
@@ -2403,3 +2404,11 @@ int starpu_wake_worker_relax(int workerid)
 {
 	return _starpu_wake_worker_relax(workerid);
 }
+
+#ifdef STARPU_HAVE_HWLOC
+hwloc_cpuset_t starpu_worker_get_hwloc_cpuset(int workerid)
+{
+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
+	return hwloc_bitmap_dup(worker->hwloc_cpu_set);
+}
+#endif

+ 6 - 4
src/core/workers.h

@@ -133,6 +133,8 @@ LIST_TYPE(_starpu_worker,
 	starpu_pthread_wait_t wait;
 #endif
 
+	struct timespec cl_start; /* Codelet start time of the task currently running */
+	struct timespec cl_end; /* Codelet end time of the last task running */
 	unsigned char first_task; /* Index of first task in the pipeline */
 	unsigned char ntasks; /* number of tasks in the pipeline */
 	unsigned char pipeline_length; /* number of tasks to be put in the pipeline */
@@ -399,7 +401,8 @@ struct _starpu_machine_config
 	struct _starpu_combined_worker combined_workers[STARPU_NMAX_COMBINEDWORKERS];
 
 	/* Translation table from bindid to worker IDs */
-	struct {
+	struct
+	{
 		int *workerids;
 		unsigned nworkers; /* size of workerids */
 	} *bindid_workers;
@@ -552,8 +555,6 @@ static inline struct _starpu_sched_ctx *_starpu_get_sched_ctx_struct(unsigned id
 
 struct _starpu_combined_worker *_starpu_get_combined_worker_struct(unsigned id);
 
-int _starpu_is_initialized(void);
-
 /* Returns the structure that describes the overall machine configuration (eg.
  * all workers and topology). */
 static inline struct _starpu_machine_config *_starpu_get_machine_config(void)
@@ -1041,7 +1042,8 @@ static inline int _starpu_worker_trylock(int workerid)
 	int cur_workerid = starpu_worker_get_id();
 	if (!ret)
 	{
-		if (workerid != cur_workerid) {
+		if (workerid != cur_workerid)
+		{
 			ret = !worker->state_relax_refcnt;
 			if (ret)
 				STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);

+ 15 - 0
src/datawizard/copy_driver.c

@@ -521,6 +521,11 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 #endif
 
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_DISK_RAM):
+                if (req && !starpu_asynchronous_copy_disabled())
+                {
+                        req->async_channel.type = STARPU_DISK_RAM;
+                        req->async_channel.event.disk_event.requests = NULL;
+                }
 		if(copy_methods->any_to_any)
 			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL);
 
@@ -541,6 +546,11 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 		break;
 
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_DISK_RAM,STARPU_CPU_RAM):
+                if (req && !starpu_asynchronous_copy_disabled())
+                {
+                        req->async_channel.type = STARPU_DISK_RAM;
+                        req->async_channel.event.disk_event.requests = NULL;
+                }
 		if(copy_methods->any_to_any)
 			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled()  ? &req->async_channel : NULL);
 		else
@@ -563,6 +573,11 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 		break;
 
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_DISK_RAM,STARPU_DISK_RAM):
+                if (req && !starpu_asynchronous_copy_disabled())
+                {
+                        req->async_channel.type = STARPU_DISK_RAM;
+                        req->async_channel.event.disk_event.requests = NULL;
+                }
 		ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req ? &req->async_channel : NULL);
 		break;
 

+ 6 - 1
src/datawizard/copy_driver.h

@@ -71,10 +71,15 @@ struct _starpu_mpi_ms_async_event
 };
 #endif
 
+LIST_TYPE(_starpu_disk_backend_event,
+	void *backend_event;
+);
+        
+
 struct _starpu_disk_async_event
 {
 	unsigned memory_node;
-	void *backend_event;
+        struct _starpu_disk_backend_event_list * requests;
 };
 
 /* this is a structure that can be queried to see whether an asynchronous

+ 0 - 17
src/datawizard/data_request.c

@@ -24,15 +24,6 @@
 #include <core/disk.h>
 #include <core/simgrid.h>
 
-/* TODO: This should be tuned according to driver capabilities
- * Data interfaces should also have to declare how many asynchronous requests
- * they have actually started (think of e.g. csr).
- */
-#define MAX_PENDING_REQUESTS_PER_NODE 20
-#define MAX_PENDING_PREFETCH_REQUESTS_PER_NODE 10
-#define MAX_PENDING_IDLE_REQUESTS_PER_NODE 1
-#define MAX_PUSH_TIME 1000 /* Maximum time in us that we can afford pushing requests before going back to the driver loop, e.g. for checking GPU task termination */
-
 /* requests that have not been treated at all */
 static struct _starpu_data_request_prio_list data_requests[STARPU_MAXNODES];
 static struct _starpu_data_request_prio_list prefetch_requests[STARPU_MAXNODES];
@@ -118,14 +109,6 @@ static void _starpu_data_request_unlink(struct _starpu_data_request *r)
 
 static void _starpu_data_request_destroy(struct _starpu_data_request *r)
 {
-	switch (r->async_channel.type)
-	{
-		case STARPU_DISK_RAM:
-			starpu_disk_free_request(&r->async_channel);
-			break;
-		default:
-			break;
-	}
 	//fprintf(stderr, "DESTROY REQ %p (%d) refcnt %d\n", r, node, r->refcnt);
 	_starpu_data_request_delete(r);
 }

+ 9 - 0
src/datawizard/data_request.h

@@ -27,6 +27,15 @@
 #include <common/prio_list.h>
 #include <common/starpu_spinlock.h>
 
+/* TODO: This should be tuned according to driver capabilities
+ * Data interfaces should also have to declare how many asynchronous requests
+ * they have actually started (think of e.g. csr).
+ */
+#define MAX_PENDING_REQUESTS_PER_NODE 20
+#define MAX_PENDING_PREFETCH_REQUESTS_PER_NODE 10
+#define MAX_PENDING_IDLE_REQUESTS_PER_NODE 1
+#define MAX_PUSH_TIME 1000 /* Maximum time in us that we can afford pushing requests before going back to the driver loop, e.g. for checking GPU task termination */
+
 struct _starpu_data_replicate;
 
 struct _starpu_callback_list

+ 9 - 1
src/datawizard/filters.c

@@ -201,6 +201,11 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 	for (node = 0; node < STARPU_MAXNODES; node++)
 		_starpu_data_unregister_ram_pointer(initial_handle, node);
 
+	if (nparts && !inherit_state)
+	{
+		STARPU_ASSERT_MSG(childrenp, "Passing NULL pointer for parameter childrenp while parameter inherit_state is 0");
+	}
+
 	for (i = 0; i < nparts; i++)
 	{
 		starpu_data_handle_t child;
@@ -210,7 +215,6 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 		else
 			child = childrenp[i];
 		STARPU_ASSERT(child);
-		_STARPU_TRACE_HANDLE_DATA_REGISTER(child);
 
 		struct starpu_data_interface_ops *ops;
 
@@ -332,6 +336,8 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 			if (ptr != NULL)
 				_starpu_data_register_ram_pointer(child, ptr);
 		}
+
+		_STARPU_TRACE_HANDLE_DATA_REGISTER(child);
 	}
 	/* now let the header */
 	_starpu_spin_unlock(&initial_handle->header_lock);
@@ -536,6 +542,8 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 		STARPU_PTHREAD_MUTEX_DESTROY(&child_handle->busy_mutex);
 		STARPU_PTHREAD_COND_DESTROY(&child_handle->busy_cond);
 		STARPU_PTHREAD_MUTEX_DESTROY(&child_handle->sequential_consistency_mutex);
+
+		_STARPU_TRACE_HANDLE_DATA_UNREGISTER(child_handle);
 	}
 
 	/* there is no child anymore */

+ 0 - 0
src/datawizard/interfaces/bcsr_interface.c


Some files were not shown because too many files changed in this diff