Parcourir la source

Merge branch 'master' into ft_checkpoint

Nathalie Furmento il y a 5 ans
Parent
commit
cde827b909
57 fichiers modifiés avec 868 ajouts et 607 suppressions
  1. 1 0
      ChangeLog
  2. 38 33
      configure.ac
  3. 17 0
      doc/doxygen/chapters/501_environment_variables.doxy
  4. 246 15
      include/starpu_bitmap.h
  5. 3 3
      include/starpu_sched_component.h
  6. 1 1
      julia/examples/mandelbrot/Makefile
  7. 51 0
      julia/examples/task_insert_color/Makefile
  8. 89 0
      julia/examples/task_insert_color/task_insert_color.c
  9. 48 0
      julia/examples/task_insert_color/task_insert_color.jl
  10. 7 0
      julia/examples/vector_scal/vector_scal.jl
  11. 27 4
      julia/src/StarPU.jl
  12. 14 2
      julia/src/compiler/expressions.jl
  13. 2 0
      julia/src/jlstarpu_task.h
  14. 2 0
      julia/src/jlstarpu_task_submit.c
  15. 1 1
      mpi/Makefile.am
  16. 8 7
      mpi/src/mpi/starpu_mpi_mpi.c
  17. 4 0
      mpi/src/starpu_mpi_init.c
  18. 8 9
      mpi/src/starpu_mpi_task_insert.c
  19. 10 2
      mpi/tests/Makefile.am
  20. 4 0
      mpi/tests/bench_helper.h
  21. 35 3
      mpi/tests/sendrecv_bench.c
  22. 4 3
      mpi/tests/sendrecv_gemm_bench.c
  23. 8 0
      mpi/tests/sendrecv_parallel_tasks_bench.c
  24. 2 2
      tools/replay-mpi/Makefile.am
  25. 0 1
      src/Makefile.am
  26. 0 265
      src/common/bitmap.c
  27. 1 1
      src/common/fxt.h
  28. 16 2
      src/core/perfmodel/perfmodel_bus.c
  29. 13 0
      src/core/simgrid.c
  30. 1 1
      src/core/topology.c
  31. 0 2
      src/debug/traces/starpu_fxt.c
  32. 1 1
      src/sched_policies/component_best_implementation.c
  33. 6 7
      src/sched_policies/component_composed.c
  34. 6 6
      src/sched_policies/component_eager.c
  35. 4 4
      src/sched_policies/component_eager_calibration.c
  36. 4 4
      src/sched_policies/component_eager_prio.c
  37. 12 13
      src/sched_policies/component_fifo.c
  38. 6 6
      src/sched_policies/component_heteroprio.c
  39. 6 6
      src/sched_policies/component_prio.c
  40. 2 2
      src/sched_policies/component_random.c
  41. 23 26
      src/sched_policies/component_sched.c
  42. 23 28
      src/sched_policies/component_work_stealing.c
  43. 9 9
      src/sched_policies/component_worker.c
  44. 23 39
      src/sched_policies/deque_modeling_policy_data_aware.c
  45. 15 24
      src/sched_policies/eager_central_policy.c
  46. 7 8
      src/sched_policies/eager_central_priority_policy.c
  47. 12 4
      src/sched_policies/fifo_queues.c
  48. 1 0
      src/sched_policies/fifo_queues.h
  49. 15 22
      src/sched_policies/graph_test_policy.c
  50. 6 8
      src/sched_policies/heteroprio.c
  51. 4 4
      src/sched_policies/modular_gemm.c
  52. 11 24
      src/sched_policies/parallel_eager.c
  53. 0 4
      tools/Makefile.am
  54. 1 0
      tools/dev/lsan/suppressions
  55. 7 0
      tools/dev/valgrind/valgrind.sh
  56. 1 1
      tools/starpu_replay.c
  57. 2 0
      tools/starpu_replay_sched.c

+ 1 - 0
ChangeLog

@@ -53,6 +53,7 @@ Small features:
   * Add STARPU_LIMIT_CPU_NUMA_MEM environment variable.
   * Add STARPU_WORKERS_GETBIND environment variable.
   * Add STARPU_SCHED_SIMPLE_DECIDE_ALWAYS modular scheduler flag.
+  * And STARPU_LIMIT_BANDWIDTH environment variable.
 
 StarPU 1.3.3 (git revision 11afc5b007fe1ab1c729b55b47a5a98ef7f3cfad)
 ====================================================================

+ 38 - 33
configure.ac

@@ -88,13 +88,23 @@ AC_CHECK_PROGS(PROG_DATE,gdate date)
 dnl locate pkg-config
 PKG_PROG_PKG_CONFIG
 
+AC_ARG_ENABLE(simgrid, [AS_HELP_STRING([--enable-simgrid],
+			[Enable simulating execution in simgrid])],
+			enable_simgrid=$enableval, enable_simgrid=no)
+
 if test x$enable_perf_debug = xyes; then
     enable_shared=no
 fi
+
 default_enable_mpi_check=maybe
-default_enable_mpi=maybe
 default_enable_mpi_ft=no
 
+if test x$enable_simgrid = xyes ; then
+	default_enable_mpi=no
+else
+	default_enable_mpi=maybe
+fi
+
 ###############################################################################
 #                                                                             #
 #                                 Drivers                                     #
@@ -136,9 +146,6 @@ AC_ARG_WITH(simgrid-lib-dir,
 		enable_simgrid=yes
 	], [simgrid_lib_dir=no])
 
-AC_ARG_ENABLE(simgrid, [AS_HELP_STRING([--enable-simgrid],
-			[Enable simulating execution in simgrid])],
-			enable_simgrid=$enableval, enable_simgrid=no)
 if test x$enable_simgrid = xyes ; then
    	if test -n "$SIMGRID_CFLAGS" ; then
 	   	CFLAGS="$SIMGRID_CFLAGS $CFLAGS"
@@ -191,7 +198,7 @@ if test x$enable_simgrid = xyes ; then
 
 	# Latest functions
 	AC_CHECK_FUNCS([MSG_process_attach sg_actor_attach sg_actor_init sg_actor_set_stacksize MSG_zone_get_hosts sg_zone_get_hosts MSG_process_self_name MSG_process_userdata_init sg_actor_data])
-	AC_CHECK_FUNCS([xbt_mutex_try_acquire smpi_process_set_user_data SMPI_thread_create sg_zone_get_by_name sg_link_name sg_host_route sg_host_self sg_host_list sg_host_speed simcall_process_create sg_config_continue_after_help])
+	AC_CHECK_FUNCS([xbt_mutex_try_acquire smpi_process_set_user_data SMPI_thread_create sg_zone_get_by_name sg_link_name sg_link_bandwidth_set sg_host_route sg_host_self sg_host_list sg_host_speed simcall_process_create sg_config_continue_after_help])
 	AC_CHECK_FUNCS([simgrid_init], [AC_DEFINE([STARPU_SIMGRID_HAVE_SIMGRID_INIT], [1], [Define to 1 if you have the `simgrid_init' function.])])
 	AC_CHECK_FUNCS([xbt_barrier_init], [AC_DEFINE([STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT], [1], [Define to 1 if you have the `xbt_barrier_init' function.])])
 	AC_CHECK_FUNCS([sg_actor_sleep_for sg_actor_self sg_actor_ref sg_host_get_properties sg_host_send_to sg_host_sendto sg_cfg_set_int sg_actor_self_execute sg_actor_execute simgrid_get_clock])
@@ -373,6 +380,30 @@ AC_MSG_CHECKING(whether mpicxx is available)
 AC_MSG_RESULT($mpicxx_path)
 AC_SUBST(MPICXX, $mpicxx_path)
 
+# Check if mpiexec is available
+if test x$enable_simgrid = xyes ; then
+    DEFAULT_MPIEXEC=smpirun
+    AC_ARG_WITH(smpirun, [AS_HELP_STRING([--with-smpirun[=<name of smpirun or path to smpirun>]], [Name or path of the smpirun helper])], [DEFAULT_MPIEXEC=$withval])
+else
+    DEFAULT_MPIEXEC=mpiexec
+    AC_ARG_WITH(mpiexec, [AS_HELP_STRING([--with-mpiexec=<name of mpiexec or path to mpiexec>], [Name or path of mpiexec])], [DEFAULT_MPIEXEC=$withval])
+fi
+
+case $DEFAULT_MPIEXEC in
+    /*) mpiexec_path="$DEFAULT_MPIEXEC" ;;
+    *)  AC_PATH_PROG(mpiexec_path, $DEFAULT_MPIEXEC, [no], [$MPIPATH])
+esac
+AC_MSG_CHECKING(whether mpiexec is available)
+AC_MSG_RESULT($mpiexec_path)
+
+# We test if MPIEXEC exists
+if test ! -x $mpiexec_path; then
+    AC_MSG_RESULT(The mpiexec script '$mpiexec_path' is not valid)
+    default_enable_mpi_check=no
+    mpiexec_path=""
+fi
+AC_SUBST(MPIEXEC,$mpiexec_path)
+
 ###############################################################################
 #                                                                             #
 #                                    MPI                                      #
@@ -505,32 +536,6 @@ if test x$enable_mpi = xno ; then
     running_mpi_check=no
 fi
 
-if test x$enable_mpi = xyes -a x$running_mpi_check = xyes ; then
-    # Check if mpiexec is available
-    if test x$enable_simgrid = xyes ; then
-	DEFAULT_MPIEXEC=smpirun
-        AC_ARG_WITH(smpirun, [AS_HELP_STRING([--with-smpirun[=<name of smpirun or path to smpirun>]], [Name or path of the smpirun helper])], [DEFAULT_MPIEXEC=$withval])
-    else
-	DEFAULT_MPIEXEC=mpiexec
-	AC_ARG_WITH(mpiexec, [AS_HELP_STRING([--with-mpiexec=<name of mpiexec or path to mpiexec>], [Name or path of mpiexec])], [DEFAULT_MPIEXEC=$withval])
-    fi
-
-    case $DEFAULT_MPIEXEC in
-	/*) mpiexec_path="$DEFAULT_MPIEXEC" ;;
-	*)  AC_PATH_PROG(mpiexec_path, $DEFAULT_MPIEXEC, [no], [$MPIPATH])
-    esac
-    AC_MSG_CHECKING(whether mpiexec is available)
-    AC_MSG_RESULT($mpiexec_path)
-
-    # We test if MPIEXEC exists
-    if test ! -x $mpiexec_path; then
-        AC_MSG_RESULT(The mpiexec script '$mpiexec_path' is not valid)
-        running_mpi_check=no
-        mpiexec_path=""
-    fi
-    AC_SUBST(MPIEXEC,$mpiexec_path)
-fi
-
 AM_CONDITIONAL(STARPU_MPI_CHECK, test x$running_mpi_check = xyes)
 AC_MSG_CHECKING(whether MPI tests should be run)
 AC_MSG_RESULT($running_mpi_check)
@@ -553,7 +558,7 @@ fi
 if test x$enable_mpi = xyes ; then
     if test x$enable_simgrid = xyes ; then
         if test x$enable_shared = xyes ; then
-	    AC_MSG_ERROR([MPI with simgrid can not work with shared libraries, if you need the MPI support, theb use --disable-shared to fix this, else disable MPI with --disable-mpi])
+	    AC_MSG_ERROR([MPI with simgrid can not work with shared libraries, if you need the MPI support, then use --disable-shared to fix this, else disable MPI with --disable-mpi])
         else
 	    CFLAGS="$CFLAGS -fPIC"
 	    CXXFLAGS="$CXXFLAGS -fPIC"
@@ -3534,7 +3539,6 @@ AC_OUTPUT([
 	Makefile
 	src/Makefile
 	tools/Makefile
-	tools/replay-mpi/Makefile
 	tools/starpu_env
 	tools/starpu_codelet_profile
 	tools/starpu_codelet_histo_profile
@@ -3585,6 +3589,7 @@ AC_OUTPUT([
 	mpi/src/Makefile
 	mpi/tests/Makefile
 	mpi/examples/Makefile
+	mpi/tools/Makefile
 	sc_hypervisor/Makefile
 	sc_hypervisor/src/Makefile
 	sc_hypervisor/examples/Makefile

+ 17 - 0
doc/doxygen/chapters/501_environment_variables.doxy

@@ -986,6 +986,23 @@ NUMA nodes used by StarPU. Any \ref STARPU_LIMIT_CPU_NUMA_devid_MEM additionally
 specified will take over STARPU_LIMIT_CPU_NUMA_MEM.
 </dd>
 
+<dt>STARPU_LIMIT_BANDWIDTH</dt>
+<dd>
+\anchor STARPU_LIMIT_BANDWIDTH
+\addindex __env__STARPU_LIMIT_BANDWIDTH
+Specify the maximum available PCI bandwidth of the system in MB/s. This can only
+be effective with simgrid simulation. This allows to easily override the
+bandwidths stored in the platform file generated from measurements on the native
+system. This can be used e.g. for convenient
+
+Specify the maximum number of megabytes that should be available to the
+application on each NUMA node. This is the same as specifying that same amount
+with \ref STARPU_LIMIT_CPU_NUMA_devid_MEM for each NUMA node number. The total
+memory available to StarPU will thus be this amount multiplied by the number of
+NUMA nodes used by StarPU. Any \ref STARPU_LIMIT_CPU_NUMA_devid_MEM additionally
+specified will take over STARPU_LIMIT_BANDWIDTH.
+</dd>
+
 <dt>STARPU_MINIMUM_AVAILABLE_MEM</dt>
 <dd>
 \anchor STARPU_MINIMUM_AVAILABLE_MEM

+ 246 - 15
include/starpu_bitmap.h

@@ -18,6 +18,12 @@
 #ifndef __STARPU_BITMAP_H__
 #define __STARPU_BITMAP_H__
 
+#include <starpu_util.h>
+#include <starpu_config.h>
+
+#include <string.h>
+#include <stdlib.h>
+
 #ifdef __cplusplus
 extern "C"
 {
@@ -28,43 +34,268 @@ extern "C"
    @brief This is the interface for the bitmap utilities provided by StarPU.
    @{
  */
+#ifndef _STARPU_LONG_BIT
+#define _STARPU_LONG_BIT ((int)(sizeof(unsigned long) * 8))
+#endif
+
+#define _STARPU_BITMAP_SIZE ((STARPU_NMAXWORKERS - 1)/_STARPU_LONG_BIT) + 1
 
 /** create a empty starpu_bitmap */
-struct starpu_bitmap *starpu_bitmap_create(void) STARPU_ATTRIBUTE_MALLOC;
+static inline struct starpu_bitmap *starpu_bitmap_create(void) STARPU_ATTRIBUTE_MALLOC;
+/** zero a starpu_bitmap */
+static inline void starpu_bitmap_init(struct starpu_bitmap *b);
 /** free \p b */
-void starpu_bitmap_destroy(struct starpu_bitmap *b);
+static inline void starpu_bitmap_destroy(struct starpu_bitmap *b);
 
 /** set bit \p e in \p b */
-void starpu_bitmap_set(struct starpu_bitmap *b, int e);
+static inline void starpu_bitmap_set(struct starpu_bitmap *b, int e);
 /** unset bit \p e in \p b */
-void starpu_bitmap_unset(struct starpu_bitmap *b, int e);
+static inline void starpu_bitmap_unset(struct starpu_bitmap *b, int e);
 /** unset all bits in \p b */
-void starpu_bitmap_unset_all(struct starpu_bitmap *b);
+static inline void starpu_bitmap_unset_all(struct starpu_bitmap *b);
 
 /** return true iff bit \p e is set in \p b */
-int starpu_bitmap_get(struct starpu_bitmap *b, int e);
+static inline int starpu_bitmap_get(struct starpu_bitmap *b, int e);
 /** Basically compute \c starpu_bitmap_unset_all(\p a) ; \p a = \p b & \p c; */
-void starpu_bitmap_unset_and(struct starpu_bitmap *a, struct starpu_bitmap *b, struct starpu_bitmap *c);
+static inline void starpu_bitmap_unset_and(struct starpu_bitmap *a, struct starpu_bitmap *b, struct starpu_bitmap *c);
 /** Basically compute \p a |= \p b */
-void starpu_bitmap_or(struct starpu_bitmap *a, struct starpu_bitmap *b);
+static inline void starpu_bitmap_or(struct starpu_bitmap *a, struct starpu_bitmap *b);
 /** return 1 iff \p e is set in \p b1 AND \p e is set in \p b2 */
-int starpu_bitmap_and_get(struct starpu_bitmap *b1, struct starpu_bitmap *b2, int e);
+static inline int starpu_bitmap_and_get(struct starpu_bitmap *b1, struct starpu_bitmap *b2, int e);
 /** return the number of set bits in \p b */
-int starpu_bitmap_cardinal(struct starpu_bitmap *b);
+static inline int starpu_bitmap_cardinal(struct starpu_bitmap *b);
 
 /** return the index of the first set bit of \p b, -1 if none */
-int starpu_bitmap_first(struct starpu_bitmap *b);
+static inline int starpu_bitmap_first(struct starpu_bitmap *b);
 /** return the position of the last set bit of \p b, -1 if none */
-int starpu_bitmap_last(struct starpu_bitmap *b);
+static inline int starpu_bitmap_last(struct starpu_bitmap *b);
 /** return the position of set bit right after \p e in \p b, -1 if none */
-int starpu_bitmap_next(struct starpu_bitmap *b, int e);
+static inline int starpu_bitmap_next(struct starpu_bitmap *b, int e);
 /** todo */
-int starpu_bitmap_has_next(struct starpu_bitmap *b, int e);
+static inline int starpu_bitmap_has_next(struct starpu_bitmap *b, int e);
 
 /** @} */
 
-#ifdef __cplusplus
+struct starpu_bitmap
+{
+	unsigned long bits[_STARPU_BITMAP_SIZE];
+	int cardinal;
+};
+
+#ifdef _STARPU_DEBUG_BITMAP
+static int _starpu_check_bitmap(struct starpu_bitmap *b)
+{
+	int card = b->cardinal;
+	int i = starpu_bitmap_first(b);
+	int j;
+	for(j = 0; j < card; j++)
+	{
+		if(i == -1)
+			return 0;
+		int tmp = starpu_bitmap_next(b,i);
+		if(tmp == i)
+			return 0;
+		i = tmp;
+	}
+	if(i != -1)
+		return 0;
+	return 1;
 }
+#else
+#define _starpu_check_bitmap(b) 1
 #endif
 
+static int _starpu_count_bit_static(unsigned long e)
+{
+#if (__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__) >= 4)
+	return __builtin_popcountl(e);
+#else
+	int c = 0;
+	while(e)
+	{
+		c += e&1;
+		e >>= 1;
+	}
+	return c;
 #endif
+}
+
+static inline struct starpu_bitmap *starpu_bitmap_create()
+{
+	return (struct starpu_bitmap *) calloc(1, sizeof(struct starpu_bitmap));
+}
+
+static inline void starpu_bitmap_init(struct starpu_bitmap *b)
+{
+	memset(b, 0, sizeof(*b));
+}
+
+static inline void starpu_bitmap_destroy(struct starpu_bitmap * b)
+{
+	free(b);
+}
+
+static inline void starpu_bitmap_set(struct starpu_bitmap * b, int e)
+{
+	if(!starpu_bitmap_get(b, e))
+		b->cardinal++;
+	else
+		return;
+	STARPU_ASSERT(e/_STARPU_LONG_BIT < _STARPU_BITMAP_SIZE);
+	b->bits[e/_STARPU_LONG_BIT] |= (1ul << (e%_STARPU_LONG_BIT));
+	STARPU_ASSERT(_starpu_check_bitmap(b));
+}
+static inline void starpu_bitmap_unset(struct starpu_bitmap *b, int e)
+{
+	if(starpu_bitmap_get(b, e))
+		b->cardinal--;
+	else
+		return;
+	STARPU_ASSERT(e/_STARPU_LONG_BIT < _STARPU_BITMAP_SIZE);
+	if(e / _STARPU_LONG_BIT > _STARPU_BITMAP_SIZE)
+		return;
+	b->bits[e/_STARPU_LONG_BIT] &= ~(1ul << (e%_STARPU_LONG_BIT));
+	STARPU_ASSERT(_starpu_check_bitmap(b));
+}
+
+static inline void starpu_bitmap_unset_all(struct starpu_bitmap * b)
+{
+	memset(b->bits, 0, _STARPU_BITMAP_SIZE * sizeof(unsigned long));
+}
+
+static inline void starpu_bitmap_unset_and(struct starpu_bitmap * a, struct starpu_bitmap * b, struct starpu_bitmap * c)
+{
+	a->cardinal = 0;
+	int i;
+	for(i = 0; i < _STARPU_BITMAP_SIZE; i++)
+	{
+		a->bits[i] = b->bits[i] & c->bits[i];
+		a->cardinal += _starpu_count_bit_static(a->bits[i]);
+	}
+}
+
+static inline int starpu_bitmap_get(struct starpu_bitmap * b, int e)
+{
+	STARPU_ASSERT(e / _STARPU_LONG_BIT < _STARPU_BITMAP_SIZE);
+	if(e / _STARPU_LONG_BIT >= _STARPU_BITMAP_SIZE)
+		return 0;
+	return (b->bits[e/_STARPU_LONG_BIT] & (1ul << (e%_STARPU_LONG_BIT))) ?
+		1:
+		0;
+}
+
+static inline void starpu_bitmap_or(struct starpu_bitmap * a, struct starpu_bitmap * b)
+{
+	int i;
+	a->cardinal = 0;
+	for(i = 0; i < _STARPU_BITMAP_SIZE; i++)
+	{
+		a->bits[i] |= b->bits[i];
+		a->cardinal += _starpu_count_bit_static(a->bits[i]);
+	}
+}
+
+
+static inline int starpu_bitmap_and_get(struct starpu_bitmap * b1, struct starpu_bitmap * b2, int e)
+{
+	return starpu_bitmap_get(b1,e) && starpu_bitmap_get(b2,e);
+}
+
+static inline int starpu_bitmap_cardinal(struct starpu_bitmap * b)
+{
+	return b->cardinal;
+}
+
+
+static inline int _starpu_get_first_bit_rank(unsigned long ms)
+{
+	STARPU_ASSERT(ms != 0);
+#if (__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))
+	return __builtin_ffsl(ms) - 1;
+#else
+	unsigned long m = 1ul;
+	int i = 0;
+	while(!(m&ms))
+		i++,m<<=1;
+	return i;
+#endif
+}
+
+static inline int _starpu_get_last_bit_rank(unsigned long l)
+{
+	STARPU_ASSERT(l != 0);
+#if (__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))
+	return 8*sizeof(l) - __builtin_clzl(l);
+#else
+	int ibit = _STARPU_LONG_BIT - 1;
+	while((!(1ul << ibit)) & l)
+		ibit--;
+	STARPU_ASSERT(ibit >= 0);
+	return ibit;
+#endif
+}
+
+static inline int starpu_bitmap_first(struct starpu_bitmap * b)
+{
+	int i = 0;
+	while(i < _STARPU_BITMAP_SIZE && !b->bits[i])
+		i++;
+	if( i == _STARPU_BITMAP_SIZE)
+		return -1;
+	int nb_long = i;
+	unsigned long ms = b->bits[i];
+
+	return (nb_long * _STARPU_LONG_BIT) + _starpu_get_first_bit_rank(ms);
+}
+
+static inline int starpu_bitmap_has_next(struct starpu_bitmap * b, int e)
+{
+	int nb_long = (e+1) / _STARPU_LONG_BIT;
+	int nb_bit = (e+1) % _STARPU_LONG_BIT;
+	unsigned long mask = (~0ul) << nb_bit;
+	if(b->bits[nb_long] & mask)
+		return 1;
+	for(nb_long++; nb_long < _STARPU_BITMAP_SIZE; nb_long++)
+		if(b->bits[nb_long])
+			return 1;
+	return 0;
+}
+
+static inline int starpu_bitmap_last(struct starpu_bitmap * b)
+{
+	if(b->cardinal == 0)
+		return -1;
+	int ilong;
+	for(ilong = _STARPU_BITMAP_SIZE - 1; ilong >= 0; ilong--)
+	{
+		if(b->bits[ilong])
+			break;
+	}
+	STARPU_ASSERT(ilong >= 0);
+	unsigned long l = b->bits[ilong];
+	return ilong * _STARPU_LONG_BIT + _starpu_get_last_bit_rank(l);
+}
+
+static inline int starpu_bitmap_next(struct starpu_bitmap *b, int e)
+{
+	int nb_long = e / _STARPU_LONG_BIT;
+	int nb_bit = e % _STARPU_LONG_BIT;
+	unsigned long rest = nb_bit == _STARPU_LONG_BIT - 1 ? 0 : (~0ul << (nb_bit + 1)) & b->bits[nb_long];
+	if(nb_bit != (_STARPU_LONG_BIT - 1) && rest)
+	{
+		int i = _starpu_get_first_bit_rank(rest);
+		STARPU_ASSERT(i >= 0 && i < _STARPU_LONG_BIT);
+		return (nb_long * _STARPU_LONG_BIT) + i;
+	}
+
+	for(nb_long++;nb_long < _STARPU_BITMAP_SIZE; nb_long++)
+		if(b->bits[nb_long])
+			return nb_long * _STARPU_LONG_BIT + _starpu_get_first_bit_rank(b->bits[nb_long]);
+	return -1;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __STARPU_BITMAP_H__ */

+ 3 - 3
include/starpu_sched_component.h

@@ -69,14 +69,14 @@ struct starpu_sched_component
 	/** The tree containing the component*/
 	struct starpu_sched_tree *tree;
 	/** set of underlying workers */
-	struct starpu_bitmap *workers;
+	struct starpu_bitmap workers;
 	/**
 	   subset of starpu_sched_component::workers that is currently available in the context
 	   The push method should take this value into account, it is set with:
 	   component->workers UNION tree->workers UNION
 	   component->child[i]->workers_in_ctx iff exist x such as component->children[i]->parents[x] == component
 	*/
-	struct starpu_bitmap *workers_in_ctx;
+	struct starpu_bitmap workers_in_ctx;
 	/** private data */
 	void *data;
 	char *name;
@@ -188,7 +188,7 @@ struct starpu_sched_tree
 	/**
 	   set of workers available in this context, this value is used to mask workers in modules
 	*/
-	struct starpu_bitmap *workers;
+	struct starpu_bitmap workers;
 	/**
 	   context id of the scheduler
 	*/

+ 1 - 1
julia/examples/mandelbrot/Makefile

@@ -26,7 +26,7 @@ LIBPATH=${PWD}/../StarPU.jl/lib
 all: ${EXTERNLIB}
 
 mandelbrot: mandelbrot.c cpu_mandelbrot.o #gpu_mandelbrot.o
-	$(CC) $(CPU_CFLAGS) $^ -o $@ $(LDFLAGS)
+	$(CC) $(CPU_CFLAGS) $^ -o $@ $(LDFLAGS) -lm
 
 %.o: %.c
 	$(CC) -c -fPIC $(CPU_CFLAGS) $^ -o $@

+ 51 - 0
julia/examples/task_insert_color/Makefile

@@ -0,0 +1,51 @@
+CC=gcc
+NVCC=nvcc
+ENABLE_CUDA=no
+LD=$(CC)
+
+ifeq ($(ENABLE_CUDA),yes)
+        LD := ${NVCC}
+endif
+
+CFLAGS = -O3 -g $(shell pkg-config --cflags starpu-1.3)
+CPU_CFLAGS = ${CFLAGS} -Wall -mavx -fomit-frame-pointer -march=native -ffast-math
+CUDA_CFLAGS = ${CFLAGS}
+LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
+
+EXTERNLIB=extern_tasks.so
+GENERATEDLIB=generated_tasks.so
+
+C_OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
+CUDA_OBJECTS=$(patsubst %.cu,%.o,$(wildcard gen*.cu))
+ifneq ($(ENABLE_CUDA),yes)
+	CUDA_OBJECTS:=
+endif
+
+LIBPATH=${PWD}/../StarPU.jl/lib
+
+all: task_insert_color
+
+task_insert_color: task_insert_color.o
+	$(CC) $(CPU_CFLAGS) $^ -o $@ $(LDFLAGS)
+
+%.o: %.c
+	$(CC) -c -fPIC $(CPU_CFLAGS) $^ -o $@
+
+%.o: %.cu
+	$(NVCC) -dc $(CUDA_CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
+
+${GENERATEDLIB}: $(C_OBJECTS) $(CUDA_OBJECTS)
+	$(LD) -shared $(LDFLAGS) $^ -o $@
+
+PHONY: clean
+
+clean:
+	rm -f vector_scal *.so *.o genc_*.c gencuda_*.cu *.dat
+
+# Performance Tests
+cstarpu.dat: task_insert_color
+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 ./task_insert_color > $@
+julia_generatedc.dat:
+	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia task_insert_colorl.jl
+
+test: cstarpu.dat julia_generatedc.dat

+ 89 - 0
julia/examples/task_insert_color/task_insert_color.c

@@ -0,0 +1,89 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2018-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+void func(void *descr[], void *_args)
+{
+	int *x = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	(void)_args;
+
+	*x *= 2;
+}
+
+struct starpu_codelet mycodelet =
+{
+	.modes = { STARPU_RW },
+	.cpu_funcs = {func},
+	.cpu_funcs_name = {"func"},
+        .nbuffers = 1
+};
+
+struct starpu_codelet mycodelet_color =
+{
+	.modes = { STARPU_RW },
+	.cpu_funcs = {func},
+	.cpu_funcs_name = {"func"},
+        .nbuffers = 1,
+	.color = 0x0000FF,
+};
+
+int main(void)
+{
+	unsigned i;
+	int value=42;
+	starpu_data_handle_t handle;
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) goto enodev;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_variable_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)&value, sizeof(value));
+
+	// In the trace file, the following task should be green (executed on CPU)
+	ret = starpu_task_insert(&mycodelet, STARPU_RW, handle, STARPU_NAME, "mytask",
+				 0);
+	if (STARPU_UNLIKELY(ret == -ENODEV))
+	{
+		starpu_data_unregister(handle);
+		goto enodev;
+	}
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+
+	// In the trace file, the following task will be red as specified by STARPU_TASK_COLOR
+	ret = starpu_task_insert(&mycodelet, STARPU_RW, handle, STARPU_NAME, "mytask",
+				 STARPU_TASK_COLOR, 0xFF0000,
+				 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+
+	// In the trace file, the following task will be blue as specified by the field color of mycodelet_color
+	ret = starpu_task_insert(&mycodelet_color, STARPU_RW, handle, STARPU_NAME, "mytask",
+				 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+
+	starpu_task_wait_for_all();
+	starpu_data_unregister(handle);
+
+	starpu_shutdown();
+
+	return 0;
+
+ enodev:
+	return 77;
+}

+ 48 - 0
julia/examples/task_insert_color/task_insert_color.jl

@@ -0,0 +1,48 @@
+import Libdl
+using StarPU
+
+@target STARPU_CPU
+@codelet function task_insert_color(val ::Ref{Int32}) :: Nothing
+    val[] = val[] * 2
+
+    return
+end
+
+starpu_init()
+
+function task_insert_color_with_starpu(val ::Ref{Int32})
+    @starpu_block let
+	hVal = starpu_data_register(val)
+
+        cl1 = StarpuCodelet(
+            cpu_func = CPU_CODELETS["task_insert_color"],
+            modes = [STARPU_RW]
+        )
+
+        cl2 = StarpuCodelet(
+            cpu_func = CPU_CODELETS["task_insert_color"],
+            modes = [STARPU_RW],
+            color = 0x0000FF
+        )
+
+	@starpu_sync_tasks begin
+
+            # In the trace file, the following task should be green (executed on CPU)
+            starpu_task_submit(StarpuTask(cl = cl1, handles = [hVal]))
+
+            # In the trace file, the following task will be blue as specified by the field color of cl2
+            starpu_task_submit(StarpuTask(cl = cl2, handles = [hVal]))
+
+            # In the trace file, the following tasks will be red as specified in @starpu_async_cl
+            @starpu_async_cl task_insert_color(hVal) [STARPU_RW] [] 0xFF0000
+
+	end
+    end
+end
+
+
+foo = Ref(convert(Int32, 42))
+
+task_insert_color_with_starpu(foo)
+
+starpu_shutdown()

+ 7 - 0
julia/examples/vector_scal/vector_scal.jl

@@ -55,11 +55,18 @@ end
 function compute_times(io,start_dim, step_dim, stop_dim)
     for size in (start_dim : step_dim : stop_dim)
         V = Array(rand(Cfloat, size))
+        starpu_memory_pin(V)
+
         m :: Int32 = 10
         k :: Float32 = 2.
         l :: Float32 = 3.
+
         println("INPUT ", V[1:10])
+
         mt =  vector_scal_with_starpu(V, m, k, l)
+
+        starpu_memory_unpin(V)
+
         println("OUTPUT ", V[1:10])
         println(io,"$size $mt")
         println("$size $mt")

+ 27 - 4
julia/src/StarPU.jl

@@ -49,6 +49,8 @@ end
 
 export starpu_init
 export starpu_shutdown
+export starpu_memory_pin
+export starpu_memory_unpin
 export starpu_data_unregister
 export starpu_data_register
 export starpu_data_get_sub_data
@@ -185,6 +187,8 @@ end
 struct StarpuCodelet
     where_to_execute :: UInt32
 
+    color :: UInt32
+
     cpu_func :: String
     cuda_func :: String
     opencl_func :: String
@@ -201,7 +205,8 @@ struct StarpuCodelet
                            opencl_func :: String = "",
                            modes :: Vector{StarpuDataAccessMode} = StarpuDataAccessMode[],
                            perfmodel :: StarpuPerfmodel = StarpuPerfmodel(),
-                           where_to_execute :: Union{Cvoid, UInt32} = nothing
+                           where_to_execute :: Union{Cvoid, UInt32} = nothing,
+                           color :: UInt32 = 0x00000000
                            )
 
         if (length(modes) > STARPU_NMAXBUFS)
@@ -217,7 +222,7 @@ struct StarpuCodelet
             real_where = where_to_execute
         end
 
-        output = new(real_where, cpu_func, cuda_func, opencl_func,modes, perfmodel, real_c_codelet_ptr)
+        output = new(real_where, color, cpu_func, cuda_func, opencl_func,modes, perfmodel, real_c_codelet_ptr)
 
         starpu_c_codelet_update(output)
 
@@ -651,9 +656,23 @@ end
 
 STARPU_MAIN_RAM = 0 #TODO: ENUM
 
+function starpu_memory_pin(data) :: Nothing
+    data_pointer = pointer(data)
 
+    @starpucall(starpu_memory_pin,
+                Cvoid, (Ptr{Cvoid}, Csize_t),
+                data_pointer,
+                sizeof(data))
+end
 
+function starpu_memory_unpin(data) :: Nothing
+    data_pointer = pointer(data)
 
+    @starpucall(starpu_memory_unpin,
+                Cvoid, (Ptr{Cvoid}, Csize_t),
+                data_pointer,
+                sizeof(data))
+end
 
 function StarpuNewDataHandle(ptr :: StarpuDataHandlePointer, destr :: Function...) :: StarpuDataHandle
     return StarpuDestructible(ptr, destr...)
@@ -858,7 +877,7 @@ end
     Creates and submits an asynchronous task running cl Codelet function.
     Ex : @starpu_async_cl cl(handle1, handle2)
 """
-macro starpu_async_cl(expr,modes,cl_arg=[])
+macro starpu_async_cl(expr, modes, cl_arg=[], color ::UInt32=0x00000000)
 
     if (!isa(expr, Expr) || expr.head != :call)
         error("Invalid task submit syntax")
@@ -877,7 +896,8 @@ macro starpu_async_cl(expr,modes,cl_arg=[])
         #opencl_func="ocl_matrix_mult",
         ### TODO: CORRECT !
         modes = map((x -> starpu_modes(x)),modes.args),
-        perfmodel = perfmodel
+        perfmodel = perfmodel,
+        color = color
     )
     handles = Expr(:vect, expr.args[2:end]...)
     #dump(handles)
@@ -1214,6 +1234,8 @@ mutable struct StarpuCodeletTranslator
 
     where_to_execute :: UInt32
 
+    color :: UInt32
+
     cpu_func :: Ptr{Cvoid}
     cpu_func_name :: Cstring
 
@@ -1237,6 +1259,7 @@ mutable struct StarpuCodeletTranslator
         end
 
         output.where_to_execute = cl.where_to_execute
+        output.color = cl.color
 
         cpu_func_ptr = load_starpu_function_pointer(cl.cpu_func)
         cuda_func_ptr = load_starpu_function_pointer(cl.cuda_func)

+ 14 - 2
julia/src/compiler/expressions.jl

@@ -422,6 +422,10 @@ end
 function print(io :: IO, x :: StarpuExprFor ; indent = 0,restrict=false)
 
     print_newline(io, indent)
+    print(io, "{")
+    indent += starpu_indent_size
+    print_newline(io, indent)
+
     print(io, StarpuExprBlock(x.set_declarations), indent = indent)
 
     id = x.set.id
@@ -449,12 +453,20 @@ function print(io :: IO, x :: StarpuExprFor ; indent = 0,restrict=false)
 
     print_newline(io, indent)
     print(io, "{")
-    print_newline(io, indent + starpu_indent_size)
-    print(io, x.body, indent = indent + starpu_indent_size)
+    indent += starpu_indent_size
+
+    print_newline(io, indent)
+    print(io, x.body, indent = indent)
+
+    indent -= starpu_indent_size
     print_newline(io, indent)
     print(io, "}")
+
+    indent -= starpu_indent_size
     print_newline(io, indent)
+    print(io, "}")
 
+    print_newline(io, indent)
 end
 
 

+ 2 - 0
julia/src/jlstarpu_task.h

@@ -30,6 +30,8 @@ struct jlstarpu_codelet
 {
 	uint32_t where;
 
+  	uint32_t color;
+
 	starpu_cpu_func_t cpu_func;
 	char * cpu_func_name;
 

+ 2 - 0
julia/src/jlstarpu_task_submit.c

@@ -63,6 +63,8 @@ void jlstarpu_codelet_update(const struct jlstarpu_codelet * const input, struct
 {
 	output->where = input->where;
 
+	output->color = input->color;
+
 	output->cpu_funcs[0] = input->cpu_func;
 	output->cpu_funcs_name[0] = input->cpu_func_name;
 

+ 1 - 1
mpi/Makefile.am

@@ -16,7 +16,7 @@
 
 include $(top_srcdir)/starpu-subdirtests.mk
 
-SUBDIRS=src tests examples
+SUBDIRS=src tests examples tools
 
 pkgconfigdir = $(libdir)/pkgconfig
 pkgconfig_DATA = libstarpumpi.pc starpumpi-1.0.pc starpumpi-1.1.pc starpumpi-1.2.pc starpumpi-1.3.pc

+ 8 - 7
mpi/src/mpi/starpu_mpi_mpi.c

@@ -51,7 +51,10 @@ static unsigned nready_process;
 /* Number of send requests to submit to MPI at the same time */
 static unsigned ndetached_send;
 
+#ifdef STARPU_USE_FXT
 static void _starpu_mpi_add_sync_point_in_fxt(void);
+#endif
+
 static void _starpu_mpi_handle_ready_request(struct _starpu_mpi_req *req);
 static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req);
 #ifdef STARPU_MPI_VERBOSE
@@ -1173,6 +1176,8 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
 	starpu_pthread_setname("MPI");
 
+	_starpu_mpi_env_init();
+
 #ifndef STARPU_SIMGRID
 	if (_starpu_mpi_thread_cpuid < 0)
 	{
@@ -1189,11 +1194,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	if (_starpu_mpi_thread_cpuid >= 0)
 		/* In case MPI changed the binding */
 		starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI");
-#endif
-
-	_starpu_mpi_env_init();
-
-#ifdef STARPU_SIMGRID
+#else
 	/* Now that MPI is set up, let the rest of simgrid get initialized */
 	char **argv_cpy;
 	_STARPU_MPI_MALLOC(argv_cpy, *(argc_argv->argc) * sizeof(char*));
@@ -1532,9 +1533,9 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	return NULL;
 }
 
+#ifdef STARPU_USE_FXT
 static void _starpu_mpi_add_sync_point_in_fxt(void)
 {
-#ifdef STARPU_USE_FXT
 	int rank;
 	int worldsize;
 	int ret;
@@ -1563,8 +1564,8 @@ static void _starpu_mpi_add_sync_point_in_fxt(void)
 	_STARPU_MPI_TRACE_BARRIER(rank, worldsize, random_number);
 
 	_STARPU_MPI_DEBUG(3, "unique key %x\n", random_number);
-#endif
 }
+#endif
 
 int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 {

+ 4 - 0
mpi/src/starpu_mpi_init.c

@@ -232,6 +232,10 @@ int starpu_mpi_shutdown(void)
 	void *value;
 	int rank, world_size;
 
+	/* Make sure we do not have MPI communications pending in the task graph
+	 * before shutting down MPI */
+	starpu_mpi_wait_for_all(MPI_COMM_WORLD);
+
 	/* We need to get the rank before calling MPI_Finalize to pass to _starpu_mpi_comm_amounts_display() */
 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
 	starpu_mpi_comm_size(MPI_COMM_WORLD, &world_size);

+ 8 - 9
mpi/src/starpu_mpi_task_insert.c

@@ -818,8 +818,12 @@ void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 	// need to count how many nodes have the data in redux mode
 	if (me == rank)
 	{
-		int i, j=0;
-		struct starpu_task *taskBs[nb_nodes];
+		int i;
+
+		// taskC depends on all taskBs created
+		struct starpu_task *taskC = starpu_task_create();
+		taskC->cl = &_starpu_mpi_redux_data_readwrite_cl;
+		STARPU_TASK_SET_HANDLE(taskC, data_handle, 0);
 
 		for(i=0 ; i<nb_nodes ; i++)
 		{
@@ -857,8 +861,8 @@ void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 				args->taskB->cl = args->data_handle->redux_cl;
 				args->taskB->sequential_consistency = 0;
 				STARPU_TASK_SET_HANDLE(args->taskB, args->data_handle, 0);
-				taskBs[j] = args->taskB;
-				j++;
+
+				starpu_task_declare_deps_array(taskC, 1, &args->taskB);
 
 				// Submit taskA
 				starpu_task_insert(&_starpu_mpi_redux_data_read_cl,
@@ -868,11 +872,6 @@ void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 			}
 		}
 
-		// Submit taskC which depends on all taskBs created
-		struct starpu_task *taskC = starpu_task_create();
-		taskC->cl = &_starpu_mpi_redux_data_readwrite_cl;
-		STARPU_TASK_SET_HANDLE(taskC, data_handle, 0);
-		starpu_task_declare_deps_array(taskC, j, taskBs);
 		int ret = starpu_task_submit(taskC);
 		STARPU_ASSERT(ret == 0);
 	}

+ 10 - 2
mpi/tests/Makefile.am

@@ -139,8 +139,12 @@ starpu_mpi_TESTS +=				\
 	temporary				\
 	user_defined_datatype			\
 	early_stuff				\
-	sendrecv_bench				\
+	sendrecv_bench
+
+if !STARPU_USE_MPI_MPI
+starpu_mpi_TESTS +=				\
 	sendrecv_parallel_tasks_bench
+endif
 
 if !NO_BLAS_LIB
 starpu_mpi_TESTS +=				\
@@ -235,9 +239,13 @@ noinst_PROGRAMS =				\
 	load_balancer				\
 	driver					\
 	sendrecv_bench				\
-	sendrecv_gemm_bench			\
 	sendrecv_parallel_tasks_bench
 
+if !NO_BLAS_LIB
+noinst_PROGRAMS +=				\
+	sendrecv_gemm_bench
+endif
+
 if STARPU_USE_MPI_FT
 noinst_PROGRAMS +=  \
 	checkpoints

+ 4 - 0
mpi/tests/bench_helper.h

@@ -24,6 +24,10 @@
 	#define MULT_DEFAULT 4
 	#define LOOPS_DEFAULT 100
 	#define NX_MAX (64 * 1024 * 1024) // kB
+#elif !defined(STARPU_LONG_CHECK)
+	#define MULT_DEFAULT 4
+	#define LOOPS_DEFAULT 10000
+	#define NX_MAX (128 * 1024 * 1024) // kB
 #else
 	#define MULT_DEFAULT 2
 	#define LOOPS_DEFAULT 100000

+ 35 - 3
mpi/tests/sendrecv_bench.c

@@ -27,6 +27,30 @@ int main(int argc, char **argv)
 {
 	int ret, rank, worldsize;
 	int mpi_init;
+	int pause_workers = 0;
+
+
+	for (int i = 1; i < argc; i++)
+	{
+		if (strcmp(argv[i], "-p") == 0)
+		{
+			pause_workers = 1;
+			printf("Workers will be paused during benchmark.\n");
+		}
+		else if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
+		{
+			fprintf(stderr, "Options:\n");
+			fprintf(stderr, "\t-h --help   display this help\n");
+			fprintf(stderr, "\t-p          pause workers during benchmark\n");
+			exit(EXIT_SUCCESS);
+		}
+		else
+		{
+			fprintf(stderr,"Unrecognized option %s\n", argv[i]);
+			exit(EXIT_FAILURE);
+		}
+	}
+
 
 	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
 	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
@@ -46,12 +70,20 @@ int main(int argc, char **argv)
 		return STARPU_TEST_SKIPPED;
 	}
 
-	/* Pause workers for this bench: all workers polling for tasks has a strong impact on performances */
-	starpu_pause();
+
+	if (pause_workers)
+	{
+		/* Pause workers for this bench: all workers polling for tasks has a strong impact on performances */
+		starpu_pause();
+	}
 
 	sendrecv_bench(rank, NULL);
 
-	starpu_resume();
+	if (pause_workers)
+	{
+		starpu_resume();
+	}
+
 	starpu_mpi_shutdown();
 	if (!mpi_init)
 		MPI_Finalize();

+ 4 - 3
mpi/tests/sendrecv_gemm_bench.c

@@ -287,14 +287,15 @@ static void parse_args(int argc, char **argv)
 
 		else if (strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0)
 		{
-			fprintf(stderr,"Usage: %s [-nblocks n] [-size size] [-check] [-spmd] [-comm_thread_cpuid cpuid]\n", argv[0]);
+			fprintf(stderr,"Usage: %s [-nblocks n] [-size size] [-check] [-spmd] [-comm-thread-cpuid cpuid]\n", argv[0]);
 			fprintf(stderr,"Currently selected: matrix size: %u - %u blocks\n", matrix_dim, nslices);
-			fprintf(stderr, "Use -comm_thread_cpuid to specifiy where to bind the comm benchmarking thread\n");
+			fprintf(stderr, "Use -comm-thread-cpuid to specifiy where to bind the comm benchmarking thread\n");
 			exit(EXIT_SUCCESS);
 		}
+
 		else
 		{
-			fprintf(stderr,"Unrecognized option %s", argv[i]);
+			fprintf(stderr,"Unrecognized option %s\n", argv[i]);
 			exit(EXIT_FAILURE);
 		}
 	}

+ 8 - 0
mpi/tests/sendrecv_parallel_tasks_bench.c

@@ -13,6 +13,7 @@
  *
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
+
 /*
  * sendrecv benchmark from different tasks, executed simultaneously on serveral
  * workers.
@@ -21,6 +22,13 @@
  * The goal is to measure impact of calls to starpu_mpi_* from different threads.
  *
  * Use STARPU_NCPU to set the number of parallel ping pongs
+ *
+ *
+ * Note: This currently can not work with the MPI backend with more than 1 CPU,
+ * since with big sizes, the MPI_Wait call in the MPI thread may block waiting
+ * for the peer to call MPI_Recv+Wait, and there is no guarantee that the peer
+ * will call MPI_Recv+Wait for the same data since tasks can proceed in any
+ * order.
  */
 
 #include <starpu_mpi.h>

+ 2 - 2
tools/replay-mpi/Makefile.am

@@ -29,14 +29,14 @@ CC=$(CC_OR_MPICC)
 CCLD=$(CC_OR_MPICC)
 
 if STARPU_USE_MPI
-LIBS += ../../mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+LIBS += $(top_builddir)/mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 AM_CPPFLAGS += -I$(top_srcdir)/mpi/include
 endif
 
 bin_PROGRAMS = starpu_replay_mpi
 
 starpu_replay.c starpu_replay_sched.c:
-	$(LN_S) $(abs_srcdir)/../$(notdir $@) $@
+	$(LN_S) $(top_srcdir)/tools/$(notdir $@) $@
 
 starpu_replay_mpi_SOURCES = \
 	starpu_replay.c \

+ 0 - 1
src/Makefile.am

@@ -160,7 +160,6 @@ noinst_HEADERS = 						\
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 		\
 	common/barrier.c					\
 	common/barrier_counter.c				\
-	common/bitmap.c						\
 	common/hash.c 						\
 	common/rwlock.c						\
 	common/starpu_spinlock.c				\

+ 0 - 265
src/common/bitmap.c

@@ -1,265 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2013-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
- * Copyright (C) 2013       Simon Archipoff
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include <starpu.h>
-#include <starpu_bitmap.h>
-
-#include <limits.h>
-#include <string.h>
-#include <stdlib.h>
-
-#ifndef LONG_BIT
-#define LONG_BIT (sizeof(unsigned long) * 8)
-#endif
-
-struct starpu_bitmap
-{
-	unsigned long * bits;
-	int size; /* the size of bits array in number of unsigned long */
-	int cardinal;
-};
-
-//#define DEBUG_BITMAP
-
-#ifdef DEBUG_BITMAP
-static int check_bitmap(struct starpu_bitmap *b)
-{
-	int card = b->cardinal;
-	int i = starpu_bitmap_first(b);
-	int j;
-	for(j = 0; j < card; j++)
-	{
-		if(i == -1)
-			return 0;
-		int tmp = starpu_bitmap_next(b,i);
-		if(tmp == i)
-			return 0;
-		i = tmp;
-	}
-	if(i != -1)
-		return 0;
-	return 1;
-}
-#else
-#define check_bitmap(b) 1
-#endif
-
-static int _count_bit(unsigned long e)
-{
-#if (__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__) >= 4)
-	return __builtin_popcountl(e);
-#else
-	int c = 0;
-	while(e)
-	{
-		c += e&1;
-		e >>= 1;
-	}
-	return c;
-#endif
-}
-
-struct starpu_bitmap * starpu_bitmap_create(void)
-{
-	struct starpu_bitmap *b;
-	_STARPU_CALLOC(b, 1, sizeof(*b));
-	return b;
-}
-void starpu_bitmap_destroy(struct starpu_bitmap * b)
-{
-	if(b)
-	{
-		free(b->bits);
-		free(b);
-	}
-}
-
-void starpu_bitmap_set(struct starpu_bitmap * b, int e)
-{
-
-	if(!starpu_bitmap_get(b, e))
-		b->cardinal++;
-	else
-		return;
-	if((e/LONG_BIT) + 1 > b->size)
-	{
-		_STARPU_REALLOC(b->bits, sizeof(unsigned long) * ((e/LONG_BIT) + 1));
-		memset(b->bits + b->size, 0, sizeof(unsigned long) * ((e/LONG_BIT + 1) - b->size));
-		b->size = (e/LONG_BIT) + 1;
-	}
-	b->bits[e/LONG_BIT] |= (1ul << (e%LONG_BIT));
-	STARPU_ASSERT(check_bitmap(b));
-}
-void starpu_bitmap_unset(struct starpu_bitmap *b, int e)
-{
-	if(starpu_bitmap_get(b, e))
-		b->cardinal--;
-	else
-		return;
-	if(e / LONG_BIT > b->size)
-		return;
-	b->bits[e/LONG_BIT] &= ~(1ul << (e%LONG_BIT));
-	STARPU_ASSERT(check_bitmap(b));
-}
-
-void starpu_bitmap_unset_all(struct starpu_bitmap * b)
-{
-	free(b->bits);
-	b->bits = NULL;
-	b->size = 0;
-}
-
-void starpu_bitmap_unset_and(struct starpu_bitmap * a, struct starpu_bitmap * b, struct starpu_bitmap * c)
-{
-	int n = STARPU_MIN(b->size, c->size);
-	_STARPU_REALLOC(a->bits, sizeof(unsigned long) * n);
-	a->size = n;
-	a->cardinal = 0;
-	int i;
-	for(i = 0; i < n; i++)
-	{
-		a->bits[i] = b->bits[i] & c->bits[i];
-		a->cardinal += _count_bit(a->bits[i]);
-	}
-}
-
-int starpu_bitmap_get(struct starpu_bitmap * b, int e)
-{
-	if(e / LONG_BIT >= b->size)
-		return 0;
-	return (b->bits[e/LONG_BIT] & (1ul << (e%LONG_BIT))) ?
-		1:
-		0;
-}
-
-void starpu_bitmap_or(struct starpu_bitmap * a, struct starpu_bitmap * b)
-{
-	if(a->size < b->size)
-	{
-		_STARPU_REALLOC(a->bits, b->size * sizeof(unsigned long));
-		memset(a->bits + a->size, 0, (b->size - a->size) * sizeof(unsigned long));
-		a->size = b->size;
-
-	}
-	int i;
-	for(i = 0; i < b->size; i++)
-	{
-		a->bits[i] |= b->bits[i];
-	}
-	a->cardinal = 0;
-	for(i = 0; i < a->size; i++)
-		a->cardinal += _count_bit(a->bits[i]);
-}
-
-
-int starpu_bitmap_and_get(struct starpu_bitmap * b1, struct starpu_bitmap * b2, int e)
-{
-	return starpu_bitmap_get(b1,e) && starpu_bitmap_get(b2,e);
-}
-
-int starpu_bitmap_cardinal(struct starpu_bitmap * b)
-{
-	return b->cardinal;
-}
-
-
-static inline int get_first_bit_rank(unsigned long ms)
-{
-	STARPU_ASSERT(ms != 0);
-#if (__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))
-	return __builtin_ffsl(ms) - 1;
-#else
-	unsigned long m = 1ul;
-	int i = 0;
-	while(!(m&ms))
-		i++,m<<=1;
-	return i;
-#endif
-}
-
-static inline int get_last_bit_rank(unsigned long l)
-{
-	STARPU_ASSERT(l != 0);
-#if (__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))
-	return 8*sizeof(l) - __builtin_clzl(l);
-#else
-	int ibit = LONG_BIT - 1;
-	while((!(1ul << ibit)) & l)
-		ibit--;
-	STARPU_ASSERT(ibit >= 0);
-	return ibit;
-#endif
-}
-
-int starpu_bitmap_first(struct starpu_bitmap * b)
-{
-	int i = 0;
-	while(i < b->size && !b->bits[i])
-		i++;
-	if( i == b->size)
-		return -1;
-	int nb_long = i;
-	unsigned long ms = b->bits[i];
-
-	return (nb_long * LONG_BIT) + get_first_bit_rank(ms);
-}
-
-int starpu_bitmap_has_next(struct starpu_bitmap * b, int e)
-{
-	int nb_long = (e+1) / LONG_BIT;
-	int nb_bit = (e+1) % LONG_BIT;
-	unsigned long mask = (~0ul) << nb_bit;
-	if(b->bits[nb_long] & mask)
-		return 1;
-	for(nb_long++; nb_long < b->size; nb_long++)
-		if(b->bits[nb_long])
-			return 1;
-	return 0;
-}
-
-int starpu_bitmap_last(struct starpu_bitmap * b)
-{
-	if(b->cardinal == 0)
-		return -1;
-	int ilong;
-	for(ilong = b->size - 1; ilong >= 0; ilong--)
-	{
-		if(b->bits[ilong])
-			break;
-	}
-	STARPU_ASSERT(ilong >= 0);
-	unsigned long l = b->bits[ilong];
-	return ilong * LONG_BIT + get_last_bit_rank(l);
-}
-
-int starpu_bitmap_next(struct starpu_bitmap *b, int e)
-{
-	int nb_long = e / LONG_BIT;
-	int nb_bit = e % LONG_BIT;
-	unsigned long rest = nb_bit == LONG_BIT - 1 ? 0 : (~0ul << (nb_bit + 1)) & b->bits[nb_long];
-	if(nb_bit != (LONG_BIT - 1) && rest)
-	{
-		int i = get_first_bit_rank(rest);
-		STARPU_ASSERT(i >= 0 && i < LONG_BIT);
-		return (nb_long * LONG_BIT) + i;
-	}
-
-	for(nb_long++;nb_long < b->size; nb_long++)
-		if(b->bits[nb_long])
-			return nb_long * LONG_BIT + get_first_bit_rank(b->bits[nb_long]);
-	return -1;
-}

+ 1 - 1
src/common/fxt.h

@@ -753,7 +753,7 @@ do {									\
 	{								\
 		/* we include the task name */			\
 		_STARPU_FUT_FULL_PROBE5STR(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_START_CODELET_BODY, (job)->job_id, ((job)->task)->sched_ctx, workerid, starpu_worker_get_memory_node(workerid), 1, name); \
-		if (model_name && strcmp(model_name, name))				\
+		if (model_name)					\
 			_STARPU_FUT_FULL_PROBE1STR(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_MODEL_NAME, (job)->job_id, model_name); \
 	}								\
 	else {                                                          \

+ 16 - 2
src/core/perfmodel/perfmodel_bus.c

@@ -80,8 +80,8 @@ struct dev_timing
 };
 
 /* TODO: measure latency */
-static double bandwidth_matrix[STARPU_MAXNODES][STARPU_MAXNODES];
-static double latency_matrix[STARPU_MAXNODES][STARPU_MAXNODES];
+static double bandwidth_matrix[STARPU_MAXNODES][STARPU_MAXNODES]; /* MB/s */
+static double latency_matrix[STARPU_MAXNODES][STARPU_MAXNODES]; /* µs */
 static unsigned was_benchmarked = 0;
 #ifndef STARPU_SIMGRID
 static unsigned ncpus = 0;
@@ -1546,6 +1546,20 @@ static int load_bus_bandwidth_file_content(void)
 				return 0;
 			}
 
+			int limit_bandwidth = starpu_get_env_number("STARPU_LIMIT_BANDWIDTH");
+			if (limit_bandwidth >= 0)
+			{
+#ifndef STARPU_SIMGRID
+				_STARPU_DISP("Warning: STARPU_LIMIT_BANDWIDTH set to %d but simgrid not enabled, thus ignored\n", limit_bandwidth);
+#else
+#ifdef HAVE_SG_LINK_BANDWIDTH_SET
+				bandwidth = limit_bandwidth;
+#else
+				_STARPU_DISP("Warning: STARPU_LIMIT_BANDWIDTH set to %d but this requires simgrid 3.26\n", limit_bandwidth);
+#endif
+#endif
+			}
+
 			bandwidth_matrix[src][dst] = bandwidth;
 
 			/* Look out for \t\n */

+ 13 - 0
src/core/simgrid.c

@@ -337,6 +337,19 @@ void _starpu_start_simgrid(int *argc, char **argv)
 #else
 	MSG_create_environment(path);
 #endif
+	int limit_bandwidth = starpu_get_env_number("STARPU_LIMIT_BANDWIDTH");
+	if (limit_bandwidth >= 0)
+	{
+#ifdef HAVE_SG_LINK_BANDWIDTH_SET
+		sg_link_t *links = sg_link_list();
+		int count = sg_link_count(), i;
+		for (i = 0; i < count; i++) {
+			sg_link_bandwidth_set(links[i], limit_bandwidth * 1000000.);
+		}
+#else
+		_STARPU_DISP("Warning: STARPU_LIMIT_BANDWIDTH set to %d but this requires simgrid 3.26, thus ignored\n", limit_bandwidth);
+#endif
+	}
 
 	simgrid_transfer_cost = starpu_get_env_number_default("STARPU_SIMGRID_TRANSFER_COST", 1);
 }

+ 1 - 1
src/core/topology.c

@@ -2854,7 +2854,7 @@ static void _starpu_init_workers_binding_and_memory(struct _starpu_machine_confi
 					config->nbindid = STARPU_NMAXWORKERS;
 				else
 					config->nbindid = 2 * old_nbindid;
-				if (bindid > config->nbindid)
+				if (bindid >= config->nbindid)
 				{
 					config->nbindid = bindid+1;
 				}

+ 0 - 2
src/debug/traces/starpu_fxt.c

@@ -199,8 +199,6 @@ static void task_dump(struct task_info *task, struct starpu_fxt_options *options
 	if (task->name)
 	{
 		fprintf(tasks_file, "Name: %s\n", task->name);
-		if (!task->model_name)
-			fprintf(tasks_file, "Model: %s\n", task->name);
 		free(task->name);
 	}
 	if (task->model_name)

+ 1 - 1
src/sched_policies/component_best_implementation.c

@@ -85,7 +85,7 @@ static void select_best_implementation_and_set_preds(unsigned sched_ctx_id, stru
 static int best_implementation_push_task(struct starpu_sched_component * component, struct starpu_task * task)
 {
 	STARPU_ASSERT(component->nchildren == 1);
-	select_best_implementation_and_set_preds(component->tree->sched_ctx_id, component->workers_in_ctx, task);
+	select_best_implementation_and_set_preds(component->tree->sched_ctx_id, &component->workers_in_ctx, task);
 	return starpu_sched_component_push_task(component,component->children[0],task);
 }
 

+ 6 - 7
src/sched_policies/component_composed.c

@@ -168,16 +168,16 @@ static void composed_component_remove_child(struct starpu_sched_component * comp
 static void composed_component_notify_change_workers(struct starpu_sched_component * component)
 {
 	struct composed_component * c = component->data;
-	struct starpu_bitmap * workers = component->workers;
-	struct starpu_bitmap * workers_in_ctx = component->workers_in_ctx;
+	struct starpu_bitmap * workers = &component->workers;
+	struct starpu_bitmap * workers_in_ctx = &component->workers_in_ctx;
 	struct starpu_sched_component * n;
 	for(n = c->top; ;n = n->children[0])
 	{
-		starpu_bitmap_unset_all(n->workers);
-		starpu_bitmap_or(n->workers, workers);
+		starpu_bitmap_unset_all(&n->workers);
+		starpu_bitmap_or(&n->workers, workers);
 
-		starpu_bitmap_unset_all(n->workers_in_ctx);
-		starpu_bitmap_or(n->workers_in_ctx, workers_in_ctx);
+		starpu_bitmap_unset_all(&n->workers_in_ctx);
+		starpu_bitmap_or(&n->workers_in_ctx, workers_in_ctx);
 
 		n->properties = component->properties;
 		if(n == c->bottom)
@@ -195,7 +195,6 @@ void composed_component_deinit_data(struct starpu_sched_component * _component)
 	do
 	{
 		component = next;
-		component->workers = NULL;
 		next = component->children ? component->children[0] : NULL;
 		starpu_sched_component_destroy(component);
 	}

+ 6 - 6
src/sched_policies/component_eager.c

@@ -35,9 +35,9 @@ static int eager_push_task(struct starpu_sched_component * component, struct sta
 	{
 		/* target told us we could push to it, try to */
 		int idworker;
-		for(idworker = starpu_bitmap_first(target->workers);
+		for(idworker = starpu_bitmap_first(&target->workers);
 			idworker != -1;
-			idworker = starpu_bitmap_next(target->workers, idworker))
+			idworker = starpu_bitmap_next(&target->workers, idworker))
 		{
 			int nimpl;
 			for(nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
@@ -55,9 +55,9 @@ static int eager_push_task(struct starpu_sched_component * component, struct sta
 
 	/* FIXME: should rather just loop over children before looping over its workers */
 	int workerid;
-	for(workerid = starpu_bitmap_first(component->workers_in_ctx);
+	for(workerid = starpu_bitmap_first(&component->workers_in_ctx);
 	    workerid != -1;
-	    workerid = starpu_bitmap_next(component->workers_in_ctx, workerid))
+	    workerid = starpu_bitmap_next(&component->workers_in_ctx, workerid))
 	{
 		int nimpl;
 		for(nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
@@ -70,9 +70,9 @@ static int eager_push_task(struct starpu_sched_component * component, struct sta
 				for (i = 0; i < component->nchildren; i++)
 				{
 					int idworker;
-					for(idworker = starpu_bitmap_first(component->children[i]->workers);
+					for(idworker = starpu_bitmap_first(&component->children[i]->workers);
 						idworker != -1;
-						idworker = starpu_bitmap_next(component->children[i]->workers, idworker))
+						idworker = starpu_bitmap_next(&component->children[i]->workers, idworker))
 					{
 						if (idworker == workerid)
 						{

+ 4 - 4
src/sched_policies/component_eager_calibration.c

@@ -25,9 +25,9 @@ static int eager_calibration_push_task(struct starpu_sched_component * component
 	starpu_task_bundle_t bundle = task->bundle;
 
 	int workerid;
-	for(workerid = starpu_bitmap_first(component->workers_in_ctx);
+	for(workerid = starpu_bitmap_first(&component->workers_in_ctx);
 	    workerid != -1;
-	    workerid = starpu_bitmap_next(component->workers_in_ctx, workerid))
+	    workerid = starpu_bitmap_next(&component->workers_in_ctx, workerid))
 	{
 		struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid, component->tree->sched_ctx_id);
 		int nimpl;
@@ -49,9 +49,9 @@ static int eager_calibration_push_task(struct starpu_sched_component * component
 					for (i = 0; i < component->nchildren; i++)
 					{
 						int idworker;
-						for(idworker = starpu_bitmap_first(component->children[i]->workers);
+						for(idworker = starpu_bitmap_first(&component->children[i]->workers);
 							idworker != -1;
-							idworker = starpu_bitmap_next(component->children[i]->workers, idworker))
+							idworker = starpu_bitmap_next(&component->children[i]->workers, idworker))
 						{
 							if (idworker == workerid)
 							{

+ 4 - 4
src/sched_policies/component_eager_prio.c

@@ -50,9 +50,9 @@ static int eager_prio_progress_one(struct starpu_sched_component *component)
 
 	/* FIXME: should rather just loop over children before looping over its workers */
 	int workerid;
-	for(workerid = starpu_bitmap_first(component->workers_in_ctx);
+	for(workerid = starpu_bitmap_first(&component->workers_in_ctx);
 	    workerid != -1;
-	    workerid = starpu_bitmap_next(component->workers_in_ctx, workerid))
+	    workerid = starpu_bitmap_next(&component->workers_in_ctx, workerid))
 	{
 		int nimpl;
 		for(nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
@@ -65,9 +65,9 @@ static int eager_prio_progress_one(struct starpu_sched_component *component)
 				for (i = 0; i < component->nchildren; i++)
 				{
 					int idworker;
-					for(idworker = starpu_bitmap_first(component->children[i]->workers);
+					for(idworker = starpu_bitmap_first(&component->children[i]->workers);
 						idworker != -1;
-						idworker = starpu_bitmap_next(component->children[i]->workers, idworker))
+						idworker = starpu_bitmap_next(&component->children[i]->workers, idworker))
 					{
 						if (idworker == workerid)
 						{

+ 12 - 13
src/sched_policies/component_fifo.c

@@ -23,7 +23,7 @@
 
 struct _starpu_fifo_data
 {
-	struct _starpu_fifo_taskq * fifo;
+	struct _starpu_fifo_taskq fifo;
 	starpu_pthread_mutex_t mutex;
 	unsigned ntasks_threshold;
 	double exp_len_threshold;
@@ -35,7 +35,6 @@ static void fifo_component_deinit_data(struct starpu_sched_component * component
 {
 	STARPU_ASSERT(component && component->data);
 	struct _starpu_fifo_data * f = component->data;
-	_starpu_destroy_fifo(f->fifo);
 	STARPU_PTHREAD_MUTEX_DESTROY(&f->mutex);
 	free(f);
 }
@@ -44,22 +43,22 @@ static double fifo_estimated_end(struct starpu_sched_component * component)
 {
 	STARPU_ASSERT(component && component->data);
 	struct _starpu_fifo_data * data = component->data;
-	struct _starpu_fifo_taskq * queue = data->fifo;
+	struct _starpu_fifo_taskq * queue = &data->fifo;
 	return starpu_sched_component_estimated_end_min_add(component, queue->exp_len);
 }
 
 static double fifo_estimated_load(struct starpu_sched_component * component)
 {
 	STARPU_ASSERT(component && component->data);
-	STARPU_ASSERT(starpu_bitmap_cardinal(component->workers_in_ctx) != 0);
+	STARPU_ASSERT(starpu_bitmap_cardinal(&component->workers_in_ctx) != 0);
 	struct _starpu_fifo_data * data = component->data;
-	struct _starpu_fifo_taskq * queue = data->fifo;
+	struct _starpu_fifo_taskq * queue = &data->fifo;
 	starpu_pthread_mutex_t * mutex = &data->mutex;
 	double relative_speedup = 0.0;
 	double load = starpu_sched_component_estimated_load(component);
 	if(STARPU_SCHED_COMPONENT_IS_HOMOGENEOUS(component))
 	{
-		int first_worker = starpu_bitmap_first(component->workers_in_ctx);
+		int first_worker = starpu_bitmap_first(&component->workers_in_ctx);
 		relative_speedup = starpu_worker_get_relative_speedup(starpu_worker_get_perf_archtype(first_worker, component->tree->sched_ctx_id));
 		STARPU_COMPONENT_MUTEX_LOCK(mutex);
 		load += queue->ntasks / relative_speedup;
@@ -69,11 +68,11 @@ static double fifo_estimated_load(struct starpu_sched_component * component)
 	else
 	{
 		int i;
-		for(i = starpu_bitmap_first(component->workers_in_ctx);
+		for(i = starpu_bitmap_first(&component->workers_in_ctx);
 		    i != -1;
-		    i = starpu_bitmap_next(component->workers_in_ctx, i))
+		    i = starpu_bitmap_next(&component->workers_in_ctx, i))
 			relative_speedup += starpu_worker_get_relative_speedup(starpu_worker_get_perf_archtype(i, component->tree->sched_ctx_id));
-		relative_speedup /= starpu_bitmap_cardinal(component->workers_in_ctx);
+		relative_speedup /= starpu_bitmap_cardinal(&component->workers_in_ctx);
 		STARPU_ASSERT(!_STARPU_IS_ZERO(relative_speedup));
 		STARPU_COMPONENT_MUTEX_LOCK(mutex);
 		load += queue->ntasks / relative_speedup;
@@ -87,7 +86,7 @@ static int fifo_push_local_task(struct starpu_sched_component * component, struc
 	STARPU_ASSERT(component && component->data && task);
 	STARPU_ASSERT(starpu_sched_component_can_execute_task(component,task));
 	struct _starpu_fifo_data * data = component->data;
-	struct _starpu_fifo_taskq * queue = data->fifo;
+	struct _starpu_fifo_taskq * queue = &data->fifo;
 	starpu_pthread_mutex_t * mutex = &data->mutex;
 	int ret = 0;
 	const double now = starpu_timing_now();
@@ -169,7 +168,7 @@ static struct starpu_task * fifo_pull_task(struct starpu_sched_component * compo
 {
 	STARPU_ASSERT(component && component->data);
 	struct _starpu_fifo_data * data = component->data;
-	struct _starpu_fifo_taskq * queue = data->fifo;
+	struct _starpu_fifo_taskq * queue = &data->fifo;
 	starpu_pthread_mutex_t * mutex = &data->mutex;
 	const double now = starpu_timing_now();
 
@@ -182,7 +181,7 @@ static struct starpu_task * fifo_pull_task(struct starpu_sched_component * compo
 	STARPU_COMPONENT_MUTEX_LOCK(mutex);
 	struct starpu_task * task;
 	if (data->ready && to->properties & STARPU_SCHED_COMPONENT_SINGLE_MEMORY_NODE)
-		task = _starpu_fifo_pop_first_ready_task(queue, starpu_bitmap_first(to->workers_in_ctx), -1);
+		task = _starpu_fifo_pop_first_ready_task(queue, starpu_bitmap_first(&to->workers_in_ctx), -1);
 	else
 		task = _starpu_fifo_pop_task(queue, starpu_worker_get_id_check());
 	if(task && data->exp)
@@ -269,7 +268,7 @@ struct starpu_sched_component * starpu_sched_component_fifo_create(struct starpu
 	struct starpu_sched_component *component = starpu_sched_component_create(tree, "fifo");
 	struct _starpu_fifo_data *data;
 	_STARPU_MALLOC(data, sizeof(*data));
-	data->fifo = _starpu_create_fifo();
+	_starpu_init_fifo(&data->fifo);
 	STARPU_PTHREAD_MUTEX_INIT(&data->mutex,NULL);
 	component->data = data;
 	component->estimated_end = fifo_estimated_end;

+ 6 - 6
src/sched_policies/component_heteroprio.c

@@ -128,9 +128,9 @@ static int heteroprio_progress_accel(struct starpu_sched_component *component, s
 		for (i = 0; i < component->nchildren; i++)
 		{
 			int idworker;
-			for(idworker = starpu_bitmap_first(component->children[i]->workers);
+			for(idworker = starpu_bitmap_first(&component->children[i]->workers);
 				idworker != -1;
-				idworker = starpu_bitmap_next(component->children[i]->workers, idworker))
+				idworker = starpu_bitmap_next(&component->children[i]->workers, idworker))
 			{
 				if (starpu_worker_get_type(idworker) == archtype)
 					break;
@@ -173,9 +173,9 @@ static int heteroprio_progress_accel(struct starpu_sched_component *component, s
 	best_component = component->children[best_icomponent];
 
 	int idworker;
-	for(idworker = starpu_bitmap_first(best_component->workers);
+	for(idworker = starpu_bitmap_first(&best_component->workers);
 		idworker != -1;
-		idworker = starpu_bitmap_next(best_component->workers, idworker))
+		idworker = starpu_bitmap_next(&best_component->workers, idworker))
 	{
 		if (starpu_worker_get_type(idworker) == archtype)
 			break;
@@ -356,9 +356,9 @@ static int heteroprio_push_task(struct starpu_sched_component * component, struc
 
 	/* Compute acceleration between best-performing arch and least-performing arch */
 	int workerid;
-	for(workerid = starpu_bitmap_first(component->workers_in_ctx);
+	for(workerid = starpu_bitmap_first(&component->workers_in_ctx);
 	    workerid != -1;
-	    workerid = starpu_bitmap_next(component->workers_in_ctx, workerid))
+	    workerid = starpu_bitmap_next(&component->workers_in_ctx, workerid))
 	{
 		unsigned impl_mask;
 		if (!starpu_worker_can_execute_task_impl(workerid, task, &impl_mask))

+ 6 - 6
src/sched_policies/component_prio.c

@@ -70,7 +70,7 @@ static double prio_estimated_end(struct starpu_sched_component * component)
 static double prio_estimated_load(struct starpu_sched_component * component)
 {
 	STARPU_ASSERT(component && component->data);
-	STARPU_ASSERT(starpu_bitmap_cardinal(component->workers_in_ctx) != 0);
+	STARPU_ASSERT(starpu_bitmap_cardinal(&component->workers_in_ctx) != 0);
 	struct _starpu_prio_data * data = component->data;
 	struct _starpu_prio_deque * queue = &data->prio;
 	starpu_pthread_mutex_t * mutex = &data->mutex;
@@ -78,7 +78,7 @@ static double prio_estimated_load(struct starpu_sched_component * component)
 	double load = starpu_sched_component_estimated_load(component);
 	if(STARPU_SCHED_COMPONENT_IS_HOMOGENEOUS(component))
 	{
-		int first_worker = starpu_bitmap_first(component->workers_in_ctx);
+		int first_worker = starpu_bitmap_first(&component->workers_in_ctx);
 		relative_speedup = starpu_worker_get_relative_speedup(starpu_worker_get_perf_archtype(first_worker, component->tree->sched_ctx_id));
 		STARPU_COMPONENT_MUTEX_LOCK(mutex);
 		load += queue->ntasks / relative_speedup;
@@ -88,11 +88,11 @@ static double prio_estimated_load(struct starpu_sched_component * component)
 	else
 	{
 		int i;
-		for(i = starpu_bitmap_first(component->workers_in_ctx);
+		for(i = starpu_bitmap_first(&component->workers_in_ctx);
 		    i != -1;
-		    i = starpu_bitmap_next(component->workers_in_ctx, i))
+		    i = starpu_bitmap_next(&component->workers_in_ctx, i))
 			relative_speedup += starpu_worker_get_relative_speedup(starpu_worker_get_perf_archtype(i, component->tree->sched_ctx_id));
-		relative_speedup /= starpu_bitmap_cardinal(component->workers_in_ctx);
+		relative_speedup /= starpu_bitmap_cardinal(&component->workers_in_ctx);
 		STARPU_ASSERT(!_STARPU_IS_ZERO(relative_speedup));
 		STARPU_COMPONENT_MUTEX_LOCK(mutex);
 		load += queue->ntasks / relative_speedup;
@@ -204,7 +204,7 @@ static struct starpu_task * prio_pull_task(struct starpu_sched_component * compo
 	STARPU_COMPONENT_MUTEX_LOCK(mutex);
 	struct starpu_task * task;
 	if (data->ready && to->properties & STARPU_SCHED_COMPONENT_SINGLE_MEMORY_NODE)
-		task = _starpu_prio_deque_deque_first_ready_task(queue, starpu_bitmap_first(to->workers_in_ctx));
+		task = _starpu_prio_deque_deque_first_ready_task(queue, starpu_bitmap_first(&to->workers_in_ctx));
 	else
 		task = _starpu_prio_deque_pop_task(queue);
 	if(task && data->exp)

+ 2 - 2
src/sched_policies/component_random.c

@@ -24,9 +24,9 @@ static double compute_relative_speedup(struct starpu_sched_component * component
 {
 	double sum = 0.0;
 	int id;
-	for(id = starpu_bitmap_first(component->workers_in_ctx);
+	for(id = starpu_bitmap_first(&component->workers_in_ctx);
 	    id != -1;
-	    id = starpu_bitmap_next(component->workers_in_ctx, id))
+	    id = starpu_bitmap_next(&component->workers_in_ctx, id))
 	{
 		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(id, component->tree->sched_ctx_id);
 		sum += starpu_worker_get_relative_speedup(perf_arch);

+ 23 - 26
src/sched_policies/component_sched.c

@@ -45,9 +45,9 @@ int starpu_sched_component_execute_preds(struct starpu_sched_component * compone
 
 
 	int workerid;
-	for(workerid = starpu_bitmap_first(component->workers_in_ctx);
+	for(workerid = starpu_bitmap_first(&component->workers_in_ctx);
 	    workerid != -1;
-	    workerid = starpu_bitmap_next(component->workers_in_ctx, workerid))
+	    workerid = starpu_bitmap_next(&component->workers_in_ctx, workerid))
 	{
 		int nimpl;
 		for(nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
@@ -100,9 +100,9 @@ int starpu_sched_component_can_execute_task(struct starpu_sched_component * comp
 	unsigned nimpl;
 	int worker;
 	for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
-		for(worker = starpu_bitmap_first(component->workers_in_ctx);
+		for(worker = starpu_bitmap_first(&component->workers_in_ctx);
 		    -1 != worker;
-		    worker = starpu_bitmap_next(component->workers_in_ctx, worker))
+		    worker = starpu_bitmap_next(&component->workers_in_ctx, worker))
 			if (starpu_worker_can_execute_task(worker, task, nimpl)
 			     || starpu_combined_worker_can_execute_task(worker, task, nimpl))
 			    return 1;
@@ -115,21 +115,21 @@ int starpu_sched_component_can_execute_task(struct starpu_sched_component * comp
 double starpu_sched_component_transfer_length(struct starpu_sched_component * component, struct starpu_task * task)
 {
 	STARPU_ASSERT(component && task);
-	int nworkers = starpu_bitmap_cardinal(component->workers_in_ctx);
+	int nworkers = starpu_bitmap_cardinal(&component->workers_in_ctx);
 	double sum = 0.0;
 	int worker;
 	if(STARPU_SCHED_COMPONENT_IS_SINGLE_MEMORY_NODE(component))
 	{
-		unsigned memory_node  = starpu_worker_get_memory_node(starpu_bitmap_first(component->workers_in_ctx));
+		unsigned memory_node  = starpu_worker_get_memory_node(starpu_bitmap_first(&component->workers_in_ctx));
 		if(task->bundle)
 			return starpu_task_bundle_expected_data_transfer_time(task->bundle,memory_node);
 		else
 			return starpu_task_expected_data_transfer_time(memory_node, task);
 	}
 
-	for(worker = starpu_bitmap_first(component->workers_in_ctx);
+	for(worker = starpu_bitmap_first(&component->workers_in_ctx);
 	    worker != -1;
-	    worker = starpu_bitmap_next(component->workers_in_ctx, worker))
+	    worker = starpu_bitmap_next(&component->workers_in_ctx, worker))
 	{
 		unsigned memory_node  = starpu_worker_get_memory_node(worker);
 		if(task->bundle)
@@ -156,7 +156,7 @@ void starpu_sched_component_prefetch_on_node(struct starpu_sched_component * com
 	if (starpu_get_prefetch_flag() && (!task->prefetched)
 		&& (component->properties & STARPU_SCHED_COMPONENT_SINGLE_MEMORY_NODE))
 	{
-		int worker = starpu_bitmap_first(component->workers_in_ctx);
+		int worker = starpu_bitmap_first(&component->workers_in_ctx);
 		unsigned memory_node = starpu_worker_get_memory_node(worker);
 		starpu_prefetch_task_input_on_node(task, memory_node);
 		task->prefetched = 1;
@@ -195,8 +195,6 @@ void starpu_sched_component_destroy(struct starpu_sched_component *component)
 	free(component->children);
 	free(component->parents);
 	free(component->name);
-	starpu_bitmap_destroy(component->workers);
-	starpu_bitmap_destroy(component->workers_in_ctx);
 	free(component);
 }
 
@@ -223,7 +221,7 @@ void set_properties(struct starpu_sched_component * component)
 	STARPU_ASSERT(component);
 	component->properties = 0;
 
-	int worker = starpu_bitmap_first(component->workers_in_ctx);
+	int worker = starpu_bitmap_first(&component->workers_in_ctx);
 	if (worker == -1)
 		return;
 	if (starpu_worker_is_combined_worker(worker))
@@ -237,7 +235,7 @@ void set_properties(struct starpu_sched_component * component)
 	int is_all_same_component = 1;
 	for(;
 	    worker != -1;
-	    worker = starpu_bitmap_next(component->workers_in_ctx, worker))
+	    worker = starpu_bitmap_next(&component->workers_in_ctx, worker))
 	{
 		if(starpu_worker_is_combined_worker(worker))
 			continue;
@@ -262,12 +260,12 @@ void _starpu_sched_component_update_workers(struct starpu_sched_component * comp
 	STARPU_ASSERT(component);
 	if(starpu_sched_component_is_worker(component))
 		return;
-	starpu_bitmap_unset_all(component->workers);
+	starpu_bitmap_unset_all(&component->workers);
 	unsigned i;
 	for(i = 0; i < component->nchildren; i++)
 	{
 		_starpu_sched_component_update_workers(component->children[i]);
-		starpu_bitmap_or(component->workers, component->children[i]->workers);
+		starpu_bitmap_or(&component->workers, &component->children[i]->workers);
 	}
 	component->notify_change_workers(component);
 }
@@ -282,11 +280,11 @@ void _starpu_sched_component_update_workers_in_ctx(struct starpu_sched_component
 	if(starpu_sched_component_is_worker(component))
 		return;
 	struct starpu_bitmap * workers_in_ctx = _starpu_get_worker_mask(sched_ctx_id);
-	starpu_bitmap_unset_and(component->workers_in_ctx,component->workers, workers_in_ctx);
+	starpu_bitmap_unset_and(&component->workers_in_ctx,&component->workers, workers_in_ctx);
 	unsigned i,j;
 	for(i = starpu_worker_get_count(); i < starpu_worker_get_count() + starpu_combined_worker_get_count(); i++)
 	{
-		if (starpu_bitmap_get(component->workers, i))
+		if (starpu_bitmap_get(&component->workers, i))
 		{
 			/* Component has this combined worker, check whether the
 			 * context has all the corresponding workers */
@@ -297,7 +295,7 @@ void _starpu_sched_component_update_workers_in_ctx(struct starpu_sched_component
 				if (!starpu_bitmap_get(workers_in_ctx, combined_workerid[j]))
 					goto nocombined;
 			/* We have all workers, add it */
-			starpu_bitmap_set(component->workers_in_ctx, i);
+			starpu_bitmap_set(&component->workers_in_ctx, i);
 		}
 nocombined:
 		(void)0;
@@ -324,7 +322,7 @@ struct starpu_bitmap * _starpu_get_worker_mask(unsigned sched_ctx_id)
 	STARPU_ASSERT(sched_ctx_id < STARPU_NMAX_SCHED_CTXS);
 	struct starpu_sched_tree * t = starpu_sched_ctx_get_policy_data(sched_ctx_id);
 	STARPU_ASSERT(t);
-	return t->workers;
+	return &t->workers;
 }
 
 void starpu_sched_tree_update_workers_in_ctx(struct starpu_sched_tree * t)
@@ -442,7 +440,7 @@ void starpu_sched_tree_add_workers(unsigned sched_ctx_id, int *workerids, unsign
 
 	unsigned i;
 	for(i = 0; i < nworkers; i++)
-		starpu_bitmap_set(t->workers, workerids[i]);
+		starpu_bitmap_set(&t->workers, workerids[i]);
 
 	starpu_sched_tree_update_workers_in_ctx(t);
 
@@ -461,7 +459,7 @@ void starpu_sched_tree_remove_workers(unsigned sched_ctx_id, int *workerids, uns
 
 	unsigned i;
 	for(i = 0; i < nworkers; i++)
-		starpu_bitmap_unset(t->workers, workerids[i]);
+		starpu_bitmap_unset(&t->workers, workerids[i]);
 
 	starpu_sched_tree_update_workers_in_ctx(t);
 
@@ -478,7 +476,7 @@ struct starpu_sched_tree * starpu_sched_tree_create(unsigned sched_ctx_id)
 	struct starpu_sched_tree *t;
 	_STARPU_CALLOC(t, 1, sizeof(*t));
 	t->sched_ctx_id = sched_ctx_id;
-	t->workers = starpu_bitmap_create();
+	starpu_bitmap_init(&t->workers);
 	STARPU_PTHREAD_MUTEX_INIT(&t->lock,NULL);
 	trees[sched_ctx_id] = t;
 	return t;
@@ -491,7 +489,6 @@ void starpu_sched_tree_destroy(struct starpu_sched_tree * tree)
 	trees[tree->sched_ctx_id] = NULL;
 	if(tree->root)
 		starpu_sched_component_destroy_rec(tree->root);
-	starpu_bitmap_destroy(tree->workers);
 	STARPU_PTHREAD_MUTEX_DESTROY(&tree->lock);
 	free(tree);
 }
@@ -694,7 +691,7 @@ double starpu_sched_component_estimated_end_min_add(struct starpu_sched_componen
 	{
 		/* We don't know which workers will do this, assume it will be
 		 * evenly distributed to existing work */
-		int card = starpu_bitmap_cardinal(component->workers_in_ctx);
+		int card = starpu_bitmap_cardinal(&component->workers_in_ctx);
 		if (card == 0)
 			/* Oops, no resources to compute our tasks. Let's just hope that
 			 * we will be given one at some point */
@@ -732,8 +729,8 @@ struct starpu_sched_component * starpu_sched_component_create(struct starpu_sche
 	struct starpu_sched_component *component;
 	_STARPU_CALLOC(component, 1, sizeof(*component));
 	component->tree = tree;
-	component->workers = starpu_bitmap_create();
-	component->workers_in_ctx = starpu_bitmap_create();
+	starpu_bitmap_init(&component->workers);
+	starpu_bitmap_init(&component->workers_in_ctx);
 	component->add_child = starpu_sched_component_add_child;
 	component->remove_child = starpu_sched_component_remove_child;
 	component->add_parent = starpu_sched_component_add_parent;

+ 23 - 28
src/sched_policies/component_work_stealing.c

@@ -37,7 +37,7 @@ struct _starpu_component_work_stealing_data
  */
 	unsigned performed_total, last_pop_child, last_push_child;
 
-	struct _starpu_prio_deque ** fifos;
+	struct _starpu_prio_deque * fifos;
 	starpu_pthread_mutex_t ** mutexes;
 	unsigned size;
 };
@@ -59,7 +59,7 @@ static struct starpu_task *  steal_task_round_robin(struct starpu_sched_componen
 	struct starpu_task * task = NULL;
 	while (1)
 	{
-		struct _starpu_prio_deque * fifo = wsd->fifos[i];
+		struct _starpu_prio_deque * fifo = &wsd->fifos[i];
 
 		STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
 		task = _starpu_prio_deque_deque_task_for_worker(fifo, workerid, NULL);
@@ -123,7 +123,7 @@ static inline unsigned select_worker(struct starpu_sched_component * component)
 
 static int is_worker_of_component(struct starpu_sched_component * component, int workerid)
 {
-	return starpu_bitmap_get(component->workers, workerid);
+	return starpu_bitmap_get(&component->workers, workerid);
 }
 
 
@@ -141,17 +141,17 @@ static struct starpu_task * pull_task(struct starpu_sched_component * component,
 	struct _starpu_component_work_stealing_data * wsd = component->data;
 	const double now = starpu_timing_now();
 	STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
-	struct starpu_task * task = _starpu_prio_deque_pop_task(wsd->fifos[i]);
+	struct starpu_task * task = _starpu_prio_deque_pop_task(&wsd->fifos[i]);
 	if(task)
 	{
 		if(!isnan(task->predicted))
 		{
-			wsd->fifos[i]->exp_len -= task->predicted;
-			wsd->fifos[i]->exp_start = now + task->predicted;
+			wsd->fifos[i].exp_len -= task->predicted;
+			wsd->fifos[i].exp_start = now + task->predicted;
 		}
 	}
 	else
-		wsd->fifos[i]->exp_len = 0.0;
+		wsd->fifos[i].exp_len = 0.0;
 
 	STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 	if(task)
@@ -163,7 +163,7 @@ static struct starpu_task * pull_task(struct starpu_sched_component * component,
 	if(task)
 	{
 		STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
-		wsd->fifos[i]->nprocessed++;
+		wsd->fifos[i].nprocessed++;
 		STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 
 		return task;
@@ -196,13 +196,13 @@ double _ws_estimated_end(struct starpu_sched_component * component)
 	for(i = 0; i < component->nchildren; i++)
 	{
 		STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
-		sum_len += wsd->fifos[i]->exp_len;
-		wsd->fifos[i]->exp_start = STARPU_MAX(now, wsd->fifos[i]->exp_start);
-		sum_start += wsd->fifos[i]->exp_start;
+		sum_len += wsd->fifos[i].exp_len;
+		wsd->fifos[i].exp_start = STARPU_MAX(now, wsd->fifos[i].exp_start);
+		sum_start += wsd->fifos[i].exp_start;
 		STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 
 	}
-	int nb_workers = starpu_bitmap_cardinal(component->workers_in_ctx);
+	int nb_workers = starpu_bitmap_cardinal(&component->workers_in_ctx);
 
 	return (sum_start + sum_len) / nb_workers;
 }
@@ -216,14 +216,14 @@ double _ws_estimated_load(struct starpu_sched_component * component)
 	for(i = 0; i < component->nchildren; i++)
 	{
 		STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
-		ntasks += wsd->fifos[i]->ntasks;
+		ntasks += wsd->fifos[i].ntasks;
 		STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 	}
 	double speedup = 0.0;
 	int workerid;
-	for(workerid = starpu_bitmap_first(component->workers_in_ctx);
+	for(workerid = starpu_bitmap_first(&component->workers_in_ctx);
 	    -1 != workerid;
-	    workerid = starpu_bitmap_next(component->workers_in_ctx, workerid))
+	    workerid = starpu_bitmap_next(&component->workers_in_ctx, workerid))
 	{
 		speedup += starpu_worker_get_relative_speedup(starpu_worker_get_perf_archtype(workerid, component->tree->sched_ctx_id));
 	}
@@ -243,9 +243,9 @@ static int push_task(struct starpu_sched_component * component, struct starpu_ta
 	while(1)
 	{
 		int workerid;
-		for(workerid = starpu_bitmap_first(component->children[i]->workers_in_ctx);
+		for(workerid = starpu_bitmap_first(&component->children[i]->workers_in_ctx);
 		    -1 != workerid;
-		    workerid = starpu_bitmap_next(component->children[i]->workers_in_ctx, workerid))
+		    workerid = starpu_bitmap_next(&component->children[i]->workers_in_ctx, workerid))
 		{
 			unsigned impl;
 			int can_execute = starpu_worker_can_execute_task_first_impl(workerid, task, &impl);
@@ -265,7 +265,7 @@ static int push_task(struct starpu_sched_component * component, struct starpu_ta
 
 	STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
 	starpu_sched_task_break(task);
-	ret = _starpu_prio_deque_push_front_task(wsd->fifos[i], task);
+	ret = _starpu_prio_deque_push_front_task(&wsd->fifos[i], task);
 	STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 
 	wsd->last_push_child = i;
@@ -308,9 +308,9 @@ int starpu_sched_tree_work_stealing_push_task(struct starpu_task *task)
 
 			struct _starpu_component_work_stealing_data * wsd = component->data;
 			STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
-			int ret = _starpu_prio_deque_push_front_task(wsd->fifos[i] , task);
+			int ret = _starpu_prio_deque_push_front_task(&wsd->fifos[i] , task);
 			if(ret == 0 && !isnan(task->predicted))
-				wsd->fifos[i]->exp_len += task->predicted;
+				wsd->fifos[i].exp_len += task->predicted;
 			STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 
 			component->can_pull(component);
@@ -334,10 +334,7 @@ void _ws_add_child(struct starpu_sched_component * component, struct starpu_sche
 		wsd->size = component->nchildren;
 	}
 
-	struct _starpu_prio_deque *fifo;
-	_STARPU_MALLOC(fifo, sizeof(*fifo));
-	_starpu_prio_deque_init(fifo);
-	wsd->fifos[component->nchildren - 1] = fifo;
+	_starpu_prio_deque_init(&wsd->fifos[component->nchildren - 1]);
 
 	starpu_pthread_mutex_t *mutex;
 	_STARPU_MALLOC(mutex, sizeof(*mutex));
@@ -359,19 +356,17 @@ void _ws_remove_child(struct starpu_sched_component * component, struct starpu_s
 			break;
 	}
 	STARPU_ASSERT(i_component != component->nchildren);
-	struct _starpu_prio_deque * tmp_fifo = wsd->fifos[i_component];
+	struct _starpu_prio_deque tmp_fifo = wsd->fifos[i_component];
 	wsd->fifos[i_component] = wsd->fifos[component->nchildren - 1];
 
 
 	component->children[i_component] = component->children[component->nchildren - 1];
 	component->nchildren--;
 	struct starpu_task * task;
-	while ((task = _starpu_prio_deque_pop_task(tmp_fifo)))
+	while ((task = _starpu_prio_deque_pop_task(&tmp_fifo)))
 	{
 		starpu_sched_component_push_task(NULL, component, task);
 	}
-	_starpu_prio_deque_destroy(tmp_fifo);
-	free(tmp_fifo);
 }
 
 void _work_stealing_component_deinit_data(struct starpu_sched_component * component)

+ 9 - 9
src/sched_policies/component_worker.c

@@ -408,7 +408,7 @@ static int simple_worker_push_task(struct starpu_sched_component * component, st
 	t->task = task;
 	t->ntasks = 1;
 
-	task->workerid = starpu_bitmap_first(component->workers);
+	task->workerid = starpu_bitmap_first(&component->workers);
 #if 1 /* dead lock problem? */
 	if (starpu_get_prefetch_flag() && !task->prefetched)
 		starpu_prefetch_task_input_for(task, task->workerid);
@@ -522,7 +522,7 @@ static double simple_worker_estimated_load(struct starpu_sched_component * compo
 	int ntasks_in_fifo = l ? l->ntasks : 0;
 	return (double) (nb_task + ntasks_in_fifo)
 		/ starpu_worker_get_relative_speedup(
-				starpu_worker_get_perf_archtype(starpu_bitmap_first(component->workers), component->tree->sched_ctx_id));
+				starpu_worker_get_perf_archtype(starpu_bitmap_first(&component->workers), component->tree->sched_ctx_id));
 }
 
 static void _worker_component_deinit_data(struct starpu_sched_component * component)
@@ -567,8 +567,8 @@ static struct starpu_sched_component * starpu_sched_component_worker_create(stru
 	component->estimated_end = simple_worker_estimated_end;
 	component->estimated_load = simple_worker_estimated_load;
 	component->deinit_data = _worker_component_deinit_data;
-	starpu_bitmap_set(component->workers, workerid);
-	starpu_bitmap_or(component->workers_in_ctx, component->workers);
+	starpu_bitmap_set(&component->workers, workerid);
+	starpu_bitmap_or(&component->workers_in_ctx, &component->workers);
 	_worker_components[tree->sched_ctx_id][workerid] = component;
 
 	/*
@@ -616,7 +616,7 @@ static int combined_worker_push_task(struct starpu_sched_component * component,
 	struct _starpu_worker_component_data * data = component->data;
 	STARPU_ASSERT(data->parallel_worker.worker_size >= 1);
 	struct _starpu_task_grid * task_alias[data->parallel_worker.worker_size];
-	starpu_parallel_task_barrier_init(task, starpu_bitmap_first(component->workers));
+	starpu_parallel_task_barrier_init(task, starpu_bitmap_first(&component->workers));
 	task_alias[0] = _starpu_task_grid_create();
 	task_alias[0]->task = starpu_task_dup(task);
 	task_alias[0]->task->workerid = data->parallel_worker.workerids[0];
@@ -750,8 +750,8 @@ static struct starpu_sched_component  * starpu_sched_component_combined_worker_c
 
 	struct starpu_sched_component *component = starpu_sched_component_parallel_worker_create(tree, combined_worker->worker_size, (unsigned *) combined_worker->combined_workerid);
 
-	starpu_bitmap_set(component->workers, workerid);
-	starpu_bitmap_or(component->workers_in_ctx, component->workers);
+	starpu_bitmap_set(&component->workers, workerid);
+	starpu_bitmap_or(&component->workers_in_ctx, &component->workers);
 
 	_worker_components[tree->sched_ctx_id][workerid] = component;
 
@@ -803,8 +803,8 @@ int starpu_sched_component_worker_get_workerid(struct starpu_sched_component * w
 #ifndef STARPU_NO_ASSERT
 	STARPU_ASSERT(_worker_consistant(worker_component));
 #endif
-	STARPU_ASSERT(1 == starpu_bitmap_cardinal(worker_component->workers));
-	return starpu_bitmap_first(worker_component->workers);
+	STARPU_ASSERT(1 == starpu_bitmap_cardinal(&worker_component->workers));
+	return starpu_bitmap_first(&worker_component->workers);
 }
 
 void starpu_sched_component_worker_pre_exec_hook(struct starpu_task * task, unsigned sched_ctx_id STARPU_ATTRIBUTE_UNUSED)

+ 23 - 39
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -52,7 +52,7 @@ struct _starpu_dmda_data
 	double _gamma;
 	double idle_power;
 
-	struct _starpu_fifo_taskq **queue_array;
+	struct _starpu_fifo_taskq queue_array[STARPU_NMAXWORKERS];
 
 	long int total_task_cnt;
 	long int ready_task_cnt;
@@ -234,7 +234,7 @@ static struct starpu_task *_dmda_pop_task(unsigned sched_ctx_id, int ready)
 	struct starpu_task *task;
 
 	unsigned workerid = starpu_worker_get_id_check();
-	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
+	struct _starpu_fifo_taskq *fifo = &dt->queue_array[workerid];
 
 	/* Take the opportunity to update start time */
 	fifo->exp_start = STARPU_MAX(starpu_timing_now(), fifo->exp_start);
@@ -284,7 +284,7 @@ static struct starpu_task *dmda_pop_every_task(unsigned sched_ctx_id)
 	struct starpu_task *new_list, *task;
 
 	unsigned workerid = starpu_worker_get_id_check();
-	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
+	struct _starpu_fifo_taskq *fifo = &dt->queue_array[workerid];
 
 	/* Take the opportunity to update start time */
 	fifo->exp_start = STARPU_MAX(starpu_timing_now(), fifo->exp_start);
@@ -323,7 +323,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 			return 0;
 	}
 
-	struct _starpu_fifo_taskq *fifo = dt->queue_array[best_workerid];
+	struct _starpu_fifo_taskq *fifo = &dt->queue_array[best_workerid];
 
 	double now = starpu_timing_now();
 
@@ -405,13 +405,13 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 	if (prio)
 	{
 		starpu_worker_lock(best_workerid);
-		ret =_starpu_fifo_push_sorted_task(dt->queue_array[best_workerid], task);
+		ret =_starpu_fifo_push_sorted_task(&dt->queue_array[best_workerid], task);
 		if(dt->num_priorities != -1)
 		{
 			int i;
 			int task_prio = _starpu_normalize_prio(task->priority, dt->num_priorities, task->sched_ctx);
 			for(i = 0; i <= task_prio; i++)
-				dt->queue_array[best_workerid]->ntasks_per_priority[i]++;
+				dt->queue_array[best_workerid].ntasks_per_priority[i]++;
 		}
 
 
@@ -424,9 +424,9 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 	else
 	{
 		starpu_worker_lock(best_workerid);
-		starpu_task_list_push_back (&dt->queue_array[best_workerid]->taskq, task);
-		dt->queue_array[best_workerid]->ntasks++;
-		dt->queue_array[best_workerid]->nprocessed++;
+		starpu_task_list_push_back (&dt->queue_array[best_workerid].taskq, task);
+		dt->queue_array[best_workerid].ntasks++;
+		dt->queue_array[best_workerid].nprocessed++;
 #if !defined(STARPU_NON_BLOCKING_DRIVERS) || defined(STARPU_SIMGRID)
 		starpu_wake_worker_locked(best_workerid);
 #endif
@@ -469,7 +469,7 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 		unsigned nimpl;
 		unsigned impl_mask;
 		unsigned worker = workers->get_next(workers, &it);
-		struct _starpu_fifo_taskq *fifo  = dt->queue_array[worker];
+		struct _starpu_fifo_taskq *fifo  = &dt->queue_array[worker];
 		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker, sched_ctx_id);
 
 		/* Sometimes workers didn't take the tasks as early as we expected */
@@ -624,7 +624,7 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 		unsigned nimpl;
 		unsigned impl_mask;
 		unsigned workerid = workers->get_next(workers, &it);
-		struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
+		struct _starpu_fifo_taskq *fifo = &dt->queue_array[workerid];
 		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(workerid, sched_ctx_id);
 		unsigned memory_node = starpu_worker_get_memory_node(workerid);
 
@@ -965,15 +965,12 @@ static void dmda_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nwo
 		int workerid = workerids[i];
 		/* if the worker has alreadry belonged to this context
 		   the queue and the synchronization variables have been already initialized */
-		q = dt->queue_array[workerid];
-		if(q == NULL)
-		{
-			q = dt->queue_array[workerid] = _starpu_create_fifo();
-			/* These are only stats, they can be read with races */
-			STARPU_HG_DISABLE_CHECKING(q->exp_start);
-			STARPU_HG_DISABLE_CHECKING(q->exp_len);
-			STARPU_HG_DISABLE_CHECKING(q->exp_end);
-		}
+		q = &dt->queue_array[workerid];
+		_starpu_init_fifo(q);
+		/* These are only stats, they can be read with races */
+		STARPU_HG_DISABLE_CHECKING(q->exp_start);
+		STARPU_HG_DISABLE_CHECKING(q->exp_len);
+		STARPU_HG_DISABLE_CHECKING(q->exp_end);
 
 		if(dt->num_priorities != -1)
 		{
@@ -997,16 +994,10 @@ static void dmda_remove_workers(unsigned sched_ctx_id, int *workerids, unsigned
 	for (i = 0; i < nworkers; i++)
 	{
 		int workerid = workerids[i];
-		if(dt->queue_array[workerid] != NULL)
+		if(dt->num_priorities != -1)
 		{
-			if(dt->num_priorities != -1)
-			{
-				free(dt->queue_array[workerid]->exp_len_per_priority);
-				free(dt->queue_array[workerid]->ntasks_per_priority);
-			}
-
-			_starpu_destroy_fifo(dt->queue_array[workerid]);
-			dt->queue_array[workerid] = NULL;
+			free(dt->queue_array[workerid].exp_len_per_priority);
+			free(dt->queue_array[workerid].ntasks_per_priority);
 		}
 	}
 }
@@ -1018,12 +1009,6 @@ static void initialize_dmda_policy(unsigned sched_ctx_id)
 
 	starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)dt);
 
-	_STARPU_MALLOC(dt->queue_array, STARPU_NMAXWORKERS*sizeof(struct _starpu_fifo_taskq*));
-
-	int i;
-	for(i = 0; i < STARPU_NMAXWORKERS; i++)
-		dt->queue_array[i] = NULL;
-
 	dt->alpha = starpu_get_env_float_default("STARPU_SCHED_ALPHA", _STARPU_SCHED_ALPHA_DEFAULT);
 	dt->beta = starpu_get_env_float_default("STARPU_SCHED_BETA", _STARPU_SCHED_BETA_DEFAULT);
 	dt->_gamma = starpu_get_env_float_default("STARPU_SCHED_GAMMA", _STARPU_SCHED_GAMMA_DEFAULT);
@@ -1069,7 +1054,6 @@ static void deinitialize_dmda_policy(unsigned sched_ctx_id)
 	}
 #endif
 
-	free(dt->queue_array);
 	free(dt);
 }
 
@@ -1080,7 +1064,7 @@ static void dmda_pre_exec_hook(struct starpu_task *task, unsigned sched_ctx_id)
 {
 	unsigned workerid = starpu_worker_get_id_check();
 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
-	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
+	struct _starpu_fifo_taskq *fifo = &dt->queue_array[workerid];
 	const double now = starpu_timing_now();
 
 	/* Once the task is executing, we can update the predicted amount
@@ -1099,7 +1083,7 @@ static void dmda_pre_exec_hook(struct starpu_task *task, unsigned sched_ctx_id)
 static void dmda_push_task_notify(struct starpu_task *task, int workerid, int perf_workerid, unsigned sched_ctx_id)
 {
 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
-	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
+	struct _starpu_fifo_taskq *fifo = &dt->queue_array[workerid];
 
 	/* Compute the expected penality */
 	double predicted = starpu_task_worker_expected_length(task, perf_workerid, STARPU_NMAX_SCHED_CTXS,
@@ -1174,7 +1158,7 @@ static void dmda_post_exec_hook(struct starpu_task * task, unsigned sched_ctx_id
 {
 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 	unsigned workerid = starpu_worker_get_id_check();
-	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
+	struct _starpu_fifo_taskq *fifo = &dt->queue_array[workerid];
 	starpu_worker_lock_self();
 	_starpu_fifo_task_finished(fifo, task, dt->num_priorities);
 	starpu_worker_unlock_self();

+ 15 - 24
src/sched_policies/eager_central_policy.c

@@ -29,9 +29,9 @@
 
 struct _starpu_eager_center_policy_data
 {
-	struct _starpu_fifo_taskq *fifo;
+	struct _starpu_fifo_taskq fifo;
 	starpu_pthread_mutex_t policy_mutex;
-	struct starpu_bitmap *waiters;
+	struct starpu_bitmap waiters;
 };
 
 static void initialize_eager_center_policy(unsigned sched_ctx_id)
@@ -40,13 +40,8 @@ static void initialize_eager_center_policy(unsigned sched_ctx_id)
 	_STARPU_MALLOC(data, sizeof(struct _starpu_eager_center_policy_data));
 
 	/* there is only a single queue in that trivial design */
-	data->fifo =  _starpu_create_fifo();
-	data->waiters = starpu_bitmap_create();
-
-	 /* Tell helgrind that it's fine to check for empty fifo in
-	  * pop_task_eager_policy without actual mutex (it's just an integer)
-	  */
-	STARPU_HG_DISABLE_CHECKING(data->fifo->ntasks);
+	_starpu_init_fifo(&data->fifo);
+	starpu_bitmap_init(&data->waiters);
 
 	starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)data);
 	STARPU_PTHREAD_MUTEX_INIT(&data->policy_mutex, NULL);
@@ -55,14 +50,10 @@ static void initialize_eager_center_policy(unsigned sched_ctx_id)
 static void deinitialize_eager_center_policy(unsigned sched_ctx_id)
 {
 	struct _starpu_eager_center_policy_data *data = (struct _starpu_eager_center_policy_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
-	struct _starpu_fifo_taskq *fifo = data->fifo;
+	struct _starpu_fifo_taskq *fifo = &data->fifo;
 
 	STARPU_ASSERT(starpu_task_list_empty(&fifo->taskq));
 
-	/* deallocate the job queue */
-	_starpu_destroy_fifo(fifo);
-	starpu_bitmap_destroy(data->waiters);
-
 	STARPU_PTHREAD_MUTEX_DESTROY(&data->policy_mutex);
 	free(data);
 }
@@ -75,9 +66,9 @@ static int push_task_eager_policy(struct starpu_task *task)
 	starpu_worker_relax_on();
 	STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
 	starpu_worker_relax_off();
-	starpu_task_list_push_back(&data->fifo->taskq,task);
-	data->fifo->ntasks++;
-	data->fifo->nprocessed++;
+	starpu_task_list_push_back(&data->fifo.taskq,task);
+	data->fifo.ntasks++;
+	data->fifo.nprocessed++;
 
 	if (_starpu_get_nsched_ctxs() > 1)
 	{
@@ -105,7 +96,7 @@ static int push_task_eager_policy(struct starpu_task *task)
 		unsigned worker = workers->get_next(workers, &it);
 
 #ifdef STARPU_NON_BLOCKING_DRIVERS
-		if (!starpu_bitmap_get(data->waiters, worker))
+		if (!starpu_bitmap_get(&data->waiters, worker))
 			/* This worker is not waiting for a task */
 			continue;
 #endif
@@ -114,7 +105,7 @@ static int push_task_eager_policy(struct starpu_task *task)
 		{
 			/* It can execute this one, tell him! */
 #ifdef STARPU_NON_BLOCKING_DRIVERS
-			starpu_bitmap_unset(data->waiters, worker);
+			starpu_bitmap_unset(&data->waiters, worker);
 			/* We really woke at least somebody, no need to wake somebody else */
 			break;
 #else
@@ -146,7 +137,7 @@ static struct starpu_task *pop_every_task_eager_policy(unsigned sched_ctx_id)
 	struct _starpu_eager_center_policy_data *data = (struct _starpu_eager_center_policy_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 	unsigned workerid = starpu_worker_get_id_check();
 	STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
-	struct starpu_task* task = _starpu_fifo_pop_every_task(data->fifo, workerid);
+	struct starpu_task* task = _starpu_fifo_pop_every_task(&data->fifo, workerid);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
 
 	starpu_sched_ctx_list_task_counters_reset_all(task, sched_ctx_id);
@@ -163,13 +154,13 @@ static struct starpu_task *pop_task_eager_policy(unsigned sched_ctx_id)
 	/* Here helgrind would shout that this is unprotected, this is just an
 	 * integer access, and we hold the sched mutex, so we can not miss any
 	 * wake up. */
-	if (!STARPU_RUNNING_ON_VALGRIND && _starpu_fifo_empty(data->fifo))
+	if (!STARPU_RUNNING_ON_VALGRIND && _starpu_fifo_empty(&data->fifo))
 	{
 		return NULL;
 	}
 
 #ifdef STARPU_NON_BLOCKING_DRIVERS
-	if (!STARPU_RUNNING_ON_VALGRIND && starpu_bitmap_get(data->waiters, workerid))
+	if (!STARPU_RUNNING_ON_VALGRIND && starpu_bitmap_get(&data->waiters, workerid))
 		/* Nobody woke us, avoid bothering the mutex */
 	{
 		return NULL;
@@ -180,10 +171,10 @@ static struct starpu_task *pop_task_eager_policy(unsigned sched_ctx_id)
 	STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
 	starpu_worker_relax_off();
 
-	chosen_task = _starpu_fifo_pop_task(data->fifo, workerid);
+	chosen_task = _starpu_fifo_pop_task(&data->fifo, workerid);
 	if (!chosen_task)
 		/* Tell pushers that we are waiting for tasks for us */
-		starpu_bitmap_set(data->waiters, workerid);
+		starpu_bitmap_set(&data->waiters, workerid);
 
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
 	if(chosen_task &&_starpu_get_nsched_ctxs() > 1)

+ 7 - 8
src/sched_policies/eager_central_priority_policy.c

@@ -35,7 +35,7 @@ struct _starpu_eager_central_prio_data
 {
 	struct _starpu_prio_deque taskq;
 	starpu_pthread_mutex_t policy_mutex;
-	struct starpu_bitmap *waiters;
+	struct starpu_bitmap waiters;
 };
 
 /*
@@ -49,7 +49,7 @@ static void initialize_eager_center_priority_policy(unsigned sched_ctx_id)
 
 	/* only a single queue (even though there are several internaly) */
 	_starpu_prio_deque_init(&data->taskq);
-	data->waiters = starpu_bitmap_create();
+	starpu_bitmap_init(&data->waiters);
 
 	/* Tell helgrind that it's fine to check for empty fifo in
 	 * _starpu_priority_pop_task without actual mutex (it's just an
@@ -72,7 +72,6 @@ static void deinitialize_eager_center_priority_policy(unsigned sched_ctx_id)
 
 	/* deallocate the job queue */
 	_starpu_prio_deque_destroy(&data->taskq);
-	starpu_bitmap_destroy(data->waiters);
 
 	STARPU_PTHREAD_MUTEX_DESTROY(&data->policy_mutex);
 	free(data);
@@ -115,7 +114,7 @@ static int _starpu_priority_push_task(struct starpu_task *task)
 		unsigned worker = workers->get_next(workers, &it);
 
 #ifdef STARPU_NON_BLOCKING_DRIVERS
-		if (!starpu_bitmap_get(data->waiters, worker))
+		if (!starpu_bitmap_get(&data->waiters, worker))
 			/* This worker is not waiting for a task */
 			continue;
 #endif
@@ -124,7 +123,7 @@ static int _starpu_priority_push_task(struct starpu_task *task)
 		{
 			/* It can execute this one, tell him! */
 #ifdef STARPU_NON_BLOCKING_DRIVERS
-			starpu_bitmap_unset(data->waiters, worker);
+			starpu_bitmap_unset(&data->waiters, worker);
 			/* We really woke at least somebody, no need to wake somebody else */
 			break;
 #else
@@ -170,7 +169,7 @@ static struct starpu_task *_starpu_priority_pop_task(unsigned sched_ctx_id)
 	}
 
 #ifdef STARPU_NON_BLOCKING_DRIVERS
-	if (!STARPU_RUNNING_ON_VALGRIND && starpu_bitmap_get(data->waiters, workerid))
+	if (!STARPU_RUNNING_ON_VALGRIND && starpu_bitmap_get(&data->waiters, workerid))
 		/* Nobody woke us, avoid bothering the mutex */
 	{
 		return NULL;
@@ -197,7 +196,7 @@ static struct starpu_task *_starpu_priority_pop_task(unsigned sched_ctx_id)
 			if(worker != workerid)
 			{
 #ifdef STARPU_NON_BLOCKING_DRIVERS
-				starpu_bitmap_unset(data->waiters, worker);
+				starpu_bitmap_unset(&data->waiters, worker);
 #else
 				starpu_wake_worker_relax_light(worker);
 #endif
@@ -208,7 +207,7 @@ static struct starpu_task *_starpu_priority_pop_task(unsigned sched_ctx_id)
 
 	if (!chosen_task)
 		/* Tell pushers that we are waiting for tasks for us */
-		starpu_bitmap_set(data->waiters, workerid);
+		starpu_bitmap_set(&data->waiters, workerid);
 
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
 	if(chosen_task &&_starpu_get_nsched_ctxs() > 1)

+ 12 - 4
src/sched_policies/fifo_queues.c

@@ -44,14 +44,14 @@ static int is_sorted_task_list(struct starpu_task * task)
 }
 */
 
-struct _starpu_fifo_taskq *_starpu_create_fifo(void)
+void _starpu_init_fifo(struct _starpu_fifo_taskq *fifo)
 {
-	struct _starpu_fifo_taskq *fifo;
-	_STARPU_MALLOC(fifo, sizeof(struct _starpu_fifo_taskq));
-
 	/* note that not all mechanisms (eg. the semaphore) have to be used */
 	starpu_task_list_init(&fifo->taskq);
 	fifo->ntasks = 0;
+	/* Tell helgrind that it's fine to check for empty fifo in
+	 * pop_task_graph_test_policy without actual mutex (it's just an integer)
+	 */
 	STARPU_HG_DISABLE_CHECKING(fifo->ntasks);
 	fifo->nprocessed = 0;
 
@@ -60,6 +60,14 @@ struct _starpu_fifo_taskq *_starpu_create_fifo(void)
 	fifo->exp_end = fifo->exp_start;
 	fifo->exp_len_per_priority = NULL;
 	fifo->pipeline_len = 0.0;
+}
+
+struct _starpu_fifo_taskq *_starpu_create_fifo(void)
+{
+	struct _starpu_fifo_taskq *fifo;
+	_STARPU_MALLOC(fifo, sizeof(struct _starpu_fifo_taskq));
+
+	_starpu_init_fifo(fifo);
 
 	return fifo;
 }

+ 1 - 0
src/sched_policies/fifo_queues.h

@@ -50,6 +50,7 @@ struct _starpu_fifo_taskq
 };
 
 struct _starpu_fifo_taskq*_starpu_create_fifo(void) STARPU_ATTRIBUTE_MALLOC;
+void _starpu_init_fifo(struct _starpu_fifo_taskq *fifo);
 void _starpu_destroy_fifo(struct _starpu_fifo_taskq *fifo);
 
 int _starpu_fifo_empty(struct _starpu_fifo_taskq *fifo);

+ 15 - 22
src/sched_policies/graph_test_policy.c

@@ -36,11 +36,11 @@
 
 struct _starpu_graph_test_policy_data
 {
-	struct _starpu_fifo_taskq *fifo;	/* Bag of tasks which are ready before do_schedule is called */
+	struct _starpu_fifo_taskq fifo;	/* Bag of tasks which are ready before do_schedule is called */
 	struct _starpu_prio_deque prio_cpu;
 	struct _starpu_prio_deque prio_gpu;
 	starpu_pthread_mutex_t policy_mutex;
-	struct starpu_bitmap *waiters;
+	struct starpu_bitmap waiters;
 	unsigned computed;
 	unsigned descendants;			/* Whether we use descendants, or depths, for priorities */
 };
@@ -51,20 +51,15 @@ static void initialize_graph_test_policy(unsigned sched_ctx_id)
 	_STARPU_MALLOC(data, sizeof(struct _starpu_graph_test_policy_data));
 
 	/* there is only a single queue in that trivial design */
-	data->fifo =  _starpu_create_fifo();
+	_starpu_init_fifo(&data->fifo);
 	 _starpu_prio_deque_init(&data->prio_cpu);
 	 _starpu_prio_deque_init(&data->prio_gpu);
-	data->waiters = starpu_bitmap_create();
+	starpu_bitmap_init(&data->waiters);
 	data->computed = 0;
 	data->descendants = starpu_get_env_number_default("STARPU_SCHED_GRAPH_TEST_DESCENDANTS", 0);
 
 	_starpu_graph_record = 1;
 
-	 /* Tell helgrind that it's fine to check for empty fifo in
-	  * pop_task_graph_test_policy without actual mutex (it's just an integer)
-	  */
-	STARPU_HG_DISABLE_CHECKING(data->fifo->ntasks);
-
 	starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)data);
 	STARPU_PTHREAD_MUTEX_INIT(&data->policy_mutex, NULL);
 }
@@ -72,15 +67,13 @@ static void initialize_graph_test_policy(unsigned sched_ctx_id)
 static void deinitialize_graph_test_policy(unsigned sched_ctx_id)
 {
 	struct _starpu_graph_test_policy_data *data = (struct _starpu_graph_test_policy_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
-	struct _starpu_fifo_taskq *fifo = data->fifo;
+	struct _starpu_fifo_taskq *fifo = &data->fifo;
 
 	STARPU_ASSERT(starpu_task_list_empty(&fifo->taskq));
 
 	/* deallocate the job queue */
-	_starpu_destroy_fifo(fifo);
 	 _starpu_prio_deque_destroy(&data->prio_cpu);
 	 _starpu_prio_deque_destroy(&data->prio_gpu);
-	starpu_bitmap_destroy(data->waiters);
 
 	_starpu_graph_record = 0;
 	STARPU_PTHREAD_MUTEX_DESTROY(&data->policy_mutex);
@@ -194,9 +187,9 @@ static void do_schedule_graph_test_policy(unsigned sched_ctx_id)
 	}
 
 	/* Now that we have priorities, move tasks from bag to priority queue */
-	while(!_starpu_fifo_empty(data->fifo))
+	while(!_starpu_fifo_empty(&data->fifo))
 	{
-		struct starpu_task *task = _starpu_fifo_pop_task(data->fifo, -1);
+		struct starpu_task *task = _starpu_fifo_pop_task(&data->fifo, -1);
 		struct _starpu_prio_deque *prio = select_prio(sched_ctx_id, data, task);
 		_starpu_prio_deque_push_back_task(prio, task);
 	}
@@ -210,7 +203,7 @@ static void do_schedule_graph_test_policy(unsigned sched_ctx_id)
 	{
 		/* Tell each worker is shouldn't sleep any more */
 		unsigned worker = workers->get_next(workers, &it);
-		starpu_bitmap_unset(data->waiters, worker);
+		starpu_bitmap_unset(&data->waiters, worker);
 	}
 #endif
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
@@ -237,9 +230,9 @@ static int push_task_graph_test_policy(struct starpu_task *task)
 	if (!data->computed)
 	{
 		/* Priorities are not computed, leave the task in the bag for now */
-		starpu_task_list_push_back(&data->fifo->taskq,task);
-		data->fifo->ntasks++;
-		data->fifo->nprocessed++;
+		starpu_task_list_push_back(&data->fifo.taskq,task);
+		data->fifo.ntasks++;
+		data->fifo.nprocessed++;
 		starpu_push_task_end(task);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
 		return 0;
@@ -266,7 +259,7 @@ static int push_task_graph_test_policy(struct starpu_task *task)
 		unsigned worker = workers->get_next(workers, &it);
 
 #ifdef STARPU_NON_BLOCKING_DRIVERS
-		if (!starpu_bitmap_get(data->waiters, worker))
+		if (!starpu_bitmap_get(&data->waiters, worker))
 			/* This worker is not waiting for a task */
 			continue;
 #endif
@@ -281,7 +274,7 @@ static int push_task_graph_test_policy(struct starpu_task *task)
 		{
 			/* It can execute this one, tell him! */
 #ifdef STARPU_NON_BLOCKING_DRIVERS
-			starpu_bitmap_unset(data->waiters, worker);
+			starpu_bitmap_unset(&data->waiters, worker);
 			/* We really woke at least somebody, no need to wake somebody else */
 			break;
 #else
@@ -333,7 +326,7 @@ static struct starpu_task *pop_task_graph_test_policy(unsigned sched_ctx_id)
 	if (!STARPU_RUNNING_ON_VALGRIND && !data->computed)
 		/* Not computed yet */
 		return NULL;
-	if (!STARPU_RUNNING_ON_VALGRIND && starpu_bitmap_get(data->waiters, workerid))
+	if (!STARPU_RUNNING_ON_VALGRIND && starpu_bitmap_get(&data->waiters, workerid))
 		/* Nobody woke us, avoid bothering the mutex */
 		return NULL;
 #endif
@@ -350,7 +343,7 @@ static struct starpu_task *pop_task_graph_test_policy(unsigned sched_ctx_id)
 	chosen_task = _starpu_prio_deque_pop_task_for_worker(prio, workerid, NULL);
 	if (!chosen_task)
 		/* Tell pushers that we are waiting for tasks for us */
-		starpu_bitmap_set(data->waiters, workerid);
+		starpu_bitmap_set(&data->waiters, workerid);
 
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
 

+ 6 - 8
src/sched_policies/heteroprio.c

@@ -88,7 +88,7 @@ struct _heteroprio_worker_wrapper
 struct _starpu_heteroprio_data
 {
 	starpu_pthread_mutex_t policy_mutex;
-	struct starpu_bitmap *waiters;
+	struct starpu_bitmap waiters;
 	/* The bucket to store the tasks */
 	struct _heteroprio_bucket buckets[STARPU_HETEROPRIO_MAX_PRIO];
 	/* The number of buckets for each arch */
@@ -216,7 +216,7 @@ static void initialize_heteroprio_policy(unsigned sched_ctx_id)
 	_STARPU_MALLOC(hp, sizeof(struct _starpu_heteroprio_data));
 	memset(hp, 0, sizeof(*hp));
 
-	hp->waiters = starpu_bitmap_create();
+	starpu_bitmap_init(&hp->waiters);
 
 	starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)hp);
 
@@ -295,8 +295,6 @@ static void deinitialize_heteroprio_policy(unsigned sched_ctx_id)
 		_heteroprio_bucket_release(&hp->buckets[idx_prio]);
 	}
 
-	starpu_bitmap_destroy(hp->waiters);
-
 	STARPU_PTHREAD_MUTEX_DESTROY(&hp->policy_mutex);
 	free(hp);
 }
@@ -404,7 +402,7 @@ static int push_task_heteroprio_policy(struct starpu_task *task)
 		unsigned worker = workers->get_next(workers, &it);
 
 #ifdef STARPU_NON_BLOCKING_DRIVERS
-		if (!starpu_bitmap_get(hp->waiters, worker))
+		if (!starpu_bitmap_get(&hp->waiters, worker))
 			/* This worker is not waiting for a task */
 			continue;
 #endif
@@ -413,7 +411,7 @@ static int push_task_heteroprio_policy(struct starpu_task *task)
 		{
 			/* It can execute this one, tell him! */
 #ifdef STARPU_NON_BLOCKING_DRIVERS
-			starpu_bitmap_unset(hp->waiters, worker);
+			starpu_bitmap_unset(&hp->waiters, worker);
 			/* We really woke at least somebody, no need to wake somebody else */
 			break;
 #else
@@ -455,7 +453,7 @@ static struct starpu_task *pop_task_heteroprio_policy(unsigned sched_ctx_id)
 		return NULL;
 	}
 
-	if (!STARPU_RUNNING_ON_VALGRIND && starpu_bitmap_get(hp->waiters, workerid))
+	if (!STARPU_RUNNING_ON_VALGRIND && starpu_bitmap_get(&hp->waiters, workerid))
 	{
 		/* Nobody woke us, avoid bothering the mutex */
 		return NULL;
@@ -602,7 +600,7 @@ done:		;
 	if (!task)
 	{
 		/* Tell pushers that we are waiting for tasks_queue for us */
-		starpu_bitmap_set(hp->waiters, workerid);
+		starpu_bitmap_set(&hp->waiters, workerid);
 	}
 	STARPU_PTHREAD_MUTEX_UNLOCK(&hp->policy_mutex);
 

+ 4 - 4
src/sched_policies/modular_gemm.c

@@ -119,9 +119,9 @@ static int gemm_push_task(struct starpu_sched_component * component, struct star
 
 	int workerid;
 	/* It's not a GEMM, or no GPU wanted to take it, find somebody else */
-	for(workerid = starpu_bitmap_first(component->workers_in_ctx);
+	for(workerid = starpu_bitmap_first(&component->workers_in_ctx);
 	    workerid != -1;
-	    workerid = starpu_bitmap_next(component->workers_in_ctx, workerid))
+	    workerid = starpu_bitmap_next(&component->workers_in_ctx, workerid))
 	{
 		int nimpl;
 		for(nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
@@ -133,9 +133,9 @@ static int gemm_push_task(struct starpu_sched_component * component, struct star
 				{
 					struct starpu_sched_component *child = component->children[i];
 					int idworker;
-					for(idworker = starpu_bitmap_first(component->children[i]->workers);
+					for(idworker = starpu_bitmap_first(&component->children[i]->workers);
 						idworker != -1;
-						idworker = starpu_bitmap_next(component->children[i]->workers, idworker))
+						idworker = starpu_bitmap_next(&component->children[i]->workers, idworker))
 					{
 						if (idworker == workerid)
 						{

+ 11 - 24
src/sched_policies/parallel_eager.c

@@ -35,8 +35,8 @@ struct _starpu_peager_common_data *_peager_common_data = NULL;
 struct _starpu_peager_data
 {
 	starpu_pthread_mutex_t policy_mutex;
-	struct _starpu_fifo_taskq *fifo;
-	struct _starpu_fifo_taskq *local_fifo[STARPU_NMAXWORKERS];
+	struct _starpu_fifo_taskq fifo;
+	struct _starpu_fifo_taskq local_fifo[STARPU_NMAXWORKERS];
 };
 
 static void initialize_peager_common(void)
@@ -136,22 +136,12 @@ static void peager_add_workers(unsigned sched_ctx_id, int *workerids, unsigned n
 		/* slaves pick up tasks from their local queue, their master
 		 * will put tasks directly in that local list when a parallel
 		 * tasks comes. */
-		data->local_fifo[workerid] = _starpu_create_fifo();
+		_starpu_init_fifo(&data->local_fifo[workerid]);
 	}
 }
 
-static void peager_remove_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
+static void peager_remove_workers(unsigned sched_ctx_id, int *workerids STARPU_ATTRIBUTE_UNUSED, unsigned nworkers STARPU_ATTRIBUTE_UNUSED)
 {
-	struct _starpu_peager_data *data = (struct _starpu_peager_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
-	unsigned i;
-	for(i = 0; i < nworkers; i++)
-        {
-		int workerid = workerids[i];
-		if(!starpu_worker_is_combined_worker(workerid))
-		{
-			_starpu_destroy_fifo(data->local_fifo[workerid]);
-		}
-	}
 	if (sched_ctx_id == 0)
 	{
 		deinitialize_peager_common();
@@ -166,7 +156,7 @@ static void initialize_peager_policy(unsigned sched_ctx_id)
 	_STARPU_DISP("Warning: the peager scheduler is mostly a proof of concept and not really very optimized\n");
 
 	/* masters pick tasks from that queue */
-	data->fifo = _starpu_create_fifo();
+	_starpu_init_fifo(&data->fifo);
 
 	starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)data);
         STARPU_PTHREAD_MUTEX_INIT(&data->policy_mutex, NULL);
@@ -177,9 +167,6 @@ static void deinitialize_peager_policy(unsigned sched_ctx_id)
 	/* TODO check that there is no task left in the queue */
 	struct _starpu_peager_data *data = (struct _starpu_peager_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 
-	/* deallocate the job queue */
-	_starpu_destroy_fifo(data->fifo);
-
         STARPU_PTHREAD_MUTEX_DESTROY(&data->policy_mutex);
 
 	free(data);
@@ -193,7 +180,7 @@ static int push_task_peager_policy(struct starpu_task *task)
 	struct _starpu_peager_data *data = (struct _starpu_peager_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 
 	STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
-	ret_val = _starpu_fifo_push_task(data->fifo, task);
+	ret_val = _starpu_fifo_push_task(&data->fifo, task);
 #ifndef STARPU_NON_BLOCKING_DRIVERS
 	int is_parallel_task = task->cl && task->cl->max_parallelism > 1;
 #endif
@@ -249,7 +236,7 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 		starpu_worker_relax_on();
 		STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
 		starpu_worker_relax_off();
-		task = _starpu_fifo_pop_task(data->fifo, workerid);
+		task = _starpu_fifo_pop_task(&data->fifo, workerid);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
 
 		return task;
@@ -261,11 +248,11 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 	STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
 	starpu_worker_relax_off();
 	/* check if a slave task is available in the local queue */
-	task = _starpu_fifo_pop_task(data->local_fifo[workerid], workerid);
+	task = _starpu_fifo_pop_task(&data->local_fifo[workerid], workerid);
 	if (!task)
 	{
 		/* no slave task, try to pop a task as master */
-		task = _starpu_fifo_pop_task(data->fifo, workerid);
+		task = _starpu_fifo_pop_task(&data->fifo, workerid);
 		if (task)
 		{
 			_STARPU_DEBUG("poping master task %p\n", task);
@@ -277,7 +264,7 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 		{
 			/* task is potentially parallel, leave it for a combined worker master */
 			_STARPU_DEBUG("pushing back master task %p\n", task);
-			_starpu_fifo_push_back_task(data->fifo, task);
+			_starpu_fifo_push_back_task(&data->fifo, task);
 			task = NULL;
 		}
 #endif
@@ -339,7 +326,7 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 		int local_worker = combined_workerid[i];
 		alias->destroy = 1;
 		_STARPU_TRACE_JOB_PUSH(alias, alias->priority > 0);
-		_starpu_fifo_push_task(data->local_fifo[local_worker], alias);
+		_starpu_fifo_push_task(&data->local_fifo[local_worker], alias);
 	}
 
 	/* The master also manipulated an alias */

+ 0 - 4
tools/Makefile.am

@@ -399,10 +399,6 @@ starpu_replay_SOURCES = \
 	starpu_replay.c \
 	starpu_replay_sched.c
 
-if STARPU_USE_MPI
-SUBDIRS += replay-mpi
-endif
-
 endif
 
 starpu_perfmodel_plot_CPPFLAGS = $(AM_CFLAGS) $(AM_CPPFLAGS) $(FXT_CFLAGS)

+ 1 - 0
tools/dev/lsan/suppressions

@@ -29,3 +29,4 @@ leak:hwloc_topology_load
 leak:hwloc_topology_set_xml
 leak:hwloc_components_init
 leak:hwloc_plugins_init
+leak:hwloc_plugins_exit

+ 7 - 0
tools/dev/valgrind/valgrind.sh

@@ -17,6 +17,13 @@
 EXEC=$(basename $0 .sh)
 DIRNAME=$(dirname $0)
 
+CLIMIT=$(ulimit -c)
+if [ "$CLIMIT" = unlimited ]
+then
+	# valgrind cores are often *huge*, 100MB will already be quite big...
+	ulimit -c 100000
+fi
+
 if test "$EXEC" == "valgrind"
 then
     RUN="valgrind --track-origins=yes --show-reachable=yes --leak-check=full --errors-for-leak-kinds=all --show-leak-kinds=all --error-exitcode=42"

+ 1 - 1
tools/starpu_replay.c

@@ -431,7 +431,7 @@ static void arrays_managing(int mode)
 	{
 		_STARPU_MALLOC(handles_ptr, sizeof(*handles_ptr) * nb_parameters);
 		_STARPU_MALLOC(modes_ptr, sizeof(*modes_ptr) * nb_parameters);
-		_STARPU_CALLOC(reg_signal, nb_parameters, sizeof(char *));
+		_STARPU_CALLOC(reg_signal, nb_parameters, sizeof(char));
 
 	}
 }

+ 2 - 0
tools/starpu_replay_sched.c

@@ -344,6 +344,8 @@ void schedRecInit(const char * filename)
 	}
 
 	fclose(f);
+
+	free(s);
 }
 
 static void do_prefetch(void *arg)