Browse Source

Merge branch 'master' into knobs

Nathalie Furmento 6 years ago
parent
commit
db1951c1f2
66 changed files with 1858 additions and 211 deletions
  1. 11 0
      ChangeLog
  2. 1 0
      Makefile.am
  3. 1 0
      configure.ac
  4. 2 1
      doc/doxygen/Makefile.am
  5. 2 1
      doc/doxygen/chapters/000_introduction.doxy
  6. 1 1
      doc/doxygen/chapters/101_building.doxy
  7. 1 12
      doc/doxygen/chapters/380_offline_performance_tools.doxy
  8. 48 0
      doc/doxygen/chapters/415_fault_tolerance.doxy
  9. 6 1
      doc/doxygen/refman.tex
  10. 4 4
      examples/Makefile.am
  11. 6 2
      examples/lu/lu_example.c
  12. 2 1
      examples/lu/xlu_implicit_pivot.c
  13. 4 1
      examples/native_fortran/nf_example.f90
  14. 4 2
      include/starpu_profiling.h
  15. 1 0
      include/starpu_stdlib.h
  16. 67 0
      include/starpu_task.h
  17. 13 0
      src/common/utils.c
  18. 114 3
      src/core/dependencies/data_concurrency.c
  19. 2 1
      src/core/dependencies/data_concurrency.h
  20. 16 2
      src/core/jobs.c
  21. 2 1
      src/core/jobs.h
  22. 1 1
      src/core/perfmodel/perfmodel_history.c
  23. 126 30
      src/core/task.c
  24. 2 6
      src/core/task.h
  25. 14 8
      src/core/topology.c
  26. 50 24
      src/core/workers.c
  27. 0 2
      src/core/workers.h
  28. 0 6
      src/datawizard/filters.h
  29. 19 27
      src/datawizard/interfaces/block_filters.c
  30. 10 16
      src/datawizard/interfaces/matrix_filters.c
  31. 7 9
      src/datawizard/interfaces/vector_filters.c
  32. 11 3
      src/debug/traces/starpu_fxt.c
  33. 9 3
      src/drivers/driver_common/driver_common.c
  34. 1 0
      tests/Makefile.am
  35. 123 0
      tests/fault-tolerance/retry.c
  36. 2 2
      tests/perfmodels/regression_based.c
  37. 26 0
      tools/Makefile.am
  38. 10 2
      tools/dev/valgrind/hdf5.suppr
  39. 1 0
      tools/perfmodels/sampling/bus/hannibal-pitch.affinity
  40. 1 0
      tools/perfmodels/sampling/bus/hannibal-pitch.bandwidth
  41. 1 0
      tools/perfmodels/sampling/bus/hannibal-pitch.config
  42. 1 0
      tools/perfmodels/sampling/bus/hannibal-pitch.latency
  43. 70 0
      tools/perfmodels/sampling/bus/hannibal-pitch.platform.v4.xml
  44. 70 0
      tools/perfmodels/sampling/bus/hannibal-pitch.platform.xml
  45. 7 0
      tools/perfmodels/sampling/bus/hannibal.affinity
  46. 17 0
      tools/perfmodels/sampling/bus/hannibal.bandwidth
  47. 4 0
      tools/perfmodels/sampling/bus/hannibal.config
  48. 17 0
      tools/perfmodels/sampling/bus/hannibal.latency
  49. 70 0
      tools/perfmodels/sampling/bus/hannibal.platform.v4.xml
  50. 70 0
      tools/perfmodels/sampling/bus/hannibal.platform.xml
  51. 104 0
      tools/perfmodels/sampling/codelets/45/chol_model_11.hannibal
  52. 1 0
      tools/perfmodels/sampling/codelets/45/chol_model_11.hannibal-pitch
  53. 104 0
      tools/perfmodels/sampling/codelets/45/chol_model_21.hannibal
  54. 1 0
      tools/perfmodels/sampling/codelets/45/chol_model_21.hannibal-pitch
  55. 104 0
      tools/perfmodels/sampling/codelets/45/chol_model_22.hannibal
  56. 1 0
      tools/perfmodels/sampling/codelets/45/chol_model_22.hannibal-pitch
  57. 104 0
      tools/perfmodels/sampling/codelets/45/starpu_slu_lu_model_11.hannibal
  58. 1 0
      tools/perfmodels/sampling/codelets/45/starpu_slu_lu_model_11.hannibal-pitch
  59. 104 0
      tools/perfmodels/sampling/codelets/45/starpu_slu_lu_model_12.hannibal
  60. 1 0
      tools/perfmodels/sampling/codelets/45/starpu_slu_lu_model_12.hannibal-pitch
  61. 104 0
      tools/perfmodels/sampling/codelets/45/starpu_slu_lu_model_21.hannibal
  62. 1 0
      tools/perfmodels/sampling/codelets/45/starpu_slu_lu_model_21.hannibal-pitch
  63. 104 0
      tools/perfmodels/sampling/codelets/45/starpu_slu_lu_model_22.hannibal
  64. 1 0
      tools/perfmodels/sampling/codelets/45/starpu_slu_lu_model_22.hannibal-pitch
  65. 21 3
      tools/starpu_fxt_data_trace.c
  66. 54 36
      tools/starpu_replay.c

+ 11 - 0
ChangeLog

@@ -19,10 +19,21 @@
 StarPU 1.4.0 (svn revision xxxx)
 StarPU 1.4.0 (svn revision xxxx)
 ==============================================
 ==============================================
 New features:
 New features:
+  * Fault tolerance support with starpu_task_ft_failed().
 
 
 StarPU 1.3.2 (git revision xxx)
 StarPU 1.3.2 (git revision xxx)
 ==============================================
 ==============================================
 
 
+Small changes:
+  * Improve OpenMP support to detect the environment is valid before
+    launching OpenMP
+  * Delete old code (drivers gordon, scc, starpu-top, and plugin gcc)
+    and update authors file accordingly
+  * Add Heteroprio documentation (including a simple example)
+  * Add a progression hook, to be called when workers are idle, which
+    is used in the NewMadeleine implementation of StarPU-MPI to ensure
+    communications progress.
+
 StarPU 1.3.1 (git revision 01949488b4f8e6fe26d2c200293b8aae5876b038)
 StarPU 1.3.1 (git revision 01949488b4f8e6fe26d2c200293b8aae5876b038)
 ==============================================
 ==============================================
 
 

+ 1 - 0
Makefile.am

@@ -120,6 +120,7 @@ noinst_HEADERS = \
 	include/pthread_win32/semaphore.h
 	include/pthread_win32/semaphore.h
 
 
 if STARPU_DEVEL
 if STARPU_DEVEL
+all-local:
 	@if $(GREP) -r sys/time.h $$( find $(srcdir)/examples $(srcdir)/tests $(srcdir)/src $(srcdir)/mpi/src $(srcdir)/include -name \*.[ch] -a \! -name starpu_util.h -a \! -name timer.h -a \! -name loader.c ) ; \
 	@if $(GREP) -r sys/time.h $$( find $(srcdir)/examples $(srcdir)/tests $(srcdir)/src $(srcdir)/mpi/src $(srcdir)/include -name \*.[ch] -a \! -name starpu_util.h -a \! -name timer.h -a \! -name loader.c ) ; \
 	then \
 	then \
 		echo "Please do not include sys/time, it is not available on Windows, include starpu_util.h and use starpu_timing_now() instead" ; \
 		echo "Please do not include sys/time, it is not available on Windows, include starpu_util.h and use starpu_timing_now() instead" ; \

+ 1 - 0
configure.ac

@@ -789,6 +789,7 @@ AC_SEARCH_LIBS([sqrt],[m],,AC_MSG_ERROR([math library unavailable]))
 AC_HAVE_LIBRARY([ws2_32])
 AC_HAVE_LIBRARY([ws2_32])
 AC_CHECK_FUNCS([sysconf])
 AC_CHECK_FUNCS([sysconf])
 AC_CHECK_FUNCS([getrlimit])
 AC_CHECK_FUNCS([getrlimit])
+AC_CHECK_FUNCS([scandir])
 
 
 AC_CHECK_FUNC([pthread_spin_lock], have_pthread_spin_lock=yes, have_pthread_spin_lock=no)
 AC_CHECK_FUNC([pthread_spin_lock], have_pthread_spin_lock=yes, have_pthread_spin_lock=no)
 if test x$have_pthread_spin_lock = xyes; then
 if test x$have_pthread_spin_lock = xyes; then

+ 2 - 1
doc/doxygen/Makefile.am

@@ -2,7 +2,7 @@
 #
 #
 # Copyright (C) 2013-2018                                Inria
 # Copyright (C) 2013-2018                                Inria
 # Copyright (C) 2010-2019                                CNRS
 # Copyright (C) 2010-2019                                CNRS
-# Copyright (C) 2009,2011,2013,2014,2017                 Université de Bordeaux
+# Copyright (C) 2009,2011,2013,2014,2017,2019            Université de Bordeaux
 #
 #
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
 # it under the terms of the GNU Lesser General Public License as published by
@@ -72,6 +72,7 @@ chapters =	\
 	chapters/390_faq.doxy		\
 	chapters/390_faq.doxy		\
 	chapters/401_out_of_core.doxy		\
 	chapters/401_out_of_core.doxy		\
 	chapters/410_mpi_support.doxy		\
 	chapters/410_mpi_support.doxy		\
+	chapters/415_fault_tolerance.doxy	\
 	chapters/420_fft_support.doxy		\
 	chapters/420_fft_support.doxy		\
 	chapters/430_mic_support.doxy		\
 	chapters/430_mic_support.doxy		\
 	chapters/450_native_fortran_support.doxy		\
 	chapters/450_native_fortran_support.doxy		\

+ 2 - 1
doc/doxygen/chapters/000_introduction.doxy

@@ -2,7 +2,7 @@
  *
  *
  * Copyright (C) 2010-2019                                CNRS
  * Copyright (C) 2010-2019                                CNRS
  * Copyright (C) 2011-2013,2016                           Inria
  * Copyright (C) 2011-2013,2016                           Inria
- * Copyright (C) 2009-2011,2014,2016                      Université de Bordeaux
+ * Copyright (C) 2009-2011,2014,2016,2019                 Université de Bordeaux
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -290,6 +290,7 @@ The documentation chapters include
 <ul>
 <ul>
 <li> \ref OutOfCore
 <li> \ref OutOfCore
 <li> \ref MPISupport
 <li> \ref MPISupport
+<li> \ref FaultTolerance
 <li> \ref FFTSupport
 <li> \ref FFTSupport
 <li> \ref MICSupport
 <li> \ref MICSupport
 <li> \ref NativeFortranSupport
 <li> \ref NativeFortranSupport

+ 1 - 1
doc/doxygen/chapters/101_building.doxy

@@ -209,7 +209,7 @@ $ export LD_LIBRARY_PATH=$STARPU_PATH/lib:$LD_LIBRARY_PATH
 And it is useful to get access to the StarPU tools:
 And it is useful to get access to the StarPU tools:
 
 
 \verbatim
 \verbatim
-$ PATH=$PATH:$STARPU_PATH/bin
+$ export PATH=$PATH:$STARPU_PATH/bin
 \endverbatim
 \endverbatim
 
 
 It is then useful to check that StarPU executes correctly and finds your hardware:
 It is then useful to check that StarPU executes correctly and finds your hardware:

+ 1 - 12
doc/doxygen/chapters/380_offline_performance_tools.doxy

@@ -60,18 +60,7 @@ StarPU can use the FxT library (see
 https://savannah.nongnu.org/projects/fkt/) to generate traces
 https://savannah.nongnu.org/projects/fkt/) to generate traces
 with a limited runtime overhead.
 with a limited runtime overhead.
 
 
-You can either get a tarball:
-
-\verbatim
-$ wget http://download.savannah.gnu.org/releases/fkt/fxt-0.2.11.tar.gz
-\endverbatim
-
-or use the FxT library from CVS (autotools are required):
-
-\verbatim
-$ cvs -d :pserver:anonymous\@cvs.sv.gnu.org:/sources/fkt co FxT
-$ ./bootstrap
-\endverbatim
+You can get a tarball from http://download.savannah.gnu.org/releases/fkt/
 
 
 Compiling and installing the FxT library in the <c>$FXTDIR</c> path is
 Compiling and installing the FxT library in the <c>$FXTDIR</c> path is
 done following the standard procedure:
 done following the standard procedure:

+ 48 - 0
doc/doxygen/chapters/415_fault_tolerance.doxy

@@ -0,0 +1,48 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2019                                     Université de Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*! \page FaultTolerance Fault Tolerance
+
+\section Introduction Introduction
+
+Due to e.g. hardware error, some tasks may fail, or even complete nodes may
+fail.  For now, StarPU provides some support for failure of tasks.
+
+\section TaskRetry Retrying tasks
+
+In case a task implementation notices that it fail to compute properly, it can
+call starpu_task_failed() to notify StarPU of the failure.
+
+<c>tests/fault-tolerance/retry.c</c> is an example of coping with such failure:
+the principle is that when submitting the task, one sets its prologue callback
+to starpu_task_ft_prologue(). That prologue will turn the task into a meta
+task which will manage the repeated submission of try-tasks to perform the
+computation until one of the computations succeeds.
+
+By default, try-tasks will be just retried until one of them succeeds (i.e. the
+task implementation does not call starpu_task_failed()). One can change the
+behavior by passing a <c>check_failsafe</c> function as prologue parameter,
+which will be called at the end of the try-task attempt. It can look at
+<c>starpu_task_get_current()->failed</c> to determine whether the try-task
+suceeded, in which case it can call starpu_task_ft_success() on the meta-task to
+notify success, or if it failed, in which case it can call
+starpu_task_failsafe_create_retry() to create another try-task, and submit it
+with starpu_task_submit_nodeps().
+
+This can however only work if the task input are not modified, and is thus not
+supported for tasks with data access mode ::STARPU_RW.
+
+*/

+ 6 - 1
doc/doxygen/refman.tex

@@ -2,7 +2,7 @@
 %
 %
 % Copyright (C) 2013-2016,2018                           Inria
 % Copyright (C) 2013-2016,2018                           Inria
 % Copyright (C) 2013-2019                                CNRS
 % Copyright (C) 2013-2019                                CNRS
-% Copyright (C) 2014,2018                                Université de Bordeaux
+% Copyright (C) 2014,2018-2019                                Université de Bordeaux
 % Copyright (C) 2013                                     Simon Archipoff
 % Copyright (C) 2013                                     Simon Archipoff
 %
 %
 % StarPU is free software; you can redistribute it and/or modify
 % StarPU is free software; you can redistribute it and/or modify
@@ -154,6 +154,11 @@ Documentation License”.
 \hypertarget{MPISupport}{}
 \hypertarget{MPISupport}{}
 \input{MPISupport}
 \input{MPISupport}
 
 
+\chapter{Fault Tolerance}
+\label{FaultTolerance}
+\hypertarget{FaultTolerance}{}
+\input{FaultTolerance}
+
 \chapter{FFT Support}
 \chapter{FFT Support}
 \label{FFTSupport}
 \label{FFTSupport}
 \hypertarget{FFTSupport}{}
 \hypertarget{FFTSupport}{}

+ 4 - 4
examples/Makefile.am

@@ -304,6 +304,10 @@ if !NO_BLAS_LIB
 STARPU_EXAMPLES +=				\
 STARPU_EXAMPLES +=				\
 	mult/sgemm 				\
 	mult/sgemm 				\
 	mult/dgemm				\
 	mult/dgemm				\
+	lu/lu_example_float			\
+	lu/lu_example_double			\
+	lu/lu_implicit_example_float		\
+	lu/lu_implicit_example_double		\
 	cholesky/cholesky_tag			\
 	cholesky/cholesky_tag			\
 	cholesky/cholesky_tile_tag		\
 	cholesky/cholesky_tile_tag		\
 	cholesky/cholesky_implicit		\
 	cholesky/cholesky_implicit		\
@@ -313,10 +317,6 @@ if !STARPU_SIMGRID
 STARPU_EXAMPLES +=				\
 STARPU_EXAMPLES +=				\
 	axpy/axpy				\
 	axpy/axpy				\
 	cholesky/cholesky_grain_tag		\
 	cholesky/cholesky_grain_tag		\
-	lu/lu_example_float			\
-	lu/lu_example_double			\
-	lu/lu_implicit_example_float		\
-	lu/lu_implicit_example_double		\
 	heat/heat				\
 	heat/heat				\
 	cg/cg					\
 	cg/cg					\
 	pipeline/pipeline
 	pipeline/pipeline

+ 6 - 2
examples/lu/lu_example.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009-2017                                Université de Bordeaux
+ * Copyright (C) 2009-2017,2019                           Université de Bordeaux
  * Copyright (C) 2011,2012                                Inria
  * Copyright (C) 2011,2012                                Inria
  * Copyright (C) 2010-2013,2015-2018                      CNRS
  * Copyright (C) 2010-2013,2015-2018                      CNRS
  *
  *
@@ -176,7 +176,11 @@ void copy_matrix_into_blocks(void)
 static void init_matrix(void)
 static void init_matrix(void)
 {
 {
 	/* allocate matrix */
 	/* allocate matrix */
+#ifdef STARPU_SIMGRID
+	A = (void*) 1;
+#else
 	starpu_malloc_flags((void **)&A, (size_t)size*size*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
 	starpu_malloc_flags((void **)&A, (size_t)size*size*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+#endif
 	STARPU_ASSERT(A);
 	STARPU_ASSERT(A);
 
 
 	starpu_srand48((long int)time(NULL));
 	starpu_srand48((long int)time(NULL));
@@ -435,10 +439,10 @@ int main(int argc, char **argv)
 
 
 		check_result();
 		check_result();
 	}
 	}
-#endif
 
 
 	if (pivot)
 	if (pivot)
 		free(ipiv);
 		free(ipiv);
+#endif
 
 
 	starpu_free_flags(A, (size_t)size*size*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
 	starpu_free_flags(A, (size_t)size*size*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
 
 

+ 2 - 1
examples/lu/xlu_implicit_pivot.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2015,2017,2018                      Université de Bordeaux
+ * Copyright (C) 2010-2015,2017,2018-2019                 Université de Bordeaux
  * Copyright (C) 2013                                     Inria
  * Copyright (C) 2013                                     Inria
  * Copyright (C) 2010                                     Mehdi Juhoor
  * Copyright (C) 2010                                     Mehdi Juhoor
  * Copyright (C) 2010-2013,2015-2019                      CNRS
  * Copyright (C) 2010-2013,2015-2019                      CNRS
@@ -367,5 +367,6 @@ int STARPU_LU(lu_decomposition_pivot_no_stride)(TYPE **matA, unsigned *ipiv, uns
 		starpu_data_unregister(dataAp[bi+nblocks*bj]);
 		starpu_data_unregister(dataAp[bi+nblocks*bj]);
 	}
 	}
 	free(dataAp);
 	free(dataAp);
+	free(piv_description);
 	return ret;
 	return ret;
 }
 }

+ 4 - 1
examples/native_fortran/nf_example.f90

@@ -2,7 +2,7 @@
 !
 !
 ! Copyright (C) 2017                                     CNRS
 ! Copyright (C) 2017                                     CNRS
 ! Copyright (C) 2015,2016                                Inria
 ! Copyright (C) 2015,2016                                Inria
-! Copyright (C) 2016                                     Université de Bordeaux
+! Copyright (C) 2016,2019                                Université de Bordeaux
 ! Copyright (C) 2015                                     ONERA
 ! Copyright (C) 2015                                     ONERA
 !
 !
 ! StarPU is free software; you can redistribute it and/or modify
 ! StarPU is free software; you can redistribute it and/or modify
@@ -36,6 +36,7 @@ PROGRAM f90_example
   INTEGER(KIND=C_INT)            :: starpu_maj,starpu_min,starpu_rev
   INTEGER(KIND=C_INT)            :: starpu_maj,starpu_min,starpu_rev
   INTEGER(KIND=C_INT)            :: neq,ng,nb,it,it_tot
   INTEGER(KIND=C_INT)            :: neq,ng,nb,it,it_tot
   REAL(KIND=C_DOUBLE)            :: r, coeff2
   REAL(KIND=C_DOUBLE)            :: r, coeff2
+  REAL(KIND=C_DOUBLE),TARGET     :: flops
 
 
   TYPE(C_PTR) :: cl_loop_element = C_NULL_PTR ! loop codelet
   TYPE(C_PTR) :: cl_loop_element = C_NULL_PTR ! loop codelet
   TYPE(C_PTR) :: cl_copy_element = C_NULL_PTR ! copy codelet
   TYPE(C_PTR) :: cl_copy_element = C_NULL_PTR ! copy codelet
@@ -95,11 +96,13 @@ PROGRAM f90_example
      ! compute new dro for each element
      ! compute new dro for each element
      DO i = 1,Nelt
      DO i = 1,Nelt
         elt => mesh%elt(i)
         elt => mesh%elt(i)
+        flops = elt%Ng * ( (elt%Np * numpar%Neq_max * 2) + 1 + elt%Np * numpar%Neq_max)
         CALL fstarpu_insert_task((/ cl_loop_element,    &
         CALL fstarpu_insert_task((/ cl_loop_element,    &
                 FSTARPU_VALUE, c_loc(numpar%coeff), FSTARPU_SZ_C_DOUBLE, &
                 FSTARPU_VALUE, c_loc(numpar%coeff), FSTARPU_SZ_C_DOUBLE, &
                 FSTARPU_R, elt%ro_h,                 &
                 FSTARPU_R, elt%ro_h,                 &
                 FSTARPU_RW, elt%dro_h,                &
                 FSTARPU_RW, elt%dro_h,                &
                 FSTARPU_R, elt%basis_h,              &
                 FSTARPU_R, elt%basis_h,              &
+                FSTARPU_FLOPS, c_loc(flops),         &
                 C_NULL_PTR /))
                 C_NULL_PTR /))
      ENDDO
      ENDDO
      ! sync (if needed by the algorithm)
      ! sync (if needed by the algorithm)

+ 4 - 2
include/starpu_profiling.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2014,2016,2017                      Université de Bordeaux
- * Copyright (C) 2010,2011,2013,2015,2017,2019                 CNRS
+ * Copyright (C) 2010-2014,2016,2017,2019                 Université de Bordeaux
+ * Copyright (C) 2010,2011,2013,2015,2017,2019            CNRS
  * Copyright (C) 2016                                     Inria
  * Copyright (C) 2016                                     Inria
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -118,6 +118,8 @@ struct starpu_profiling_worker_info
 	/** Energy consumed by the worker, in Joules */
 	/** Energy consumed by the worker, in Joules */
 	double energy_consumed;
 	double energy_consumed;
 
 
+	/* TODO: add wasted time due to failed tasks */
+
 	double flops;
 	double flops;
 };
 };
 
 

+ 1 - 0
include/starpu_stdlib.h

@@ -242,6 +242,7 @@ void starpu_memory_deallocate(unsigned node, size_t size);
 void starpu_memory_wait_available(unsigned node, size_t size);
 void starpu_memory_wait_available(unsigned node, size_t size);
 
 
 void starpu_sleep(float nb_sec);
 void starpu_sleep(float nb_sec);
+void starpu_usleep(float nb_micro_sec);
 
 
 /** @} */
 /** @} */
 
 

+ 67 - 0
include/starpu_task.h

@@ -919,6 +919,13 @@ struct starpu_task
 	unsigned no_submitorder:1;
 	unsigned no_submitorder:1;
 
 
 	/**
 	/**
+	   Whether this task has failed and will thus have to be retried
+
+	   Set by StarPU.
+	*/
+	unsigned failed:1;
+
+	/**
 	   Whether the scheduler has pushed the task on some queue
 	   Whether the scheduler has pushed the task on some queue
 
 
 	   Set by StarPU.
 	   Set by StarPU.
@@ -1348,6 +1355,15 @@ void starpu_task_destroy(struct starpu_task *task);
 int starpu_task_submit(struct starpu_task *task) STARPU_WARN_UNUSED_RESULT;
 int starpu_task_submit(struct starpu_task *task) STARPU_WARN_UNUSED_RESULT;
 
 
 /**
 /**
+   Submit \p task to StarPU with dependency bypass.
+
+   This can only be called on behalf of another task which has already taken the
+   proper dependencies, e.g. this task is just an attempt of doing the actual
+   computation of that task.
+*/
+int starpu_task_submit_nodeps(struct starpu_task *task) STARPU_WARN_UNUSED_RESULT;
+
+/**
    Submit \p task to the context \p sched_ctx_id. By default,
    Submit \p task to the context \p sched_ctx_id. By default,
    starpu_task_submit() submits the task to a global context that is
    starpu_task_submit() submits the task to a global context that is
    created automatically by StarPU.
    created automatically by StarPU.
@@ -1504,6 +1520,57 @@ unsigned starpu_task_get_implementation(struct starpu_task *task);
  */
  */
 void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t *deps, void (*callback)(void *), void *callback_arg);
 void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t *deps, void (*callback)(void *), void *callback_arg);
 
 
+
+
+
+/**
+   Function to be used as a prologue callback to enable fault tolerance for the
+   task. This prologue will create a try-task, i.e a duplicate of the task,
+   which will to the actual computation.
+
+   The prologue argument can be set to a check_ft function that will be
+   called on termination of the duplicate, which can check the result of the
+   task, and either confirm success, or resubmit another attempt.
+   If it is not set, the default implementation is to just resubmit a new
+   try-task.
+ */
+void starpu_task_ft_prologue(void *check_ft);
+
+
+/**
+   Create a try-task for a \p meta_task, given a \p template_task task
+   template. The meta task can be passed as template on the first call, but
+   since it is mangled by starpu_task_ft_create_retry(), further calls
+   (typically made by the check_ft callback) need to be passed the previous
+   try-task as template task.
+
+   \p check_ft is similar to the prologue argument of
+   starpu_task_ft_prologue(), and is typicall set to the very function calling
+   starpu_task_ft_create_retry().
+
+   The try-task is returned, and can be modified (e.g. to change scheduling
+   parameters) before being submitted with starpu_task_submit_nodeps().
+ */
+struct starpu_task * starpu_task_ft_create_retry(const struct starpu_task *meta_task, const struct starpu_task *template_task, void (*check_ft)(void*));
+
+/**
+   Record that this task failed, and should thus be retried.
+   This is usually called from the task codelet function itself, after checking
+   the result and noticing that the computation went wrong, and thus the task
+   should be retried. The performance of this task execution will not be
+   recorded for performance models.
+
+   This can only be called for a task whose data access modes are either
+   STARPU_R and STARPU_W.
+ */
+void starpu_task_ft_failed(struct starpu_task *task);
+
+/**
+   Notify that the try-task was successful and thus the meta-task was
+   successful.
+ */
+void starpu_task_ft_success(struct starpu_task *meta_task);
+
 /** @} */
 /** @} */
 
 
 #ifdef __cplusplus
 #ifdef __cplusplus

+ 13 - 0
src/common/utils.c

@@ -549,6 +549,19 @@ void starpu_sleep(float nb_sec)
 #endif
 #endif
 }
 }
 
 
+void starpu_usleep(float nb_micro_sec)
+{
+#ifdef STARPU_SIMGRID
+	MSG_process_sleep(nb_micro_sec / 1000000);
+#elif defined(STARPU_HAVE_WINDOWS)
+	Sleep(nb_micro_sec / 1000);
+#elif HAVE_UNISTD_H
+	usleep(nb_micro_sec);
+#else
+#error no implementation of usleep
+#endif
+}
+
 char *starpu_getenv(const char *str)
 char *starpu_getenv(const char *str)
 {
 {
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID

+ 114 - 3
src/core/dependencies/data_concurrency.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2013,2015-2017                           Inria
  * Copyright (C) 2013,2015-2017                           Inria
- * Copyright (C) 2009-2015,2017,2018                      Université de Bordeaux
+ * Copyright (C) 2009-2015,2017,2018-2019                      Université de Bordeaux
  * Copyright (C) 2010-2013,2015,2017,2018,2019            CNRS
  * Copyright (C) 2010-2013,2015,2017,2018,2019            CNRS
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -241,6 +241,60 @@ static unsigned _starpu_attempt_to_submit_data_request(unsigned request_from_cod
 
 
 }
 }
 
 
+/* Take a data, without waiting for it to be available (it is assumed to be).
+ * This is typicall used for nodeps tasks, for which a previous task has already
+ * waited for the proper conditions, and we just need to take another reference
+ * for overall reference coherency.
+/* No lock is held, this acquires and releases the handle header lock */
+static void _starpu_take_data(unsigned request_from_codelet,
+						       starpu_data_handle_t handle, enum starpu_data_access_mode mode,
+						       struct _starpu_job *j)
+{
+	STARPU_ASSERT_MSG(!handle->arbiter, "TODO");
+
+	/* Do not care about some flags */
+	mode &= ~STARPU_COMMUTE;
+	mode &= ~STARPU_SSEND;
+	mode &= ~STARPU_LOCALITY;
+	if (mode == STARPU_RW)
+		mode = STARPU_W;
+
+	/* Take the lock protecting the header. We try to do some progression
+	 * in case this is called from a worker, otherwise we just wait for the
+	 * lock to be available. */
+	if (request_from_codelet)
+	{
+		int cpt = 0;
+		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
+		{
+			cpt++;
+			_starpu_datawizard_progress(0);
+		}
+		if (cpt == STARPU_SPIN_MAXTRY)
+			_starpu_spin_lock(&handle->header_lock);
+	}
+	else
+	{
+		_starpu_spin_lock(&handle->header_lock);
+	}
+
+	/* If we are currently performing a reduction, we freeze any request
+	 * that is not explicitely a reduction task. */
+	unsigned is_a_reduction_task = (request_from_codelet && j && j->reduction_task);
+
+	STARPU_ASSERT_MSG(!is_a_reduction_task, "TODO");
+
+	enum starpu_data_access_mode previous_mode = handle->current_mode;
+
+	STARPU_ASSERT_MSG(mode == previous_mode, "mode was %d, but requested %d", previous_mode, mode);
+
+	handle->refcnt++;
+	handle->busy_count++;
+
+	_starpu_spin_unlock(&handle->header_lock);
+}
+
+
 /* No lock is held */
 /* No lock is held */
 unsigned _starpu_attempt_to_submit_data_request_from_apps(starpu_data_handle_t handle, enum starpu_data_access_mode mode,
 unsigned _starpu_attempt_to_submit_data_request_from_apps(starpu_data_handle_t handle, enum starpu_data_access_mode mode,
 							  void (*callback)(void *), void *argcb)
 							  void (*callback)(void *), void *argcb)
@@ -260,7 +314,7 @@ static unsigned attempt_to_submit_data_request_from_job(struct _starpu_job *j, u
 	return _starpu_attempt_to_submit_data_request(1, handle, mode, NULL, NULL, j, buffer_index);
 	return _starpu_attempt_to_submit_data_request(1, handle, mode, NULL, NULL, j, buffer_index);
 }
 }
 
 
-/* Acquire all data of the given job, one by one in handle pointer value order
+/* Try to acquire all data of the given job, one by one in handle pointer value order
  */
  */
 /* No lock is held */
 /* No lock is held */
 static unsigned _submit_job_enforce_data_deps(struct _starpu_job *j, unsigned start_buffer_index)
 static unsigned _submit_job_enforce_data_deps(struct _starpu_job *j, unsigned start_buffer_index)
@@ -301,6 +355,50 @@ static unsigned _submit_job_enforce_data_deps(struct _starpu_job *j, unsigned st
 	return 0;
 	return 0;
 }
 }
 
 
+static void take_data_from_job(struct _starpu_job *j, unsigned buffer_index)
+{
+	/* Note that we do not access j->task->handles, but j->ordered_buffers
+	 * which is a sorted copy of it. */
+	struct _starpu_data_descr *buffer = &(_STARPU_JOB_GET_ORDERED_BUFFERS(j)[buffer_index]);
+	starpu_data_handle_t handle = buffer->handle;
+	enum starpu_data_access_mode mode = buffer->mode & ~STARPU_COMMUTE;
+
+	_starpu_take_data(1, handle, mode, j);
+}
+
+/* Immediately acquire all data of the given job, one by one in handle pointer value order
+ */
+/* No lock is held */
+static void _submit_job_take_data_deps(struct _starpu_job *j, unsigned start_buffer_index)
+{
+	unsigned buf;
+
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(j->task);
+	for (buf = start_buffer_index; buf < nbuffers; buf++)
+	{
+		starpu_data_handle_t handle = _STARPU_JOB_GET_ORDERED_BUFFER_HANDLE(j, buf);
+		if (buf)
+		{
+			starpu_data_handle_t handle_m1 = _STARPU_JOB_GET_ORDERED_BUFFER_HANDLE(j, buf-1);
+			if (handle_m1 == handle)
+				/* We have already requested this data, skip it. This
+				 * depends on ordering putting writes before reads, see
+				 * _starpu_compar_handles.  */
+				continue;
+		}
+
+		if(handle->arbiter)
+		{
+			/* We arrived on an arbitered data, we stop and proceed
+			 * with the arbiter second step.  */
+			STARPU_ASSERT_MSG(0, "TODO");
+			//_starpu_submit_job_take_arbitered_deps(j, buf, nbuffers);
+		}
+
+                take_data_from_job(j, buf);
+	}
+}
+
 /* This is called when the tag+task dependencies are to be finished releasing.  */
 /* This is called when the tag+task dependencies are to be finished releasing.  */
 void _starpu_enforce_data_deps_notify_job_ready_soon(struct _starpu_job *j, _starpu_notify_job_start_data *data)
 void _starpu_enforce_data_deps_notify_job_ready_soon(struct _starpu_job *j, _starpu_notify_job_start_data *data)
 {
 {
@@ -363,7 +461,7 @@ void _starpu_job_set_ordered_buffers(struct _starpu_job *j)
 }
 }
 
 
 /* Sort the data used by the given job by handle pointer value order, and
 /* Sort the data used by the given job by handle pointer value order, and
- * acquire them in that order */
+ * try to acquire them in that order */
 /* No  lock is held */
 /* No  lock is held */
 unsigned _starpu_submit_job_enforce_data_deps(struct _starpu_job *j)
 unsigned _starpu_submit_job_enforce_data_deps(struct _starpu_job *j)
 {
 {
@@ -391,6 +489,19 @@ static unsigned unlock_one_requester(struct _starpu_data_requester *r)
 		return 0;
 		return 0;
 }
 }
 
 
+/* Sort the data used by the given job by handle pointer value order, and
+ * immediately acquire them in that order */
+/* No  lock is held */
+void _starpu_submit_job_take_data_deps(struct _starpu_job *j)
+{
+	struct starpu_codelet *cl = j->task->cl;
+
+	if ((cl == NULL) || (STARPU_TASK_GET_NBUFFERS(j->task) == 0))
+		return;
+
+	_submit_job_take_data_deps(j, 0);
+}
+
 /* This is called when a task is finished with a piece of data
 /* This is called when a task is finished with a piece of data
  * (or on starpu_data_release)
  * (or on starpu_data_release)
  *
  *

+ 2 - 1
src/core/dependencies/data_concurrency.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2015                                     Inria
  * Copyright (C) 2015                                     Inria
- * Copyright (C) 2009-2012,2014,2015,2018                 Université de Bordeaux
+ * Copyright (C) 2009-2012,2014,2015,2018-2019                 Université de Bordeaux
  * Copyright (C) 2010,2011,2013,2015,2017                 CNRS
  * Copyright (C) 2010,2011,2013,2015,2017                 CNRS
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -25,6 +25,7 @@ void _starpu_job_set_ordered_buffers(struct _starpu_job *j);
 
 
 unsigned _starpu_submit_job_enforce_data_deps(struct _starpu_job *j);
 unsigned _starpu_submit_job_enforce_data_deps(struct _starpu_job *j);
 void _starpu_submit_job_enforce_arbitered_deps(struct _starpu_job *j, unsigned buf, unsigned nbuffers);
 void _starpu_submit_job_enforce_arbitered_deps(struct _starpu_job *j, unsigned buf, unsigned nbuffers);
+void _starpu_submit_job_take_data_deps(struct _starpu_job *j);
 void _starpu_enforce_data_deps_notify_job_ready_soon(struct _starpu_job *j, _starpu_notify_job_start_data *data);
 void _starpu_enforce_data_deps_notify_job_ready_soon(struct _starpu_job *j, _starpu_notify_job_start_data *data);
 
 
 int _starpu_notify_data_dependencies(starpu_data_handle_t handle);
 int _starpu_notify_data_dependencies(starpu_data_handle_t handle);

+ 16 - 2
src/core/jobs.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2011-2017                                Inria
  * Copyright (C) 2011-2017                                Inria
- * Copyright (C) 2008-2018                                Université de Bordeaux
+ * Copyright (C) 2008-2019                                Université de Bordeaux
  * Copyright (C) 2010-2019                                CNRS
  * Copyright (C) 2010-2019                                CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2011                                     Télécom-SudParis
  * Copyright (C) 2011                                     Télécom-SudParis
@@ -519,7 +519,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 			{
 			{
 				/* We reuse the same job structure */
 				/* We reuse the same job structure */
 				task->status = STARPU_TASK_BLOCKED;
 				task->status = STARPU_TASK_BLOCKED;
-				int ret = _starpu_submit_job(j);
+				int ret = _starpu_submit_job(j, 0);
 				STARPU_ASSERT(!ret);
 				STARPU_ASSERT(!ret);
 			}
 			}
 #ifdef STARPU_OPENMP
 #ifdef STARPU_OPENMP
@@ -700,6 +700,20 @@ unsigned _starpu_reenforce_task_deps_and_schedule(struct _starpu_job *j)
 }
 }
 #endif
 #endif
 
 
+unsigned _starpu_take_deps_and_schedule(struct _starpu_job *j)
+{
+	unsigned ret;
+	STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
+
+	/* Take references */
+	_starpu_submit_job_take_data_deps(j);
+
+	/* And immediately push task */
+	ret = _starpu_push_task(j);
+
+	return ret;
+}
+
 /* This is called when a tag or task dependency is to be released.  */
 /* This is called when a tag or task dependency is to be released.  */
 void _starpu_enforce_deps_notify_job_ready_soon(struct _starpu_job *j, _starpu_notify_job_start_data *data, int tag)
 void _starpu_enforce_deps_notify_job_ready_soon(struct _starpu_job *j, _starpu_notify_job_start_data *data, int tag)
 {
 {

+ 2 - 1
src/core/jobs.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2008-2018                                Université de Bordeaux
+ * Copyright (C) 2008-2019                                Université de Bordeaux
  * Copyright (C) 2011,2014                                Inria
  * Copyright (C) 2011,2014                                Inria
  * Copyright (C) 2010,2011,2013-2015,2017,2018,2019       CNRS
  * Copyright (C) 2010,2011,2013-2015,2017,2018,2019       CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2013                                     Thibaut Lambert
@@ -255,6 +255,7 @@ unsigned _starpu_enforce_deps_starting_from_task(struct _starpu_job *j);
 /* When waking up a continuation, we only enforce new task dependencies */
 /* When waking up a continuation, we only enforce new task dependencies */
 unsigned _starpu_reenforce_task_deps_and_schedule(struct _starpu_job *j);
 unsigned _starpu_reenforce_task_deps_and_schedule(struct _starpu_job *j);
 #endif
 #endif
+unsigned _starpu_take_deps_and_schedule(struct _starpu_job *j);
 void _starpu_enforce_deps_notify_job_ready_soon(struct _starpu_job *j, _starpu_notify_job_start_data *data, int tag);
 void _starpu_enforce_deps_notify_job_ready_soon(struct _starpu_job *j, _starpu_notify_job_start_data *data, int tag);
 
 
 /* Called at the submission of the job */
 /* Called at the submission of the job */

+ 1 - 1
src/core/perfmodel/perfmodel_history.c

@@ -1408,7 +1408,7 @@ void starpu_perfmodel_directory(FILE *output)
  * the performance model files */
  * the performance model files */
 int starpu_perfmodel_list(FILE *output)
 int starpu_perfmodel_list(FILE *output)
 {
 {
-#if !defined(_WIN32) || defined(__MINGW32__) || defined(__CYGWIN__)
+#ifdef HAVE_SCANDIR
         char *path;
         char *path;
 	struct dirent **list;
 	struct dirent **list;
 	int n;
 	int n;

+ 126 - 30
src/core/task.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2011-2019                                Inria
  * Copyright (C) 2011-2019                                Inria
- * Copyright (C) 2009-2018                                Université de Bordeaux
+ * Copyright (C) 2009-2019                                Université de Bordeaux
  * Copyright (C) 2017                                     Erwan Leria
  * Copyright (C) 2017                                     Erwan Leria
  * Copyright (C) 2010-2019                                CNRS
  * Copyright (C) 2010-2019                                CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2013                                     Thibaut Lambert
@@ -478,7 +478,7 @@ int _starpu_task_test_termination(struct starpu_task *task)
 
 
 /* NB in case we have a regenerable task, it is possible that the job was
 /* NB in case we have a regenerable task, it is possible that the job was
  * already counted. */
  * already counted. */
-int _starpu_submit_job(struct _starpu_job *j)
+int _starpu_submit_job(struct _starpu_job *j, int nodeps)
 {
 {
 	struct starpu_task *task = j->task;
 	struct starpu_task *task = j->task;
 	int ret;
 	int ret;
@@ -552,15 +552,22 @@ int _starpu_submit_job(struct _starpu_job *j)
 	}
 	}
 #endif
 #endif
 
 
-#ifdef STARPU_OPENMP
-	if (continuation)
+	if (nodeps)
 	{
 	{
-		ret = _starpu_reenforce_task_deps_and_schedule(j);
+		ret = _starpu_take_deps_and_schedule(j);
 	}
 	}
 	else
 	else
-#endif
 	{
 	{
-		ret = _starpu_enforce_deps_and_schedule(j);
+#ifdef STARPU_OPENMP
+		if (continuation)
+		{
+			ret = _starpu_reenforce_task_deps_and_schedule(j);
+		}
+		else
+#endif
+		{
+			ret = _starpu_enforce_deps_and_schedule(j);
+		}
 	}
 	}
 
 
 	_STARPU_LOG_OUT();
 	_STARPU_LOG_OUT();
@@ -810,7 +817,7 @@ static int _starpu_task_submit_head(struct starpu_task *task)
 }
 }
 
 
 /* application should submit new tasks to StarPU through this function */
 /* application should submit new tasks to StarPU through this function */
-int starpu_task_submit(struct starpu_task *task)
+int _starpu_task_submit(struct starpu_task *task, int nodeps)
 {
 {
 	_STARPU_LOG_IN();
 	_STARPU_LOG_IN();
 	STARPU_ASSERT(task);
 	STARPU_ASSERT(task);
@@ -826,6 +833,7 @@ int starpu_task_submit(struct starpu_task *task)
 	}
 	}
 	unsigned is_sync = task->synchronous;
 	unsigned is_sync = task->synchronous;
 	starpu_task_bundle_t bundle = task->bundle;
 	starpu_task_bundle_t bundle = task->bundle;
+	STARPU_ASSERT_MSG(!(nodeps && bundle), "not supported\n");
 	/* internally, StarPU manipulates a struct _starpu_job * which is a wrapper around a
 	/* internally, StarPU manipulates a struct _starpu_job * which is a wrapper around a
 	* task structure, it is possible that this job structure was already
 	* task structure, it is possible that this job structure was already
 	* allocated. */
 	* allocated. */
@@ -854,6 +862,7 @@ int starpu_task_submit(struct starpu_task *task)
 			_starpu_perf_counter_update_per_codelet_sample(task->cl);
 			_starpu_perf_counter_update_per_codelet_sample(task->cl);
 		}
 		}
 	}
 	}
+	STARPU_ASSERT_MSG(!(nodeps && continuation), "not supported\n");
 
 
 	if (!j->internal)
 	if (!j->internal)
 	{
 	{
@@ -889,7 +898,8 @@ int starpu_task_submit(struct starpu_task *task)
 	if (task->cl && !continuation)
 	if (task->cl && !continuation)
 	{
 	{
 		_starpu_job_set_ordered_buffers(j);
 		_starpu_job_set_ordered_buffers(j);
-		_starpu_detect_implicit_data_deps(task);
+		if (!nodeps)
+			_starpu_detect_implicit_data_deps(task);
 	}
 	}
 
 
 	if (bundle)
 	if (bundle)
@@ -930,7 +940,7 @@ int starpu_task_submit(struct starpu_task *task)
 	if (profiling)
 	if (profiling)
 		_starpu_clock_gettime(&info->submit_time);
 		_starpu_clock_gettime(&info->submit_time);
 
 
-	ret = _starpu_submit_job(j);
+	ret = _starpu_submit_job(j, nodeps);
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 	if (_starpu_simgrid_task_submit_cost())
 	if (_starpu_simgrid_task_submit_cost())
 		MSG_process_sleep(0.000001);
 		MSG_process_sleep(0.000001);
@@ -949,6 +959,11 @@ int starpu_task_submit(struct starpu_task *task)
 	return ret;
 	return ret;
 }
 }
 
 
+int starpu_task_submit(struct starpu_task *task)
+{
+	return _starpu_task_submit(task, 0);
+}
+
 int _starpu_task_submit_internally(struct starpu_task *task)
 int _starpu_task_submit_internally(struct starpu_task *task)
 {
 {
 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
@@ -965,27 +980,9 @@ int starpu_task_submit_to_ctx(struct starpu_task *task, unsigned sched_ctx_id)
 
 
 /* The StarPU core can submit tasks directly to the scheduler or a worker,
 /* The StarPU core can submit tasks directly to the scheduler or a worker,
  * skipping dependencies completely (when it knows what it is doing).  */
  * skipping dependencies completely (when it knows what it is doing).  */
-int _starpu_task_submit_nodeps(struct starpu_task *task)
+int starpu_task_submit_nodeps(struct starpu_task *task)
 {
 {
-	int ret = _starpu_task_submit_head(task);
-	STARPU_ASSERT(ret == 0);
-
-	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
-
-	_starpu_increment_nsubmitted_tasks_of_sched_ctx(j->task->sched_ctx);
-	_starpu_sched_task_submit(task);
-
-	STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
-	_starpu_handle_job_submission(j);
-	_starpu_increment_nready_tasks_of_sched_ctx(j->task->sched_ctx, j->task->flops, j->task);
-	if (task->cl)
-		/* This would be done by data dependencies checking */
-		_starpu_job_set_ordered_buffers(j);
-	STARPU_ASSERT(task->status == STARPU_TASK_BLOCKED);
-	task->status = STARPU_TASK_READY;
-	STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
-
-	return _starpu_push_task(j);
+	return _starpu_task_submit(task, 1);
 }
 }
 
 
 /*
 /*
@@ -1570,3 +1567,102 @@ void _starpu_watchdog_shutdown(void)
 
 
 	STARPU_PTHREAD_JOIN(watchdog_thread, NULL);
 	STARPU_PTHREAD_JOIN(watchdog_thread, NULL);
 }
 }
+
+static void _starpu_ft_check_support(const struct starpu_task *task)
+{
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
+	unsigned i;
+
+	for (i = 0; i < nbuffers; i++)
+	{
+		enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(task, i);
+		STARPU_ASSERT_MSG (mode == STARPU_R || mode == STARPU_W,
+				"starpu_task_failed is only supported for tasks with access modes STARPU_R and STARPU_W");
+	}
+}
+
+struct starpu_task *starpu_task_ft_create_retry
+(const struct starpu_task *meta_task, const struct starpu_task *template_task, void (*check_ft)(void *))
+{
+	/* Create a new task to actually perform the result */
+	struct starpu_task *new_task = starpu_task_create();
+
+	*new_task = *template_task;
+	new_task->prologue_callback_func = NULL;
+	/* XXX: cl_arg needs to be duplicated */
+	STARPU_ASSERT_MSG(!meta_task->cl_arg_free || !meta_task->cl_arg, "not supported yet");
+	STARPU_ASSERT_MSG(!meta_task->callback_func, "not supported");
+	new_task->callback_func = check_ft;
+	new_task->callback_arg = (void*) meta_task;
+	new_task->callback_arg_free = 0;
+	new_task->prologue_callback_arg_free = 0;
+	STARPU_ASSERT_MSG(!new_task->prologue_callback_pop_arg_free, "not supported");
+	new_task->use_tag = 0;
+	new_task->synchronous = 0;
+	new_task->destroy = 1;
+	new_task->regenerate = 0;
+	new_task->no_submitorder = 1;
+	new_task->failed = 0;
+	new_task->status = STARPU_TASK_INVALID;
+	new_task->profiling_info = NULL;
+	new_task->prev = NULL;
+	new_task->next = NULL;
+	new_task->starpu_private = NULL;
+	new_task->omp_task = NULL;
+
+	return new_task;
+}
+
+static void _starpu_default_check_ft(void *arg)
+{
+	struct starpu_task *meta_task = arg;
+	struct starpu_task *current_task = starpu_task_get_current();
+	struct starpu_task *new_task;
+	int ret;
+
+	if (!current_task->failed)
+	{
+		starpu_task_ft_success(meta_task);
+		return;
+	}
+
+	new_task = starpu_task_ft_create_retry
+(meta_task, current_task, _starpu_default_check_ft);
+
+	ret = starpu_task_submit_nodeps(new_task);
+	STARPU_ASSERT(!ret);
+}
+
+void starpu_task_ft_prologue(void *arg)
+{
+	struct starpu_task *meta_task = starpu_task_get_current();
+	struct starpu_task *new_task;
+	void (*check_ft)(void*) = arg;
+	int ret;
+
+	if (!check_ft)
+		check_ft = _starpu_default_check_ft;
+
+	/* Create a task which will do the actual computation */
+	new_task = starpu_task_ft_create_retry
+(meta_task, meta_task, check_ft);
+
+	ret = starpu_task_submit_nodeps(new_task);
+	STARPU_ASSERT(!ret);
+
+	/* Make the parent task wait for the result getting correct */
+	starpu_task_end_dep_add(meta_task, 1);
+	meta_task->where = STARPU_NOWHERE;
+}
+
+void starpu_task_ft_failed(struct starpu_task *task)
+{
+	_starpu_ft_check_support(task);
+
+	task->failed = 1;
+}
+
+void starpu_task_ft_success(struct starpu_task *meta_task)
+{
+	starpu_task_end_dep_release(meta_task);
+}

+ 2 - 6
src/core/task.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2011-2014,2016,2017                      Inria
  * Copyright (C) 2011-2014,2016,2017                      Inria
- * Copyright (C) 2009-2018                                Université de Bordeaux
+ * Copyright (C) 2009-2019                                Université de Bordeaux
  * Copyright (C) 2010-2017, 2019                          CNRS
  * Copyright (C) 2010-2017, 2019                          CNRS
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -39,11 +39,7 @@ void _starpu_task_init(void);
 void _starpu_task_deinit(void);
 void _starpu_task_deinit(void);
 void _starpu_set_current_task(struct starpu_task *task);
 void _starpu_set_current_task(struct starpu_task *task);
 
 
-/* NB the second argument makes it possible to count regenerable tasks only
- * once. */
-int _starpu_submit_job(struct _starpu_job *j);
-
-int _starpu_task_submit_nodeps(struct starpu_task *task);
+int _starpu_submit_job(struct _starpu_job *j, int nodeps);
 
 
 void _starpu_task_declare_deps_array(struct starpu_task *task, unsigned ndeps, struct starpu_task *task_array[], int check);
 void _starpu_task_declare_deps_array(struct starpu_task *task, unsigned ndeps, struct starpu_task *task_array[], int check);
 
 

+ 14 - 8
src/core/topology.c

@@ -353,32 +353,38 @@ struct _starpu_worker *_starpu_get_worker_from_driver(struct starpu_driver *d)
 
 
 	for (workerid = 0; workerid < nworkers; workerid++)
 	for (workerid = 0; workerid < nworkers; workerid++)
 	{
 	{
-		struct _starpu_worker *worker;
-		worker = _starpu_get_worker_struct(workerid);
-
 		if (starpu_worker_get_type(workerid) == d->type)
 		if (starpu_worker_get_type(workerid) == d->type)
 		{
 		{
+			struct _starpu_worker *worker;
+			worker = _starpu_get_worker_struct(workerid);
 			switch (d->type)
 			switch (d->type)
 			{
 			{
+#ifdef STARPU_USE_CPU
 			case STARPU_CPU_WORKER:
 			case STARPU_CPU_WORKER:
-			{
-				if (worker->driver.id.cpu_id == d->id.cpu_id)
+				if (worker->devid == d->id.cpu_id)
 					return worker;
 					return worker;
 				break;
 				break;
-			}
+#endif
+#ifdef STARPU_USE_OPENCL
 			case STARPU_OPENCL_WORKER:
 			case STARPU_OPENCL_WORKER:
 			{
 			{
-				if (worker->driver.id.opencl_id == d->id.opencl_id)
+				cl_device_id device;
+				starpu_opencl_get_device(worker->devid, &device);
+				if (device == d->id.opencl_id)
 					return worker;
 					return worker;
 				break;
 				break;
 			}
 			}
+#endif
+#ifdef STARPU_USE_CUDA
 			case STARPU_CUDA_WORKER:
 			case STARPU_CUDA_WORKER:
 			{
 			{
-				if (worker->driver.id.cuda_id == d->id.cuda_id)
+				if (worker->devid == d->id.cuda_id)
 					return worker;
 					return worker;
 				break;
 				break;
 
 
 			}
 			}
+#endif
+
 			default:
 			default:
 				(void) worker;
 				(void) worker;
 				_STARPU_DEBUG("Invalid device type\n");
 				_STARPU_DEBUG("Invalid device type\n");

+ 50 - 24
src/core/workers.c

@@ -810,18 +810,18 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
 
 		_starpu_init_worker_queue(workerarg);
 		_starpu_init_worker_queue(workerarg);
 
 
-		struct starpu_driver *driver = &(workerarg->driver);
-		driver->type = workerarg->arch;
+		struct starpu_driver driver;
+		driver.type = workerarg->arch;
 		switch (workerarg->arch)
 		switch (workerarg->arch)
 		{
 		{
 #if defined(STARPU_USE_CPU) || defined(STARPU_SIMGRID)
 #if defined(STARPU_USE_CPU) || defined(STARPU_SIMGRID)
 			case STARPU_CPU_WORKER:
 			case STARPU_CPU_WORKER:
-				driver->id.cpu_id = devid;
+			{
+				driver.id.cpu_id = devid;
 				workerarg->driver_ops = &_starpu_driver_cpu_ops;
 				workerarg->driver_ops = &_starpu_driver_cpu_ops;
 				workerarg->wait_for_worker_initialization = 1;
 				workerarg->wait_for_worker_initialization = 1;
-				workerarg->may_launch_driver = _starpu_may_launch_driver(&pconfig->conf, driver);
 
 
-				if (workerarg->may_launch_driver)
+				if (_starpu_may_launch_driver(&pconfig->conf, &driver))
 				{
 				{
 					STARPU_PTHREAD_CREATE_ON(
 					STARPU_PTHREAD_CREATE_ON(
 						"CPU",
 						"CPU",
@@ -836,11 +836,13 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 					workerarg->run_by_starpu = 0;
 					workerarg->run_by_starpu = 0;
 				}
 				}
 				break;
 				break;
+			}
 #endif
 #endif
 
 
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 			case STARPU_CUDA_WORKER:
 			case STARPU_CUDA_WORKER:
-				driver->id.cuda_id = devid;
+			{
+				driver.id.cuda_id = devid;
 				workerarg->driver_ops = &_starpu_driver_cuda_ops;
 				workerarg->driver_ops = &_starpu_driver_cuda_ops;
 				struct _starpu_worker_set *worker_set = workerarg->set;
 				struct _starpu_worker_set *worker_set = workerarg->set;
 
 
@@ -852,9 +854,8 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 				worker_set->set_is_initialized = 0;
 				worker_set->set_is_initialized = 0;
 				worker_set->wait_for_set_initialization = 1;
 				worker_set->wait_for_set_initialization = 1;
 				workerarg->wait_for_worker_initialization = 0;
 				workerarg->wait_for_worker_initialization = 0;
-				workerarg->may_launch_driver = _starpu_may_launch_driver(&pconfig->conf, driver);
 
 
-				if (workerarg->may_launch_driver)
+				if (_starpu_may_launch_driver(&pconfig->conf, &driver))
 				{
 				{
 					STARPU_PTHREAD_CREATE_ON(
 					STARPU_PTHREAD_CREATE_ON(
 						"CUDA",
 						"CUDA",
@@ -869,17 +870,18 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 					workerarg->run_by_starpu = 0;
 					workerarg->run_by_starpu = 0;
 				}
 				}
 				break;
 				break;
+			}
 #endif
 #endif
 
 
 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
 			case STARPU_OPENCL_WORKER:
 			case STARPU_OPENCL_WORKER:
+			{
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
-				starpu_opencl_get_device(devid, &driver->id.opencl_id);
+				starpu_opencl_get_device(devid, &driver.id.opencl_id);
 				workerarg->driver_ops = &_starpu_driver_opencl_ops;
 				workerarg->driver_ops = &_starpu_driver_opencl_ops;
 				workerarg->wait_for_worker_initialization = 1;
 				workerarg->wait_for_worker_initialization = 1;
-				workerarg->may_launch_driver = _starpu_may_launch_driver(&pconfig->conf, driver);
 
 
-				if (workerarg->may_launch_driver)
+				if (_starpu_may_launch_driver(&pconfig->conf, &driver))
 				{
 				{
 					STARPU_PTHREAD_CREATE_ON(
 					STARPU_PTHREAD_CREATE_ON(
 						"OpenCL",
 						"OpenCL",
@@ -895,10 +897,12 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 				}
 				}
 #endif
 #endif
 				break;
 				break;
+			}
 #endif
 #endif
 
 
 #ifdef STARPU_USE_MIC
 #ifdef STARPU_USE_MIC
 			case STARPU_MIC_WORKER:
 			case STARPU_MIC_WORKER:
+			{
 				/* We spawn only one thread
 				/* We spawn only one thread
 				 * per MIC device, which will control all MIC
 				 * per MIC device, which will control all MIC
 				 * workers of this device. (by using a worker set). */
 				 * workers of this device. (by using a worker set). */
@@ -919,10 +923,12 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 						_starpu_simgrid_get_host_by_worker(workerarg));
 						_starpu_simgrid_get_host_by_worker(workerarg));
 
 
 				break;
 				break;
+			}
 #endif /* STARPU_USE_MIC */
 #endif /* STARPU_USE_MIC */
 
 
 #ifdef STARPU_USE_MPI_MASTER_SLAVE
 #ifdef STARPU_USE_MPI_MASTER_SLAVE
 			case STARPU_MPI_MS_WORKER:
 			case STARPU_MPI_MS_WORKER:
+			{
 				/* We spawn only one thread
 				/* We spawn only one thread
 				 * per MPI device, which will control all MPI
 				 * per MPI device, which will control all MPI
 				 * workers of this device. (by using a worker set). */
 				 * workers of this device. (by using a worker set). */
@@ -950,6 +956,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
 
 #endif /* STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD */
 #endif /* STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD */
 				break;
 				break;
+			}
 #endif /* STARPU_USE_MPI_MASTER_SLAVE */
 #endif /* STARPU_USE_MPI_MASTER_SLAVE */
 
 
 			default:
 			default:
@@ -1741,24 +1748,43 @@ unsigned _starpu_worker_can_block(unsigned memnode STARPU_ATTRIBUTE_UNUSED, stru
 	if (worker->state_changing_ctx_notice)
 	if (worker->state_changing_ctx_notice)
 		return 0;
 		return 0;
 
 
-	if (worker->driver.type == STARPU_CPU_WORKER || worker->driver.type == STARPU_CUDA_WORKER || worker->driver.type == STARPU_OPENCL_WORKER)
+	unsigned can_block = 1;
+
+	struct starpu_driver driver;
+	driver.type = worker->arch;
+	switch (driver.type)
 	{
 	{
-		if (worker->may_launch_driver == 0)
-			return 0;
+	case STARPU_CPU_WORKER:
+		driver.id.cpu_id = worker->devid;
+		break;
+	case STARPU_CUDA_WORKER:
+		driver.id.cuda_id = worker->devid;
+		break;
+#ifdef STARPU_USE_OPENCL
+	case STARPU_OPENCL_WORKER:
+		starpu_opencl_get_device(worker->devid, &driver.id.opencl_id);
+		break;
+#endif
+	default:
+		goto always_launch;
 	}
 	}
-	else
-	{
+	if (!_starpu_may_launch_driver(&_starpu_config.conf, &driver))
+		return 0;
+
+always_launch:
+
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
-		if (!_starpu_check_that_no_data_request_exists(memnode))
-			return 0;
+	if (!_starpu_check_that_no_data_request_exists(memnode))
+		can_block = 0;
 #endif
 #endif
-		if (!_starpu_machine_is_running())
-			return 0;
 
 
-		if (!_starpu_execute_registered_progression_hooks())
-			return 0;
-	}
-	return 1;
+	if (!_starpu_machine_is_running())
+		can_block = 0;
+
+	if (!_starpu_execute_registered_progression_hooks())
+		can_block = 0;
+
+	return can_block;
 #endif
 #endif
 }
 }
 
 

+ 0 - 2
src/core/workers.h

@@ -155,8 +155,6 @@ LIST_TYPE(_starpu_worker,
 	char short_name[32];
 	char short_name[32];
 	unsigned run_by_starpu; /**< Is this run by StarPU or directly by the application ? */
 	unsigned run_by_starpu; /**< Is this run by StarPU or directly by the application ? */
 	struct _starpu_driver_ops *driver_ops;
 	struct _starpu_driver_ops *driver_ops;
-	struct starpu_driver driver;
-	unsigned may_launch_driver;
 
 
 	struct _starpu_sched_ctx_list *sched_ctx_list;
 	struct _starpu_sched_ctx_list *sched_ctx_list;
 	int tmp_sched_ctx;
 	int tmp_sched_ctx;

+ 0 - 6
src/datawizard/filters.h

@@ -26,12 +26,6 @@
 #include <starpu.h>
 #include <starpu.h>
 #include <common/config.h>
 #include <common/config.h>
 
 
-void
-_starpu_filter_nparts_compute_chunk_size_and_offset(unsigned n, unsigned nparts,
-					     size_t elemsize, unsigned id,
-					     unsigned ld, unsigned *chunk_size,
-					     size_t *offset);
-
 
 
 /* submit asynchronous unpartitioning / partitioning to make target active read-only or read-write */
 /* submit asynchronous unpartitioning / partitioning to make target active read-only or read-write */
 void _starpu_data_partition_access_submit(starpu_data_handle_t target, int write);
 void _starpu_data_partition_access_submit(starpu_data_handle_t target, int write);

+ 19 - 27
src/datawizard/interfaces/block_filters.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2012                                     Inria
  * Copyright (C) 2012                                     Inria
- * Copyright (C) 2010,2011,2013,2015,2017                 CNRS
- * Copyright (C) 2011-2014,2016, 2019                           Université de Bordeaux
+ * Copyright (C) 2010,2011,2013,2015,2017,2019            CNRS
+ * Copyright (C) 2011-2014,2016, 2019                     Université de Bordeaux
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -21,7 +21,7 @@
 #include <datawizard/filters.h>
 #include <datawizard/filters.h>
 
 
 void starpu_block_filter_block(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f,
 void starpu_block_filter_block(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f,
-                                    unsigned id, unsigned nparts)
+			       unsigned id, unsigned nparts)
 {
 {
         struct starpu_block_interface *block_father = (struct starpu_block_interface *) father_interface;
         struct starpu_block_interface *block_father = (struct starpu_block_interface *) father_interface;
         struct starpu_block_interface *block_child = (struct starpu_block_interface *) child_interface;
         struct starpu_block_interface *block_child = (struct starpu_block_interface *) child_interface;
@@ -31,12 +31,11 @@ void starpu_block_filter_block(void *father_interface, void *child_interface, ST
         uint32_t nz = block_father->nz;
         uint32_t nz = block_father->nz;
 	size_t elemsize = block_father->elemsize;
 	size_t elemsize = block_father->elemsize;
 
 
-	STARPU_ASSERT_MSG(nparts <= nx, "%u parts for %u elements", nparts, nx);
+	STARPU_ASSERT_MSG(nparts <= nx, "cannot split %u elements in %u parts", nx, nparts);
 
 
 	uint32_t chunk_size;
 	uint32_t chunk_size;
 	size_t offset;
 	size_t offset;
-	starpu_filter_nparts_compute_chunk_size_and_offset(nx, nparts, elemsize, id, 1,
-				       &chunk_size, &offset);
+	starpu_filter_nparts_compute_chunk_size_and_offset(nx, nparts, elemsize, id, 1, &chunk_size, &offset);
 
 
 	STARPU_ASSERT_MSG(block_father->id == STARPU_BLOCK_INTERFACE_ID, "%s can only be applied on a block data", __func__);
 	STARPU_ASSERT_MSG(block_father->id == STARPU_BLOCK_INTERFACE_ID, "%s can only be applied on a block data", __func__);
 	block_child->id = block_father->id;
 	block_child->id = block_father->id;
@@ -57,7 +56,7 @@ void starpu_block_filter_block(void *father_interface, void *child_interface, ST
 }
 }
 
 
 void starpu_block_filter_block_shadow(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f,
 void starpu_block_filter_block_shadow(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f,
-                                    unsigned id, unsigned nparts)
+				      unsigned id, unsigned nparts)
 {
 {
         struct starpu_block_interface *block_father = (struct starpu_block_interface *) father_interface;
         struct starpu_block_interface *block_father = (struct starpu_block_interface *) father_interface;
         struct starpu_block_interface *block_child = (struct starpu_block_interface *) child_interface;
         struct starpu_block_interface *block_child = (struct starpu_block_interface *) child_interface;
@@ -70,13 +69,11 @@ void starpu_block_filter_block_shadow(void *father_interface, void *child_interf
         uint32_t nz = block_father->nz;
         uint32_t nz = block_father->nz;
 	size_t elemsize = block_father->elemsize;
 	size_t elemsize = block_father->elemsize;
 
 
-	STARPU_ASSERT_MSG(nparts <= nx, "%u parts for %u elements", nparts, nx);
+	STARPU_ASSERT_MSG(nparts <= nx, "cannot split %u elements in %u parts", nx, nparts);
 
 
 	uint32_t child_nx;
 	uint32_t child_nx;
 	size_t offset;
 	size_t offset;
-	starpu_filter_nparts_compute_chunk_size_and_offset(nx, nparts, elemsize, id, 1,
-						     &child_nx, &offset);
-	
+	starpu_filter_nparts_compute_chunk_size_and_offset(nx, nparts, elemsize, id, 1, &child_nx, &offset);
 
 
 	STARPU_ASSERT_MSG(block_father->id == STARPU_BLOCK_INTERFACE_ID, "%s can only be applied on a block data", __func__);
 	STARPU_ASSERT_MSG(block_father->id == STARPU_BLOCK_INTERFACE_ID, "%s can only be applied on a block data", __func__);
 	block_child->id = block_father->id;
 	block_child->id = block_father->id;
@@ -97,7 +94,7 @@ void starpu_block_filter_block_shadow(void *father_interface, void *child_interf
 }
 }
 
 
 void starpu_block_filter_vertical_block(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f,
 void starpu_block_filter_vertical_block(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f,
-                                    unsigned id, unsigned nparts)
+					unsigned id, unsigned nparts)
 {
 {
         struct starpu_block_interface *block_father = (struct starpu_block_interface *) father_interface;
         struct starpu_block_interface *block_father = (struct starpu_block_interface *) father_interface;
         struct starpu_block_interface *block_child = (struct starpu_block_interface *) child_interface;
         struct starpu_block_interface *block_child = (struct starpu_block_interface *) child_interface;
@@ -107,12 +104,11 @@ void starpu_block_filter_vertical_block(void *father_interface, void *child_inte
         uint32_t nz = block_father->nz;
         uint32_t nz = block_father->nz;
 	size_t elemsize = block_father->elemsize;
 	size_t elemsize = block_father->elemsize;
 
 
-	STARPU_ASSERT_MSG(nparts <= ny, "%u parts for %u elements", nparts, ny);
+	STARPU_ASSERT_MSG(nparts <= ny, "cannot split %u elements in %u parts", ny, nparts);
 
 
 	uint32_t child_ny;
 	uint32_t child_ny;
 	size_t offset;
 	size_t offset;
-	starpu_filter_nparts_compute_chunk_size_and_offset(ny, nparts, elemsize, id, block_father->ldy,
-				       &child_ny, &offset);
+	starpu_filter_nparts_compute_chunk_size_and_offset(ny, nparts, elemsize, id, block_father->ldy, &child_ny, &offset);
 
 
 	STARPU_ASSERT_MSG(block_father->id == STARPU_BLOCK_INTERFACE_ID, "%s can only be applied on a block data", __func__);
 	STARPU_ASSERT_MSG(block_father->id == STARPU_BLOCK_INTERFACE_ID, "%s can only be applied on a block data", __func__);
 	block_child->id = block_father->id;
 	block_child->id = block_father->id;
@@ -133,7 +129,7 @@ void starpu_block_filter_vertical_block(void *father_interface, void *child_inte
 }
 }
 
 
 void starpu_block_filter_vertical_block_shadow(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f,
 void starpu_block_filter_vertical_block_shadow(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f,
-                                    unsigned id, unsigned nparts)
+					       unsigned id, unsigned nparts)
 {
 {
         struct starpu_block_interface *block_father = (struct starpu_block_interface *) father_interface;
         struct starpu_block_interface *block_father = (struct starpu_block_interface *) father_interface;
         struct starpu_block_interface *block_child = (struct starpu_block_interface *) child_interface;
         struct starpu_block_interface *block_child = (struct starpu_block_interface *) child_interface;
@@ -146,14 +142,12 @@ void starpu_block_filter_vertical_block_shadow(void *father_interface, void *chi
         uint32_t nz = block_father->nz;
         uint32_t nz = block_father->nz;
 	size_t elemsize = block_father->elemsize;
 	size_t elemsize = block_father->elemsize;
 
 
-	STARPU_ASSERT_MSG(nparts <= ny, "%u parts for %u elements", nparts, ny);
+	STARPU_ASSERT_MSG(nparts <= ny, "cannot split %u elements in %u parts", ny, nparts);
 
 
 	uint32_t child_ny;
 	uint32_t child_ny;
 	size_t offset;
 	size_t offset;
 
 
-	starpu_filter_nparts_compute_chunk_size_and_offset(ny, nparts, elemsize, id,
-						     block_father->ldy,
-						     &child_ny, &offset);
+	starpu_filter_nparts_compute_chunk_size_and_offset(ny, nparts, elemsize, id, block_father->ldy, &child_ny, &offset);
 
 
 	STARPU_ASSERT_MSG(block_father->id == STARPU_BLOCK_INTERFACE_ID, "%s can only be applied on a block data", __func__);
 	STARPU_ASSERT_MSG(block_father->id == STARPU_BLOCK_INTERFACE_ID, "%s can only be applied on a block data", __func__);
 	block_child->id = block_father->id;
 	block_child->id = block_father->id;
@@ -174,7 +168,7 @@ void starpu_block_filter_vertical_block_shadow(void *father_interface, void *chi
 }
 }
 
 
 void starpu_block_filter_depth_block(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f,
 void starpu_block_filter_depth_block(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f,
-                                    unsigned id, unsigned nparts)
+				     unsigned id, unsigned nparts)
 {
 {
         struct starpu_block_interface *block_father = (struct starpu_block_interface *) father_interface;
         struct starpu_block_interface *block_father = (struct starpu_block_interface *) father_interface;
         struct starpu_block_interface *block_child = (struct starpu_block_interface *) child_interface;
         struct starpu_block_interface *block_child = (struct starpu_block_interface *) child_interface;
@@ -184,7 +178,7 @@ void starpu_block_filter_depth_block(void *father_interface, void *child_interfa
         uint32_t nz = block_father->nz;
         uint32_t nz = block_father->nz;
 	size_t elemsize = block_father->elemsize;
 	size_t elemsize = block_father->elemsize;
 
 
-	STARPU_ASSERT_MSG(nparts <= nz, "%u parts for %u elements", nparts, nz);
+	STARPU_ASSERT_MSG(nparts <= nz, "cannot split %u elements in %u parts", nz, nparts);
 
 
 	uint32_t child_nz;
 	uint32_t child_nz;
 	size_t offset;
 	size_t offset;
@@ -211,7 +205,7 @@ void starpu_block_filter_depth_block(void *father_interface, void *child_interfa
 }
 }
 
 
 void starpu_block_filter_depth_block_shadow(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f,
 void starpu_block_filter_depth_block_shadow(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f,
-                                    unsigned id, unsigned nparts)
+					    unsigned id, unsigned nparts)
 {
 {
         struct starpu_block_interface *block_father = (struct starpu_block_interface *) father_interface;
         struct starpu_block_interface *block_father = (struct starpu_block_interface *) father_interface;
         struct starpu_block_interface *block_child = (struct starpu_block_interface *) child_interface;
         struct starpu_block_interface *block_child = (struct starpu_block_interface *) child_interface;
@@ -224,14 +218,12 @@ void starpu_block_filter_depth_block_shadow(void *father_interface, void *child_
         uint32_t nz = block_father->nz - 2 * shadow_size;
         uint32_t nz = block_father->nz - 2 * shadow_size;
 	size_t elemsize = block_father->elemsize;
 	size_t elemsize = block_father->elemsize;
 
 
-	STARPU_ASSERT_MSG(nparts <= nz, "%u parts for %u elements", nparts, nz);
+	STARPU_ASSERT_MSG(nparts <= nz, "cannot split %u elements into %u parts", nz, nparts);
 
 
 	uint32_t child_nz;
 	uint32_t child_nz;
 	size_t offset;
 	size_t offset;
 
 
-	starpu_filter_nparts_compute_chunk_size_and_offset(nz, nparts, elemsize, id,
-						     block_father->ldz,
-						     &child_nz, &offset);
+	starpu_filter_nparts_compute_chunk_size_and_offset(nz, nparts, elemsize, id, block_father->ldz, &child_nz, &offset);
 
 
 	STARPU_ASSERT_MSG(block_father->id == STARPU_BLOCK_INTERFACE_ID, "%s can only be applied on a block data", __func__);
 	STARPU_ASSERT_MSG(block_father->id == STARPU_BLOCK_INTERFACE_ID, "%s can only be applied on a block data", __func__);
 	block_child->id = block_father->id;
 	block_child->id = block_father->id;

+ 10 - 16
src/datawizard/interfaces/matrix_filters.c

@@ -3,7 +3,7 @@
  * Copyright (C) 2008-2014,2016,2019                      Université de Bordeaux
  * Copyright (C) 2008-2014,2016,2019                      Université de Bordeaux
  * Copyright (C) 2012                                     Inria
  * Copyright (C) 2012                                     Inria
  * Copyright (C) 2010                                     Mehdi Juhoor
  * Copyright (C) 2010                                     Mehdi Juhoor
- * Copyright (C) 2010,2011,2013,2015,2017                 CNRS
+ * Copyright (C) 2010,2011,2013,2015,2017,2019            CNRS
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -33,13 +33,12 @@ void starpu_matrix_filter_block(void *father_interface, void *child_interface, S
 	uint32_t ny = matrix_father->ny;
 	uint32_t ny = matrix_father->ny;
 	size_t elemsize = matrix_father->elemsize;
 	size_t elemsize = matrix_father->elemsize;
 
 
-	STARPU_ASSERT_MSG(nchunks <= nx, "%u parts for %u elements", nchunks, nx);
+	STARPU_ASSERT_MSG(nchunks <= nx, "cannot split %u elements in %u parts", nx, nchunks);
 
 
 	uint32_t child_nx;
 	uint32_t child_nx;
 	size_t offset;
 	size_t offset;
 
 
-	starpu_filter_nparts_compute_chunk_size_and_offset(nx, nchunks, elemsize, id, 1,
-						     &child_nx, &offset);
+	starpu_filter_nparts_compute_chunk_size_and_offset(nx, nchunks, elemsize, id, 1, &child_nx, &offset);
 
 
 	STARPU_ASSERT_MSG(matrix_father->id == STARPU_MATRIX_INTERFACE_ID, "%s can only be applied on a matrix data", __func__);
 	STARPU_ASSERT_MSG(matrix_father->id == STARPU_MATRIX_INTERFACE_ID, "%s can only be applied on a matrix data", __func__);
 
 
@@ -77,13 +76,12 @@ void starpu_matrix_filter_block_shadow(void *father_interface, void *child_inter
 	uint32_t ny = matrix_father->ny;
 	uint32_t ny = matrix_father->ny;
 	size_t elemsize = matrix_father->elemsize;
 	size_t elemsize = matrix_father->elemsize;
 
 
-	STARPU_ASSERT_MSG(nchunks <= nx, "%u parts for %u elements", nchunks, nx);
+	STARPU_ASSERT_MSG(nchunks <= nx, "cannot split %u elements in %u parts", nx, nchunks);
 
 
 	uint32_t child_nx;
 	uint32_t child_nx;
 	size_t offset;
 	size_t offset;
 
 
-	starpu_filter_nparts_compute_chunk_size_and_offset(nx, nchunks, elemsize, id, 1,
-						     &child_nx, &offset);
+	starpu_filter_nparts_compute_chunk_size_and_offset(nx, nchunks, elemsize, id, 1, &child_nx, &offset);
 
 
 	child_nx += 2 * shadow_size;
 	child_nx += 2 * shadow_size;
 
 
@@ -117,14 +115,12 @@ void starpu_matrix_filter_vertical_block(void *father_interface, void *child_int
 	uint32_t ny = matrix_father->ny;
 	uint32_t ny = matrix_father->ny;
 	size_t elemsize = matrix_father->elemsize;
 	size_t elemsize = matrix_father->elemsize;
 
 
-	STARPU_ASSERT_MSG(nchunks <= ny, "%u parts for %u elements", nchunks, ny);
+	STARPU_ASSERT_MSG(nchunks <= ny, "cannot split %u elements in %u parts", ny, nchunks);
 
 
 	uint32_t child_ny;
 	uint32_t child_ny;
 	size_t offset;
 	size_t offset;
 
 
-	starpu_filter_nparts_compute_chunk_size_and_offset(ny, nchunks, elemsize, id,
-						     matrix_father->ld,
-						     &child_ny, &offset);
+	starpu_filter_nparts_compute_chunk_size_and_offset(ny, nchunks, elemsize, id, matrix_father->ld, &child_ny, &offset);
 
 
 	STARPU_ASSERT_MSG(matrix_father->id == STARPU_MATRIX_INTERFACE_ID, "%s can only be applied on a matrix data", __func__);
 	STARPU_ASSERT_MSG(matrix_father->id == STARPU_MATRIX_INTERFACE_ID, "%s can only be applied on a matrix data", __func__);
 	matrix_child->id = matrix_father->id;
 	matrix_child->id = matrix_father->id;
@@ -157,14 +153,12 @@ void starpu_matrix_filter_vertical_block_shadow(void *father_interface, void *ch
 	uint32_t ny = matrix_father->ny - 2 * shadow_size;
 	uint32_t ny = matrix_father->ny - 2 * shadow_size;
 	size_t elemsize = matrix_father->elemsize;
 	size_t elemsize = matrix_father->elemsize;
 
 
-	STARPU_ASSERT_MSG(nchunks <= ny, "%u parts for %u elements", nchunks, ny);
+	STARPU_ASSERT_MSG(nchunks <= ny, "cannot split %u elements in %u parts", ny, nchunks);
 
 
 	uint32_t child_ny;
 	uint32_t child_ny;
 	size_t offset;
 	size_t offset;
 
 
-	starpu_filter_nparts_compute_chunk_size_and_offset(ny, nchunks, elemsize, id,
-						     matrix_father->ld,
-						     &child_ny, &offset);
+	starpu_filter_nparts_compute_chunk_size_and_offset(ny, nchunks, elemsize, id, matrix_father->ld, &child_ny, &offset);
 	child_ny += 2 * shadow_size;
 	child_ny += 2 * shadow_size;
 
 
 	STARPU_ASSERT_MSG(matrix_father->id == STARPU_MATRIX_INTERFACE_ID, "%s can only be applied on a matrix data", __func__);
 	STARPU_ASSERT_MSG(matrix_father->id == STARPU_MATRIX_INTERFACE_ID, "%s can only be applied on a matrix data", __func__);
@@ -172,7 +166,7 @@ void starpu_matrix_filter_vertical_block_shadow(void *father_interface, void *ch
 	matrix_child->nx = nx;
 	matrix_child->nx = nx;
 	matrix_child->ny = child_ny;
 	matrix_child->ny = child_ny;
 	matrix_child->elemsize = elemsize;
 	matrix_child->elemsize = elemsize;
-	STARPU_ASSERT_MSG(matrix_father->allocsize == matrix_father->nx * matrix_father->ny * matrix_father->elemsize, "partitioning matrix with non-trivial allocsize not supported yet, patch welcome");
+	STARPU_ASSERT_MSG(matrix_father->allocsize == matrix_father->nx * matrix_father->ny * matrix_father->elemsize, "partitioning matrix with non-trivial allocsize not supported yet, patch welcomed");
 	matrix_child->allocsize = matrix_child->nx * matrix_child->ny * elemsize;
 	matrix_child->allocsize = matrix_child->nx * matrix_child->ny * elemsize;
 
 
 	/* is the information on this node valid ? */
 	/* is the information on this node valid ? */

+ 7 - 9
src/datawizard/interfaces/vector_filters.c

@@ -3,7 +3,7 @@
  * Copyright (C) 2008-2014,2016,2017,2019                 Université de Bordeaux
  * Copyright (C) 2008-2014,2016,2017,2019                 Université de Bordeaux
  * Copyright (C) 2012                                     Inria
  * Copyright (C) 2012                                     Inria
  * Copyright (C) 2010                                     Mehdi Juhoor
  * Copyright (C) 2010                                     Mehdi Juhoor
- * Copyright (C) 2010,2011,2013,2015-2017                 CNRS
+ * Copyright (C) 2010,2011,2013,2015-2017,2019            CNRS
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -29,12 +29,11 @@ void starpu_vector_filter_block(void *father_interface, void *child_interface, S
 	uint32_t nx = vector_father->nx;
 	uint32_t nx = vector_father->nx;
 	size_t elemsize = vector_father->elemsize;
 	size_t elemsize = vector_father->elemsize;
 
 
-	STARPU_ASSERT_MSG(nchunks <= nx, "%u parts for %u elements", nchunks, nx);
+	STARPU_ASSERT_MSG(nchunks <= nx, "cannot split %u elements in %u parts", nx, nchunks);
 
 
 	uint32_t child_nx;
 	uint32_t child_nx;
 	size_t offset;
 	size_t offset;
-	starpu_filter_nparts_compute_chunk_size_and_offset(nx, nchunks, elemsize, id, 1,
-						     &child_nx, &offset);
+	starpu_filter_nparts_compute_chunk_size_and_offset(nx, nchunks, elemsize, id, 1, &child_nx, &offset);
 
 
 	STARPU_ASSERT_MSG(vector_father->id == STARPU_VECTOR_INTERFACE_ID, "%s can only be applied on a vector data", __func__);
 	STARPU_ASSERT_MSG(vector_father->id == STARPU_VECTOR_INTERFACE_ID, "%s can only be applied on a vector data", __func__);
 	vector_child->id = vector_father->id;
 	vector_child->id = vector_father->id;
@@ -64,12 +63,11 @@ void starpu_vector_filter_block_shadow(void *father_interface, void *child_inter
 	uint32_t nx = vector_father->nx - 2 * shadow_size;
 	uint32_t nx = vector_father->nx - 2 * shadow_size;
 	size_t elemsize = vector_father->elemsize;
 	size_t elemsize = vector_father->elemsize;
 
 
-	STARPU_ASSERT_MSG(nchunks <= nx, "%u parts for %u elements", nchunks, nx);
+	STARPU_ASSERT_MSG(nchunks <= nx, "cannot split %u elements in %u parts", nx, nchunks);
 
 
 	uint32_t child_nx;
 	uint32_t child_nx;
 	size_t offset;
 	size_t offset;
-	starpu_filter_nparts_compute_chunk_size_and_offset(nx, nchunks, elemsize, id, 1,
-						     &child_nx, &offset);
+	starpu_filter_nparts_compute_chunk_size_and_offset(nx, nchunks, elemsize, id, 1, &child_nx, &offset);
 	child_nx += 2*shadow_size;
 	child_nx += 2*shadow_size;
 
 
 	STARPU_ASSERT_MSG(vector_father->id == STARPU_VECTOR_INTERFACE_ID, "%s can only be applied on a vector data", __func__);
 	STARPU_ASSERT_MSG(vector_father->id == STARPU_VECTOR_INTERFACE_ID, "%s can only be applied on a vector data", __func__);
@@ -155,7 +153,7 @@ void starpu_vector_filter_list_long(void *father_interface, void *child_interfac
 	vector_child->id = vector_father->id;
 	vector_child->id = vector_father->id;
 	vector_child->nx = chunk_size;
 	vector_child->nx = chunk_size;
 	vector_child->elemsize = elemsize;
 	vector_child->elemsize = elemsize;
-	STARPU_ASSERT_MSG(vector_father->allocsize == vector_father->nx * vector_father->elemsize, "partitioning vector with non-trival allocsize not supported yet, patch welcome");
+	STARPU_ASSERT_MSG(vector_father->allocsize == vector_father->nx * vector_father->elemsize, "partitioning vector with non-trival allocsize not supported yet, patch welcomed");
 	vector_child->allocsize = vector_child->nx * elemsize;
 	vector_child->allocsize = vector_child->nx * elemsize;
 
 
 	if (vector_father->dev_handle)
 	if (vector_father->dev_handle)
@@ -188,7 +186,7 @@ void starpu_vector_filter_list(void *father_interface, void *child_interface, st
 	vector_child->id = vector_father->id;
 	vector_child->id = vector_father->id;
 	vector_child->nx = chunk_size;
 	vector_child->nx = chunk_size;
 	vector_child->elemsize = elemsize;
 	vector_child->elemsize = elemsize;
-	STARPU_ASSERT_MSG(vector_father->allocsize == vector_father->nx * vector_father->elemsize, "partitioning vector with non-trival allocsize not supported yet, patch welcome");
+	STARPU_ASSERT_MSG(vector_father->allocsize == vector_father->nx * vector_father->elemsize, "partitioning vector with non-trival allocsize not supported yet, patch welcomed");
 	vector_child->allocsize = vector_child->nx * elemsize;
 	vector_child->allocsize = vector_child->nx * elemsize;
 
 
 	if (vector_father->dev_handle)
 	if (vector_father->dev_handle)

+ 11 - 3
src/debug/traces/starpu_fxt.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2011-2017                                Inria
  * Copyright (C) 2011-2017                                Inria
- * Copyright (C) 2009-2018                                Université de Bordeaux
+ * Copyright (C) 2009-2019                                Université de Bordeaux
  * Copyright (C) 2013                                     Joris Pablo
  * Copyright (C) 2013                                     Joris Pablo
  * Copyright (C) 2017,2018                                Federal University of Rio Grande do Sul (UFRGS)
  * Copyright (C) 2017,2018                                Federal University of Rio Grande do Sul (UFRGS)
  * Copyright (C) 2011-2019                                CNRS
  * Copyright (C) 2011-2019                                CNRS
@@ -1609,7 +1609,7 @@ static void handle_codelet_details(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 		int i;
 		int i;
 		for (i = 0; i < last_codelet_parameter[worker] && i < MAX_PARAMETERS; i++)
 		for (i = 0; i < last_codelet_parameter[worker] && i < MAX_PARAMETERS; i++)
 		{
 		{
-			eaten += snprintf(parameters + eaten, sizeof(parameters) - eaten - 1, "%s%s", i?"_":"", last_codelet_parameter_description[worker][i]);
+			eaten += snprintf(parameters + eaten, sizeof(parameters) - eaten - 1, "%s%s", i?" ":"", last_codelet_parameter_description[worker][i]);
 		}
 		}
 	}
 	}
 	parameters[sizeof(parameters)-1] = 0;
 	parameters[sizeof(parameters)-1] = 0;
@@ -1641,6 +1641,12 @@ static void handle_codelet_details(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 		char *prefix = options->file_prefix;
 		char *prefix = options->file_prefix;
 		unsigned sched_ctx = ev->param[0];
 		unsigned sched_ctx = ev->param[0];
 
 
+		/* Paje won't like spaces, replace with underscores */
+		char *c;
+		for (c = parameters; *c; c++)
+			if (*c == ' ')
+				*c = '_';
+
 		worker_set_detailed_state(last_codelet_start[worker], prefix, worker, _starpu_last_codelet_symbol[worker], ev->param[1], parameters, ev->param[2], ev->param[4], job_id, ((double) task->kflops) / 1000000, X, Y, Z, task->iterations[0], task->iterations[1], options);
 		worker_set_detailed_state(last_codelet_start[worker], prefix, worker, _starpu_last_codelet_symbol[worker], ev->param[1], parameters, ev->param[2], ev->param[4], job_id, ((double) task->kflops) / 1000000, X, Y, Z, task->iterations[0], task->iterations[1], options);
 		if (sched_ctx != 0)
 		if (sched_ctx != 0)
 		{
 		{
@@ -4478,6 +4484,7 @@ struct parse_task
 {
 {
 	unsigned exec_time;
 	unsigned exec_time;
 	unsigned data_total;
 	unsigned data_total;
+	unsigned workerid;
 	char *codelet_name;
 	char *codelet_name;
 };
 };
 
 
@@ -4515,7 +4522,7 @@ static void write_task(struct parse_task pt)
 		fprintf(codelet_list, "%s\n", codelet_name);
 		fprintf(codelet_list, "%s\n", codelet_name);
 	}
 	}
 	double time = pt.exec_time * NANO_SEC_TO_MILI_SEC;
 	double time = pt.exec_time * NANO_SEC_TO_MILI_SEC;
-	fprintf(kernel->file, "%lf %u\n", time, pt.data_total);
+	fprintf(kernel->file, "%lf %u %u\n", time, pt.data_total, pt.workerid);
 }
 }
 
 
 void starpu_fxt_write_data_trace(char *filename_in)
 void starpu_fxt_write_data_trace(char *filename_in)
@@ -4570,6 +4577,7 @@ void starpu_fxt_write_data_trace(char *filename_in)
 
 
 		case _STARPU_FUT_START_CODELET_BODY:
 		case _STARPU_FUT_START_CODELET_BODY:
 			workerid = ev.param[2];
 			workerid = ev.param[2];
+			tasks[workerid].workerid = (unsigned)workerid;
 			tasks[workerid].exec_time = ev.time;
 			tasks[workerid].exec_time = ev.time;
 			has_name = ev.param[4];
 			has_name = ev.param[4];
 			tasks[workerid].codelet_name = strdup(has_name ? get_fxt_string(&ev, 5): "unknown");
 			tasks[workerid].codelet_name = strdup(has_name ? get_fxt_string(&ev, 5): "unknown");

+ 9 - 3
src/drivers/driver_common/driver_common.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2011-2017                                Inria
  * Copyright (C) 2011-2017                                Inria
- * Copyright (C) 2010-2018                                Université de Bordeaux
+ * Copyright (C) 2010-2019                                Université de Bordeaux
  * Copyright (C) 2010-2017, 2019                          CNRS
  * Copyright (C) 2010-2017, 2019                          CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2011                                     Télécom-SudParis
  * Copyright (C) 2011                                     Télécom-SudParis
@@ -268,9 +268,12 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 				do_update_time_model = 1;
 				do_update_time_model = 1;
 			}
 			}
 #else
 #else
-			const unsigned do_update_time_model = 1;
+			unsigned do_update_time_model = 1;
 			const double time_consumed = measured;
 			const double time_consumed = measured;
 #endif
 #endif
+			if (j->task->failed)
+				/* Do not record perfmodel for failed tasks, they may terminate earlier */
+				do_update_time_model = 0;
 			if (do_update_time_model)
 			if (do_update_time_model)
 			{
 			{
 				_starpu_update_perfmodel_history(j, j->task->cl->model, perf_arch, worker->devid, time_consumed, j->nimpl);
 				_starpu_update_perfmodel_history(j, j->task->cl->model, perf_arch, worker->devid, time_consumed, j->nimpl);
@@ -301,9 +304,12 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 		}
 		}
 #else
 #else
 		const double energy_consumed = profiling_info->energy_consumed;
 		const double energy_consumed = profiling_info->energy_consumed;
-		const unsigned do_update_energy_model = 1;
+		unsigned do_update_energy_model = 1;
 #endif
 #endif
 
 
+		if (j->task->failed)
+			/* Do not record perfmodel for failed tasks, they may terminate earlier */
+			do_update_energy_model = 0;
 		if (do_update_energy_model)
 		if (do_update_energy_model)
 		{
 		{
 			_starpu_update_perfmodel_history(j, j->task->cl->energy_model, perf_arch, worker->devid, energy_consumed, j->nimpl);
 			_starpu_update_perfmodel_history(j, j->task->cl->energy_model, perf_arch, worker->devid, energy_consumed, j->nimpl);

+ 1 - 0
tests/Makefile.am

@@ -327,6 +327,7 @@ myPROGRAMS +=				\
 	disk/mem_reclaim			\
 	disk/mem_reclaim			\
 	errorcheck/invalid_blocking_calls	\
 	errorcheck/invalid_blocking_calls	\
 	errorcheck/workers_cpuid		\
 	errorcheck/workers_cpuid		\
+	fault-tolerance/retry			\
 	helper/starpu_data_cpy			\
 	helper/starpu_data_cpy			\
 	helper/starpu_create_sync_task		\
 	helper/starpu_create_sync_task		\
 	microbenchs/async_tasks_overhead	\
 	microbenchs/async_tasks_overhead	\

+ 123 - 0
tests/fault-tolerance/retry.c

@@ -0,0 +1,123 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011-2013,2015,2017                      CNRS
+ * Copyright (C) 2017                                     Inria
+ * Copyright (C) 2019                                     Université de Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This tests the fault tolerance interface: it submits a tasks which repeatedly
+ * fails until being eventually successful
+ */
+
+#include <starpu.h>
+#include "../helper.h"
+
+/* This task fakes some repeated errors  */
+static int retry;
+void cpu_increment(void *descr[], void *arg)
+{
+	(void)arg;
+	unsigned *var = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	unsigned *var2 = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
+	FPRINTF(stderr,"computing\n");
+	*var2 = *var + 1;
+	if (retry < 10)
+	{
+		FPRINTF(stderr,"failing\n");
+		retry++;
+		/* Fake failure */
+		starpu_task_ft_failed(starpu_task_get_current());
+	}
+	else
+		FPRINTF(stderr,"succeed\n");
+}
+
+static struct starpu_codelet my_codelet =
+{
+	.cpu_funcs = {cpu_increment},
+	.cpu_funcs_name = {"cpu_increment"},
+	.modes = { STARPU_R, STARPU_W },
+	.nbuffers = 2
+};
+
+/* This implements the retry strategy
+ * (Identical to the default implementation: just retry) */
+static void check_ft(void *arg)
+{
+	struct starpu_task *meta_task = arg;
+	struct starpu_task *current_task = starpu_task_get_current();
+	struct starpu_task *new_task;
+	int ret;
+
+	if (!current_task->failed)
+	{
+		FPRINTF(stderr,"didn't fail, release main task\n");
+		starpu_task_ft_success(meta_task);
+		return;
+	}
+
+	FPRINTF(stderr,"failed, try again\n");
+
+	new_task = starpu_task_ft_create_retry(meta_task, current_task, check_ft);
+
+	/* Here we could e.g. force the task to use only a CPU implementation
+	 * known to be failsafe */
+
+	ret = starpu_task_submit_nodeps(new_task);
+	STARPU_ASSERT(!ret);
+}
+
+int main(void)
+{
+	int x = 12;
+	int y = 1;
+        starpu_data_handle_t h_x, h_y;
+	int ret, ret1;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_variable_data_register(&h_x, STARPU_MAIN_RAM, (uintptr_t)&x, sizeof(x));
+	starpu_variable_data_register(&h_y, STARPU_MAIN_RAM, (uintptr_t)&y, sizeof(y));
+
+	retry = 0;
+	ret1 = starpu_task_insert(&my_codelet,
+				  STARPU_PROLOGUE_CALLBACK, starpu_task_ft_prologue,
+				  STARPU_PROLOGUE_CALLBACK_ARG, check_ft,
+				  STARPU_R, h_x,
+				  STARPU_W, h_y,
+				  0);
+	if (ret1 != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret1, "starpu_task_insert");
+	starpu_task_wait_for_all();
+
+	starpu_data_unregister(h_x);
+	starpu_data_unregister(h_y);
+
+	starpu_shutdown();
+
+	if (x != 12)
+		ret = 1;
+	FPRINTF(stderr, "Value x = %d (expected 12)\n", x);
+
+	if (ret1 != -ENODEV)
+	{
+		if (y != 13)
+			ret = 1;
+		FPRINTF(stderr, "Value y = %d (expected 13)\n", y);
+	}
+
+	STARPU_RETURN(ret);
+}

+ 2 - 2
tests/perfmodels/regression_based.c

@@ -2,7 +2,7 @@
  *
  *
  * Copyright (C) 2011,2012,2014                           Inria
  * Copyright (C) 2011,2012,2014                           Inria
  * Copyright (C) 2011-2016,2019                           Université de Bordeaux
  * Copyright (C) 2011-2016,2019                           Université de Bordeaux
- * Copyright (C) 2011-2017                                CNRS
+ * Copyright (C) 2011-2017, 2019                          CNRS
  * Copyright (C) 2011                                     Télécom-SudParis
  * Copyright (C) 2011                                     Télécom-SudParis
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -70,7 +70,7 @@ void memset_cpu(void *descr[], void *arg)
 	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
 	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 
 
-	usleep(10);
+	starpu_usleep(10);
 	memset(ptr, 42, n * sizeof(*ptr));
 	memset(ptr, 42, n * sizeof(*ptr));
 }
 }
 
 

+ 26 - 0
tools/Makefile.am

@@ -42,6 +42,18 @@ dist_pkgdata_perfmodels_sampling_bus_DATA = \
 	perfmodels/sampling/bus/attila.latency	\
 	perfmodels/sampling/bus/attila.latency	\
 	perfmodels/sampling/bus/attila.platform.xml	\
 	perfmodels/sampling/bus/attila.platform.xml	\
 	perfmodels/sampling/bus/attila.platform.v4.xml	\
 	perfmodels/sampling/bus/attila.platform.v4.xml	\
+	perfmodels/sampling/bus/hannibal.affinity	\
+	perfmodels/sampling/bus/hannibal.bandwidth	\
+	perfmodels/sampling/bus/hannibal.config	\
+	perfmodels/sampling/bus/hannibal.latency	\
+	perfmodels/sampling/bus/hannibal.platform.xml	\
+	perfmodels/sampling/bus/hannibal.platform.v4.xml	\
+	perfmodels/sampling/bus/hannibal-pitch.affinity	\
+	perfmodels/sampling/bus/hannibal-pitch.bandwidth	\
+	perfmodels/sampling/bus/hannibal-pitch.config	\
+	perfmodels/sampling/bus/hannibal-pitch.latency	\
+	perfmodels/sampling/bus/hannibal-pitch.platform.xml	\
+	perfmodels/sampling/bus/hannibal-pitch.platform.v4.xml	\
 	perfmodels/sampling/bus/idgraf.affinity	\
 	perfmodels/sampling/bus/idgraf.affinity	\
 	perfmodels/sampling/bus/idgraf.bandwidth	\
 	perfmodels/sampling/bus/idgraf.bandwidth	\
 	perfmodels/sampling/bus/idgraf.config	\
 	perfmodels/sampling/bus/idgraf.config	\
@@ -79,6 +91,20 @@ dist_pkgdata_perfmodels_sampling_codelets_DATA = \
 	perfmodels/sampling/codelets/45/starpu_dlu_lu_model_21.attila	\
 	perfmodels/sampling/codelets/45/starpu_dlu_lu_model_21.attila	\
 	perfmodels/sampling/codelets/45/starpu_dlu_lu_model_22.attila	\
 	perfmodels/sampling/codelets/45/starpu_dlu_lu_model_22.attila	\
 	perfmodels/sampling/codelets/45/overlap_sleep_1024_24.attila	\
 	perfmodels/sampling/codelets/45/overlap_sleep_1024_24.attila	\
+	perfmodels/sampling/codelets/45/chol_model_11.hannibal	\
+	perfmodels/sampling/codelets/45/chol_model_21.hannibal	\
+	perfmodels/sampling/codelets/45/chol_model_22.hannibal	\
+	perfmodels/sampling/codelets/45/starpu_slu_lu_model_11.hannibal	\
+	perfmodels/sampling/codelets/45/starpu_slu_lu_model_12.hannibal	\
+	perfmodels/sampling/codelets/45/starpu_slu_lu_model_21.hannibal	\
+	perfmodels/sampling/codelets/45/starpu_slu_lu_model_22.hannibal	\
+	perfmodels/sampling/codelets/45/chol_model_11.hannibal-pitch	\
+	perfmodels/sampling/codelets/45/chol_model_21.hannibal-pitch	\
+	perfmodels/sampling/codelets/45/chol_model_22.hannibal-pitch	\
+	perfmodels/sampling/codelets/45/starpu_slu_lu_model_11.hannibal-pitch	\
+	perfmodels/sampling/codelets/45/starpu_slu_lu_model_12.hannibal-pitch	\
+	perfmodels/sampling/codelets/45/starpu_slu_lu_model_21.hannibal-pitch	\
+	perfmodels/sampling/codelets/45/starpu_slu_lu_model_22.hannibal-pitch	\
 	perfmodels/sampling/codelets/45/chol_model_11.idgraf	\
 	perfmodels/sampling/codelets/45/chol_model_11.idgraf	\
 	perfmodels/sampling/codelets/45/chol_model_21.idgraf	\
 	perfmodels/sampling/codelets/45/chol_model_21.idgraf	\
 	perfmodels/sampling/codelets/45/chol_model_22.idgraf	\
 	perfmodels/sampling/codelets/45/chol_model_22.idgraf	\

+ 10 - 2
tools/dev/valgrind/hdf5.suppr

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 #
 # Copyright (C) 2017                                     CNRS
 # Copyright (C) 2017                                     CNRS
-# Copyright (C) 2017                                     Université de Bordeaux
+# Copyright (C) 2017, 2019                               Université de Bordeaux
 #
 #
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
 # it under the terms of the GNU Lesser General Public License as published by
@@ -20,7 +20,6 @@
    Memcheck:Leak
    Memcheck:Leak
    match-leak-kinds: reachable
    match-leak-kinds: reachable
    ...
    ...
-   fun:H5FL_reg_malloc
    fun:H5E_get_stack
    fun:H5E_get_stack
    ...
    ...
 }
 }
@@ -34,3 +33,12 @@
    ...
    ...
 }
 }
 
 
+{
+   thread-specific value
+   Memcheck:Leak
+   match-leak-kinds: reachable
+   ...
+   fun:H5CX_push
+   ...
+}
+

+ 1 - 0
tools/perfmodels/sampling/bus/hannibal-pitch.affinity

@@ -0,0 +1 @@
+hannibal.affinity

+ 1 - 0
tools/perfmodels/sampling/bus/hannibal-pitch.bandwidth

@@ -0,0 +1 @@
+hannibal.bandwidth

+ 1 - 0
tools/perfmodels/sampling/bus/hannibal-pitch.config

@@ -0,0 +1 @@
+hannibal.config

+ 1 - 0
tools/perfmodels/sampling/bus/hannibal-pitch.latency

@@ -0,0 +1 @@
+hannibal.latency

+ 70 - 0
tools/perfmodels/sampling/bus/hannibal-pitch.platform.v4.xml

@@ -0,0 +1,70 @@
+<?xml version="1.0"?>
+ <!DOCTYPE platform SYSTEM "http://simgrid.gforge.inria.fr/simgrid/simgrid.dtd">
+ <platform version="4">
+ <config id="General">
+   <prop id="network/TCP-gamma" value="-1"></prop>
+   <prop id="network/latency-factor" value="1"></prop>
+   <prop id="network/bandwidth-factor" value="1"></prop>
+ </config>
+ <AS  id="AS0"  routing="Full">
+   <host id="MAIN" speed="1f"/>
+   <host id="CPU0" speed="2000000000f"/>
+   <host id="CPU1" speed="2000000000f"/>
+   <host id="CPU2" speed="2000000000f"/>
+   <host id="CPU3" speed="2000000000f"/>
+   <host id="CPU4" speed="2000000000f"/>
+   <host id="CPU5" speed="2000000000f"/>
+   <host id="CPU6" speed="2000000000f"/>
+   <host id="CPU7" speed="2000000000f"/>
+   <host id="CUDA0" speed="2000000000f">
+     <prop id="memsize" value="3145728000"/>
+     <prop id="memcpy_peer" value="0"/>
+   </host>
+   <host id="CUDA1" speed="2000000000f">
+     <prop id="memsize" value="3145728000"/>
+     <prop id="memcpy_peer" value="0"/>
+   </host>
+   <host id="CUDA2" speed="2000000000f">
+     <prop id="memsize" value="3145728000"/>
+     <prop id="memcpy_peer" value="0"/>
+   </host>
+   <host id="OpenCL0" speed="2000000000f">
+     <prop id="memsize" value="3145728000"/>
+   </host>
+   <host id="OpenCL1" speed="2000000000f">
+     <prop id="memsize" value="3145728000"/>
+   </host>
+   <host id="OpenCL2" speed="2000000000f">
+     <prop id="memsize" value="3145728000"/>
+   </host>
+
+   <host id="RAM" speed="1f"/>
+
+   <link id="Share" bandwidth="5988779905.433726Bps" latency="0.000000s"/>
+
+   <link id="RAM-CUDA0" bandwidth="1653658596.433726Bps" latency="0.000012s"/>
+   <link id="CUDA0-RAM" bandwidth="993981963.299022Bps" latency="0.000012s"/>
+   <link id="RAM-CUDA1" bandwidth="869707794.319062Bps" latency="0.000013s"/>
+   <link id="CUDA1-RAM" bandwidth="925610046.160954Bps" latency="0.000013s"/>
+   <link id="RAM-CUDA2" bandwidth="1653711631.023217Bps" latency="0.000012s"/>
+   <link id="CUDA2-RAM" bandwidth="981498659.805904Bps" latency="0.000013s"/>
+   <link id="RAM-OpenCL0" bandwidth="3975378655.154796Bps" latency="0.000020s"/>
+   <link id="OpenCL0-RAM" bandwidth="2937163571.508681Bps" latency="0.000064s"/>
+   <link id="RAM-OpenCL1" bandwidth="2636838726.154693Bps" latency="0.000020s"/>
+   <link id="OpenCL1-RAM" bandwidth="2610203570.688437Bps" latency="0.000036s"/>
+   <link id="RAM-OpenCL2" bandwidth="3992447566.540525Bps" latency="0.000020s"/>
+   <link id="OpenCL2-RAM" bandwidth="2812550617.128727Bps" latency="0.000037s"/>
+   <route src="RAM" dst="CUDA0" symmetrical="NO"><link_ctn id="RAM-CUDA0"/><link_ctn id="Share"/></route>
+   <route src="CUDA0" dst="RAM" symmetrical="NO"><link_ctn id="CUDA0-RAM"/><link_ctn id="Share"/></route>
+   <route src="RAM" dst="CUDA1" symmetrical="NO"><link_ctn id="RAM-CUDA1"/><link_ctn id="Share"/></route>
+   <route src="CUDA1" dst="RAM" symmetrical="NO"><link_ctn id="CUDA1-RAM"/><link_ctn id="Share"/></route>
+   <route src="RAM" dst="CUDA2" symmetrical="NO"><link_ctn id="RAM-CUDA2"/><link_ctn id="Share"/></route>
+   <route src="CUDA2" dst="RAM" symmetrical="NO"><link_ctn id="CUDA2-RAM"/><link_ctn id="Share"/></route>
+   <route src="RAM" dst="OpenCL0" symmetrical="NO"><link_ctn id="RAM-OpenCL0"/><link_ctn id="Share"/></route>
+   <route src="OpenCL0" dst="RAM" symmetrical="NO"><link_ctn id="OpenCL0-RAM"/><link_ctn id="Share"/></route>
+   <route src="RAM" dst="OpenCL1" symmetrical="NO"><link_ctn id="RAM-OpenCL1"/><link_ctn id="Share"/></route>
+   <route src="OpenCL1" dst="RAM" symmetrical="NO"><link_ctn id="OpenCL1-RAM"/><link_ctn id="Share"/></route>
+   <route src="RAM" dst="OpenCL2" symmetrical="NO"><link_ctn id="RAM-OpenCL2"/><link_ctn id="Share"/></route>
+   <route src="OpenCL2" dst="RAM" symmetrical="NO"><link_ctn id="OpenCL2-RAM"/><link_ctn id="Share"/></route>
+ </AS>
+ </platform>

+ 70 - 0
tools/perfmodels/sampling/bus/hannibal-pitch.platform.xml

@@ -0,0 +1,70 @@
+<?xml version="1.0"?>
+ <!DOCTYPE platform SYSTEM "http://simgrid.gforge.inria.fr/simgrid.dtd">
+ <platform version="3">
+ <config id="General">
+   <prop id="network/TCP_gamma" value="-1"></prop>
+   <prop id="network/latency_factor" value="1"></prop>
+   <prop id="network/bandwidth_factor" value="1"></prop>
+ </config>
+ <AS  id="AS0"  routing="Full">
+   <host id="MAIN" power="1"/>
+   <host id="CPU0" power="2000000000"/>
+   <host id="CPU1" power="2000000000"/>
+   <host id="CPU2" power="2000000000"/>
+   <host id="CPU3" power="2000000000"/>
+   <host id="CPU4" power="2000000000"/>
+   <host id="CPU5" power="2000000000"/>
+   <host id="CPU6" power="2000000000"/>
+   <host id="CPU7" power="2000000000"/>
+   <host id="CUDA0" power="2000000000">
+     <prop id="memsize" value="3145728000"/>
+     <prop id="memcpy_peer" value="0"/>
+   </host>
+   <host id="CUDA1" power="2000000000">
+     <prop id="memsize" value="3145728000"/>
+     <prop id="memcpy_peer" value="0"/>
+   </host>
+   <host id="CUDA2" power="2000000000">
+     <prop id="memsize" value="3145728000"/>
+     <prop id="memcpy_peer" value="0"/>
+   </host>
+   <host id="OpenCL0" power="2000000000">
+     <prop id="memsize" value="3145728000"/>
+   </host>
+   <host id="OpenCL1" power="2000000000">
+     <prop id="memsize" value="3145728000"/>
+   </host>
+   <host id="OpenCL2" power="2000000000">
+     <prop id="memsize" value="3145728000"/>
+   </host>
+
+   <host id="RAM" power="1"/>
+
+   <link id="Share" bandwidth="5988779905.433726" latency="0.000000"/>
+   
+   <link id="RAM-CUDA0" bandwidth="1653658596.433726" latency="0.000012"/>
+   <link id="CUDA0-RAM" bandwidth="993981963.299022" latency="0.000012"/>
+   <link id="RAM-CUDA1" bandwidth="869707794.319062" latency="0.000013"/>
+   <link id="CUDA1-RAM" bandwidth="925610046.160954" latency="0.000013"/>
+   <link id="RAM-CUDA2" bandwidth="1653711631.023217" latency="0.000012"/>
+   <link id="CUDA2-RAM" bandwidth="981498659.805904" latency="0.000013"/>
+   <link id="RAM-OpenCL0" bandwidth="3975378655.154796" latency="0.000020"/>
+   <link id="OpenCL0-RAM" bandwidth="2937163571.508681" latency="0.000064"/>
+   <link id="RAM-OpenCL1" bandwidth="2636838726.154693" latency="0.000020"/>
+   <link id="OpenCL1-RAM" bandwidth="2610203570.688437" latency="0.000036"/>
+   <link id="RAM-OpenCL2" bandwidth="3992447566.540525" latency="0.000020"/>
+   <link id="OpenCL2-RAM" bandwidth="2812550617.128727" latency="0.000037"/>
+   <route src="RAM" dst="CUDA0" symmetrical="NO"><link_ctn id="RAM-CUDA0"/><link_ctn id="Share"/></route>
+   <route src="CUDA0" dst="RAM" symmetrical="NO"><link_ctn id="CUDA0-RAM"/><link_ctn id="Share"/></route>
+   <route src="RAM" dst="CUDA1" symmetrical="NO"><link_ctn id="RAM-CUDA1"/><link_ctn id="Share"/></route>
+   <route src="CUDA1" dst="RAM" symmetrical="NO"><link_ctn id="CUDA1-RAM"/><link_ctn id="Share"/></route>
+   <route src="RAM" dst="CUDA2" symmetrical="NO"><link_ctn id="RAM-CUDA2"/><link_ctn id="Share"/></route>
+   <route src="CUDA2" dst="RAM" symmetrical="NO"><link_ctn id="CUDA2-RAM"/><link_ctn id="Share"/></route>
+   <route src="RAM" dst="OpenCL0" symmetrical="NO"><link_ctn id="RAM-OpenCL0"/><link_ctn id="Share"/></route>
+   <route src="OpenCL0" dst="RAM" symmetrical="NO"><link_ctn id="OpenCL0-RAM"/><link_ctn id="Share"/></route>
+   <route src="RAM" dst="OpenCL1" symmetrical="NO"><link_ctn id="RAM-OpenCL1"/><link_ctn id="Share"/></route>
+   <route src="OpenCL1" dst="RAM" symmetrical="NO"><link_ctn id="OpenCL1-RAM"/><link_ctn id="Share"/></route>
+   <route src="RAM" dst="OpenCL2" symmetrical="NO"><link_ctn id="RAM-OpenCL2"/><link_ctn id="Share"/></route>
+   <route src="OpenCL2" dst="RAM" symmetrical="NO"><link_ctn id="OpenCL2-RAM"/><link_ctn id="Share"/></route>
+ </AS>
+ </platform>

+ 7 - 0
tools/perfmodels/sampling/bus/hannibal.affinity

@@ -0,0 +1,7 @@
+# GPU	CPU0	CPU1	CPU2	CPU3	CPU4	CPU5	CPU6	CPU7	
+0	0	1	2	3	4	5	6	7	
+1	4	5	6	7	0	1	2	3	
+2	4	5	6	7	0	1	2	3	
+0	0	1	2	3	4	5	6	7	
+1	4	5	6	7	0	1	2	3	
+2	4	5	6	7	0	1	2	3	

+ 17 - 0
tools/perfmodels/sampling/bus/hannibal.bandwidth

@@ -0,0 +1,17 @@
+# to 0		to 1		to 2		to 3		to 4		to 5		to 6		to 7		to 8		to 9		to 10		to 11		to 12		to 13		to 14		to 15		
+0.000000	5988.779905	3149.675860	5988.971975	3975.378655	2636.838726	3992.447567	nan	nan	nan	nan	nan	nan	nan	nan	nan
+3599.738919	0.000000	1679.850942	2248.345554	1889.122528	1521.977521	1892.968372	nan	nan	nan	nan	nan	nan	nan	nan	nan
+3352.127736	2149.165370	0.000000	2149.190105	1818.623736	1475.884075	1822.187624	nan	nan	nan	nan	nan	nan	nan	nan	nan
+3554.530216	2230.599117	1669.939421	0.000000	1876.596887	1513.836926	1880.391850	nan	nan	nan	nan	nan	nan	nan	nan	nan
+2937.163572	1970.662958	1519.854976	1970.683755	0.000000	1389.455231	1692.226493	nan	nan	nan	nan	nan	nan	nan	nan	nan
+2610.203571	1817.881699	1427.338068	1817.899396	1575.646193	0.000000	1578.320689	nan	nan	nan	nan	nan	nan	nan	nan	nan
+2812.550617	1913.772761	1485.791058	1913.792375	1647.181820	1360.930908	0.000000	nan	nan	nan	nan	nan	nan	nan	nan	nan
+nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
+nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
+nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
+nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
+nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
+nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
+nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
+nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
+nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan

+ 4 - 0
tools/perfmodels/sampling/bus/hannibal.config

@@ -0,0 +1,4 @@
+# Current configuration
+8 # Number of CPUs
+3 # Number of CUDA devices
+3 # Number of OpenCL devices

+ 17 - 0
tools/perfmodels/sampling/bus/hannibal.latency

@@ -0,0 +1,17 @@
+# to 0		to 1		to 2		to 3		to 4		to 5		to 6		to 7		to 8		to 9		to 10		to 11		to 12		to 13		to 14		to 15		
+0.000000	12.460938	12.570312	12.468750	20.000000	20.328125	19.593750	nan	nan	nan	nan	nan	nan	nan	nan	nan
+12.476562	0.000000	25.046875	24.945312	32.476562	32.804688	32.070312	nan	nan	nan	nan	nan	nan	nan	nan	nan
+12.593750	25.054688	0.000000	25.062500	32.593750	32.921875	32.187500	nan	nan	nan	nan	nan	nan	nan	nan	nan
+12.539062	25.000000	25.109375	0.000000	32.539062	32.867188	32.132812	nan	nan	nan	nan	nan	nan	nan	nan	nan
+63.601562	76.062500	76.171875	76.070312	0.000000	83.929688	83.195312	nan	nan	nan	nan	nan	nan	nan	nan	nan
+35.992188	48.453125	48.562500	48.460938	55.992188	0.000000	55.585938	nan	nan	nan	nan	nan	nan	nan	nan	nan
+36.765625	49.226562	49.335938	49.234375	56.765625	57.093750	0.000000	nan	nan	nan	nan	nan	nan	nan	nan	nan
+nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
+nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
+nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
+nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
+nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
+nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
+nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
+nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
+nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan

+ 70 - 0
tools/perfmodels/sampling/bus/hannibal.platform.v4.xml

@@ -0,0 +1,70 @@
+<?xml version="1.0"?>
+ <!DOCTYPE platform SYSTEM "http://simgrid.gforge.inria.fr/simgrid/simgrid.dtd">
+ <platform version="4">
+ <config id="General">
+   <prop id="network/TCP-gamma" value="-1"></prop>
+   <prop id="network/latency-factor" value="1"></prop>
+   <prop id="network/bandwidth-factor" value="1"></prop>
+ </config>
+ <AS  id="AS0"  routing="Full">
+   <host id="MAIN" speed="1f"/>
+   <host id="CPU0" speed="2000000000f"/>
+   <host id="CPU1" speed="2000000000f"/>
+   <host id="CPU2" speed="2000000000f"/>
+   <host id="CPU3" speed="2000000000f"/>
+   <host id="CPU4" speed="2000000000f"/>
+   <host id="CPU5" speed="2000000000f"/>
+   <host id="CPU6" speed="2000000000f"/>
+   <host id="CPU7" speed="2000000000f"/>
+   <host id="CUDA0" speed="2000000000f">
+     <prop id="memsize" value="3145728000"/>
+     <prop id="memcpy_peer" value="0"/>
+   </host>
+   <host id="CUDA1" speed="2000000000f">
+     <prop id="memsize" value="3145728000"/>
+     <prop id="memcpy_peer" value="0"/>
+   </host>
+   <host id="CUDA2" speed="2000000000f">
+     <prop id="memsize" value="3145728000"/>
+     <prop id="memcpy_peer" value="0"/>
+   </host>
+   <host id="OpenCL0" speed="2000000000f">
+     <prop id="memsize" value="3145728000"/>
+   </host>
+   <host id="OpenCL1" speed="2000000000f">
+     <prop id="memsize" value="3145728000"/>
+   </host>
+   <host id="OpenCL2" speed="2000000000f">
+     <prop id="memsize" value="3145728000"/>
+   </host>
+
+   <host id="RAM" speed="1f"/>
+
+   <link id="Share" bandwidth="5988971975.023217Bps" latency="0.000000s"/>
+
+   <link id="RAM-CUDA0" bandwidth="5988779905.433726Bps" latency="0.000012s"/>
+   <link id="CUDA0-RAM" bandwidth="3599738919.299022Bps" latency="0.000012s"/>
+   <link id="RAM-CUDA1" bandwidth="3149675860.319062Bps" latency="0.000013s"/>
+   <link id="CUDA1-RAM" bandwidth="3352127736.160954Bps" latency="0.000013s"/>
+   <link id="RAM-CUDA2" bandwidth="5988971975.023217Bps" latency="0.000012s"/>
+   <link id="CUDA2-RAM" bandwidth="3554530215.805904Bps" latency="0.000013s"/>
+   <link id="RAM-OpenCL0" bandwidth="3975378655.154796Bps" latency="0.000020s"/>
+   <link id="OpenCL0-RAM" bandwidth="2937163571.508681Bps" latency="0.000064s"/>
+   <link id="RAM-OpenCL1" bandwidth="2636838726.154693Bps" latency="0.000020s"/>
+   <link id="OpenCL1-RAM" bandwidth="2610203570.688437Bps" latency="0.000036s"/>
+   <link id="RAM-OpenCL2" bandwidth="3992447566.540525Bps" latency="0.000020s"/>
+   <link id="OpenCL2-RAM" bandwidth="2812550617.128727Bps" latency="0.000037s"/>
+   <route src="RAM" dst="CUDA0" symmetrical="NO"><link_ctn id="RAM-CUDA0"/><link_ctn id="Share"/></route>
+   <route src="CUDA0" dst="RAM" symmetrical="NO"><link_ctn id="CUDA0-RAM"/><link_ctn id="Share"/></route>
+   <route src="RAM" dst="CUDA1" symmetrical="NO"><link_ctn id="RAM-CUDA1"/><link_ctn id="Share"/></route>
+   <route src="CUDA1" dst="RAM" symmetrical="NO"><link_ctn id="CUDA1-RAM"/><link_ctn id="Share"/></route>
+   <route src="RAM" dst="CUDA2" symmetrical="NO"><link_ctn id="RAM-CUDA2"/><link_ctn id="Share"/></route>
+   <route src="CUDA2" dst="RAM" symmetrical="NO"><link_ctn id="CUDA2-RAM"/><link_ctn id="Share"/></route>
+   <route src="RAM" dst="OpenCL0" symmetrical="NO"><link_ctn id="RAM-OpenCL0"/><link_ctn id="Share"/></route>
+   <route src="OpenCL0" dst="RAM" symmetrical="NO"><link_ctn id="OpenCL0-RAM"/><link_ctn id="Share"/></route>
+   <route src="RAM" dst="OpenCL1" symmetrical="NO"><link_ctn id="RAM-OpenCL1"/><link_ctn id="Share"/></route>
+   <route src="OpenCL1" dst="RAM" symmetrical="NO"><link_ctn id="OpenCL1-RAM"/><link_ctn id="Share"/></route>
+   <route src="RAM" dst="OpenCL2" symmetrical="NO"><link_ctn id="RAM-OpenCL2"/><link_ctn id="Share"/></route>
+   <route src="OpenCL2" dst="RAM" symmetrical="NO"><link_ctn id="OpenCL2-RAM"/><link_ctn id="Share"/></route>
+ </AS>
+ </platform>

+ 70 - 0
tools/perfmodels/sampling/bus/hannibal.platform.xml

@@ -0,0 +1,70 @@
+<?xml version="1.0"?>
+ <!DOCTYPE platform SYSTEM "http://simgrid.gforge.inria.fr/simgrid.dtd">
+ <platform version="3">
+ <config id="General">
+   <prop id="network/TCP_gamma" value="-1"></prop>
+   <prop id="network/latency_factor" value="1"></prop>
+   <prop id="network/bandwidth_factor" value="1"></prop>
+ </config>
+ <AS  id="AS0"  routing="Full">
+   <host id="MAIN" power="1"/>
+   <host id="CPU0" power="2000000000"/>
+   <host id="CPU1" power="2000000000"/>
+   <host id="CPU2" power="2000000000"/>
+   <host id="CPU3" power="2000000000"/>
+   <host id="CPU4" power="2000000000"/>
+   <host id="CPU5" power="2000000000"/>
+   <host id="CPU6" power="2000000000"/>
+   <host id="CPU7" power="2000000000"/>
+   <host id="CUDA0" power="2000000000">
+     <prop id="memsize" value="3145728000"/>
+     <prop id="memcpy_peer" value="0"/>
+   </host>
+   <host id="CUDA1" power="2000000000">
+     <prop id="memsize" value="3145728000"/>
+     <prop id="memcpy_peer" value="0"/>
+   </host>
+   <host id="CUDA2" power="2000000000">
+     <prop id="memsize" value="3145728000"/>
+     <prop id="memcpy_peer" value="0"/>
+   </host>
+   <host id="OpenCL0" power="2000000000">
+     <prop id="memsize" value="3145728000"/>
+   </host>
+   <host id="OpenCL1" power="2000000000">
+     <prop id="memsize" value="3145728000"/>
+   </host>
+   <host id="OpenCL2" power="2000000000">
+     <prop id="memsize" value="3145728000"/>
+   </host>
+
+   <host id="RAM" power="1"/>
+
+   <link id="Share" bandwidth="5988971975.023217" latency="0.000000"/>
+
+   <link id="RAM-CUDA0" bandwidth="5988779905.433726" latency="0.000012"/>
+   <link id="CUDA0-RAM" bandwidth="3599738919.299022" latency="0.000012"/>
+   <link id="RAM-CUDA1" bandwidth="3149675860.319062" latency="0.000013"/>
+   <link id="CUDA1-RAM" bandwidth="3352127736.160954" latency="0.000013"/>
+   <link id="RAM-CUDA2" bandwidth="5988971975.023217" latency="0.000012"/>
+   <link id="CUDA2-RAM" bandwidth="3554530215.805904" latency="0.000013"/>
+   <link id="RAM-OpenCL0" bandwidth="3975378655.154796" latency="0.000020"/>
+   <link id="OpenCL0-RAM" bandwidth="2937163571.508681" latency="0.000064"/>
+   <link id="RAM-OpenCL1" bandwidth="2636838726.154693" latency="0.000020"/>
+   <link id="OpenCL1-RAM" bandwidth="2610203570.688437" latency="0.000036"/>
+   <link id="RAM-OpenCL2" bandwidth="3992447566.540525" latency="0.000020"/>
+   <link id="OpenCL2-RAM" bandwidth="2812550617.128727" latency="0.000037"/>
+   <route src="RAM" dst="CUDA0" symmetrical="NO"><link_ctn id="RAM-CUDA0"/><link_ctn id="Share"/></route>
+   <route src="CUDA0" dst="RAM" symmetrical="NO"><link_ctn id="CUDA0-RAM"/><link_ctn id="Share"/></route>
+   <route src="RAM" dst="CUDA1" symmetrical="NO"><link_ctn id="RAM-CUDA1"/><link_ctn id="Share"/></route>
+   <route src="CUDA1" dst="RAM" symmetrical="NO"><link_ctn id="CUDA1-RAM"/><link_ctn id="Share"/></route>
+   <route src="RAM" dst="CUDA2" symmetrical="NO"><link_ctn id="RAM-CUDA2"/><link_ctn id="Share"/></route>
+   <route src="CUDA2" dst="RAM" symmetrical="NO"><link_ctn id="CUDA2-RAM"/><link_ctn id="Share"/></route>
+   <route src="RAM" dst="OpenCL0" symmetrical="NO"><link_ctn id="RAM-OpenCL0"/><link_ctn id="Share"/></route>
+   <route src="OpenCL0" dst="RAM" symmetrical="NO"><link_ctn id="OpenCL0-RAM"/><link_ctn id="Share"/></route>
+   <route src="RAM" dst="OpenCL1" symmetrical="NO"><link_ctn id="RAM-OpenCL1"/><link_ctn id="Share"/></route>
+   <route src="OpenCL1" dst="RAM" symmetrical="NO"><link_ctn id="OpenCL1-RAM"/><link_ctn id="Share"/></route>
+   <route src="RAM" dst="OpenCL2" symmetrical="NO"><link_ctn id="RAM-OpenCL2"/><link_ctn id="Share"/></route>
+   <route src="OpenCL2" dst="RAM" symmetrical="NO"><link_ctn id="OpenCL2-RAM"/><link_ctn id="Share"/></route>
+ </AS>
+ </platform>

+ 104 - 0
tools/perfmodels/sampling/codelets/45/chol_model_11.hannibal

@@ -0,0 +1,104 @@
+##################
+#	Performance	Model	Version
+45
+
+####################
+# COMBs
+# number of combinations
+3
+####################
+# COMB_1
+# number of types devices
+1
+####################
+# DEV_0
+# device type (CPU - 0, CUDA - 1, OPENCL - 2, MIC - 3)
+1
+####################
+# DEV_0
+# device id 
+0
+####################
+# DEV_0
+# number of cores 
+1
+##########
+# number of implementations
+1
+#####
+# Model for cuda0_impl0 (Comb1)
+#	number	of	entries
+1
+#	sumlnx	sumlnx2	sumlny	sumlnxlny	alpha	beta	n	minx	maxx
+0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	nan	nan	0	0	0
+#	a	b	c
+nan	nan	nan
+# not multiple-regression-base
+0
+#	hash	size	flops	mean	(us)	dev	(us)	sum	sum2	n
+617e5fe6	3686400	0	1.701016e+05	7.229737e+03	4.082438e+06	6.956835e+11	24
+
+####################
+# COMB_2
+# number of types devices
+1
+####################
+# DEV_0
+# device type (CPU - 0, CUDA - 1, OPENCL - 2, MIC - 3)
+1
+####################
+# DEV_0
+# device id 
+1
+####################
+# DEV_0
+# number of cores 
+1
+##########
+# number of implementations
+1
+#####
+# Model for cuda1_impl0 (Comb2)
+#	number	of	entries
+1
+#	sumlnx	sumlnx2	sumlny	sumlnxlny	alpha	beta	n	minx	maxx
+0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	nan	nan	0	0	0
+#	a	b	c
+nan	nan	nan
+# not multiple-regression-base
+0
+#	hash	size	flops	mean	(us)	dev	(us)	sum	sum2	n
+617e5fe6	3686400	0	1.188776e+05	9.331204e+02	2.113643e+08	2.512803e+13	1778
+
+####################
+# COMB_3
+# number of types devices
+1
+####################
+# DEV_0
+# device type (CPU - 0, CUDA - 1, OPENCL - 2, MIC - 3)
+1
+####################
+# DEV_0
+# device id 
+2
+####################
+# DEV_0
+# number of cores 
+1
+##########
+# number of implementations
+1
+#####
+# Model for cuda2_impl0 (Comb3)
+#	number	of	entries
+1
+#	sumlnx	sumlnx2	sumlny	sumlnxlny	alpha	beta	n	minx	maxx
+0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	nan	nan	0	0	0
+#	a	b	c
+nan	nan	nan
+# not multiple-regression-base
+0
+#	hash	size	flops	mean	(us)	dev	(us)	sum	sum2	n
+617e5fe6	3686400	0	1.205438e+05	2.044578e+03	2.189075e+08	2.639552e+13	1816
+

+ 1 - 0
tools/perfmodels/sampling/codelets/45/chol_model_11.hannibal-pitch

@@ -0,0 +1 @@
+chol_model_11.hannibal

+ 104 - 0
tools/perfmodels/sampling/codelets/45/chol_model_21.hannibal

@@ -0,0 +1,104 @@
+##################
+#	Performance	Model	Version
+45
+
+####################
+# COMBs
+# number of combinations
+3
+####################
+# COMB_1
+# number of types devices
+1
+####################
+# DEV_0
+# device type (CPU - 0, CUDA - 1, OPENCL - 2, MIC - 3)
+1
+####################
+# DEV_0
+# device id 
+0
+####################
+# DEV_0
+# number of cores 
+1
+##########
+# number of implementations
+1
+#####
+# Model for cuda0_impl0 (Comb1)
+#	number	of	entries
+1
+#	sumlnx	sumlnx2	sumlny	sumlnxlny	alpha	beta	n	minx	maxx
+0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	nan	nan	0	0	0
+#	a	b	c
+nan	nan	nan
+# not multiple-regression-base
+0
+#	hash	size	flops	mean	(us)	dev	(us)	sum	sum2	n
+ff82dda0	7372800	8.856576e+08	1.551780e+04	9.258624e+03	5.415867e+08	1.139602e+13	34901
+
+####################
+# COMB_2
+# number of types devices
+1
+####################
+# DEV_0
+# device type (CPU - 0, CUDA - 1, OPENCL - 2, MIC - 3)
+1
+####################
+# DEV_0
+# device id 
+1
+####################
+# DEV_0
+# number of cores 
+1
+##########
+# number of implementations
+1
+#####
+# Model for cuda1_impl0 (Comb2)
+#	number	of	entries
+1
+#	sumlnx	sumlnx2	sumlny	sumlnxlny	alpha	beta	n	minx	maxx
+0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	nan	nan	0	0	0
+#	a	b	c
+nan	nan	nan
+# not multiple-regression-base
+0
+#	hash	size	flops	mean	(us)	dev	(us)	sum	sum2	n
+ff82dda0	7372800	8.856576e+08	1.787309e+04	1.121893e+04	5.782658e+08	1.440761e+13	32354
+
+####################
+# COMB_3
+# number of types devices
+1
+####################
+# DEV_0
+# device type (CPU - 0, CUDA - 1, OPENCL - 2, MIC - 3)
+1
+####################
+# DEV_0
+# device id 
+2
+####################
+# DEV_0
+# number of cores 
+1
+##########
+# number of implementations
+1
+#####
+# Model for cuda2_impl0 (Comb3)
+#	number	of	entries
+1
+#	sumlnx	sumlnx2	sumlny	sumlnxlny	alpha	beta	n	minx	maxx
+0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	nan	nan	0	0	0
+#	a	b	c
+nan	nan	nan
+# not multiple-regression-base
+0
+#	hash	size	flops	mean	(us)	dev	(us)	sum	sum2	n
+ff82dda0	7372800	8.856576e+08	1.675795e+04	1.012077e+04	5.931309e+08	1.356507e+13	35394
+

+ 1 - 0
tools/perfmodels/sampling/codelets/45/chol_model_21.hannibal-pitch

@@ -0,0 +1 @@
+chol_model_21.hannibal

+ 104 - 0
tools/perfmodels/sampling/codelets/45/chol_model_22.hannibal

@@ -0,0 +1,104 @@
+##################
+#	Performance	Model	Version
+45
+
+####################
+# COMBs
+# number of combinations
+3
+####################
+# COMB_1
+# number of types devices
+1
+####################
+# DEV_0
+# device type (CPU - 0, CUDA - 1, OPENCL - 2, MIC - 3)
+1
+####################
+# DEV_0
+# device id 
+0
+####################
+# DEV_0
+# number of cores 
+1
+##########
+# number of implementations
+1
+#####
+# Model for cuda0_impl0 (Comb1)
+#	number	of	entries
+1
+#	sumlnx	sumlnx2	sumlny	sumlnxlny	alpha	beta	n	minx	maxx
+0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	nan	nan	0	0	0
+#	a	b	c
+nan	nan	nan
+# not multiple-regression-base
+0
+#	hash	size	flops	mean	(us)	dev	(us)	sum	sum2	n
+24c84a50	11059200	1.769472e+09	5.763709e+03	3.768350e+03	4.501024e+09	3.703209e+13	780925
+
+####################
+# COMB_2
+# number of types devices
+1
+####################
+# DEV_0
+# device type (CPU - 0, CUDA - 1, OPENCL - 2, MIC - 3)
+1
+####################
+# DEV_0
+# device id 
+1
+####################
+# DEV_0
+# number of cores 
+1
+##########
+# number of implementations
+1
+#####
+# Model for cuda1_impl0 (Comb2)
+#	number	of	entries
+1
+#	sumlnx	sumlnx2	sumlny	sumlnxlny	alpha	beta	n	minx	maxx
+0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	nan	nan	0	0	0
+#	a	b	c
+nan	nan	nan
+# not multiple-regression-base
+0
+#	hash	size	flops	mean	(us)	dev	(us)	sum	sum2	n
+24c84a50	11059200	1.769472e+09	5.889910e+03	4.485232e+03	4.352661e+09	4.050353e+13	739003
+
+####################
+# COMB_3
+# number of types devices
+1
+####################
+# DEV_0
+# device type (CPU - 0, CUDA - 1, OPENCL - 2, MIC - 3)
+1
+####################
+# DEV_0
+# device id 
+2
+####################
+# DEV_0
+# number of cores 
+1
+##########
+# number of implementations
+1
+#####
+# Model for cuda2_impl0 (Comb3)
+#	number	of	entries
+1
+#	sumlnx	sumlnx2	sumlny	sumlnxlny	alpha	beta	n	minx	maxx
+0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	nan	nan	0	0	0
+#	a	b	c
+nan	nan	nan
+# not multiple-regression-base
+0
+#	hash	size	flops	mean	(us)	dev	(us)	sum	sum2	n
+24c84a50	11059200	1.769472e+09	5.782569e+03	3.939612e+03	4.412291e+09	3.735706e+13	763033
+

+ 1 - 0
tools/perfmodels/sampling/codelets/45/chol_model_22.hannibal-pitch

@@ -0,0 +1 @@
+chol_model_22.hannibal

+ 104 - 0
tools/perfmodels/sampling/codelets/45/starpu_slu_lu_model_11.hannibal

@@ -0,0 +1,104 @@
+##################
+#	Performance	Model	Version
+45
+
+####################
+# COMBs
+# number of combinations
+3
+####################
+# COMB_1
+# number of types devices
+1
+####################
+# DEV_0
+# device type (CPU - 0, CUDA - 1, OPENCL - 2, MIC - 3)
+1
+####################
+# DEV_0
+# device id 
+0
+####################
+# DEV_0
+# number of cores 
+1
+##########
+# number of implementations
+1
+#####
+# Model for cuda0_impl0 (Comb1)
+#	number	of	entries
+1
+#	sumlnx	sumlnx2	sumlny	sumlnxlny	alpha	beta	n	minx	maxx
+0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	nan	nan	0	0	0
+#	a	b	c
+nan	nan	nan
+# not multiple-regression-base
+0
+#	hash	size	flops	mean	(us)	dev	(us)	sum	sum2	n
+617e5fe6	3686400	0.000000e+00	1.250229e+05	4.416720e+03	1.500275e+06	1.878028e+11	12
+
+####################
+# COMB_2
+# number of types devices
+1
+####################
+# DEV_0
+# device type (CPU - 0, CUDA - 1, OPENCL - 2, MIC - 3)
+1
+####################
+# DEV_0
+# device id 
+1
+####################
+# DEV_0
+# number of cores 
+1
+##########
+# number of implementations
+1
+#####
+# Model for cuda1_impl0 (Comb2)
+#	number	of	entries
+1
+#	sumlnx	sumlnx2	sumlny	sumlnxlny	alpha	beta	n	minx	maxx
+0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	nan	nan	0	0	0
+#	a	b	c
+nan	nan	nan
+# not multiple-regression-base
+0
+#	hash	size	flops	mean	(us)	dev	(us)	sum	sum2	n
+617e5fe6	3686400	0.000000e+00	8.424585e+04	1.140908e+03	4.802014e+07	4.046239e+12	570
+
+####################
+# COMB_3
+# number of types devices
+1
+####################
+# DEV_0
+# device type (CPU - 0, CUDA - 1, OPENCL - 2, MIC - 3)
+1
+####################
+# DEV_0
+# device id 
+2
+####################
+# DEV_0
+# number of cores 
+1
+##########
+# number of implementations
+1
+#####
+# Model for cuda2_impl0 (Comb3)
+#	number	of	entries
+1
+#	sumlnx	sumlnx2	sumlny	sumlnxlny	alpha	beta	n	minx	maxx
+0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	nan	nan	0	0	0
+#	a	b	c
+nan	nan	nan
+# not multiple-regression-base
+0
+#	hash	size	flops	mean	(us)	dev	(us)	sum	sum2	n
+617e5fe6	3686400	0.000000e+00	8.331807e+04	6.460292e+02	5.782274e+07	4.817969e+12	694
+

+ 1 - 0
tools/perfmodels/sampling/codelets/45/starpu_slu_lu_model_11.hannibal-pitch

@@ -0,0 +1 @@
+starpu_slu_lu_model_11.hannibal

+ 104 - 0
tools/perfmodels/sampling/codelets/45/starpu_slu_lu_model_12.hannibal

@@ -0,0 +1,104 @@
+##################
+#	Performance	Model	Version
+45
+
+####################
+# COMBs
+# number of combinations
+3
+####################
+# COMB_1
+# number of types devices
+1
+####################
+# DEV_0
+# device type (CPU - 0, CUDA - 1, OPENCL - 2, MIC - 3)
+1
+####################
+# DEV_0
+# device id 
+0
+####################
+# DEV_0
+# number of cores 
+1
+##########
+# number of implementations
+1
+#####
+# Model for cuda0_impl0 (Comb1)
+#	number	of	entries
+1
+#	sumlnx	sumlnx2	sumlny	sumlnxlny	alpha	beta	n	minx	maxx
+0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	nan	nan	0	0	0
+#	a	b	c
+nan	nan	nan
+# not multiple-regression-base
+0
+#	hash	size	flops	mean	(us)	dev	(us)	sum	sum2	n
+ff82dda0	7372800	0.000000e+00	1.072902e+04	3.731292e+03	7.780684e+07	9.357572e+11	7252
+
+####################
+# COMB_2
+# number of types devices
+1
+####################
+# DEV_0
+# device type (CPU - 0, CUDA - 1, OPENCL - 2, MIC - 3)
+1
+####################
+# DEV_0
+# device id 
+1
+####################
+# DEV_0
+# number of cores 
+1
+##########
+# number of implementations
+1
+#####
+# Model for cuda1_impl0 (Comb2)
+#	number	of	entries
+1
+#	sumlnx	sumlnx2	sumlny	sumlnxlny	alpha	beta	n	minx	maxx
+0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	nan	nan	0	0	0
+#	a	b	c
+nan	nan	nan
+# not multiple-regression-base
+0
+#	hash	size	flops	mean	(us)	dev	(us)	sum	sum2	n
+ff82dda0	7372800	0.000000e+00	1.250147e+04	5.489974e+03	7.944684e+07	1.184741e+12	6355
+
+####################
+# COMB_3
+# number of types devices
+1
+####################
+# DEV_0
+# device type (CPU - 0, CUDA - 1, OPENCL - 2, MIC - 3)
+1
+####################
+# DEV_0
+# device id 
+2
+####################
+# DEV_0
+# number of cores 
+1
+##########
+# number of implementations
+1
+#####
+# Model for cuda2_impl0 (Comb3)
+#	number	of	entries
+1
+#	sumlnx	sumlnx2	sumlny	sumlnxlny	alpha	beta	n	minx	maxx
+0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	nan	nan	0	0	0
+#	a	b	c
+nan	nan	nan
+# not multiple-regression-base
+0
+#	hash	size	flops	mean	(us)	dev	(us)	sum	sum2	n
+ff82dda0	7372800	0.000000e+00	1.131230e+04	4.120480e+03	8.165221e+07	1.046224e+12	7218
+

+ 1 - 0
tools/perfmodels/sampling/codelets/45/starpu_slu_lu_model_12.hannibal-pitch

@@ -0,0 +1 @@
+starpu_slu_lu_model_12.hannibal

+ 104 - 0
tools/perfmodels/sampling/codelets/45/starpu_slu_lu_model_21.hannibal

@@ -0,0 +1,104 @@
+##################
+#	Performance	Model	Version
+45
+
+####################
+# COMBs
+# number of combinations
+3
+####################
+# COMB_1
+# number of types devices
+1
+####################
+# DEV_0
+# device type (CPU - 0, CUDA - 1, OPENCL - 2, MIC - 3)
+1
+####################
+# DEV_0
+# device id 
+0
+####################
+# DEV_0
+# number of cores 
+1
+##########
+# number of implementations
+1
+#####
+# Model for cuda0_impl0 (Comb1)
+#	number	of	entries
+1
+#	sumlnx	sumlnx2	sumlny	sumlnxlny	alpha	beta	n	minx	maxx
+0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	nan	nan	0	0	0
+#	a	b	c
+nan	nan	nan
+# not multiple-regression-base
+0
+#	hash	size	flops	mean	(us)	dev	(us)	sum	sum2	n
+ff82dda0	7372800	0.000000e+00	1.103789e+04	3.664518e+03	7.889881e+07	9.668643e+11	7148
+
+####################
+# COMB_2
+# number of types devices
+1
+####################
+# DEV_0
+# device type (CPU - 0, CUDA - 1, OPENCL - 2, MIC - 3)
+1
+####################
+# DEV_0
+# device id 
+1
+####################
+# DEV_0
+# number of cores 
+1
+##########
+# number of implementations
+1
+#####
+# Model for cuda1_impl0 (Comb2)
+#	number	of	entries
+1
+#	sumlnx	sumlnx2	sumlny	sumlnxlny	alpha	beta	n	minx	maxx
+0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	nan	nan	0	0	0
+#	a	b	c
+nan	nan	nan
+# not multiple-regression-base
+0
+#	hash	size	flops	mean	(us)	dev	(us)	sum	sum2	n
+ff82dda0	7372800	0.000000e+00	1.284524e+04	5.462619e+03	8.441889e+07	1.280490e+12	6572
+
+####################
+# COMB_3
+# number of types devices
+1
+####################
+# DEV_0
+# device type (CPU - 0, CUDA - 1, OPENCL - 2, MIC - 3)
+1
+####################
+# DEV_0
+# device id 
+2
+####################
+# DEV_0
+# number of cores 
+1
+##########
+# number of implementations
+1
+#####
+# Model for cuda2_impl0 (Comb3)
+#	number	of	entries
+1
+#	sumlnx	sumlnx2	sumlny	sumlnxlny	alpha	beta	n	minx	maxx
+0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	nan	nan	0	0	0
+#	a	b	c
+nan	nan	nan
+# not multiple-regression-base
+0
+#	hash	size	flops	mean	(us)	dev	(us)	sum	sum2	n
+ff82dda0	7372800	0.000000e+00	1.171798e+04	4.121992e+03	8.325626e+07	1.096315e+12	7105
+

+ 1 - 0
tools/perfmodels/sampling/codelets/45/starpu_slu_lu_model_21.hannibal-pitch

@@ -0,0 +1 @@
+starpu_slu_lu_model_21.hannibal

+ 104 - 0
tools/perfmodels/sampling/codelets/45/starpu_slu_lu_model_22.hannibal

@@ -0,0 +1,104 @@
+##################
+#	Performance	Model	Version
+45
+
+####################
+# COMBs
+# number of combinations
+3
+####################
+# COMB_1
+# number of types devices
+1
+####################
+# DEV_0
+# device type (CPU - 0, CUDA - 1, OPENCL - 2, MIC - 3)
+1
+####################
+# DEV_0
+# device id 
+0
+####################
+# DEV_0
+# number of cores 
+1
+##########
+# number of implementations
+1
+#####
+# Model for cuda0_impl0 (Comb1)
+#	number	of	entries
+1
+#	sumlnx	sumlnx2	sumlny	sumlnxlny	alpha	beta	n	minx	maxx
+0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	nan	nan	0	0	0
+#	a	b	c
+nan	nan	nan
+# not multiple-regression-base
+0
+#	hash	size	flops	mean	(us)	dev	(us)	sum	sum2	n
+24c84a50	11059200	0.000000e+00	5.116253e+03	1.361494e+03	9.170526e+08	5.024130e+12	179243
+
+####################
+# COMB_2
+# number of types devices
+1
+####################
+# DEV_0
+# device type (CPU - 0, CUDA - 1, OPENCL - 2, MIC - 3)
+1
+####################
+# DEV_0
+# device id 
+1
+####################
+# DEV_0
+# number of cores 
+1
+##########
+# number of implementations
+1
+#####
+# Model for cuda1_impl0 (Comb2)
+#	number	of	entries
+1
+#	sumlnx	sumlnx2	sumlny	sumlnxlny	alpha	beta	n	minx	maxx
+0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	nan	nan	0	0	0
+#	a	b	c
+nan	nan	nan
+# not multiple-regression-base
+0
+#	hash	size	flops	mean	(us)	dev	(us)	sum	sum2	n
+24c84a50	11059200	0.000000e+00	5.228920e+03	1.967478e+03	8.761527e+08	5.229949e+12	167559
+
+####################
+# COMB_3
+# number of types devices
+1
+####################
+# DEV_0
+# device type (CPU - 0, CUDA - 1, OPENCL - 2, MIC - 3)
+1
+####################
+# DEV_0
+# device id 
+2
+####################
+# DEV_0
+# number of cores 
+1
+##########
+# number of implementations
+1
+#####
+# Model for cuda2_impl0 (Comb3)
+#	number	of	entries
+1
+#	sumlnx	sumlnx2	sumlny	sumlnxlny	alpha	beta	n	minx	maxx
+0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	nan	nan	0	0	0
+#	a	b	c
+nan	nan	nan
+# not multiple-regression-base
+0
+#	hash	size	flops	mean	(us)	dev	(us)	sum	sum2	n
+24c84a50	11059200	0.000000e+00	5.131691e+03	1.494139e+03	8.920059e+08	4.965550e+12	173823
+

+ 1 - 0
tools/perfmodels/sampling/codelets/45/starpu_slu_lu_model_22.hannibal-pitch

@@ -0,0 +1 @@
+starpu_slu_lu_model_22.hannibal

+ 21 - 3
tools/starpu_fxt_data_trace.c

@@ -2,7 +2,7 @@
  *
  *
  * Copyright (C) 2013                                     Joris Pablo
  * Copyright (C) 2013                                     Joris Pablo
  * Copyright (C) 2014,2015,2017                           CNRS
  * Copyright (C) 2014,2015,2017                           CNRS
- * Copyright (C) 2011-2014,2016                           Université de Bordeaux
+ * Copyright (C) 2011-2014,2016,2019                      Université de Bordeaux
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -19,6 +19,7 @@
 #include <stdio.h>
 #include <stdio.h>
 #include <starpu.h>
 #include <starpu.h>
 #include <string.h>
 #include <string.h>
+#include <sys/stat.h>
 #include <common/config.h>
 #include <common/config.h>
 
 
 #define PROGNAME "starpu_fxt_data_trace"
 #define PROGNAME "starpu_fxt_data_trace"
@@ -75,7 +76,8 @@ static void write_gp(int argc, char **argv)
 		exit(-1);
 		exit(-1);
 	}
 	}
 	char codelet_name[MAX_LINE_SIZE];
 	char codelet_name[MAX_LINE_SIZE];
-	FILE *plt = fopen("data_trace.gp", "w+");
+	const char *file_name = "data_trace.gp";
+	FILE *plt = fopen(file_name, "w+");
 	if(!plt)
 	if(!plt)
 	{
 	{
 		perror("Error while creating data_trace.gp:");
 		perror("Error while creating data_trace.gp:");
@@ -129,7 +131,6 @@ static void write_gp(int argc, char **argv)
 	}
 	}
 	fprintf(plt, "\n");
 	fprintf(plt, "\n");
 
 
-	fprintf(stdout, "Gnuplot file <data_trace.gp> has been successfully created.\n");
 	if(fclose(codelet_list))
 	if(fclose(codelet_list))
 	{
 	{
 		perror("close failed :");
 		perror("close failed :");
@@ -141,6 +142,23 @@ static void write_gp(int argc, char **argv)
 		perror("close failed :");
 		perror("close failed :");
 		exit(-1);
 		exit(-1);
 	}
 	}
+
+	struct stat sb;
+	int ret = stat(file_name, &sb);
+	if (ret)
+	{
+		perror("stat");
+		STARPU_ABORT();
+	}
+
+	/* Make the gnuplot scrit executable for the owner */
+	ret = chmod(file_name, sb.st_mode|S_IXUSR);
+	if (ret)
+	{
+		perror("chmod");
+		STARPU_ABORT();
+	}
+	fprintf(stdout, "Gnuplot file <data_trace.gp> has been successfully created.\n");
 }
 }
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)

+ 54 - 36
tools/starpu_replay.c

@@ -268,7 +268,10 @@ static void arrays_managing(int mode)
 /* Check if a handle hasn't been registered yet */
 /* Check if a handle hasn't been registered yet */
 static void variable_data_register_check(size_t * array_of_size, int nb_handles)
 static void variable_data_register_check(size_t * array_of_size, int nb_handles)
 {
 {
-	int h;
+	int h, i;
+	starpu_data_handle_t orig_handles[nb_handles];
+
+	ARRAY_DUP(handles_ptr, orig_handles, nb_handles);
 
 
 	for (h = 0 ; h < nb_handles ; h++)
 	for (h = 0 ; h < nb_handles ; h++)
 	{
 	{
@@ -276,16 +279,29 @@ static void variable_data_register_check(size_t * array_of_size, int nb_handles)
 		{
 		{
 			struct handle * handles_cell;
 			struct handle * handles_cell;
 
 
-			_STARPU_MALLOC(handles_cell, sizeof(*handles_cell));
-			STARPU_ASSERT(handles_cell != NULL);
+			for (i = 0; i < h; i++)
+			{
+				/* Maybe we just registered it in this very h loop */
+				if (handles_ptr[h] == orig_handles[i])
+				{
+					handles_ptr[h] = handles_ptr[i];
+					break;
+				}
+			}
+
+			if (i == h)
+			{
+				_STARPU_MALLOC(handles_cell, sizeof(*handles_cell));
+				STARPU_ASSERT(handles_cell != NULL);
 
 
-			handles_cell->handle = handles_ptr[h]; /* Get the hidden key (initial handle from the file) to store it as a key*/
+				handles_cell->handle = handles_ptr[h]; /* Get the hidden key (initial handle from the file) to store it as a key*/
 
 
-			starpu_variable_data_register(handles_ptr+h, STARPU_MAIN_RAM, (uintptr_t) 1, array_of_size[h]);
+				starpu_variable_data_register(handles_ptr+h, STARPU_MAIN_RAM, (uintptr_t) 1, array_of_size[h]);
 
 
-			handles_cell->mem_ptr = handles_ptr[h]; /* Store the new value of the handle into the hash table */
+				handles_cell->mem_ptr = handles_ptr[h]; /* Store the new value of the handle into the hash table */
 
 
-			HASH_ADD(hh, handles_hash, handle, sizeof(handles_ptr[h]), handles_cell);
+				HASH_ADD(hh, handles_hash, handle, sizeof(handles_ptr[h]), handles_cell);
+			}
 		}
 		}
 	}
 	}
 }
 }
@@ -532,6 +548,7 @@ int main(int argc, char **argv)
 	reset();
 	reset();
 
 
 	double start = starpu_timing_now();
 	double start = starpu_timing_now();
+	int linenum = 0;
 
 
 	while(1)
 	while(1)
 	{
 	{
@@ -571,6 +588,8 @@ int main(int argc, char **argv)
 			s_allocated *= 2;
 			s_allocated *= 2;
 		}
 		}
 
 
+		linenum++;
+
 		if (ln == s)
 		if (ln == s)
 		{
 		{
 			/* Empty line, do task */
 			/* Empty line, do task */
@@ -652,6 +671,7 @@ int main(int argc, char **argv)
 
 
 							fprintf(stderr, "[starpu][Warning] Error loading perfmodel symbol %s\n", model);
 							fprintf(stderr, "[starpu][Warning] Error loading perfmodel symbol %s\n", model);
 							fprintf(stderr, "[starpu][Warning] Taking only measurements from the given execution, and forcing execution on worker %d\n", workerid);
 							fprintf(stderr, "[starpu][Warning] Taking only measurements from the given execution, and forcing execution on worker %d\n", workerid);
+							starpu_perfmodel_unload_model(&realmodel->perfmodel);
 							free(realmodel->model_name);
 							free(realmodel->model_name);
 							free(realmodel);
 							free(realmodel);
 							realmodel = NULL;
 							realmodel = NULL;
@@ -659,6 +679,9 @@ int main(int argc, char **argv)
 
 
 					}
 					}
 
 
+					struct starpu_perfmodel_arch *arch = starpu_worker_get_perf_archtype(workerid, 0);
+
+					unsigned comb = starpu_perfmodel_arch_comb_add(arch->ndevices, arch->devices);
 					unsigned narch = starpu_perfmodel_get_narch_combs();
 					unsigned narch = starpu_perfmodel_get_narch_combs();
 
 
 					struct task_arg *arg;
 					struct task_arg *arg;
@@ -669,9 +692,6 @@ int main(int argc, char **argv)
 
 
 					if (realmodel == NULL)
 					if (realmodel == NULL)
 					{
 					{
-						struct starpu_perfmodel_arch *arch = starpu_worker_get_perf_archtype(workerid, 0);
-
-						unsigned comb = starpu_perfmodel_arch_comb_get(arch->ndevices, arch->devices);
 						/* Erf, do without perfmodel, for execution there */
 						/* Erf, do without perfmodel, for execution there */
 						task->task.workerid = workerid;
 						task->task.workerid = workerid;
 						task->task.execute_on_a_specific_worker = 1;
 						task->task.execute_on_a_specific_worker = 1;
@@ -795,19 +815,19 @@ int main(int argc, char **argv)
 		}
 		}
 		else if (TEST("Parameters"))
 		else if (TEST("Parameters"))
 		{
 		{
-			/* Parameters line format is PARAM1_PARAM2_(...)PARAMi_(...)PARAMn */
+			/* Parameters line format is PARAM1 PARAM2 (...)PARAMi (...)PARAMn */
 			char * param_str = s + 12;
 			char * param_str = s + 12;
 			int count = 0;
 			int count = 0;
 
 
 			for (i = 0 ; param_str[i] != '\n'; i++)
 			for (i = 0 ; param_str[i] != '\n'; i++)
 			{
 			{
-				if (param_str[i] == '_') /* Checking the number of '_' (underscore), assuming that the file is not corrupted */
+				if (param_str[i] == ' ') /* Checking the number of ' ' (space), assuming that the file is not corrupted */
 				{
 				{
 					count++;
 					count++;
 				}
 				}
 			}
 			}
 
 
-			nb_parameters = count + 1; /* There is one underscore per paramater execept for the last one, that's why we have to add +1 (dirty programming) */
+			nb_parameters = count + 1; /* There is one space per paramater except for the last one, that's why we have to add +1 (dirty programming) */
 
 
 			/* This part of the algorithm will determine if it needs static or dynamic arrays */
 			/* This part of the algorithm will determine if it needs static or dynamic arrays */
 			alloc_mode = set_alloc_mode(nb_parameters);
 			alloc_mode = set_alloc_mode(nb_parameters);
@@ -820,30 +840,28 @@ int main(int argc, char **argv)
 			const char *delim = " ";
 			const char *delim = " ";
 			char *token = strtok(buffer, delim);
 			char *token = strtok(buffer, delim);
 
 
-			while (token != NULL)
+			for (i = 0 ; i < nb_parameters ; i++)
 			{
 			{
-				for (i = 0 ; i < nb_parameters ; i++)
-				{
-					struct handle *handles_cell; /* A cell of the hash table for the handles */
-					starpu_data_handle_t  handle_value = (starpu_data_handle_t) strtol(token, NULL, 16); /* Get the ith handle on the line (in the file) */
-
-					HASH_FIND(hh, handles_hash, &handle_value, sizeof(handle_value), handles_cell); /* Find if the handle_value was already registered as a key in the hash table */
+				STARPU_ASSERT(token);
+				struct handle *handles_cell; /* A cell of the hash table for the handles */
+				starpu_data_handle_t  handle_value = (starpu_data_handle_t) strtol(token, NULL, 16); /* Get the ith handle on the line (in the file) */
 
 
-					/* If it wasn't, then add it to the hash table */
-					if (handles_cell == NULL)
-					{
-						/* Hide the initial handle from the file into the handles array to find it when necessary */
-						handles_ptr[i] = handle_value;
-						reg_signal[i] = 1;
-					}
-					else
-					{
-						handles_ptr[i] = handles_cell->mem_ptr;
-						reg_signal[i] = 0;
-					}
+				HASH_FIND(hh, handles_hash, &handle_value, sizeof(handle_value), handles_cell); /* Find if the handle_value was already registered as a key in the hash table */
 
 
-					token = strtok(NULL, delim);
+				/* If it wasn't, then add it to the hash table */
+				if (handles_cell == NULL)
+				{
+					/* Hide the initial handle from the file into the handles array to find it when necessary */
+					handles_ptr[i] = handle_value;
+					reg_signal[i] = 1;
 				}
 				}
+				else
+				{
+					handles_ptr[i] = handles_cell->mem_ptr;
+					reg_signal[i] = 0;
+				}
+
+				token = strtok(NULL, delim);
 			}
 			}
 		}
 		}
 		else if (TEST("Modes"))
 		else if (TEST("Modes"))
@@ -853,7 +871,7 @@ int main(int argc, char **argv)
 			const char * delim = " ";
 			const char * delim = " ";
 			char * token = strtok(buffer, delim);
 			char * token = strtok(buffer, delim);
 
 
-			while (token != NULL)
+			while (token != NULL && mode_i < nb_parameters)
 			{
 			{
 				/* Subject to the names of starpu modes enumerator are not modified */
 				/* Subject to the names of starpu modes enumerator are not modified */
 				if (!strncmp(token, "RW", 2))
 				if (!strncmp(token, "RW", 2))
@@ -888,7 +906,7 @@ int main(int argc, char **argv)
 
 
 			_STARPU_MALLOC(sizes_set, nb_parameters * sizeof(size_t));
 			_STARPU_MALLOC(sizes_set, nb_parameters * sizeof(size_t));
 
 
-			while (token != NULL)
+			while (token != NULL && k < nb_parameters)
 			{
 			{
 				sizes_set[k] = strtol(token, NULL, 10);
 				sizes_set[k] = strtol(token, NULL, 10);
 				token = strtok(NULL, delim);
 				token = strtok(NULL, delim);
@@ -969,9 +987,9 @@ eof:
         }
         }
 
 
 	starpu_shutdown();
 	starpu_shutdown();
-
 	return 0;
 	return 0;
 
 
 enodev:
 enodev:
+	starpu_shutdown();
 	return 77;
 	return 77;
 }
 }