Browse Source

Merge branch 'fpga' of git+ssh://scm.gforge.inria.fr/gitroot/starpu/starpu into fpga

Samuel Thibault 5 years ago
parent
commit
584a70d891
100 changed files with 807 additions and 241 deletions
  1. 11 0
      ChangeLog
  2. 15 1
      configure.ac
  3. 1 1
      doc/doxygen/Makefile.am
  4. 2 0
      doc/doxygen/chapters/210_check_list_performance.doxy
  5. 1 1
      doc/doxygen/chapters/301_tasks.doxy
  6. 3 1
      doc/doxygen/chapters/320_scheduling.doxy
  7. 3 0
      doc/doxygen/chapters/350_scheduling_policy_definition.doxy
  8. 9 0
      doc/doxygen/chapters/510_configure_options.doxy
  9. 2 0
      doc/doxygen/chapters/code/vector_scal_cuda.c
  10. 1 1
      doc/doxygen_dev/Makefile.am
  11. 2 0
      doc/tutorial/vector_scal_cuda.cu
  12. 2 0
      examples/basic_examples/block_cuda.cu
  13. 2 0
      examples/basic_examples/multiformat_conversion_codelets_cuda.cu
  14. 2 0
      examples/basic_examples/multiformat_cuda.cu
  15. 2 0
      examples/basic_examples/variable_kernels.cu
  16. 2 0
      examples/basic_examples/vector_scal_cuda.cu
  17. 2 0
      examples/filters/custom_mf/conversion.cu
  18. 2 0
      examples/filters/custom_mf/cuda.cu
  19. 2 0
      examples/filters/fblock_cuda.cu
  20. 4 0
      examples/filters/fmultiple_cuda.cu
  21. 2 0
      examples/incrementer/incrementer_kernels.cu
  22. 2 0
      examples/interface/complex_kernels.cu
  23. 1 0
      examples/mult/double.h
  24. 1 0
      examples/mult/simple.h
  25. 1 1
      examples/mult/xgemm.c
  26. 2 0
      examples/pi/SobolQRNG/sobol_gpu.cu
  27. 4 0
      examples/pi/pi_kernel.cu
  28. 4 0
      examples/pi/pi_redux_kernel.cu
  29. 2 0
      examples/reductions/dot_product_kernels.cu
  30. 2 0
      examples/sched_ctx/axpy_partition_gpu.cu
  31. 2 0
      examples/spmv/spmv_cuda.cu
  32. 2 0
      examples/stencil/life_cuda.cu
  33. 2 0
      examples/stencil/shadow.cu
  34. 9 0
      include/starpu_config.h.in
  35. 1 2
      include/starpu_fxt.h
  36. 17 0
      include/starpu_helper.h
  37. 5 0
      include/starpu_scheduler.h
  38. 23 0
      include/starpu_stdlib.h
  39. 1 1
      include/starpu_task.h
  40. 2 0
      julia/examples/mult/gpu_mult.cu
  41. 2 0
      julia/examples/old_examples/gpu_mult.cu
  42. 2 0
      julia/examples/old_examples/mandelbrot/gpu_mandelbrot.cu
  43. 2 0
      julia/examples/old_examples/mandelbrot/gpu_mandelbrot_between.cu
  44. 2 0
      julia/examples/old_examples/mult/gpu_mult.cu
  45. 4 0
      julia/examples/old_examples/nbody/gpu_nbody.cu
  46. 2 0
      julia/examples/old_examples/nbody/gpu_nbody_between.cu
  47. 4 0
      julia/src/compiler/expressions.jl
  48. 1 1
      mpi/examples/matrix_decomposition/mpi_cholesky.c
  49. 223 57
      mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
  50. 12 1
      mpi/examples/matrix_decomposition/mpi_decomposition_params.c
  51. 8 0
      mpi/examples/matrix_decomposition/mpi_decomposition_params.h
  52. 6 0
      mpi/include/starpu_mpi.h
  53. 2 3
      mpi/src/starpu_mpi_datatype.c
  54. 5 0
      mpi/src/starpu_mpi_init.c
  55. 2 0
      mpi/tests/ring_kernel.cu
  56. 11 4
      src/Makefile.am
  57. 3 0
      src/core/jobs.c
  58. 5 4
      src/core/perfmodel/perfmodel_bus.c
  59. 1 0
      src/core/perfmodel/perfmodel_print.c
  60. 1 0
      src/core/sched_ctx.c
  61. 20 20
      src/core/sched_policy.c
  62. 24 2
      src/core/simgrid.c
  63. 1 1
      src/core/simgrid.h
  64. 1 0
      src/core/task.c
  65. 27 0
      src/core/topology.c
  66. 6 0
      src/core/workers.c
  67. 6 0
      src/core/workers.h
  68. 58 17
      src/datawizard/coherency.c
  69. 2 2
      src/datawizard/coherency.h
  70. 2 2
      src/datawizard/copy_driver.c
  71. 10 1
      src/datawizard/copy_driver.h
  72. 47 29
      src/datawizard/data_request.c
  73. 7 8
      src/datawizard/data_request.h
  74. 2 1
      src/datawizard/filters.c
  75. 3 0
      src/datawizard/interfaces/bcsr_interface.c
  76. 3 0
      src/datawizard/interfaces/block_interface.c
  77. 3 0
      src/datawizard/interfaces/coo_interface.c
  78. 3 0
      src/datawizard/interfaces/csr_interface.c
  79. 1 1
      src/datawizard/interfaces/data_interface.c
  80. 3 0
      src/datawizard/interfaces/matrix_interface.c
  81. 3 0
      src/datawizard/interfaces/multiformat_interface.c
  82. 3 0
      src/datawizard/interfaces/tensor_interface.c
  83. 3 0
      src/datawizard/interfaces/variable_interface.c
  84. 3 0
      src/datawizard/interfaces/vector_interface.c
  85. 3 0
      src/datawizard/interfaces/void_interface.c
  86. 23 18
      src/datawizard/memalloc.c
  87. 9 1
      src/datawizard/memalloc.h
  88. 1 0
      src/datawizard/memory_manager.c
  89. 1 0
      src/datawizard/reduction.c
  90. 7 6
      src/datawizard/user_interactions.c
  91. 2 1
      src/datawizard/write_back.c
  92. 2 2
      src/debug/latency.c
  93. 10 8
      src/debug/traces/starpu_fxt.c
  94. 0 2
      src/debug/traces/starpu_fxt.h
  95. 19 19
      src/debug/traces/starpu_fxt_mpi.c
  96. 2 0
      src/debug/traces/starpu_paje.c
  97. 4 1
      src/drivers/cpu/driver_cpu.c
  98. 29 20
      src/drivers/cuda/driver_cuda.c
  99. 1 0
      src/drivers/disk/driver_disk.c
  100. 0 0
      src/drivers/driver_common/driver_common.c

+ 11 - 0
ChangeLog

@@ -31,9 +31,20 @@ New features:
     files. This file can be parsed by the new script
     starpu_fxt_number_events_to_names.py to convert event keys to event names.
   * New STARPU_PER_WORKER perfmodel.
+  * Add energy accounting in the simgrid mode: starpu_energy_use() and
+    starpu_energy_used().
+  * New function starpu_mpi_get_thread_cpuid() to know where is bound the MPI
+    thread.
+  * New function starpu_get_pu_os_index() to convert logical index of a PU to
+    its OS index.
+  * New function starpu_get_hwloc_topology() to get the hwloc topology used by
+    StarPU.
+  * Add a task prefetch level, to improve retaining data in accelerators so we
+    can make prefetch more aggressive.
 
 Small changes:
   * Use the S4U interface of Simgrid instead of xbt and MSG.
+  * Add a synthetic energy efficiency testcase.
 
 StarPU 1.3.4 (git revision xxx)
 ==============================================

+ 15 - 1
configure.ac

@@ -1459,6 +1459,9 @@ if test x$enable_cuda = xyes; then
 	    ]
 	)
 	if test x$have_valid_nvml = xyes ; then
+		AC_CHECK_DECLS([nvmlDeviceGetTotalEnergyConsumption], [
+			AC_CHECK_FUNCS([nvmlDeviceGetTotalEnergyConsumption])
+			], [], [[#include <nvml.h>]])
 		AC_DEFINE([HAVE_LIBNVIDIA_ML], [1], [Define to 1 if you have the nvidia-ml library])
 		STARPU_CUDA_LDFLAGS="$STARPU_CUDA_LDFLAGS -lnvidia-ml"
 	fi
@@ -2321,6 +2324,14 @@ AC_MSG_RESULT($nmaxbuffers)
 AC_DEFINE_UNQUOTED(STARPU_NMAXBUFS, [$nmaxbuffers],
 		[how many buffers can be manipulated per task])
 
+AC_MSG_CHECKING(how many MPI nodes fxt files can be manipulated when generating traces)
+AC_ARG_ENABLE(fxt-max-files, [AS_HELP_STRING([--enable-fxt-max-files=<nbuffers>],
+			[maximum number of mpi nodes for traces])],
+			nmaxfxtfiles=$enableval, nmaxfxtfiles=64)
+AC_MSG_RESULT($nmaxfxtfiles)
+AC_DEFINE_UNQUOTED(STARPU_FXT_MAX_FILES, [$nmaxfxtfiles],
+		[how many MPI nodes fxt files can be manipulated when generating traces])
+
 AC_MSG_CHECKING(maximum number of memory nodes to use per MPI rank)
 AC_ARG_ENABLE(maxnodes, [AS_HELP_STRING([--enable-maxnodes=<nnodes>],
 			[maximum number of memory nodes per MPI rank])],
@@ -2645,7 +2656,7 @@ if test "x$enable_build_fortran_requested" = "xyes" ; then
 	fi
 	if test "x$enable_build_fortran" = "xyes" ; then
 		AC_DEFINE(STARPU_HAVE_FC, [1], [Define this if a Fortran compiler is available])
-		if test x$build_mpi_lib = xyes -o x$build_mpi_master_slave = xyes ; then
+		if test x$build_mpi_lib = xyes -o x$build_nmad_lib = xyes -o x$build_mpi_master_slave = xyes ; then
 			#Check MPIFORT
 			if test x$enable_simgrid = xyes ; then
 				DEFAULT_MPIFORT=smpifort
@@ -3620,6 +3631,9 @@ AC_CONFIG_COMMANDS([executable-scripts], [
   test -e tests/microbenchs/parallel_independent_homogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh tests/microbenchs/
+  mkdir -p tests/energy
+  test -e tests/energy/static.sh || ln -sf $ac_abs_top_srcdir/tests/energy/static.sh tests/energy/
+  test -e tests/energy/dynamic.sh || ln -sf $ac_abs_top_srcdir/tests/energy/dynamic.sh tests/energy/
   mkdir -p tests/datawizard
   test -e tests/datawizard/locality.sh || ln -sf $ac_abs_top_srcdir/tests/datawizard/locality.sh tests/datawizard/
   mkdir -p tests/overlap

+ 1 - 1
doc/doxygen/Makefile.am

@@ -307,5 +307,5 @@ EXTRA_DIST += doxygen.cfg refman.tex \
 # Rule to update documentation on web server. Should only be used locally.
 PUBLISHHOST	?= gforge
 update-web: $(DOX_PDF)
-	scp -pr starpu.pdf html $(PUBLISHHOST):/home/groups/starpu/htdocs/doc
+	scp -pr starpu.pdf html $(PUBLISHHOST):/home/groups/starpu/htdocs/files/doc
 

+ 2 - 0
doc/doxygen/chapters/210_check_list_performance.doxy

@@ -91,6 +91,8 @@ operations to avoid this issue. For instance:
 
 \code{.c}
 func <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);
+cudaError_t status = cudaGetLastError();
+if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 cudaStreamSynchronize(starpu_cuda_get_local_stream());
 \endcode
 

+ 1 - 1
doc/doxygen/chapters/301_tasks.doxy

@@ -118,7 +118,7 @@ to delay the termination of a task until the termination of other tasks.
 
 \section SettingManyDataHandlesForATask Setting Many Data Handles For a Task
 
-The maximum number of data a task can manage is fixed by the environment variable
+The maximum number of data a task can manage is fixed by the macro
 \ref STARPU_NMAXBUFS which has a default value which can be changed
 through the \c configure option \ref enable-maxbuffers "--enable-maxbuffers".
 

+ 3 - 1
doc/doxygen/chapters/320_scheduling.doxy

@@ -205,7 +205,9 @@ simply tend to run all computations on the most energy-conservative processing
 unit. To account for the consumption of the whole machine (including idle
 processing units), the idle power of the machine should be given by setting
 <c>export STARPU_IDLE_POWER=200</c> (\ref STARPU_IDLE_POWER) for 200W, for instance. This value can often
-be obtained from the machine power supplier.
+be obtained from the machine power supplier, e.g. by running
+
+<c>ipmitool -I lanplus -H mymachine-ipmi -U myuser -P mypasswd sdr type Current</c>
 
 The energy actually consumed by the total execution can be displayed by setting
 <c>export STARPU_PROFILING=1 STARPU_WORKER_STATS=1</c> (\ref STARPU_PROFILING and \ref STARPU_WORKER_STATS).

+ 3 - 0
doc/doxygen/chapters/350_scheduling_policy_definition.doxy

@@ -60,6 +60,9 @@ queue the transfers on the idle prefetch queue, which is only processed when
 there are no non-idle prefetch to process.
 starpu_get_prefetch_flag() is a convenient helper for checking the value of the 
 \ref STARPU_PREFETCH environment variable.
+When a scheduler does such prefetching, it should set the <c>prefetches</c>
+field of the <c>starpu_sched_policy</c> to 1, to prevent the core from
+triggering its own prefetching.
 
 Usual functions can be used on tasks, for instance one can use the following to
 get the data size for a task.

+ 9 - 0
doc/doxygen/chapters/510_configure_options.doxy

@@ -527,6 +527,15 @@ Define the maximum number of buffers that tasks will be able to take
 as parameters, then available as the macro ::STARPU_NMAXBUFS.
 </dd>
 
+<dt>--enable-fxt-max-files=<c>count</c></dt>
+<dd>
+\anchor enable-fxt-max-files
+\addindex __configure__--enable-fxt-max-files
+Use at most <c>count</c> mpi nodes fxt files for generating traces.  This information is then available as
+the macro ::STARPU_FXT_MAX_FILES.  This information is used by FxT tools when considering multi node traces.
+Default value is 64.
+</dd>
+
 <dt>--enable-allocation-cache</dt>
 <dd>
 \anchor enable-allocation-cache

+ 2 - 0
doc/doxygen/chapters/code/vector_scal_cuda.c

@@ -35,6 +35,8 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
         unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
         vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(n, val, *factor);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
         cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 1 - 1
doc/doxygen_dev/Makefile.am

@@ -248,5 +248,5 @@ EXTRA_DIST += doxygen.cfg refman.tex \
 # Rule to update documentation on web server. Should only be used locally.
 PUBLISHHOST	?= gforge
 update-web: $(DOX_PDF)
-	scp -pr starpu_dev.pdf html_dev $(PUBLISHHOST):/home/groups/starpu/htdocs/doc
+	scp -pr starpu_dev.pdf html_dev $(PUBLISHHOST):/home/groups/starpu/htdocs/files/doc
 

+ 2 - 0
doc/tutorial/vector_scal_cuda.cu

@@ -35,6 +35,8 @@ extern "C" void vector_scal_cuda(void *buffers[], void *_args)
         unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
         vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(val, n, *factor);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
         cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 0
examples/basic_examples/block_cuda.cu

@@ -40,5 +40,7 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
         float *multiplier = (float *)_args;
 
         cuda_block<<<1,1, 0, starpu_cuda_get_local_stream()>>>(block, nx, ny, nz, ldy, ldz, *multiplier);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 0
examples/basic_examples/multiformat_conversion_codelets_cuda.cu

@@ -44,4 +44,6 @@ extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
         cpu_to_cuda_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(src, dst, n);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
examples/basic_examples/multiformat_cuda.cu

@@ -39,6 +39,8 @@ extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)
 	unsigned threads_per_block = 64;
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
         multiformat_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(soa, n);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 0
examples/basic_examples/variable_kernels.cu

@@ -27,5 +27,7 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
 	float *val = (float *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 	cuda_variable<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 0
examples/basic_examples/vector_scal_cuda.cu

@@ -41,4 +41,6 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
         vector_mult_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(n, val, *factor);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
examples/filters/custom_mf/conversion.cu

@@ -45,4 +45,6 @@ extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
 	unsigned threads_per_block = 64;
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
         custom_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(aop, n, x, y);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
examples/filters/custom_mf/cuda.cu

@@ -39,4 +39,6 @@ extern "C" void custom_scal_cuda_func(void *buffers[], void *_args)
 	unsigned threads_per_block = 64;
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
         scal_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(n, x, y);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
examples/filters/fblock_cuda.cu

@@ -43,4 +43,6 @@ extern "C" void cuda_func(void *buffers[], void *_args)
 
         /* TODO: use more blocks and threads in blocks */
         fblock_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(block, nx, ny, nz, ldy, ldz, *factor);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 4 - 0
examples/filters/fmultiple_cuda.cu

@@ -44,6 +44,8 @@ extern "C" void fmultiple_check_scale_cuda(void *buffers[], void *cl_arg)
 
         /* TODO: use more vals and threads in vals */
 	_fmultiple_check_scale_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val, nx, ny, ld, start, factor);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }
 
 static __global__ void _fmultiple_check_cuda(int *val, int nx, int ny, unsigned ld, int start, int factor)
@@ -71,4 +73,6 @@ extern "C" void fmultiple_check_cuda(void *buffers[], void *cl_arg)
 
         /* TODO: use more vals and threads in vals */
 	_fmultiple_check_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val, nx, ny, ld, start, factor);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
examples/incrementer/incrementer_kernels.cu

@@ -32,4 +32,6 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
 	float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 
 	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
examples/interface/complex_kernels.cu

@@ -44,4 +44,6 @@ extern "C" void copy_complex_codelet_cuda(void *descr[], void *_args)
 	unsigned nblocks = (nx + threads_per_block-1) / threads_per_block;
 
         complex_copy_cuda<<<nblocks, threads_per_block, 0, starpu_cuda_get_local_stream()>>>(o_real, o_imaginary, i_real, i_imaginary, nx);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 1 - 0
examples/mult/double.h

@@ -15,6 +15,7 @@
  */
 
 #define TYPE	double
+#define EPSILON	0.000000000001
 
 #define CUBLAS_GEMM cublasDgemm
 #define CPU_GEMM	STARPU_DGEMM

+ 1 - 0
examples/mult/simple.h

@@ -15,6 +15,7 @@
  */
 
 #define TYPE	float
+#define EPSILON	0.000001
 
 #define CUBLAS_GEMM cublasSgemm
 #define CPU_GEMM	STARPU_SGEMM

+ 1 - 1
examples/mult/xgemm.c

@@ -75,7 +75,7 @@ static int check_output(void)
 	TYPE err;
 	err = CPU_ASUM(xdim*ydim, C, 1);
 
-	if (err < xdim*ydim*0.001)
+	if (err < EPSILON*xdim*ydim*zdim)
 	{
 		FPRINTF(stderr, "Results are OK\n");
 		return 0;

+ 2 - 0
examples/pi/SobolQRNG/sobol_gpu.cu

@@ -165,4 +165,6 @@ extern "C" void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_direct
 
     // Execute GPU kernel
     sobolGPU_kernel<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(n_vectors, n_dimensions, d_directions, d_output);
+    cudaError_t status = cudaGetLastError();
+    if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 4 - 0
examples/pi/pi_kernel.cu

@@ -137,12 +137,16 @@ extern "C" void cuda_kernel(void *descr[], void *cl_arg)
 	/* each entry of per_block_cnt contains the number of successful shots
 	 * in the corresponding block. */
 	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(random_numbers_x, random_numbers_y, nx, per_block_cnt);
+	cures = cudaGetLastError();
+	if (cures != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures);
 
 	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
 
 	/* compute the total number of successful shots by adding the elements
 	 * of the per_block_cnt array */
 	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, cnt);
+	cures = cudaGetLastError();
+	if (cures != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures);
 	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	if (cures)
 		STARPU_CUDA_REPORT_ERROR(cures);

+ 4 - 0
examples/pi/pi_redux_kernel.cu

@@ -115,12 +115,16 @@ extern "C" void pi_redux_cuda_kernel(float *x, float *y, unsigned n, unsigned lo
 	/* each entry of per_block_cnt contains the number of successful shots
 	 * in the corresponding block. */
 	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(x, y, n, per_block_cnt);
+	cures = cudaGetLastError();
+	if (cures != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures);
 
 	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
 
 	/* compute the total number of successful shots by adding the elements
 	 * of the per_block_cnt array */
 	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, shot_cnt);
+	cures = cudaGetLastError();
+	if (cures != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures);
 	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	if (cures)
 		STARPU_CUDA_REPORT_ERROR(cures);

+ 2 - 0
examples/reductions/dot_product_kernels.cu

@@ -33,4 +33,6 @@ extern "C" void redux_cuda_func(void *descr[], void *_args)
 	DOT_TYPE *dotb = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
 
 	cuda_redux<<<1,1, 0, starpu_cuda_get_local_stream()>>>(dota, dotb);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
examples/sched_ctx/axpy_partition_gpu.cu

@@ -73,4 +73,6 @@ extern "C" void cuda_axpy(void *descr[], void *_args)
 	__P_HOSTSETUP(saxpy_partitioned,dim3(dimensions,1,1),dimensions,0,SM_mapping_start,SM_allocation,stream);
 
   	saxpy_partitioned<<<width,dimensions,0,stream>>>(__P_HKARGS,n,a,x,y);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
examples/spmv/spmv_cuda.cu

@@ -97,6 +97,8 @@ extern "C" void spmv_kernel_cuda(void *descr[], void *args)
 
 	spmv_kernel_3<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>
 		(nnz, nrow, nzval, colind, rowptr, firstentry, vecin, nx_in, vecout, nx_out);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }
 
 

+ 2 - 0
examples/stencil/life_cuda.cu

@@ -73,4 +73,6 @@ extern "C" void cuda_life_update_host(int bz, const TYPE *old, TYPE *newp, int n
 	dim3 dimGrid((nx + threads_per_dim_x-1) / threads_per_dim_x, (ny + threads_per_dim_y-1) / threads_per_dim_y);
 #endif
 	cuda_life_update <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> (bz, old, newp, nx, ny, nz, ldy, ldz, iter);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
examples/stencil/shadow.cu

@@ -53,4 +53,6 @@ extern "C" void cuda_shadow_host(int bz, TYPE *ptr, int nx, int ny, int nz, int
 	dim3 dimGrid((nx + threads_per_dim_x-1) / threads_per_dim_x, (ny + threads_per_dim_y-1) / threads_per_dim_y);
 #endif
 	cuda_shadow <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> (bz, ptr, nx, ny, nz, ldy, ldz, i);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 9 - 0
include/starpu_config.h.in

@@ -187,6 +187,15 @@
 #undef STARPU_NMAXBUFS
 
 /**
+   Define the maximum number of fxt mpi files that can be read when
+   generating traces. The default value is 64, it can be changed by
+   using the configure option \ref enable-fxt-max-files
+   "--enable-fxt-max-files".
+   @ingroup API_MPI_Support
+*/
+#undef STARPU_FXT_MAX_FILES
+
+/**
    Define the maximum number of CPU workers managed by StarPU. The
    default value can be modified at configure by using the option \ref
    enable-maxcpus "--enable-maxcpus".

+ 1 - 2
include/starpu_fxt.h

@@ -20,6 +20,7 @@
 #ifndef __STARPU_FXT_H__
 #define __STARPU_FXT_H__
 
+#include <starpu_config.h>
 #include <starpu_perfmodel.h>
 
 #ifdef __cplusplus
@@ -32,8 +33,6 @@ extern "C"
    @{
 */
 
-#define STARPU_FXT_MAX_FILES	64
-
 struct starpu_fxt_codelet_event
 {
 	char symbol[256];

+ 17 - 0
include/starpu_helper.h

@@ -20,6 +20,10 @@
 #include <stdio.h>
 #include <starpu.h>
 
+#ifdef STARPU_HAVE_HWLOC
+#include <hwloc.h>
+#endif
+
 #ifdef __cplusplus
 extern "C"
 {
@@ -190,6 +194,19 @@ int starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_ha
 */
 void starpu_display_bindings(void);
 
+/**
+   If \c hwloc is used, convert the given \p logical_index of a PU to the OS
+   index of this PU. If \c hwloc is not used, return \p logical_index.
+*/
+int starpu_get_pu_os_index(unsigned logical_index);
+
+#ifdef STARPU_HAVE_HWLOC
+/**
+   Get the hwloc topology used by StarPU. One can use this pointer to get
+   information about topology, but not to change settings related to topology.
+*/
+hwloc_topology_t starpu_get_hwloc_topology(void);
+#endif
 /** @} */
 
 #ifdef __cplusplus

+ 5 - 0
include/starpu_scheduler.h

@@ -186,6 +186,11 @@ struct starpu_sched_policy
 	*/
 	void (*remove_workers)(unsigned sched_ctx_id, int *workerids, unsigned nworkers);
 
+	/** Whether this scheduling policy does data prefetching, and thus the
+	    core should not try to do it opportunistically.
+	*/
+	int prefetches;
+
 	/**
 	   Optional field. Name of the policy.
 	*/

+ 23 - 0
include/starpu_stdlib.h

@@ -239,9 +239,32 @@ void starpu_memory_deallocate(unsigned node, size_t size);
 */
 void starpu_memory_wait_available(unsigned node, size_t size);
 
+/**
+   Sleep for the given \p nb_sec seconds.
+   In simgrid mode, this only sleeps within virtual time.
+  */
 void starpu_sleep(float nb_sec);
+
+/**
+   Sleep for the given \p nb_micro_sec micro-seconds.
+   In simgrid mode, this only sleeps within virtual time.
+  */
 void starpu_usleep(float nb_micro_sec);
 
+/**
+   Account for \p joules J being used.
+   This is support in simgrid mode, to record how much energy was used, and will
+   show up in further call to starpu_energy_used().
+  */
+void starpu_energy_use(float joules);
+
+/**
+   Return the amount of energy having been used in J.
+   This account the amounts passed to starpu_energy_use(), but also the static
+   energy use set by the \ref STARPU_IDLE_POWER environment variable.
+  */
+double starpu_energy_used(void);
+
 /** @} */
 
 #ifdef __cplusplus

+ 1 - 1
include/starpu_task.h

@@ -563,7 +563,7 @@ struct starpu_codelet
 
 	/**
 	   Optional pointer to the task energy consumption performance
-	   model associated to this codelet. This optional field is
+	   model associated to this codelet (in J). This optional field is
 	   ignored when set to <c>NULL</c> or when its field
 	   starpu_perfmodel::symbol is not set. In the case of
 	   parallel codelets, this has to account for all processing

+ 2 - 0
julia/examples/mult/gpu_mult.cu

@@ -79,6 +79,8 @@ extern "C" void gpu_mult(void * descr[], void * args)
 	gpuMultKernel
 		<<< nblocks, THREADS_PER_BLOCK, 0, NULL /*starpu_cuda_get_local_stream()*/
 		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 

+ 2 - 0
julia/examples/old_examples/gpu_mult.cu

@@ -78,6 +78,8 @@ extern "C" void gpu_mult(void * descr[], void * args)
 	gpuMultKernel
 		<<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
 		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 

+ 2 - 0
julia/examples/old_examples/mandelbrot/gpu_mandelbrot.cu

@@ -106,6 +106,8 @@ extern "C" void gpu_mandelbrot(void *descr[], void *args)
   nblocks = (nxP * nyP + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
 
   gpuMandelbrotKernel <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream() >>> (nxP, nyP, ldP, d_subP, *params);
+  cudaError_t status = cudaGetLastError();
+  if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
   cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 0
julia/examples/old_examples/mandelbrot/gpu_mandelbrot_between.cu

@@ -123,6 +123,8 @@ extern "C" void CUDA_mandelbrot(void** buffers_uwrYFDVe, void* cl_arg_uwrYFDVe)
              ptr_qoUGBRtY, local_height, conv_limit, ptr_A5zD9sJZ, 
              ld_A5zD9sJZ);
     ;
+    cudaError_t status = cudaGetLastError();
+    if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
     cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 

+ 2 - 0
julia/examples/old_examples/mult/gpu_mult.cu

@@ -78,6 +78,8 @@ extern "C" void gpu_mult(void * descr[], void * args)
 	gpuMultKernel
 		<<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
 		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 

+ 4 - 0
julia/examples/old_examples/nbody/gpu_nbody.cu

@@ -94,6 +94,8 @@ extern "C" void gpu_nbody(void * descr[], void * args)
   gpuNbodyKernel
     <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
     >>> (d_P,  d_subA, d_M, nxP, nxA, nxM, ldP, ldA, *params);
+  cudaError_t status = cudaGetLastError();
+  if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
   cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
@@ -156,6 +158,8 @@ extern "C" void gpu_nbody2(void * descr[], void *args)
   gpuNbody2Kernel
     <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
     >>> (d_subP, d_subV, d_subA, nxP, nxV, nxA, ldP, ldV, ldA, *params);
+  cudaError_t status = cudaGetLastError();
+  if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
   cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 0
julia/examples/old_examples/nbody/gpu_nbody_between.cu

@@ -161,6 +161,8 @@ extern "C" void CUDA_nbody_updt(void** buffers_gj6UYWT4, void* cl_arg_gj6UYWT4)
              ld_jJ5f8wMA, ptr_piPvdbTs, ld_piPvdbTs, ptr_JBaPgPiT, 
              ptr_0STm2S4k, ld_0STm2S4k);
     ;
+    cudaError_t status = cudaGetLastError();
+    if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
     cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 

+ 4 - 0
julia/src/compiler/expressions.jl

@@ -335,6 +335,10 @@ function print(io :: IO, expr :: StarpuExprCudaCall ; indent = 0,restrict=false)
 
     print(io, ");")
     print_newline(io, indent)
+    print(io, "cudaError_t status = cudaGetLastError();")
+    print_newline(io, indent)
+    print(io, "if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);")
+    print_newline(io, indent)
 
 end
 

+ 1 - 1
mpi/examples/matrix_decomposition/mpi_cholesky.c

@@ -58,7 +58,7 @@ int main(int argc, char **argv)
 #ifndef STARPU_SIMGRID
 	matrix_display(bmat, rank);
 
-	if (check)
+	if (check && rank == 0)
 		dw_cholesky_check_computation(bmat, rank, nodes, &correctness, &flops, 0.001);
 #endif
 

+ 223 - 57
mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c

@@ -68,6 +68,212 @@ static struct starpu_codelet cl22 =
 	.color = 0x00ff00,
 };
 
+static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int nodes)
+{
+	unsigned k, m, n;
+	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
+
+	for (k = 0; k < nblocks; k++)
+	{
+		starpu_iteration_push(k);
+
+		starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
+				       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
+				       STARPU_RW, data_handles[k][k],
+				       0);
+
+		for (m = k+1; m<nblocks; m++)
+		{
+			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
+					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+					       STARPU_R, data_handles[k][k],
+					       STARPU_RW, data_handles[m][k],
+					       0);
+
+			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][k]);
+			if (my_distrib(k, k, nodes) == rank)
+				starpu_data_wont_use(data_handles[k][k]);
+
+			for (n = k+1; n<nblocks; n++)
+			{
+				if (n <= m)
+				{
+					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
+							       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+							       STARPU_R, data_handles[n][k],
+							       STARPU_R, data_handles[m][k],
+							       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
+							       0);
+				}
+			}
+
+			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
+			if (my_distrib(m, k, nodes) == rank)
+				starpu_data_wont_use(data_handles[m][k]);
+		}
+		starpu_iteration_pop();
+	}
+}
+
+/* TODO: generated from compiler polyhedral analysis of classical algorithm */
+static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, int nodes)
+{
+	unsigned k, m, n;
+	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
+
+	/* Column */
+	for (n = 0; n<nblocks; n++)
+	{
+		starpu_iteration_push(n);
+
+		/* Row */
+		for (m = n; m<nblocks; m++)
+		{
+			for (k = 0; k < n; k++)
+			{
+				/* Accumulate updates from TRSMs */
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+						       STARPU_R, data_handles[n][k],
+						       STARPU_R, data_handles[m][k],
+						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
+						       0);
+			}
+			k = n;
+			if (m > n)
+			{
+				/* non-diagonal block, solve */
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+						       STARPU_R, data_handles[k][k],
+						       STARPU_RW, data_handles[m][k],
+						       0);
+			}
+			else
+			{
+				/* diagonal block, factorize */
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
+						       STARPU_RW, data_handles[k][k],
+						       0);
+			}
+		}
+
+		starpu_iteration_pop();
+	}
+
+	/* Submit flushes, StarPU will fit them according to the progress */
+	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
+	for (m = 0; m < nblocks; m++)
+		for (n = 0; n < nblocks ; n++)
+			starpu_data_wont_use(data_handles[m][n]);
+}
+
+/* TODO: generated from compiler polyhedral analysis of classical algorithm */
+static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int rank, int nodes)
+{
+	unsigned a, b, c;
+	unsigned k, m, n;
+	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
+
+	/* double-antidiagonal number:
+	 * - a=0 contains (0,0) plus (1,0)
+	 * - a=1 contains (2,0), (1,1) plus (3,0), (2, 1)
+	 * - etc.
+	 */
+	for (a = 0; a < nblocks; a++)
+	{
+		starpu_iteration_push(a);
+
+		unsigned bfirst;
+		if (2*a < nblocks)
+			bfirst = 0;
+		else
+			bfirst = 2*a - (nblocks-1);
+
+		/* column within first antidiagonal for a */
+		for (b = bfirst; b <= a; b++)
+		{
+			/* column */
+			n = b;
+			/* row */
+			m = 2*a-b;
+
+			/* Accumulate updates from TRSMs */
+			for (c = 0; c < n; c++)
+			{
+				k = c;
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+						       STARPU_R, data_handles[n][k],
+						       STARPU_R, data_handles[m][k],
+						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
+						       0);
+			}
+
+			if (b < a)
+			{
+				/* non-diagonal block, solve */
+				k = n;
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+						       STARPU_R, data_handles[k][k],
+						       STARPU_RW, data_handles[m][k],
+						       0);
+			}
+			else
+			{
+				/* diagonal block, factorize */
+				k = a;
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
+						       STARPU_RW, data_handles[k][k],
+						       0);
+			}
+		}
+
+		/* column within second antidiagonal for a */
+		for (b = bfirst; b <= a; b++)
+		{
+			/* column */
+			n = b;
+			/* row */
+			m = 2*a-b + 1;
+
+			if (m >= nblocks)
+				/* Skip first item when even number of tiles */
+				continue;
+
+			/* Accumulate updates from TRSMs */
+			for (c = 0; c < n; c++)
+			{
+				k = c;
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+						       STARPU_R, data_handles[n][k],
+						       STARPU_R, data_handles[m][k],
+						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
+						       0);
+			}
+			/* non-diagonal block, solve */
+			k = n;
+			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
+					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+					       STARPU_R, data_handles[k][k],
+					       STARPU_RW, data_handles[m][k],
+					       0);
+		}
+
+		starpu_iteration_pop();
+	}
+
+	/* Submit flushes, StarPU will fit them according to the progress */
+	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
+	for (m = 0; m < nblocks; m++)
+		for (n = 0; n < nblocks ; n++)
+			starpu_data_wont_use(data_handles[m][n]);
+}
+
 /*
  *	code to bootstrap the factorization
  *	and construct the DAG
@@ -79,8 +285,6 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 	starpu_data_handle_t **data_handles;
 	unsigned k, m, n;
 
-	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
-
 	/* create all the DAG nodes */
 
 	data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
@@ -91,7 +295,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 		for(n = 0; n < nblocks ; n++)
 		{
 			int mpi_rank = my_distrib(m, n, nodes);
-			if (mpi_rank == rank)
+			if (mpi_rank == rank || (check && rank == 0))
 			{
 				//fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, n, m);
 				starpu_matrix_data_register(&data_handles[m][n], STARPU_MAIN_RAM, (uintptr_t)matA[m][n],
@@ -119,50 +323,16 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 	starpu_mpi_barrier(MPI_COMM_WORLD);
 	start = starpu_timing_now();
 
-	for (k = 0; k < nblocks; k++)
+	switch (submission)
 	{
-		starpu_iteration_push(k);
-
-		starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
-				       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
-				       STARPU_RW, data_handles[k][k],
-				       0);
-
-		for (m = k+1; m<nblocks; m++)
-		{
-			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
-					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
-					       STARPU_R, data_handles[k][k],
-					       STARPU_RW, data_handles[m][k],
-					       0);
-
-			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][k]);
-			if (my_distrib(k, k, nodes) == rank)
-				starpu_data_wont_use(data_handles[k][k]);
-
-			for (n = k+1; n<nblocks; n++)
-			{
-				if (n <= m)
-				{
-					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
-							       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
-							       STARPU_R, data_handles[n][k],
-							       STARPU_R, data_handles[m][k],
-							       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
-							       0);
-				}
-			}
-
-			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
-			if (my_distrib(m, k, nodes) == rank)
-				starpu_data_wont_use(data_handles[m][k]);
-		}
-		starpu_iteration_pop();
+		case TRIANGLES:		run_cholesky(data_handles, rank, nodes); break;
+		case COLUMNS:		run_cholesky_column(data_handles, rank, nodes); break;
+		case ANTIDIAGONALS:	run_cholesky_antidiagonal(data_handles, rank, nodes); break;
+		default: STARPU_ABORT();
 	}
 
 	starpu_mpi_wait_for_all(MPI_COMM_WORLD);
 	starpu_mpi_barrier(MPI_COMM_WORLD);
-
 	end = starpu_timing_now();
 
 	for (m = 0; m < nblocks; m++)
@@ -170,7 +340,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 		for(n = 0; n < nblocks ; n++)
 		{
 			/* Get back data on node 0 for the check */
-			if (check)
+			if (check && data_handles[m][n])
 				starpu_mpi_get_data_on_node(MPI_COMM_WORLD, data_handles[m][n], 0);
 
 			if (data_handles[m][n])
@@ -248,24 +418,20 @@ void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *corr
 	{
 		for (m = 0; m < nblocks; m++)
 		{
-			int mpi_rank = my_distrib(m, n, nodes);
-			if (mpi_rank == rank)
+			for (nn = BLOCKSIZE*n ; nn < BLOCKSIZE*(n+1); nn++)
 			{
-				for (nn = (size/nblocks)*n ; nn < (size/nblocks)*n+(size/nblocks); nn++)
+				for (mm = BLOCKSIZE*m ; mm < BLOCKSIZE*(m+1); mm++)
 				{
-					for (mm = (size/nblocks)*m ; mm < (size/nblocks)*m+(size/nblocks); mm++)
+					if (nn <= mm)
 					{
-						if (nn <= mm)
+						float orig = (1.0f/(1.0f+nn+mm)) + ((nn == mm)?1.0f*size:0.0f);
+						float err = fabsf(test_mat[mm +nn*size] - orig) / orig;
+						if (err > epsilon)
 						{
-							float orig = (1.0f/(1.0f+nn+mm)) + ((nn == mm)?1.0f*size:0.0f);
-							float err = fabsf(test_mat[mm +nn*size] - orig) / orig;
-							if (err > epsilon)
-							{
-								FPRINTF(stderr, "[%d] Error[%u, %u] --> %2.20f != %2.20f (err %2.20f)\n", rank, nn, mm, test_mat[mm +nn*size], orig, err);
-								*correctness = 0;
-								*flops = 0;
-								break;
-							}
+							FPRINTF(stderr, "[%d] Error[%u, %u] --> %2.20f != %2.20f (err %2.20f)\n", rank, nn, mm, test_mat[mm +nn*size], orig, err);
+							*correctness = 0;
+							*flops = 0;
+							break;
 						}
 					}
 				}

+ 12 - 1
mpi/examples/matrix_decomposition/mpi_decomposition_params.c

@@ -43,6 +43,7 @@ unsigned check = 0;
 unsigned display = 0;
 int dblockx = -1;
 int dblocky = -1;
+enum submission submission = TRIANGLES;
 
 void parse_args(int argc, char **argv, int nodes)
 {
@@ -79,6 +80,16 @@ void parse_args(int argc, char **argv, int nodes)
                         nbigblocks = strtol(argv[++i], &argptr, 10);
                 }
 
+                if (strcmp(argv[i], "-columns") == 0)
+                {
+                        submission = COLUMNS;
+                }
+
+                if (strcmp(argv[i], "-antidiagonals") == 0)
+                {
+                        submission = ANTIDIAGONALS;
+                }
+
                 if (strcmp(argv[i], "-no-prio") == 0)
                 {
                         noprio = 1;
@@ -96,7 +107,7 @@ void parse_args(int argc, char **argv, int nodes)
 
                 if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
                 {
-			printf("usage : %s [-size size] [-nblocks nblocks] [-no-prio] [-display] [-check]\n", argv[0]);
+                        printf("usage : %s [-size size] [-nblocks nblocks] [-columns] [-antidiagonals] [-no-prio] [-display] [-check]\n", argv[0]);
                 }
         }
 

+ 8 - 0
mpi/examples/matrix_decomposition/mpi_decomposition_params.h

@@ -28,6 +28,14 @@ extern unsigned display;
 extern int dblockx;
 extern int dblocky;
 
+enum submission
+{
+	TRIANGLES,
+	COLUMNS,
+	ANTIDIAGONALS,
+};
+extern enum submission submission;
+
 void parse_args(int argc, char **argv, int nodes);
 
 #endif // __MPI_CHOLESKY_PARAMS_H__

+ 6 - 0
mpi/include/starpu_mpi.h

@@ -132,6 +132,12 @@ int starpu_mpi_world_size(void);
 */
 int starpu_mpi_comm_get_attr(MPI_Comm comm, int keyval, void *attribute_val, int *flag);
 
+
+/**
+   Get the logical index of the core where the MPI thread is bound.
+*/
+int starpu_mpi_get_thread_cpuid(void);
+
 int starpu_mpi_get_communication_tag(void);
 void starpu_mpi_set_communication_tag(int tag);
 

+ 2 - 3
mpi/src/starpu_mpi_datatype.c

@@ -26,17 +26,16 @@ struct _starpu_mpi_datatype_funcs
 	UT_hash_handle hh;
 };
 
-static starpu_pthread_mutex_t _starpu_mpi_datatype_funcs_table_mutex;
+/* We want to allow applications calling starpu_mpi_interface_datatype_register/unregister as constructor/destructor */
+static starpu_pthread_mutex_t _starpu_mpi_datatype_funcs_table_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
 static struct _starpu_mpi_datatype_funcs *_starpu_mpi_datatype_funcs_table = NULL;
 
 void _starpu_mpi_datatype_init(void)
 {
-	STARPU_PTHREAD_MUTEX_INIT(&_starpu_mpi_datatype_funcs_table_mutex, NULL);
 }
 
 void _starpu_mpi_datatype_shutdown(void)
 {
-	STARPU_PTHREAD_MUTEX_DESTROY(&_starpu_mpi_datatype_funcs_table_mutex);
 }
 
 /*

+ 5 - 0
mpi/src/starpu_mpi_init.c

@@ -336,3 +336,8 @@ int starpu_mpi_world_rank(void)
 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
 	return rank;
 }
+
+int starpu_mpi_get_thread_cpuid(void)
+{
+	return _starpu_mpi_thread_cpuid;
+}

+ 2 - 0
mpi/tests/ring_kernel.cu

@@ -27,5 +27,7 @@ extern "C" void increment_cuda(void *descr[], void *_args)
 	int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
 
 	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(tokenptr);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 11 - 4
src/Makefile.am

@@ -411,9 +411,16 @@ endif
 # static inline definition
 dist-hook:
 	failed=0 ; \
-	for i in $$( $(GREP) "static inline" $$(find $(srcdir) -name \*.h) | $(SED) -e 's/.*static inline //g' | $(GREP) -v ENAME | $(SED) -e 's/[^(]* \(\|\*\)\([^ (]*\)(.*/\2/' | $(GREP) -v _starpu_spin_init) ; do \
-		for j in $(shell find . -name \*.o) ; do \
-			nm $$j | $(GREP) "U $$i$$" && { echo $$j ; failed=1 ; } ; \
-		done ; \
+	look=""; \
+	for i in $$( $(GREP) "static inline" $$(find $(srcdir) -name \*.h) | $(SED) -e 's/.*static inline //g' | $(GREP) -v ENAME\#\# | $(SED) -n -e 's/[^(]* \(\|\*\)\([^ (]*\)(.*/\2/' -e 'p;s/^_*//;p' | $(GREP) -v _starpu_spin_init | $(GREP) -v starpu_sched_ctx_worker_is_master_for_child_ctx) ; do \
+		if [ -z "$$look" ] ; then \
+			look="$$i" ; \
+		else \
+			look="$$look\|$$i" ; \
+		fi ; \
+	done ; \
+	echo "$$look" ; \
+	for j in $(shell find . -name \*.o) ; do \
+		nm $$j | $(GREP) -e "U \($$look\)$$" && { echo $$j ; failed=1 ; } ; \
 	done ; \
 	[ $$failed == 0 ]

+ 3 - 0
src/core/jobs.c

@@ -24,10 +24,12 @@
 #include <common/config.h>
 #include <common/utils.h>
 #include <common/graph.h>
+#include <datawizard/memory_nodes.h>
 #include <profiling/profiling.h>
 #include <profiling/bound.h>
 #include <core/debug.h>
 #include <limits.h>
+#include <core/workers.h>
 
 static int max_memory_use;
 static unsigned long njobs, maxnjobs;
@@ -483,6 +485,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 		 * also the callback were executed. */
 		j->terminated = 2;
 	}
+	task->prefetched = 0;
 	STARPU_PTHREAD_COND_BROADCAST(&j->sync_cond);
 	STARPU_AYU_REMOVETASK(j->job_id);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);

+ 5 - 4
src/core/perfmodel/perfmodel_bus.c

@@ -37,6 +37,7 @@
 #include <core/topology.h>
 #include <common/utils.h>
 #include <drivers/mpi/driver_mpi_common.h>
+#include <datawizard/memory_nodes.h>
 
 #ifdef STARPU_USE_OPENCL
 #include <starpu_opencl.h>
@@ -188,7 +189,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 	/* Allocate a buffer on the device */
 	unsigned char *d_buffer;
 	cures = cudaMalloc((void **)&d_buffer, size);
-	STARPU_ASSERT(cures == cudaSuccess);
+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
 
 	/* hack to avoid third party libs to rebind threads */
 	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
@@ -217,7 +218,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 		cudaHostRegister((void *)h_buffer, size, 0);
 	}
 
-	STARPU_ASSERT(cures == cudaSuccess);
+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
 
 	/* hack to avoid third party libs to rebind threads */
 	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
@@ -342,7 +343,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 	/* Allocate a buffer on the device */
 	unsigned char *s_buffer;
 	cures = cudaMalloc((void **)&s_buffer, size);
-	STARPU_ASSERT(cures == cudaSuccess);
+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
 	cudaMemset(s_buffer, 0, size);
 	cudaDeviceSynchronize();
 
@@ -368,7 +369,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 	/* Allocate a buffer on the device */
 	unsigned char *d_buffer;
 	cures = cudaMalloc((void **)&d_buffer, size);
-	STARPU_ASSERT(cures == cudaSuccess);
+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
 	cudaMemset(d_buffer, 0, size);
 	cudaDeviceSynchronize();
 

+ 1 - 0
src/core/perfmodel/perfmodel_print.c

@@ -19,6 +19,7 @@
 #include <starpu.h>
 #include <starpu_perfmodel.h>
 #include <common/config.h>
+#include <core/workers.h>
 #include "perfmodel.h"
 
 static

+ 1 - 0
src/core/sched_ctx.c

@@ -21,6 +21,7 @@
 #include <common/utils.h>
 #include <stdarg.h>
 #include <core/task.h>
+#include <core/workers.h>
 
 enum _starpu_ctx_change_op
 {

+ 20 - 20
src/core/sched_policy.c

@@ -22,6 +22,7 @@
 #include <common/utils.h>
 #include <core/sched_policy.h>
 #include <profiling/profiling.h>
+#include <datawizard/memory_nodes.h>
 #include <common/barrier.h>
 #include <core/debug.h>
 #include <core/task.h>
@@ -569,32 +570,12 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 	int ret = 0;
 	if (STARPU_UNLIKELY(task->execute_on_a_specific_worker))
 	{
-		if (starpu_get_prefetch_flag())
-			starpu_prefetch_task_input_for(task, task->workerid);
-
 		ret = _starpu_push_task_on_specific_worker(task, task->workerid);
 	}
 	else
 	{
 		struct _starpu_machine_config *config = _starpu_get_machine_config();
 
-		/* When a task can only be executed on a given arch and we have
-		 * only one memory node for that arch, we can systematically
-		 * prefetch before the scheduling decision. */
-		if (starpu_get_prefetch_flag() && starpu_memory_nodes_get_count() > 1)
-		{
-			if (task->where == STARPU_CPU && config->cpus_nodeid >= 0)
-				starpu_prefetch_task_input_on_node(task, config->cpus_nodeid);
-			else if (task->where == STARPU_CUDA && config->cuda_nodeid >= 0)
-				starpu_prefetch_task_input_on_node(task, config->cuda_nodeid);
-                        else if (task->cl->where == STARPU_FPGA && config->fpga_nodeid >= 0)
-				starpu_prefetch_task_input_on_node(task, config->fpga_nodeid);
-			else if (task->where == STARPU_OPENCL && config->opencl_nodeid >= 0)
-				starpu_prefetch_task_input_on_node(task, config->opencl_nodeid);
-			else if (task->where == STARPU_MIC && config->mic_nodeid >= 0)
-				starpu_prefetch_task_input_on_node(task, config->mic_nodeid);
-		}
-
 		if(!sched_ctx->sched_policy)
 		{
 			/* Note: we have to call that early, or else the task may have
@@ -637,6 +618,25 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 		}
 		else
 		{
+			/* When a task can only be executed on a given arch and we have
+			 * only one memory node for that arch, we can systematically
+			 * prefetch before the scheduling decision. */
+			if (!sched_ctx->sched_policy->prefetches
+				&& starpu_get_prefetch_flag()
+				&& starpu_memory_nodes_get_count() > 1)
+			{
+				if (task->where == STARPU_CPU && config->cpus_nodeid >= 0)
+					starpu_prefetch_task_input_on_node(task, config->cpus_nodeid);
+				else if (task->where == STARPU_CUDA && config->cuda_nodeid >= 0)
+					starpu_prefetch_task_input_on_node(task, config->cuda_nodeid);
+				else if (task->cl->where == STARPU_FPGA && config->fpga_nodeid >= 0)
+					starpu_prefetch_task_input_on_node(task, config->fpga_nodeid);
+				else if (task->where == STARPU_OPENCL && config->opencl_nodeid >= 0)
+					starpu_prefetch_task_input_on_node(task, config->opencl_nodeid);
+				else if (task->where == STARPU_MIC && config->mic_nodeid >= 0)
+					starpu_prefetch_task_input_on_node(task, config->mic_nodeid);
+			}
+
 			STARPU_ASSERT(sched_ctx->sched_policy->push_task);
 			/* check out if there are any workers in the context */
 			unsigned nworkers = starpu_sched_ctx_get_nworkers(sched_ctx->id);

+ 24 - 2
src/core/simgrid.c

@@ -58,6 +58,8 @@ extern int _starpu_mpi_simgrid_init(int argc, char *argv[]);
 extern void smpi_process_set_user_data(void *);
 #endif
 
+static double _starpu_simgrid_dynamic_energy = 0.0;
+
 /* 1 when MSG_init was done, 2 when initialized through redirected main, 3 when
  * initialized through MSG_process_attach */
 static int simgrid_started;
@@ -629,6 +631,7 @@ struct task
 #else
 	msg_task_t task;
 #endif
+	double energy;
 
 	/* communication termination signalization */
 	unsigned *finished;
@@ -666,6 +669,7 @@ static void *task_execute(void *arg)
 		MSG_task_execute(task->task);
 		MSG_task_destroy(task->task);
 #endif
+		starpu_energy_use(task->energy);
 		_STARPU_DEBUG("task %p finished\n", task);
 
 		*task->finished = 1;
@@ -702,7 +706,7 @@ void _starpu_simgrid_wait_tasks(int workerid)
 }
 
 /* Task execution submitted by StarPU */
-void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, double length, unsigned *finished)
+void _starpu_simgrid_submit_job(int workerid, int sched_ctx_id, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, double length, double energy, unsigned *finished)
 {
 	struct starpu_task *starpu_task = j->task;
 	double flops;
@@ -717,13 +721,19 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 
 	if (isnan(length))
 	{
-		length = starpu_task_expected_length(starpu_task, perf_arch, j->nimpl);
+		length = starpu_task_worker_expected_length(starpu_task, workerid, sched_ctx_id, j->nimpl);
 		STARPU_ASSERT_MSG(!_STARPU_IS_ZERO(length) && !isnan(length),
 				  "Codelet %s does not have a perfmodel (in directory %s), or is not calibrated enough, please re-run in non-simgrid mode until it is calibrated",
 				  _starpu_job_get_model_name(j), _starpu_get_perf_model_dir_codelet());
                 /* TODO: option to add variance according to performance model,
                  * to be able to easily check scheduling robustness */
 	}
+	if (isnan(energy))
+	{
+		energy = starpu_task_worker_expected_energy(starpu_task, workerid, sched_ctx_id, j->nimpl);
+		/* TODO: option to add variance according to performance model,
+		 * to be able to easily check scheduling robustness */
+	}
 
 #if defined(HAVE_SG_HOST_SPEED) || defined(sg_host_speed)
 #  if defined(HAVE_SG_HOST_SELF) || defined(sg_host_self)
@@ -754,6 +764,7 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 		MSG_task_execute(simgrid_task);
 		MSG_task_destroy(simgrid_task);
 #endif
+		starpu_energy_use(energy);
 	}
 	else
 	{
@@ -766,6 +777,7 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 #else
 		task->task = simgrid_task;
 #endif
+		task->energy = energy;
 		task->finished = finished;
 		*finished = 0;
 		task->next = NULL;
@@ -1391,5 +1403,15 @@ void _starpu_simgrid_data_transfer(size_t size, unsigned src_node, unsigned dst_
 }
 #endif
 
+void starpu_energy_use(float joules)
+{
+	_starpu_simgrid_dynamic_energy += joules;
+}
+
+double starpu_energy_used(void)
+{
+	float idle_power = starpu_get_env_float_default("STARPU_IDLE_POWER", 0.0);
+	return _starpu_simgrid_dynamic_energy + idle_power * starpu_timing_now() / 1000000;
+}
 
 #endif

+ 1 - 1
src/core/simgrid.h

@@ -66,7 +66,7 @@ void _starpu_simgrid_deinit_late(void);
 void _starpu_simgrid_actor_setup(void);
 void _starpu_simgrid_wait_tasks(int workerid);
 struct _starpu_job;
-void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *job, struct starpu_perfmodel_arch* perf_arch, double length, unsigned *finished);
+void _starpu_simgrid_submit_job(int workerid, int sched_ctx_id, struct _starpu_job *job, struct starpu_perfmodel_arch* perf_arch, double length, double energy, unsigned *finished);
 struct _starpu_data_request;
 int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node, struct _starpu_data_request *req);
 union _starpu_async_channel_event;

+ 1 - 0
src/core/task.c

@@ -30,6 +30,7 @@
 #include <common/utils.h>
 #include <common/fxt.h>
 #include <common/knobs.h>
+#include <datawizard/memory_nodes.h>
 #include <profiling/profiling.h>
 #include <profiling/bound.h>
 #include <math.h>

+ 27 - 0
src/core/topology.c

@@ -2086,7 +2086,11 @@ int _starpu_bind_thread_on_cpu(int cpuid STARPU_ATTRIBUTE_UNUSED, int workerid S
 		{
 			cpu_worker[cpuid] = workerid;
 			if (name)
+			{
+				if (cpu_name[cpuid])
+					free(cpu_name[cpuid]);
 				cpu_name[cpuid] = strdup(name);
+			}
 		}
 	}
 
@@ -3219,3 +3223,26 @@ void starpu_topology_print(FILE *output)
 		fprintf(output, "\n");
 	}
 }
+
+int starpu_get_pu_os_index(unsigned logical_index)
+{
+#ifdef STARPU_HAVE_HWLOC
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
+	struct _starpu_machine_topology *topology = &config->topology;
+
+	hwloc_topology_t topo = topology->hwtopology;
+
+	return hwloc_get_obj_by_type(topo, HWLOC_OBJ_PU, logical_index)->os_index;
+#else
+	return logical_index;
+#endif
+}
+
+#ifdef STARPU_HAVE_HWLOC
+hwloc_topology_t starpu_get_hwloc_topology(void)
+{
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
+
+	return config->topology.hwtopology;
+}
+#endif

+ 6 - 0
src/core/workers.c

@@ -2692,31 +2692,37 @@ int starpu_worker_get_relax_state(void)
 	return _starpu_worker_get_relax_state();
 }
 
+#undef starpu_worker_lock
 void starpu_worker_lock(int workerid)
 {
 	_starpu_worker_lock(workerid);
 }
 
+#undef starpu_worker_trylock
 int starpu_worker_trylock(int workerid)
 {
 	return _starpu_worker_trylock(workerid);
 }
 
+#undef starpu_worker_unlock
 void starpu_worker_unlock(int workerid)
 {
 	_starpu_worker_unlock(workerid);
 }
 
+#undef starpu_worker_lock_self
 void starpu_worker_lock_self(void)
 {
 	_starpu_worker_lock_self();
 }
 
+#undef starpu_worker_unlock_self
 void starpu_worker_unlock_self(void)
 {
 	_starpu_worker_unlock_self();
 }
 
+#undef starpu_wake_worker_relax
 int starpu_wake_worker_relax(int workerid)
 {
 	return _starpu_wake_worker_relax(workerid);

+ 6 - 0
src/core/workers.h

@@ -1132,6 +1132,7 @@ static inline void _starpu_worker_lock(int workerid)
 		STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
 	}
 }
+#define starpu_worker_lock _starpu_worker_lock
 
 static inline int _starpu_worker_trylock(int workerid)
 {
@@ -1162,6 +1163,7 @@ static inline int _starpu_worker_trylock(int workerid)
 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&cur_worker->sched_mutex);
 	return ret;
 }
+#define starpu_worker_trylock _starpu_worker_trylock
 
 static inline void _starpu_worker_unlock(int workerid)
 {
@@ -1174,6 +1176,7 @@ static inline void _starpu_worker_unlock(int workerid)
 		starpu_worker_relax_off();
 	}
 }
+#define starpu_worker_unlock _starpu_worker_unlock
 
 static inline void _starpu_worker_lock_self(void)
 {
@@ -1182,6 +1185,7 @@ static inline void _starpu_worker_lock_self(void)
 	STARPU_ASSERT(worker != NULL);
 	STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
 }
+#define starpu_worker_lock_self _starpu_worker_lock_self
 
 static inline void _starpu_worker_unlock_self(void)
 {
@@ -1190,6 +1194,7 @@ static inline void _starpu_worker_unlock_self(void)
 	STARPU_ASSERT(worker != NULL);
 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
 }
+#define starpu_worker_unlock_self _starpu_worker_unlock_self
 
 static inline int _starpu_wake_worker_relax(int workerid)
 {
@@ -1198,6 +1203,7 @@ static inline int _starpu_wake_worker_relax(int workerid)
 	_starpu_worker_unlock(workerid);
 	return ret;
 }
+#define starpu_wake_worker_relax _starpu_wake_worker_relax
 
 int starpu_wake_worker_relax_light(int workerid);
 

+ 58 - 17
src/datawizard/coherency.c

@@ -411,7 +411,7 @@ int _starpu_determine_request_path(starpu_data_handle_t handle,
 /* handle->lock should be taken. r is returned locked. The node parameter
  * indicate either the source of the request, or the destination for a
  * write-only request. */
-static struct _starpu_data_request *_starpu_search_existing_data_request(struct _starpu_data_replicate *replicate, unsigned node, enum starpu_data_access_mode mode, unsigned is_prefetch)
+static struct _starpu_data_request *_starpu_search_existing_data_request(struct _starpu_data_replicate *replicate, unsigned node, enum starpu_data_access_mode mode, enum _starpu_is_prefetch is_prefetch)
 {
 	struct _starpu_data_request *r;
 
@@ -474,7 +474,7 @@ static struct _starpu_data_request *_starpu_search_existing_data_request(struct
 
 struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
 								  struct _starpu_data_replicate *dst_replicate,
-								  enum starpu_data_access_mode mode, unsigned is_prefetch,
+								  enum starpu_data_access_mode mode, enum _starpu_is_prefetch is_prefetch,
 								  unsigned async,
 								  void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
 {
@@ -529,7 +529,13 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 #endif
 
 			if (dst_replicate->mc)
+			{
+				if (is_prefetch == STARPU_TASK_PREFETCH)
+					/* Make sure it stays there */
+					dst_replicate->mc->nb_tasks_prefetch++;
+
 				_starpu_memchunk_recently_used(dst_replicate->mc, requesting_node);
+			}
 		}
 
 		_starpu_spin_unlock(&handle->header_lock);
@@ -574,6 +580,9 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 			if (_starpu_allocate_memory_on_node(handle, dst_replicate, is_prefetch) == 0)
 			{
 				_starpu_update_data_state(handle, dst_replicate, mode);
+				if (is_prefetch == STARPU_TASK_PREFETCH)
+					/* Make sure it stays there */
+					dst_replicate->mc->nb_tasks_prefetch++;
 
 				_starpu_spin_unlock(&handle->header_lock);
 
@@ -652,9 +661,17 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 				STARPU_ASSERT(r->next_req_count <= STARPU_MAXNODES);
 			}
 		}
-		else if (!write_invalidation)
-			/* The last request will perform the callback after termination */
-			_starpu_data_request_append_callback(r, callback_func, callback_arg);
+		else
+		{
+			if (is_prefetch == STARPU_TASK_PREFETCH)
+				/* Make last request add the prefetch count on the mc to keep the data
+				 * there until the task gets to execute.  */
+				r->nb_tasks_prefetch++;
+
+			if (!write_invalidation)
+				/* The last request will perform the callback after termination */
+				_starpu_data_request_append_callback(r, callback_func, callback_arg);
+		}
 
 
 		if (reused_requests[hop])
@@ -719,7 +736,7 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 }
 
 int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *dst_replicate,
-			       enum starpu_data_access_mode mode, unsigned detached, unsigned is_prefetch, unsigned async,
+			       enum starpu_data_access_mode mode, unsigned detached, enum _starpu_is_prefetch is_prefetch, unsigned async,
 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
 {
         _STARPU_LOG_IN();
@@ -733,7 +750,7 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 	if (cpt == STARPU_SPIN_MAXTRY)
 		_starpu_spin_lock(&handle->header_lock);
 
-	if (is_prefetch > 0)
+	if (is_prefetch > STARPU_FETCH)
 	{
 		unsigned src_node_mask = 0;
 
@@ -751,6 +768,7 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 		if (src_node_mask == 0)
 		{
 			/* no valid copy, nothing to prefetch */
+			STARPU_ASSERT_MSG(handle->init_cl, "Could not find a valid copy of the data, and no handle initialization function");
 			_starpu_spin_unlock(&handle->header_lock);
 			return 0;
 		}
@@ -789,17 +807,22 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 
 static int idle_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
 {
-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, 2, 1, NULL, NULL, prio, "idle_prefetch_data_on_node");
+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_IDLEFETCH, 1, NULL, NULL, prio, "idle_prefetch_data_on_node");
 }
 
-static int prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
+static int task_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
 {
-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, 1, 1, NULL, NULL, prio, "prefetch_data_on_node");
+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_TASK_PREFETCH, 1, NULL, NULL, prio, "task_prefetch_data_on_node");
+}
+
+static int STARPU_ATTRIBUTE_UNUSED prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
+{
+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_PREFETCH, 1, NULL, NULL, prio, "prefetch_data_on_node");
 }
 
 static int fetch_data(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
 {
-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 0, 0, 0, NULL, NULL, prio, "fetch_data");
+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 0, STARPU_FETCH, 0, NULL, NULL, prio, "fetch_data");
 }
 
 uint32_t _starpu_get_data_refcnt(starpu_data_handle_t handle, unsigned node)
@@ -899,7 +922,7 @@ int starpu_prefetch_task_input_on_node_prio(struct starpu_task *task, unsigned t
 	if (j->discontinuous != 0)
 		return 0;
 #endif
-	STARPU_ASSERT(!task->prefetched);
+	STARPU_ASSERT_MSG(!task->prefetched, "Prefetching was already requested for this task! Did you set 'prefetches' to 1 in the starpu_sched_policy structure?");
 	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 	unsigned index;
 
@@ -918,10 +941,11 @@ int starpu_prefetch_task_input_on_node_prio(struct starpu_task *task, unsigned t
 		int node = _starpu_task_data_get_node_on_node(task, index, target_node);
 
 		struct _starpu_data_replicate *replicate = &handle->per_node[node];
-		prefetch_data_on_node(handle, node, replicate, mode, prio);
+		task_prefetch_data_on_node(handle, node, replicate, mode, prio);
 
 		_starpu_set_data_requested_flag_if_needed(handle, replicate);
 	}
+	task->prefetched = 1;
 
 	return 0;
 }
@@ -976,7 +1000,7 @@ int starpu_prefetch_task_input_for_prio(struct starpu_task *task, unsigned worke
 	if (j->discontinuous != 0)
 		return 0;
 #endif
-	STARPU_ASSERT(!task->prefetched);
+	STARPU_ASSERT_MSG(!task->prefetched, "Prefetching was already requested for this task! Did you set 'prefetches' to 1 in the starpu_sched_policy structure?");
 	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 	unsigned index;
 
@@ -995,10 +1019,11 @@ int starpu_prefetch_task_input_for_prio(struct starpu_task *task, unsigned worke
 		int node = _starpu_task_data_get_node_on_worker(task, index, worker);
 
 		struct _starpu_data_replicate *replicate = &handle->per_node[node];
-		prefetch_data_on_node(handle, node, replicate, mode, prio);
+		task_prefetch_data_on_node(handle, node, replicate, mode, prio);
 
 		_starpu_set_data_requested_flag_if_needed(handle, replicate);
 	}
+	task->prefetched = 1;
 
 	return 0;
 }
@@ -1140,7 +1165,7 @@ int _starpu_fetch_task_input(struct starpu_task *task, struct _starpu_job *j, in
 
 		if (async)
 		{
-			ret = _starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, 0, 1,
+			ret = _starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, STARPU_FETCH, 1,
 					_starpu_fetch_task_input_cb, worker, 0, "_starpu_fetch_task_input");
 #ifdef STARPU_SIMGRID
 			if (_starpu_simgrid_fetching_input_cost())
@@ -1230,7 +1255,23 @@ void _starpu_fetch_task_input_tail(struct starpu_task *task, struct _starpu_job
 		local_replicate = get_replicate(handle, mode, workerid, node);
 		_starpu_spin_lock(&handle->header_lock);
 		if (local_replicate->mc)
+		{
 			local_replicate->mc->diduse = 1;
+			if (task->prefetched && local_replicate->initialized &&
+				/* See prefetch conditions in
+				 * starpu_prefetch_task_input_on_node_prio and alike */
+				!(mode & (STARPU_SCRATCH|STARPU_REDUX)) &&
+				(mode & STARPU_R))
+			{
+				/* Allocations or transfer prefetchs should have been done by now and marked
+				 * this mc as needed for us.
+				 * Now that we added a reference for the task, we can relieve that.  */
+				/* Note: the replicate might have been evicted in between, thus not 100% sure
+				 * that our prefetch request is still recorded here.  */
+				if (local_replicate->mc->nb_tasks_prefetch > 0)
+					local_replicate->mc->nb_tasks_prefetch--;
+			}
+		}
 		_starpu_spin_unlock(&handle->header_lock);
 
 		_STARPU_TASK_SET_INTERFACE(task , local_replicate->data_interface, descrs[index].index);
@@ -1379,7 +1420,7 @@ void _starpu_fetch_nowhere_task_input(struct _starpu_job *j)
 
 		local_replicate = get_replicate(handle, mode, -1, node);
 
-		_starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, 0, 1, _starpu_fetch_nowhere_task_input_cb, wrapper, 0, "_starpu_fetch_nowhere_task_input");
+		_starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, STARPU_FETCH, 1, _starpu_fetch_nowhere_task_input_cb, wrapper, 0, "_starpu_fetch_nowhere_task_input");
 	}
 
 	if (profiling && task->profiling_info)

+ 2 - 2
src/datawizard/coherency.h

@@ -298,7 +298,7 @@ struct _starpu_data_state
  * async means that _starpu_fetch_data_on_node will wait for completion of the request
  */
 int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate,
-			       enum starpu_data_access_mode mode, unsigned detached, unsigned is_prefetch, unsigned async,
+			       enum starpu_data_access_mode mode, unsigned detached, enum _starpu_is_prefetch is_prefetch, unsigned async,
 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
 /** This releases a reference on the handle */
 void _starpu_release_data_on_node(struct _starpu_data_state *state, uint32_t default_wt_mask,
@@ -341,7 +341,7 @@ int _starpu_determine_request_path(starpu_data_handle_t handle,
  */
 struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
 								  struct _starpu_data_replicate *dst_replicate,
-								  enum starpu_data_access_mode mode, unsigned is_prefetch,
+								  enum starpu_data_access_mode mode, enum _starpu_is_prefetch is_prefetch,
 								  unsigned async,
 								  void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
 

+ 2 - 2
src/datawizard/copy_driver.c

@@ -203,7 +203,7 @@ int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT _starpu_driver_copy_data_1_to_1(starpu_d
 									unsigned donotread,
 									struct _starpu_data_request *req,
 									unsigned may_alloc,
-									unsigned prefetch STARPU_ATTRIBUTE_UNUSED)
+									enum _starpu_is_prefetch prefetch STARPU_ATTRIBUTE_UNUSED)
 {
 	if (!donotread)
 	{
@@ -221,7 +221,7 @@ int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT _starpu_driver_copy_data_1_to_1(starpu_d
 			/* We're not supposed to allocate there at the moment */
 			return -ENOMEM;
 
-		int ret_alloc = _starpu_allocate_memory_on_node(handle, dst_replicate, req ? req->prefetch : 0);
+		int ret_alloc = _starpu_allocate_memory_on_node(handle, dst_replicate, req ? req->prefetch : STARPU_FETCH);
 		if (ret_alloc)
 			return -ENOMEM;
 	}

+ 10 - 1
src/datawizard/copy_driver.h

@@ -47,6 +47,15 @@ extern "C"
 struct _starpu_data_request;
 struct _starpu_data_replicate;
 
+enum _starpu_is_prefetch
+{
+	STARPU_FETCH = 0,		/* A task really needs it now! */
+	STARPU_TASK_PREFETCH = 1,	/* A task will need it soon */
+	STARPU_PREFETCH = 2,		/* It is a good idea to have it asap */
+	STARPU_IDLEFETCH = 3,		/* Get this here when you have time to */
+	STARPU_NFETCH
+};
+
 #ifdef STARPU_USE_MIC
 /** MIC needs memory_node to know which MIC is concerned.
  * mark is used to wait asynchronous request.
@@ -132,7 +141,7 @@ int _starpu_driver_copy_data_1_to_1(starpu_data_handle_t handle,
 				    unsigned donotread,
 				    struct _starpu_data_request *req,
 				    unsigned may_alloc,
-				    unsigned prefetch);
+				    enum _starpu_is_prefetch prefetch);
 
 unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *async_channel);
 void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_channel);

+ 47 - 29
src/datawizard/data_request.c

@@ -25,8 +25,11 @@
 #include <core/simgrid.h>
 
 /* requests that have not been treated at all */
+#ifdef STARPU_DEVEL
+#warning split into separate out/in queues for each node, so that MAX_PENDING_REQUESTS_PER_NODE is separate for them, since the links are bidirectionnal
+#endif
 static struct _starpu_data_request_prio_list data_requests[STARPU_MAXNODES];
-static struct _starpu_data_request_prio_list prefetch_requests[STARPU_MAXNODES];
+static struct _starpu_data_request_prio_list prefetch_requests[STARPU_MAXNODES]; /* Contains both task_prefetch and prefetch */
 static struct _starpu_data_request_prio_list idle_requests[STARPU_MAXNODES];
 static starpu_pthread_mutex_t data_requests_list_mutex[STARPU_MAXNODES];
 
@@ -121,7 +124,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 							 int handling_node,
 							 enum starpu_data_access_mode mode,
 							 unsigned ndeps,
-							 unsigned is_prefetch,
+							 enum _starpu_is_prefetch is_prefetch,
 							 int prio,
 							 unsigned is_write_invalidation,
 							 const char *origin)
@@ -153,6 +156,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 	STARPU_ASSERT(starpu_node_get_kind(handling_node) == STARPU_CPU_RAM || _starpu_memory_node_get_nworkers(handling_node));
 	r->completed = 0;
 	r->prefetch = is_prefetch;
+	r->nb_tasks_prefetch = 0;
 	r->prio = prio;
 	r->retval = -1;
 	r->ndeps = ndeps;
@@ -307,9 +311,9 @@ void _starpu_post_data_request(struct _starpu_data_request *r)
 
 	/* insert the request in the proper list */
 	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[handling_node]);
-	if (r->prefetch == 2)
+	if (r->prefetch >= STARPU_IDLEFETCH)
 		_starpu_data_request_prio_list_push_back(&idle_requests[handling_node], r);
-	else if (r->prefetch)
+	else if (r->prefetch > STARPU_FETCH)
 		_starpu_data_request_prio_list_push_back(&prefetch_requests[handling_node], r);
 	else
 		_starpu_data_request_prio_list_push_back(&data_requests[handling_node], r);
@@ -410,6 +414,10 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 	/* Remove a reference on the destination replicate for the request */
 	if (dst_replicate)
 	{
+		if (dst_replicate->mc)
+			/* Make sure it stays there for the task.  */
+			dst_replicate->mc->nb_tasks_prefetch += r->nb_tasks_prefetch;
+
 		STARPU_ASSERT(dst_replicate->refcnt > 0);
 		dst_replicate->refcnt--;
 	}
@@ -460,7 +468,7 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 }
 
 /* TODO : accounting to see how much time was spent working for other people ... */
-static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned may_alloc, int prefetch)
+static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned may_alloc, enum _starpu_is_prefetch prefetch)
 {
 	starpu_data_handle_t handle = r->handle;
 
@@ -535,7 +543,7 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 	return 0;
 }
 
-static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_list *reqlist, unsigned src_node, unsigned may_alloc, unsigned n, unsigned *pushed, unsigned prefetch)
+static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_list *reqlist, unsigned src_node, unsigned may_alloc, unsigned n, unsigned *pushed, enum _starpu_is_prefetch prefetch)
 {
 	struct _starpu_data_request *r;
 	struct _starpu_data_request_prio_list new_data_requests[prefetch + 1]; /* Indexed by prefetch level */
@@ -606,7 +614,7 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 			ret = res;
 			/* Prefetch requests might have gotten promoted while in tmp list */
 			_starpu_data_request_prio_list_push_back(&new_data_requests[r->prefetch], r);
-			if (prefetch)
+			if (prefetch > STARPU_FETCH)
 				/* Prefetching more there would make the situation even worse */
 				break;
 		}
@@ -636,20 +644,25 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 	if (i <= prefetch)
 	{
 		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
-		if (!(_starpu_data_request_prio_list_empty(&new_data_requests[0])))
+		if (!(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_FETCH])))
+		{
+			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_FETCH], &data_requests[src_node]);
+			data_requests[src_node] = new_data_requests[STARPU_FETCH];
+		}
+		if (prefetch >= STARPU_TASK_PREFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_TASK_PREFETCH])))
 		{
-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[0], &data_requests[src_node]);
-			data_requests[src_node] = new_data_requests[0];
+			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_TASK_PREFETCH], &prefetch_requests[src_node]);
+			prefetch_requests[src_node] = new_data_requests[STARPU_TASK_PREFETCH];
 		}
-		if (prefetch >= 1 && !(_starpu_data_request_prio_list_empty(&new_data_requests[1])))
+		if (prefetch >= STARPU_PREFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_PREFETCH])))
 		{
-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[1], &prefetch_requests[src_node]);
-			prefetch_requests[src_node] = new_data_requests[1];
+			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_PREFETCH], &prefetch_requests[src_node]);
+			prefetch_requests[src_node] = new_data_requests[STARPU_PREFETCH];
 		}
-		if (prefetch >= 2 && !(_starpu_data_request_prio_list_empty(&new_data_requests[2])))
+		if (prefetch >= STARPU_IDLEFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_IDLEFETCH])))
 		{
-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[2], &idle_requests[src_node]);
-			idle_requests[src_node] = new_data_requests[2];
+			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_IDLEFETCH], &idle_requests[src_node]);
+			idle_requests[src_node] = new_data_requests[STARPU_IDLEFETCH];
 		}
 		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
 
@@ -675,17 +688,17 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 
 int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
 {
-	return __starpu_handle_node_data_requests(data_requests, src_node, may_alloc, MAX_PENDING_REQUESTS_PER_NODE, pushed, 0);
+	return __starpu_handle_node_data_requests(data_requests, src_node, may_alloc, MAX_PENDING_REQUESTS_PER_NODE, pushed, STARPU_FETCH);
 }
 
 int _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
 {
-	return __starpu_handle_node_data_requests(prefetch_requests, src_node, may_alloc, MAX_PENDING_PREFETCH_REQUESTS_PER_NODE, pushed, 1);
+	return __starpu_handle_node_data_requests(prefetch_requests, src_node, may_alloc, MAX_PENDING_PREFETCH_REQUESTS_PER_NODE, pushed, STARPU_PREFETCH);
 }
 
 int _starpu_handle_node_idle_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
 {
-	return __starpu_handle_node_data_requests(idle_requests, src_node, may_alloc, MAX_PENDING_IDLE_REQUESTS_PER_NODE, pushed, 2);
+	return __starpu_handle_node_data_requests(idle_requests, src_node, may_alloc, MAX_PENDING_IDLE_REQUESTS_PER_NODE, pushed, STARPU_IDLEFETCH);
 }
 
 static int _handle_pending_node_data_requests(unsigned src_node, unsigned force)
@@ -836,11 +849,15 @@ int _starpu_check_that_no_data_request_is_pending(unsigned node)
 }
 
 
-void _starpu_update_prefetch_status(struct _starpu_data_request *r, unsigned prefetch)
+void _starpu_update_prefetch_status(struct _starpu_data_request *r, enum _starpu_is_prefetch prefetch)
 {
 	STARPU_ASSERT(r->prefetch > prefetch);
 	r->prefetch=prefetch;
 
+	if (prefetch >= STARPU_IDLEFETCH)
+		/* No possible actual change */
+		return;
+
 	/* We have to promote chained_request too! */
 	unsigned chained_req;
 	for (chained_req = 0; chained_req < r->next_req_count; chained_req++)
@@ -852,19 +869,20 @@ void _starpu_update_prefetch_status(struct _starpu_data_request *r, unsigned pre
 
 	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[r->handling_node]);
 
+	int found = 1;
+
 	/* The request can be in a different list (handling request or the temp list)
-	 * we have to check that it is really in the prefetch list. */
+	 * we have to check that it is really in the prefetch or idle list. */
 	if (_starpu_data_request_prio_list_ismember(&prefetch_requests[r->handling_node], r))
-	{
-		_starpu_data_request_prio_list_erase(&prefetch_requests[r->handling_node],r);
-		_starpu_data_request_prio_list_push_back(&data_requests[r->handling_node],r);
-	}
-	/* The request can be in a different list (handling request or the temp list)
-	 * we have to check that it is really in the idle list. */
+		_starpu_data_request_prio_list_erase(&prefetch_requests[r->handling_node], r);
 	else if (_starpu_data_request_prio_list_ismember(&idle_requests[r->handling_node], r))
+		_starpu_data_request_prio_list_erase(&idle_requests[r->handling_node], r);
+	else
+		found = 0;
+
+	if (found)
 	{
-		_starpu_data_request_prio_list_erase(&idle_requests[r->handling_node],r);
-		if (prefetch == 1)
+		if (prefetch > STARPU_FETCH)
 			_starpu_data_request_prio_list_push_back(&prefetch_requests[r->handling_node],r);
 		else
 			_starpu_data_request_prio_list_push_back(&data_requests[r->handling_node],r);

+ 7 - 8
src/datawizard/data_request.h

@@ -79,12 +79,11 @@ LIST_TYPE(_starpu_data_request,
 	/** Whether the transfer is completed. */
 	unsigned completed;
 
-	/** Whether this is just a prefetch request:
-	 * 0 for fetch,
-	 * 1 for prefetch (dependencies have just been released)
-	 * 2 for idle (a good idea to do it some time, but no hurry at all)
-	 */
-	unsigned prefetch;
+	/** Whether this is just a prefetch request */
+	enum _starpu_is_prefetch prefetch;
+
+	/** Number of tasks which used this as a prefetch */
+	unsigned nb_tasks_prefetch;
 
 	/** Priority of the request. Default is 0 */
 	int prio;
@@ -151,7 +150,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 							 int handling_node,
 							 enum starpu_data_access_mode mode,
 							 unsigned ndeps,
-							 unsigned is_prefetch,
+							 enum _starpu_is_prefetch is_prefetch,
 							 int prio,
 							 unsigned is_write_invalidation,
 							 const char *origin) STARPU_ATTRIBUTE_MALLOC;
@@ -162,5 +161,5 @@ void _starpu_data_request_append_callback(struct _starpu_data_request *r,
 					  void (*callback_func)(void *),
 					  void *callback_arg);
 
-void _starpu_update_prefetch_status(struct _starpu_data_request *r, unsigned prefetch);
+void _starpu_update_prefetch_status(struct _starpu_data_request *r, enum _starpu_is_prefetch prefetch);
 #endif // __DATA_REQUEST_H__

+ 2 - 1
src/datawizard/filters.c

@@ -21,6 +21,7 @@
 #include <datawizard/filters.h>
 #include <datawizard/footprint.h>
 #include <datawizard/interfaces/data_interface.h>
+#include <datawizard/memory_nodes.h>
 #include <core/task.h>
 
 /*
@@ -192,7 +193,7 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 		int home_node = initial_handle->home_node;
 		if (home_node < 0 || (starpu_node_get_kind(home_node) != STARPU_CPU_RAM))
 			home_node = STARPU_MAIN_RAM;
-		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[home_node], 0);
+		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[home_node], STARPU_FETCH);
 #ifdef STARPU_DEVEL
 #warning we should reclaim memory if allocation failed
 #endif

+ 3 - 0
src/datawizard/interfaces/bcsr_interface.c

@@ -15,6 +15,9 @@
  */
 
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 /*
  * BCSR : blocked CSR, we use blocks of size (r x c)

+ 3 - 0
src/datawizard/interfaces/block_interface.c

@@ -15,6 +15,9 @@
  */
 
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 

+ 3 - 0
src/datawizard/interfaces/coo_interface.c

@@ -15,6 +15,9 @@
  */
 
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 static int
 copy_any_to_any(void *src_interface, unsigned src_node,

+ 3 - 0
src/datawizard/interfaces/csr_interface.c

@@ -16,6 +16,9 @@
  */
 
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 

+ 1 - 1
src/datawizard/interfaces/data_interface.c

@@ -693,7 +693,7 @@ void _starpu_check_if_valid_and_fetch_data_on_node(starpu_data_handle_t handle,
 	}
 	if (valid)
 	{
-		int ret = _starpu_fetch_data_on_node(handle, handle->home_node, replicate, STARPU_R, 0, 0, 0, NULL, NULL, 0, origin);
+		int ret = _starpu_fetch_data_on_node(handle, handle->home_node, replicate, STARPU_R, 0, STARPU_FETCH, 0, NULL, NULL, 0, origin);
 		STARPU_ASSERT(!ret);
 		_starpu_release_data_on_node(handle, handle->home_node, replicate);
 	}

+ 3 - 0
src/datawizard/interfaces/matrix_interface.c

@@ -15,6 +15,9 @@
  */
 
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 

+ 3 - 0
src/datawizard/interfaces/multiformat_interface.c

@@ -15,6 +15,9 @@
  */
 
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
 #ifdef STARPU_USE_CUDA

+ 3 - 0
src/datawizard/interfaces/tensor_interface.c

@@ -15,6 +15,9 @@
  */
 
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 

+ 3 - 0
src/datawizard/interfaces/variable_interface.c

@@ -15,6 +15,9 @@
  */
 
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 

+ 3 - 0
src/datawizard/interfaces/vector_interface.c

@@ -15,6 +15,9 @@
  */
 
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 

+ 3 - 0
src/datawizard/interfaces/void_interface.c

@@ -15,6 +15,9 @@
  */
 
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 static int dummy_copy(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 

+ 23 - 18
src/datawizard/memalloc.c

@@ -322,7 +322,7 @@ static int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT transfer_subtree_to_node(starpu_d
 		{
 			/* This is the only copy, push it to destination */
 			struct _starpu_data_request *r;
-			r = _starpu_create_request_to_fetch_data(handle, dst_replicate, STARPU_R, 0, 0, NULL, NULL, 0, "transfer_subtree_to_node");
+			r = _starpu_create_request_to_fetch_data(handle, dst_replicate, STARPU_R, STARPU_FETCH, 0, NULL, NULL, 0, "transfer_subtree_to_node");
 			/* There is no way we don't need a request, since
 			 * source is OWNER, destination can't be having it */
 			STARPU_ASSERT(r);
@@ -546,7 +546,7 @@ static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_re
 /* This function is called for memory chunks that are possibly in used (ie. not
  * in the cache). They should therefore still be associated to a handle. */
 /* mc_lock is held and may be temporarily released! */
-static size_t try_to_throw_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node, struct _starpu_data_replicate *replicate, unsigned is_already_in_mc_list)
+static size_t try_to_throw_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node, struct _starpu_data_replicate *replicate, unsigned is_already_in_mc_list, enum _starpu_is_prefetch is_prefetch)
 {
 	size_t freed = 0;
 
@@ -571,6 +571,10 @@ static size_t try_to_throw_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node
 		/* Hasn't been used yet, avoid evicting it */
 		return 0;
 
+	if (mc->nb_tasks_prefetch && is_prefetch >= STARPU_TASK_PREFETCH)
+		/* We have not finished executing the tasks this was prefetched for */
+		return 0;
+
 	/* REDUX memchunk */
 	if (mc->relaxed_coherency == 2)
 	{
@@ -782,7 +786,7 @@ static int try_to_find_reusable_mc(unsigned node, starpu_data_handle_t data, str
 
 /* this function looks for a memory chunk that matches a given footprint in the
  * list of mem chunk that are not important */
-static int try_to_reuse_not_important_mc(unsigned node, starpu_data_handle_t data, struct _starpu_data_replicate *replicate, uint32_t footprint)
+static int try_to_reuse_not_important_mc(unsigned node, starpu_data_handle_t data, struct _starpu_data_replicate *replicate, uint32_t footprint, enum _starpu_is_prefetch is_prefetch)
 {
 	struct _starpu_mem_chunk *mc, *orig_next_mc, *next_mc;
 	int success = 0;
@@ -816,7 +820,7 @@ restart:
 		}
 
 		/* Note: this may unlock mc_list! */
-		success = try_to_throw_mem_chunk(mc, node, replicate, 1);
+		success = try_to_throw_mem_chunk(mc, node, replicate, 1, is_prefetch);
 
 		if (orig_next_mc)
 		{
@@ -841,11 +845,14 @@ restart:
  * Try to find a buffer currently in use on the memory node which has the given
  * footprint.
  */
-static int try_to_reuse_potentially_in_use_mc(unsigned node, starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, uint32_t footprint, int is_prefetch)
+static int try_to_reuse_potentially_in_use_mc(unsigned node, starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, uint32_t footprint, enum _starpu_is_prefetch is_prefetch)
 {
 	struct _starpu_mem_chunk *mc, *next_mc, *orig_next_mc;
 	int success = 0;
 
+	if (is_prefetch >= STARPU_IDLEFETCH)
+		/* Do not evict a MC just for an idle fetch */
+		return 0;
 	/*
 	 * We have to unlock mc_lock before locking header_lock, so we have
 	 * to be careful with the list.  We try to do just one pass, by
@@ -868,14 +875,11 @@ restart:
 		if (mc->remove_notify)
 			/* Somebody already working here, skip */
 			continue;
-		if (is_prefetch > 1)
-			/* Do not evict a MC just for an idle fetch */
-			continue;
-		if (is_prefetch == 1 && !mc->wontuse)
+		if (!mc->wontuse && is_prefetch >= STARPU_PREFETCH)
 			/* Do not evict something that we might reuse, just for a prefetch */
-			/* FIXME: but perhaps we won't have any task using it in
-                         * the close future, we should perhaps rather check
-                         * mc->replicate->refcnt? */
+			continue;
+		if (mc->nb_tasks_prefetch && is_prefetch >= STARPU_TASK_PREFETCH)
+			/* Do not evict something that we will reuse, just for a task prefetch */
 			continue;
 		if (mc->footprint != footprint || _starpu_data_interface_compare(handle->per_node[node].data_interface, handle->ops, mc->data->per_node[node].data_interface, mc->ops) != 1)
 			/* Not the right type of interface, skip */
@@ -889,7 +893,7 @@ restart:
 		}
 
 		/* Note: this may unlock mc_list! */
-		success = try_to_throw_mem_chunk(mc, node, replicate, 1);
+		success = try_to_throw_mem_chunk(mc, node, replicate, 1, is_prefetch);
 
 		if (orig_next_mc)
 		{
@@ -999,7 +1003,7 @@ restart2:
 				next_mc->remove_notify = &next_mc;
 			}
 			/* Note: this may unlock mc_list! */
-			freed += try_to_throw_mem_chunk(mc, node, NULL, 0);
+			freed += try_to_throw_mem_chunk(mc, node, NULL, 0, STARPU_FETCH);
 
 			if (orig_next_mc)
 			{
@@ -1218,7 +1222,7 @@ void starpu_memchunk_tidy(unsigned node)
 			}
 
 			_starpu_spin_unlock(&mc_lock[node]);
-			if (!_starpu_create_request_to_fetch_data(handle, &handle->per_node[target_node], STARPU_R, 2, 1, NULL, NULL, 0, "starpu_memchunk_tidy"))
+			if (!_starpu_create_request_to_fetch_data(handle, &handle->per_node[target_node], STARPU_R, STARPU_IDLEFETCH, 1, NULL, NULL, 0, "starpu_memchunk_tidy"))
 			{
 				/* No request was actually needed??
 				 * Odd, but cope with it.  */
@@ -1317,6 +1321,7 @@ static struct _starpu_mem_chunk *_starpu_memchunk_init(struct _starpu_data_repli
 	mc->size_interface = interface_size;
 	mc->remove_notify = NULL;
 	mc->diduse = 0;
+	mc->nb_tasks_prefetch = 0;
 	mc->wontuse = 0;
 
 	return mc;
@@ -1430,7 +1435,7 @@ void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _star
  *
  */
 
-static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned dst_node, unsigned is_prefetch)
+static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned dst_node, enum _starpu_is_prefetch is_prefetch)
 {
 	unsigned attempts = 0;
 	starpu_ssize_t allocated_memory;
@@ -1514,7 +1519,7 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 			reclaim -= freed;
 
 			/* Try to reuse an allocated data with the same interface (to avoid spurious free/alloc) */
-			if (_starpu_has_not_important_data && try_to_reuse_not_important_mc(dst_node, handle, replicate, footprint))
+			if (_starpu_has_not_important_data && try_to_reuse_not_important_mc(dst_node, handle, replicate, footprint, is_prefetch))
 				break;
 			if (try_to_reuse_potentially_in_use_mc(dst_node, handle, replicate, footprint, is_prefetch))
 			{
@@ -1596,7 +1601,7 @@ out:
 	return allocated_memory;
 }
 
-int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned is_prefetch)
+int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum _starpu_is_prefetch is_prefetch)
 {
 	starpu_ssize_t allocated_memory;
 

+ 9 - 1
src/datawizard/memalloc.h

@@ -26,6 +26,7 @@
 #include <datawizard/interfaces/data_interface.h>
 #include <datawizard/coherency.h>
 #include <datawizard/copy_driver.h>
+#include <datawizard/data_request.h>
 
 struct _starpu_data_replicate;
 
@@ -59,10 +60,17 @@ LIST_TYPE(_starpu_mem_chunk,
 	/** Whether the memchunk is in the clean part of the mc_list */
 	unsigned clean:1;
 	/** Was this chunk used since it got allocated?  */
+	/* FIXME: probably useless now with nb_tasks_prefetch */
 	unsigned diduse:1;
 	/** Was this chunk marked as "won't use"? */
 	unsigned wontuse:1;
 
+	/** The number of prefetches that we made for this mc for various tasks
+	 * This is also the number of tasks that we will wait to see use this mc before
+	 * we attempt to evict it.
+	 */
+	unsigned nb_tasks_prefetch;
+
 	/** the size of the data is only set when calling _starpu_request_mem_chunk_removal(),
 	 * it is needed to estimate how much memory is in mc_cache, and by
 	 * free_memory_on_node() which is called when the handle is no longer
@@ -84,7 +92,7 @@ void _starpu_init_mem_chunk_lists(void);
 void _starpu_deinit_mem_chunk_lists(void);
 void _starpu_mem_chunk_init_last(void);
 void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned node, size_t size);
-int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned is_prefetch);
+int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum _starpu_is_prefetch is_prefetch);
 size_t _starpu_free_all_automatically_allocated_buffers(unsigned node);
 void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node);
 void _starpu_memchunk_wont_use(struct _starpu_mem_chunk *m, unsigned nodec);

+ 1 - 0
src/datawizard/memory_manager.c

@@ -19,6 +19,7 @@
 #include <common/thread.h>
 #include <common/fxt.h>
 #include <datawizard/memory_manager.h>
+#include <datawizard/memory_nodes.h>
 #include <core/workers.h>
 #include <starpu_stdlib.h>
 

+ 1 - 0
src/datawizard/reduction.c

@@ -22,6 +22,7 @@
 #include <datawizard/datawizard.h>
 #include <drivers/mic/driver_mic_source.h>
 #include <drivers/mp_common/source_common.h>
+#include <datawizard/memory_nodes.h>
 
 void starpu_data_set_reduction_methods(starpu_data_handle_t handle,
 				       struct starpu_codelet *redux_cl,

+ 7 - 6
src/datawizard/user_interactions.c

@@ -22,6 +22,7 @@
 #include <datawizard/write_back.h>
 #include <core/dependencies/data_concurrency.h>
 #include <core/sched_policy.h>
+#include <datawizard/memory_nodes.h>
 
 static void _starpu_data_check_initialized(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
 {
@@ -46,7 +47,7 @@ int starpu_data_request_allocation(starpu_data_handle_t handle, unsigned node)
 
 	_starpu_spin_lock(&handle->header_lock);
 
-	r = _starpu_create_data_request(handle, NULL, &handle->per_node[node], node, STARPU_NONE, 0, 1, 0, 0, "starpu_data_request_allocation");
+	r = _starpu_create_data_request(handle, NULL, &handle->per_node[node], node, STARPU_NONE, 0, STARPU_PREFETCH, 0, 0, "starpu_data_request_allocation");
 
 	/* we do not increase the refcnt associated to the request since we are
 	 * not waiting for its termination */
@@ -67,7 +68,7 @@ struct user_interaction_wrapper
 	starpu_pthread_mutex_t lock;
 	unsigned finished;
 	unsigned detached;
-	unsigned prefetch;
+	enum _starpu_is_prefetch prefetch;
 	unsigned async;
 	int prio;
 	void (*callback)(void *);
@@ -535,7 +536,7 @@ static void _prefetch_data_on_node(void *arg)
 }
 
 static
-int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigned node, unsigned async, enum starpu_data_access_mode mode, unsigned prefetch, int prio)
+int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigned node, unsigned async, enum starpu_data_access_mode mode, enum _starpu_is_prefetch prefetch, int prio)
 {
 	STARPU_ASSERT(handle);
 
@@ -595,12 +596,12 @@ int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigne
 
 int starpu_data_fetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
 {
-	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, 0, 0);
+	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, STARPU_FETCH, 0);
 }
 
 int starpu_data_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned node, unsigned async, int prio)
 {
-	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, 1, prio);
+	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, STARPU_PREFETCH, prio);
 }
 
 int starpu_data_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
@@ -610,7 +611,7 @@ int starpu_data_prefetch_on_node(starpu_data_handle_t handle, unsigned node, uns
 
 int starpu_data_idle_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned node, unsigned async, int prio)
 {
-	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, 2, prio);
+	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, STARPU_IDLEFETCH, prio);
 }
 
 int starpu_data_idle_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)

+ 2 - 1
src/datawizard/write_back.c

@@ -17,6 +17,7 @@
 #include <datawizard/datawizard.h>
 #include <datawizard/write_back.h>
 #include <core/dependencies/data_concurrency.h>
+#include <datawizard/memory_nodes.h>
 
 static void wt_callback(void *arg)
 {
@@ -63,7 +64,7 @@ void _starpu_write_through_data(starpu_data_handle_t handle, unsigned requesting
 
 				struct _starpu_data_request *r;
 				r = _starpu_create_request_to_fetch_data(handle, &handle->per_node[node],
-									 STARPU_R, 1, 1, wt_callback, handle, 0, "_starpu_write_through_data");
+									 STARPU_R, STARPU_IDLEFETCH, 1, wt_callback, handle, 0, "_starpu_write_through_data");
 
 			        /* If no request was created, the handle was already up-to-date on the
 			         * node */

+ 2 - 2
src/debug/latency.c

@@ -34,7 +34,7 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 		_starpu_spin_unlock(&handle->header_lock);
 
 		struct _starpu_data_replicate *replicate_0 = &handle->per_node[node0];
-		ret = _starpu_fetch_data_on_node(handle, node0, replicate_0, STARPU_RW, 0, 0, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
+		ret = _starpu_fetch_data_on_node(handle, node0, replicate_0, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
 		STARPU_ASSERT(!ret);
 		_starpu_release_data_on_node(handle, node0, replicate_0);
 
@@ -44,7 +44,7 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 		_starpu_spin_unlock(&handle->header_lock);
 
 		struct _starpu_data_replicate *replicate_1 = &handle->per_node[node1];
-		ret = _starpu_fetch_data_on_node(handle, node1, replicate_1, STARPU_RW, 0, 0, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
+		ret = _starpu_fetch_data_on_node(handle, node1, replicate_1, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
 		STARPU_ASSERT(!ret);
 		_starpu_release_data_on_node(handle, node1, replicate_1);
 	}

+ 10 - 8
src/debug/traces/starpu_fxt.c

@@ -20,6 +20,7 @@
 #include <starpu.h>
 #include <common/config.h>
 #include <common/uthash.h>
+#include <datawizard/copy_driver.h>
 #include <string.h>
 
 #ifdef STARPU_HAVE_POTI
@@ -1194,8 +1195,8 @@ static void handle_new_mem_node(struct fxt_ev_64 *ev, struct starpu_fxt_options
  */
 static int create_ordered_stream_id (int nodeid, int devid)
 {
-	static int stable[MAX_MPI_NODES][STARPU_MAXCUDADEVS];
-	STARPU_ASSERT(nodeid < MAX_MPI_NODES);
+	static int stable[STARPU_FXT_MAX_FILES][STARPU_MAXCUDADEVS];
+	STARPU_ASSERT(nodeid < STARPU_FXT_MAX_FILES);
 	STARPU_ASSERT(devid < STARPU_MAXCUDADEVS);
 	return stable[nodeid][devid]++;
 }
@@ -2268,13 +2269,14 @@ static void handle_mpi_data_set_tag(struct fxt_ev_64 *ev, struct starpu_fxt_opti
 	data->mpi_tag = tag;
 }
 
-static const char *copy_link_type(unsigned prefetch)
+static const char *copy_link_type(enum _starpu_is_prefetch prefetch)
 {
 	switch (prefetch)
 	{
-		case 0: return "F";
-		case 1: return "PF";
-		case 2: return "IF";
+		case STARPU_FETCH: return "F";
+		case STARPU_TASK_PREFETCH: return "TF";
+		case STARPU_PREFETCH: return "PF";
+		case STARPU_IDLEFETCH: return "IF";
 		default: STARPU_ASSERT(0);
 	}
 }
@@ -2285,7 +2287,7 @@ static void handle_start_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 	unsigned dst = ev->param[1];
 	unsigned size = ev->param[2];
 	unsigned comid = ev->param[3];
-	unsigned prefetch = ev->param[4];
+	enum _starpu_is_prefetch prefetch = ev->param[4];
 	unsigned long handle = ev->param[5];
 	const char *link_type = copy_link_type(prefetch);
 
@@ -2367,7 +2369,7 @@ static void handle_end_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 	unsigned dst = ev->param[1];
 	unsigned long size = ev->param[2];
 	unsigned comid = ev->param[3];
-	unsigned prefetch = ev->param[4];
+	enum _starpu_is_prefetch prefetch = ev->param[4];
 	const char *link_type = copy_link_type(prefetch);
 
 	char *prefix = options->file_prefix;

+ 0 - 2
src/debug/traces/starpu_fxt.h

@@ -41,8 +41,6 @@
 #include <starpu.h>
 #include "../../../include/starpu_fxt.h"
 
-#define MAX_MPI_NODES 64
-
 extern char _starpu_last_codelet_symbol[STARPU_NMAXWORKERS][(FXT_MAX_PARAMS-5)*sizeof(unsigned long)];
 
 void _starpu_fxt_dag_init(char *dag_filename);

+ 19 - 19
src/debug/traces/starpu_fxt_mpi.c

@@ -103,27 +103,27 @@ int _starpu_fxt_mpi_find_sync_point(char *filename_in, uint64_t *offset, int *ke
  */
 
 /* the list of MPI transfers found in the different traces */
-static struct mpi_transfer *mpi_sends[MAX_MPI_NODES] = {NULL};
-static struct mpi_transfer *mpi_recvs[MAX_MPI_NODES] = {NULL};
+static struct mpi_transfer *mpi_sends[STARPU_FXT_MAX_FILES] = {NULL};
+static struct mpi_transfer *mpi_recvs[STARPU_FXT_MAX_FILES] = {NULL};
 
 /* number of available slots in the lists  */
-unsigned mpi_sends_list_size[MAX_MPI_NODES] = {0};
-unsigned mpi_recvs_list_size[MAX_MPI_NODES] = {0};
+unsigned mpi_sends_list_size[STARPU_FXT_MAX_FILES] = {0};
+unsigned mpi_recvs_list_size[STARPU_FXT_MAX_FILES] = {0};
 
 /* number of slots actually used in the list  */
-unsigned mpi_sends_used[MAX_MPI_NODES] = {0};
-unsigned mpi_recvs_used[MAX_MPI_NODES] = {0};
+unsigned mpi_sends_used[STARPU_FXT_MAX_FILES] = {0};
+unsigned mpi_recvs_used[STARPU_FXT_MAX_FILES] = {0};
 
 /* number of slots already matched at the beginning of the list. This permits
  * going through the lists from the beginning to match each and every
  * transfer, thus avoiding a quadratic complexity. */
-unsigned mpi_recvs_matched[MAX_MPI_NODES][MAX_MPI_NODES] = { {0} };
-unsigned mpi_sends_matched[MAX_MPI_NODES][MAX_MPI_NODES] = { {0} };
+unsigned mpi_recvs_matched[STARPU_FXT_MAX_FILES][STARPU_FXT_MAX_FILES] = { {0} };
+unsigned mpi_sends_matched[STARPU_FXT_MAX_FILES][STARPU_FXT_MAX_FILES] = { {0} };
 
 void _starpu_fxt_mpi_add_send_transfer(int src, int dst STARPU_ATTRIBUTE_UNUSED, long mpi_tag, size_t size, float date, long jobid, unsigned long handle)
 {
 	STARPU_ASSERT(src >= 0);
-	if (src >= MAX_MPI_NODES)
+	if (src >= STARPU_FXT_MAX_FILES)
 		return;
 	unsigned slot = mpi_sends_used[src]++;
 
@@ -153,7 +153,7 @@ void _starpu_fxt_mpi_add_send_transfer(int src, int dst STARPU_ATTRIBUTE_UNUSED,
 
 void _starpu_fxt_mpi_add_recv_transfer(int src STARPU_ATTRIBUTE_UNUSED, int dst, long mpi_tag, float date, long jobid, unsigned long handle)
 {
-	if (dst >= MAX_MPI_NODES)
+	if (dst >= STARPU_FXT_MAX_FILES)
 		return;
 	unsigned slot = mpi_recvs_used[dst]++;
 
@@ -220,11 +220,11 @@ static unsigned long mpi_com_id = 0;
 
 static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comms_file, unsigned n)
 {
-	unsigned slot[MAX_MPI_NODES] = { 0 }, node;
+	unsigned slot[STARPU_FXT_MAX_FILES] = { 0 }, node;
 	unsigned nb_wrong_comm_timing = 0;
 	struct mpi_transfer_list pending_receives; /* Sorted list of matches which have not happened yet */
-	double current_out_bandwidth[MAX_MPI_NODES] = { 0. };
-	double current_in_bandwidth[MAX_MPI_NODES] = { 0. };
+	double current_out_bandwidth[STARPU_FXT_MAX_FILES] = { 0. };
+	double current_in_bandwidth[STARPU_FXT_MAX_FILES] = { 0. };
 #ifdef STARPU_HAVE_POTI
 	char mpi_container[STARPU_POTI_STR_LEN];
 #endif
@@ -246,7 +246,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 		else
 			start_date = mpi_transfer_list_front(&pending_receives)->date;
 
-		src = MAX_MPI_NODES;
+		src = STARPU_FXT_MAX_FILES;
 		for (node = 0; node < n; node++)
 		{
 			if (slot[node] < mpi_sends_used[node] && mpi_sends[node][slot[node]].date < start_date)
@@ -260,7 +260,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 			/* No event any more, we're finished! */
 			break;
 
-		if (src == MAX_MPI_NODES)
+		if (src == STARPU_FXT_MAX_FILES)
 		{
 			/* Pending match is earlier than all new sends, finish its communication */
 			match = mpi_transfer_list_pop_front(&pending_receives);
@@ -284,7 +284,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 		size_t size = cur->size;
 		unsigned long send_handle = cur->handle;
 
-		if (dst < MAX_MPI_NODES)
+		if (dst < STARPU_FXT_MAX_FILES)
 			match = try_to_match_send_transfer(src, dst, mpi_tag);
 		else
 			match = NULL;
@@ -377,10 +377,10 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
 void _starpu_fxt_display_mpi_transfers(struct starpu_fxt_options *options, int *ranks STARPU_ATTRIBUTE_UNUSED, FILE *out_paje_file, FILE* out_comms_file)
 {
-	if (options->ninputfiles > MAX_MPI_NODES)
+	if (options->ninputfiles > STARPU_FXT_MAX_FILES)
 	{
-		_STARPU_DISP("Warning: %u files given, maximum %u supported, truncating to %u\n", options->ninputfiles, MAX_MPI_NODES, MAX_MPI_NODES);
-		options->ninputfiles = MAX_MPI_NODES;
+		_STARPU_DISP("Warning: %u files given, maximum %u supported, truncating to %u\n", options->ninputfiles, STARPU_FXT_MAX_FILES, STARPU_FXT_MAX_FILES);
+		options->ninputfiles = STARPU_FXT_MAX_FILES;
 	}
 
 	/* display the MPI transfers if possible */

+ 2 - 0
src/debug/traces/starpu_paje.c

@@ -398,6 +398,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 	/* Link types */
 	poti_DefineLinkType("MPIL", "MPIP", "MPICt", "MPICt", "MPI communication");
 	poti_DefineLinkType("F", "P", "Mm", "Mm", "Intra-node data Fetch");
+	poti_DefineLinkType("TF", "P", "Mm", "Mm", "Intra-node data TaskPreFetch");
 	poti_DefineLinkType("PF", "P", "Mm", "Mm", "Intra-node data PreFetch");
 	poti_DefineLinkType("IF", "P", "Mm", "Mm", "Intra-node data IdleFetch");
 	poti_DefineLinkType("WSL", "P", "W", "W", "Work steal");
@@ -551,6 +552,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 6       No       MS     Nothing         \".0 .0 .0\"		\n\
 5       MPIL     MPIP	MPICt	MPICt   \"MPI communication\"\n\
 5       F       P	Mm	Mm      \"Intra-node data Fetch\"\n\
+5       TF      P	Mm	Mm      \"Intra-node data TaskPreFetch\"\n\
 5       PF      P	Mm	Mm      \"Intra-node data PreFetch\"\n\
 5       IF      P	Mm	Mm      \"Intra-node data IdleFetch\"\n\
 5       WSL     P	W	W       \"Work steal\"\n");

+ 4 - 1
src/drivers/cpu/driver_cpu.c

@@ -109,7 +109,10 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 				_SIMGRID_TIMER_END;
 			}
 			else
-				_starpu_simgrid_submit_job(cpu_args->workerid, j, perf_arch, NAN, NULL);
+			{
+				struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(cpu_args, j);
+				_starpu_simgrid_submit_job(cpu_args->workerid, sched_ctx->id, j, perf_arch, NAN, NAN, NULL);
+			}
 #else
 #  ifdef STARPU_PAPI
 			_starpu_profiling_papi_task_start_counters(task);

+ 29 - 20
src/drivers/cuda/driver_cuda.c

@@ -531,10 +531,13 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worke
 				_SIMGRID_TIMER_END;
 			}
 		else
-			_starpu_simgrid_submit_job(workerid, j, &worker->perf_arch, NAN,
+		{
+			struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
+			_starpu_simgrid_submit_job(workerid, sched_ctx->id, j, &worker->perf_arch, NAN, NAN,
 				async ? &task_finished[workerid][pipeline_idx] : NULL);
+		}
 #else
-#ifdef HAVE_LIBNVIDIA_ML
+#ifdef HAVE_NVMLDEVICEGETTOTALENERGYCONSUMPTION
 		unsigned long long energy_start = 0;
 		nvmlReturn_t nvmlRet = -1;
 		if (profiling && task->profiling_info)
@@ -558,7 +561,7 @@ static void finish_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *wor
 	int profiling = starpu_profiling_status_get();
 
 
-#ifdef HAVE_LIBNVIDIA_ML
+#ifdef HAVE_NVMLDEVICEGETTOTALENERGYCONSUMPTION
 	if (profiling && j->task->profiling_info && j->task->profiling_info->energy_consumed)
 	{
 		unsigned long long energy_end;
@@ -880,27 +883,33 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 			_starpu_set_local_worker_key(worker);
 			finish_job_on_cuda(_starpu_get_job_associated_to_task(task), worker);
 			/* See next task if any */
-			if (worker->ntasks && worker->current_tasks[worker->first_task] != worker->task_transferring)
+			if (worker->ntasks)
 			{
-				task = worker->current_tasks[worker->first_task];
-				j = _starpu_get_job_associated_to_task(task);
-				if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
+				if (worker->current_tasks[worker->first_task] != worker->task_transferring)
 				{
-					/* An asynchronous task, it was already
-					 * queued, it's now running, record its start time.  */
-					_starpu_driver_start_job(worker, j, &worker->perf_arch, 0, starpu_profiling_status_get());
+					task = worker->current_tasks[worker->first_task];
+					j = _starpu_get_job_associated_to_task(task);
+					if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
+					{
+						/* An asynchronous task, it was already
+						 * queued, it's now running, record its start time.  */
+						_starpu_driver_start_job(worker, j, &worker->perf_arch, 0, starpu_profiling_status_get());
+					}
+					else
+					{
+						/* A synchronous task, we have finished
+						 * flushing the pipeline, we can now at
+						 * last execute it.  */
+
+						_STARPU_TRACE_EVENT("sync_task");
+						execute_job_on_cuda(task, worker);
+						_STARPU_TRACE_EVENT("end_sync_task");
+						worker->pipeline_stuck = 0;
+					}
 				}
 				else
-				{
-					/* A synchronous task, we have finished
-					 * flushing the pipeline, we can now at
-					 * last execute it.  */
-
-					_STARPU_TRACE_EVENT("sync_task");
-					execute_job_on_cuda(task, worker);
-					_STARPU_TRACE_EVENT("end_sync_task");
-					worker->pipeline_stuck = 0;
-				}
+					/* Data for next task didn't have time to finish transferring :/ */
+					_STARPU_TRACE_WORKER_START_FETCH_INPUT(NULL, workerid);
 			}
 #ifdef STARPU_USE_FXT
 			int k;

+ 1 - 0
src/drivers/disk/driver_disk.c

@@ -21,6 +21,7 @@
 #include <drivers/disk/driver_disk.h>
 #include <drivers/cpu/driver_cpu.h>
 #include <datawizard/coherency.h>
+#include <datawizard/memory_nodes.h>
 
 int _starpu_disk_copy_src_to_disk(void * src, unsigned src_node, void * dst, size_t dst_offset, unsigned dst_node, size_t size, void * async_channel)
 {

+ 0 - 0
src/drivers/driver_common/driver_common.c


Some files were not shown because too many files changed in this diff