Browse Source

Merge branch 'fpga' of git+ssh://scm.gforge.inria.fr/gitroot/starpu/starpu into fpga

Samuel Thibault 5 years ago
parent
commit
584a70d891
100 changed files with 807 additions and 241 deletions
  1. 11 0
      ChangeLog
  2. 15 1
      configure.ac
  3. 1 1
      doc/doxygen/Makefile.am
  4. 2 0
      doc/doxygen/chapters/210_check_list_performance.doxy
  5. 1 1
      doc/doxygen/chapters/301_tasks.doxy
  6. 3 1
      doc/doxygen/chapters/320_scheduling.doxy
  7. 3 0
      doc/doxygen/chapters/350_scheduling_policy_definition.doxy
  8. 9 0
      doc/doxygen/chapters/510_configure_options.doxy
  9. 2 0
      doc/doxygen/chapters/code/vector_scal_cuda.c
  10. 1 1
      doc/doxygen_dev/Makefile.am
  11. 2 0
      doc/tutorial/vector_scal_cuda.cu
  12. 2 0
      examples/basic_examples/block_cuda.cu
  13. 2 0
      examples/basic_examples/multiformat_conversion_codelets_cuda.cu
  14. 2 0
      examples/basic_examples/multiformat_cuda.cu
  15. 2 0
      examples/basic_examples/variable_kernels.cu
  16. 2 0
      examples/basic_examples/vector_scal_cuda.cu
  17. 2 0
      examples/filters/custom_mf/conversion.cu
  18. 2 0
      examples/filters/custom_mf/cuda.cu
  19. 2 0
      examples/filters/fblock_cuda.cu
  20. 4 0
      examples/filters/fmultiple_cuda.cu
  21. 2 0
      examples/incrementer/incrementer_kernels.cu
  22. 2 0
      examples/interface/complex_kernels.cu
  23. 1 0
      examples/mult/double.h
  24. 1 0
      examples/mult/simple.h
  25. 1 1
      examples/mult/xgemm.c
  26. 2 0
      examples/pi/SobolQRNG/sobol_gpu.cu
  27. 4 0
      examples/pi/pi_kernel.cu
  28. 4 0
      examples/pi/pi_redux_kernel.cu
  29. 2 0
      examples/reductions/dot_product_kernels.cu
  30. 2 0
      examples/sched_ctx/axpy_partition_gpu.cu
  31. 2 0
      examples/spmv/spmv_cuda.cu
  32. 2 0
      examples/stencil/life_cuda.cu
  33. 2 0
      examples/stencil/shadow.cu
  34. 9 0
      include/starpu_config.h.in
  35. 1 2
      include/starpu_fxt.h
  36. 17 0
      include/starpu_helper.h
  37. 5 0
      include/starpu_scheduler.h
  38. 23 0
      include/starpu_stdlib.h
  39. 1 1
      include/starpu_task.h
  40. 2 0
      julia/examples/mult/gpu_mult.cu
  41. 2 0
      julia/examples/old_examples/gpu_mult.cu
  42. 2 0
      julia/examples/old_examples/mandelbrot/gpu_mandelbrot.cu
  43. 2 0
      julia/examples/old_examples/mandelbrot/gpu_mandelbrot_between.cu
  44. 2 0
      julia/examples/old_examples/mult/gpu_mult.cu
  45. 4 0
      julia/examples/old_examples/nbody/gpu_nbody.cu
  46. 2 0
      julia/examples/old_examples/nbody/gpu_nbody_between.cu
  47. 4 0
      julia/src/compiler/expressions.jl
  48. 1 1
      mpi/examples/matrix_decomposition/mpi_cholesky.c
  49. 223 57
      mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
  50. 12 1
      mpi/examples/matrix_decomposition/mpi_decomposition_params.c
  51. 8 0
      mpi/examples/matrix_decomposition/mpi_decomposition_params.h
  52. 6 0
      mpi/include/starpu_mpi.h
  53. 2 3
      mpi/src/starpu_mpi_datatype.c
  54. 5 0
      mpi/src/starpu_mpi_init.c
  55. 2 0
      mpi/tests/ring_kernel.cu
  56. 11 4
      src/Makefile.am
  57. 3 0
      src/core/jobs.c
  58. 5 4
      src/core/perfmodel/perfmodel_bus.c
  59. 1 0
      src/core/perfmodel/perfmodel_print.c
  60. 1 0
      src/core/sched_ctx.c
  61. 20 20
      src/core/sched_policy.c
  62. 24 2
      src/core/simgrid.c
  63. 1 1
      src/core/simgrid.h
  64. 1 0
      src/core/task.c
  65. 27 0
      src/core/topology.c
  66. 6 0
      src/core/workers.c
  67. 6 0
      src/core/workers.h
  68. 58 17
      src/datawizard/coherency.c
  69. 2 2
      src/datawizard/coherency.h
  70. 2 2
      src/datawizard/copy_driver.c
  71. 10 1
      src/datawizard/copy_driver.h
  72. 47 29
      src/datawizard/data_request.c
  73. 7 8
      src/datawizard/data_request.h
  74. 2 1
      src/datawizard/filters.c
  75. 3 0
      src/datawizard/interfaces/bcsr_interface.c
  76. 3 0
      src/datawizard/interfaces/block_interface.c
  77. 3 0
      src/datawizard/interfaces/coo_interface.c
  78. 3 0
      src/datawizard/interfaces/csr_interface.c
  79. 1 1
      src/datawizard/interfaces/data_interface.c
  80. 3 0
      src/datawizard/interfaces/matrix_interface.c
  81. 3 0
      src/datawizard/interfaces/multiformat_interface.c
  82. 3 0
      src/datawizard/interfaces/tensor_interface.c
  83. 3 0
      src/datawizard/interfaces/variable_interface.c
  84. 3 0
      src/datawizard/interfaces/vector_interface.c
  85. 3 0
      src/datawizard/interfaces/void_interface.c
  86. 23 18
      src/datawizard/memalloc.c
  87. 9 1
      src/datawizard/memalloc.h
  88. 1 0
      src/datawizard/memory_manager.c
  89. 1 0
      src/datawizard/reduction.c
  90. 7 6
      src/datawizard/user_interactions.c
  91. 2 1
      src/datawizard/write_back.c
  92. 2 2
      src/debug/latency.c
  93. 10 8
      src/debug/traces/starpu_fxt.c
  94. 0 2
      src/debug/traces/starpu_fxt.h
  95. 19 19
      src/debug/traces/starpu_fxt_mpi.c
  96. 2 0
      src/debug/traces/starpu_paje.c
  97. 4 1
      src/drivers/cpu/driver_cpu.c
  98. 29 20
      src/drivers/cuda/driver_cuda.c
  99. 1 0
      src/drivers/disk/driver_disk.c
  100. 0 0
      src/drivers/driver_common/driver_common.c

+ 11 - 0
ChangeLog

@@ -31,9 +31,20 @@ New features:
     files. This file can be parsed by the new script
     files. This file can be parsed by the new script
     starpu_fxt_number_events_to_names.py to convert event keys to event names.
     starpu_fxt_number_events_to_names.py to convert event keys to event names.
   * New STARPU_PER_WORKER perfmodel.
   * New STARPU_PER_WORKER perfmodel.
+  * Add energy accounting in the simgrid mode: starpu_energy_use() and
+    starpu_energy_used().
+  * New function starpu_mpi_get_thread_cpuid() to know where is bound the MPI
+    thread.
+  * New function starpu_get_pu_os_index() to convert logical index of a PU to
+    its OS index.
+  * New function starpu_get_hwloc_topology() to get the hwloc topology used by
+    StarPU.
+  * Add a task prefetch level, to improve retaining data in accelerators so we
+    can make prefetch more aggressive.
 
 
 Small changes:
 Small changes:
   * Use the S4U interface of Simgrid instead of xbt and MSG.
   * Use the S4U interface of Simgrid instead of xbt and MSG.
+  * Add a synthetic energy efficiency testcase.
 
 
 StarPU 1.3.4 (git revision xxx)
 StarPU 1.3.4 (git revision xxx)
 ==============================================
 ==============================================

+ 15 - 1
configure.ac

@@ -1459,6 +1459,9 @@ if test x$enable_cuda = xyes; then
 	    ]
 	    ]
 	)
 	)
 	if test x$have_valid_nvml = xyes ; then
 	if test x$have_valid_nvml = xyes ; then
+		AC_CHECK_DECLS([nvmlDeviceGetTotalEnergyConsumption], [
+			AC_CHECK_FUNCS([nvmlDeviceGetTotalEnergyConsumption])
+			], [], [[#include <nvml.h>]])
 		AC_DEFINE([HAVE_LIBNVIDIA_ML], [1], [Define to 1 if you have the nvidia-ml library])
 		AC_DEFINE([HAVE_LIBNVIDIA_ML], [1], [Define to 1 if you have the nvidia-ml library])
 		STARPU_CUDA_LDFLAGS="$STARPU_CUDA_LDFLAGS -lnvidia-ml"
 		STARPU_CUDA_LDFLAGS="$STARPU_CUDA_LDFLAGS -lnvidia-ml"
 	fi
 	fi
@@ -2321,6 +2324,14 @@ AC_MSG_RESULT($nmaxbuffers)
 AC_DEFINE_UNQUOTED(STARPU_NMAXBUFS, [$nmaxbuffers],
 AC_DEFINE_UNQUOTED(STARPU_NMAXBUFS, [$nmaxbuffers],
 		[how many buffers can be manipulated per task])
 		[how many buffers can be manipulated per task])
 
 
+AC_MSG_CHECKING(how many MPI nodes fxt files can be manipulated when generating traces)
+AC_ARG_ENABLE(fxt-max-files, [AS_HELP_STRING([--enable-fxt-max-files=<nbuffers>],
+			[maximum number of mpi nodes for traces])],
+			nmaxfxtfiles=$enableval, nmaxfxtfiles=64)
+AC_MSG_RESULT($nmaxfxtfiles)
+AC_DEFINE_UNQUOTED(STARPU_FXT_MAX_FILES, [$nmaxfxtfiles],
+		[how many MPI nodes fxt files can be manipulated when generating traces])
+
 AC_MSG_CHECKING(maximum number of memory nodes to use per MPI rank)
 AC_MSG_CHECKING(maximum number of memory nodes to use per MPI rank)
 AC_ARG_ENABLE(maxnodes, [AS_HELP_STRING([--enable-maxnodes=<nnodes>],
 AC_ARG_ENABLE(maxnodes, [AS_HELP_STRING([--enable-maxnodes=<nnodes>],
 			[maximum number of memory nodes per MPI rank])],
 			[maximum number of memory nodes per MPI rank])],
@@ -2645,7 +2656,7 @@ if test "x$enable_build_fortran_requested" = "xyes" ; then
 	fi
 	fi
 	if test "x$enable_build_fortran" = "xyes" ; then
 	if test "x$enable_build_fortran" = "xyes" ; then
 		AC_DEFINE(STARPU_HAVE_FC, [1], [Define this if a Fortran compiler is available])
 		AC_DEFINE(STARPU_HAVE_FC, [1], [Define this if a Fortran compiler is available])
-		if test x$build_mpi_lib = xyes -o x$build_mpi_master_slave = xyes ; then
+		if test x$build_mpi_lib = xyes -o x$build_nmad_lib = xyes -o x$build_mpi_master_slave = xyes ; then
 			#Check MPIFORT
 			#Check MPIFORT
 			if test x$enable_simgrid = xyes ; then
 			if test x$enable_simgrid = xyes ; then
 				DEFAULT_MPIFORT=smpifort
 				DEFAULT_MPIFORT=smpifort
@@ -3620,6 +3631,9 @@ AC_CONFIG_COMMANDS([executable-scripts], [
   test -e tests/microbenchs/parallel_independent_homogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_independent_homogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh tests/microbenchs/
+  mkdir -p tests/energy
+  test -e tests/energy/static.sh || ln -sf $ac_abs_top_srcdir/tests/energy/static.sh tests/energy/
+  test -e tests/energy/dynamic.sh || ln -sf $ac_abs_top_srcdir/tests/energy/dynamic.sh tests/energy/
   mkdir -p tests/datawizard
   mkdir -p tests/datawizard
   test -e tests/datawizard/locality.sh || ln -sf $ac_abs_top_srcdir/tests/datawizard/locality.sh tests/datawizard/
   test -e tests/datawizard/locality.sh || ln -sf $ac_abs_top_srcdir/tests/datawizard/locality.sh tests/datawizard/
   mkdir -p tests/overlap
   mkdir -p tests/overlap

+ 1 - 1
doc/doxygen/Makefile.am

@@ -307,5 +307,5 @@ EXTRA_DIST += doxygen.cfg refman.tex \
 # Rule to update documentation on web server. Should only be used locally.
 # Rule to update documentation on web server. Should only be used locally.
 PUBLISHHOST	?= gforge
 PUBLISHHOST	?= gforge
 update-web: $(DOX_PDF)
 update-web: $(DOX_PDF)
-	scp -pr starpu.pdf html $(PUBLISHHOST):/home/groups/starpu/htdocs/doc
+	scp -pr starpu.pdf html $(PUBLISHHOST):/home/groups/starpu/htdocs/files/doc
 
 

+ 2 - 0
doc/doxygen/chapters/210_check_list_performance.doxy

@@ -91,6 +91,8 @@ operations to avoid this issue. For instance:
 
 
 \code{.c}
 \code{.c}
 func <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);
 func <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);
+cudaError_t status = cudaGetLastError();
+if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 cudaStreamSynchronize(starpu_cuda_get_local_stream());
 cudaStreamSynchronize(starpu_cuda_get_local_stream());
 \endcode
 \endcode
 
 

+ 1 - 1
doc/doxygen/chapters/301_tasks.doxy

@@ -118,7 +118,7 @@ to delay the termination of a task until the termination of other tasks.
 
 
 \section SettingManyDataHandlesForATask Setting Many Data Handles For a Task
 \section SettingManyDataHandlesForATask Setting Many Data Handles For a Task
 
 
-The maximum number of data a task can manage is fixed by the environment variable
+The maximum number of data a task can manage is fixed by the macro
 \ref STARPU_NMAXBUFS which has a default value which can be changed
 \ref STARPU_NMAXBUFS which has a default value which can be changed
 through the \c configure option \ref enable-maxbuffers "--enable-maxbuffers".
 through the \c configure option \ref enable-maxbuffers "--enable-maxbuffers".
 
 

+ 3 - 1
doc/doxygen/chapters/320_scheduling.doxy

@@ -205,7 +205,9 @@ simply tend to run all computations on the most energy-conservative processing
 unit. To account for the consumption of the whole machine (including idle
 unit. To account for the consumption of the whole machine (including idle
 processing units), the idle power of the machine should be given by setting
 processing units), the idle power of the machine should be given by setting
 <c>export STARPU_IDLE_POWER=200</c> (\ref STARPU_IDLE_POWER) for 200W, for instance. This value can often
 <c>export STARPU_IDLE_POWER=200</c> (\ref STARPU_IDLE_POWER) for 200W, for instance. This value can often
-be obtained from the machine power supplier.
+be obtained from the machine power supplier, e.g. by running
+
+<c>ipmitool -I lanplus -H mymachine-ipmi -U myuser -P mypasswd sdr type Current</c>
 
 
 The energy actually consumed by the total execution can be displayed by setting
 The energy actually consumed by the total execution can be displayed by setting
 <c>export STARPU_PROFILING=1 STARPU_WORKER_STATS=1</c> (\ref STARPU_PROFILING and \ref STARPU_WORKER_STATS).
 <c>export STARPU_PROFILING=1 STARPU_WORKER_STATS=1</c> (\ref STARPU_PROFILING and \ref STARPU_WORKER_STATS).

+ 3 - 0
doc/doxygen/chapters/350_scheduling_policy_definition.doxy

@@ -60,6 +60,9 @@ queue the transfers on the idle prefetch queue, which is only processed when
 there are no non-idle prefetch to process.
 there are no non-idle prefetch to process.
 starpu_get_prefetch_flag() is a convenient helper for checking the value of the 
 starpu_get_prefetch_flag() is a convenient helper for checking the value of the 
 \ref STARPU_PREFETCH environment variable.
 \ref STARPU_PREFETCH environment variable.
+When a scheduler does such prefetching, it should set the <c>prefetches</c>
+field of the <c>starpu_sched_policy</c> to 1, to prevent the core from
+triggering its own prefetching.
 
 
 Usual functions can be used on tasks, for instance one can use the following to
 Usual functions can be used on tasks, for instance one can use the following to
 get the data size for a task.
 get the data size for a task.

+ 9 - 0
doc/doxygen/chapters/510_configure_options.doxy

@@ -527,6 +527,15 @@ Define the maximum number of buffers that tasks will be able to take
 as parameters, then available as the macro ::STARPU_NMAXBUFS.
 as parameters, then available as the macro ::STARPU_NMAXBUFS.
 </dd>
 </dd>
 
 
+<dt>--enable-fxt-max-files=<c>count</c></dt>
+<dd>
+\anchor enable-fxt-max-files
+\addindex __configure__--enable-fxt-max-files
+Use at most <c>count</c> mpi nodes fxt files for generating traces.  This information is then available as
+the macro ::STARPU_FXT_MAX_FILES.  This information is used by FxT tools when considering multi node traces.
+Default value is 64.
+</dd>
+
 <dt>--enable-allocation-cache</dt>
 <dt>--enable-allocation-cache</dt>
 <dd>
 <dd>
 \anchor enable-allocation-cache
 \anchor enable-allocation-cache

+ 2 - 0
doc/doxygen/chapters/code/vector_scal_cuda.c

@@ -35,6 +35,8 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
         unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
         unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
 
         vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(n, val, *factor);
         vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(n, val, *factor);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
 
         cudaStreamSynchronize(starpu_cuda_get_local_stream());
         cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }

+ 1 - 1
doc/doxygen_dev/Makefile.am

@@ -248,5 +248,5 @@ EXTRA_DIST += doxygen.cfg refman.tex \
 # Rule to update documentation on web server. Should only be used locally.
 # Rule to update documentation on web server. Should only be used locally.
 PUBLISHHOST	?= gforge
 PUBLISHHOST	?= gforge
 update-web: $(DOX_PDF)
 update-web: $(DOX_PDF)
-	scp -pr starpu_dev.pdf html_dev $(PUBLISHHOST):/home/groups/starpu/htdocs/doc
+	scp -pr starpu_dev.pdf html_dev $(PUBLISHHOST):/home/groups/starpu/htdocs/files/doc
 
 

+ 2 - 0
doc/tutorial/vector_scal_cuda.cu

@@ -35,6 +35,8 @@ extern "C" void vector_scal_cuda(void *buffers[], void *_args)
         unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
         unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
 
         vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(val, n, *factor);
         vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(val, n, *factor);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
 
         cudaStreamSynchronize(starpu_cuda_get_local_stream());
         cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }

+ 2 - 0
examples/basic_examples/block_cuda.cu

@@ -40,5 +40,7 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
         float *multiplier = (float *)_args;
         float *multiplier = (float *)_args;
 
 
         cuda_block<<<1,1, 0, starpu_cuda_get_local_stream()>>>(block, nx, ny, nz, ldy, ldz, *multiplier);
         cuda_block<<<1,1, 0, starpu_cuda_get_local_stream()>>>(block, nx, ny, nz, ldy, ldz, *multiplier);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }

+ 2 - 0
examples/basic_examples/multiformat_conversion_codelets_cuda.cu

@@ -44,4 +44,6 @@ extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
 
         cpu_to_cuda_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(src, dst, n);
         cpu_to_cuda_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(src, dst, n);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }
 }

+ 2 - 0
examples/basic_examples/multiformat_cuda.cu

@@ -39,6 +39,8 @@ extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)
 	unsigned threads_per_block = 64;
 	unsigned threads_per_block = 64;
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
         multiformat_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(soa, n);
         multiformat_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(soa, n);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
 
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }

+ 2 - 0
examples/basic_examples/variable_kernels.cu

@@ -27,5 +27,7 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
 	float *val = (float *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	float *val = (float *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 
 	cuda_variable<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
 	cuda_variable<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }

+ 2 - 0
examples/basic_examples/vector_scal_cuda.cu

@@ -41,4 +41,6 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
 
         vector_mult_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(n, val, *factor);
         vector_mult_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(n, val, *factor);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }
 }

+ 2 - 0
examples/filters/custom_mf/conversion.cu

@@ -45,4 +45,6 @@ extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
 	unsigned threads_per_block = 64;
 	unsigned threads_per_block = 64;
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
         custom_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(aop, n, x, y);
         custom_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(aop, n, x, y);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }
 }

+ 2 - 0
examples/filters/custom_mf/cuda.cu

@@ -39,4 +39,6 @@ extern "C" void custom_scal_cuda_func(void *buffers[], void *_args)
 	unsigned threads_per_block = 64;
 	unsigned threads_per_block = 64;
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
         scal_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(n, x, y);
         scal_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(n, x, y);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }
 }

+ 2 - 0
examples/filters/fblock_cuda.cu

@@ -43,4 +43,6 @@ extern "C" void cuda_func(void *buffers[], void *_args)
 
 
         /* TODO: use more blocks and threads in blocks */
         /* TODO: use more blocks and threads in blocks */
         fblock_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(block, nx, ny, nz, ldy, ldz, *factor);
         fblock_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(block, nx, ny, nz, ldy, ldz, *factor);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }
 }

+ 4 - 0
examples/filters/fmultiple_cuda.cu

@@ -44,6 +44,8 @@ extern "C" void fmultiple_check_scale_cuda(void *buffers[], void *cl_arg)
 
 
         /* TODO: use more vals and threads in vals */
         /* TODO: use more vals and threads in vals */
 	_fmultiple_check_scale_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val, nx, ny, ld, start, factor);
 	_fmultiple_check_scale_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val, nx, ny, ld, start, factor);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }
 }
 
 
 static __global__ void _fmultiple_check_cuda(int *val, int nx, int ny, unsigned ld, int start, int factor)
 static __global__ void _fmultiple_check_cuda(int *val, int nx, int ny, unsigned ld, int start, int factor)
@@ -71,4 +73,6 @@ extern "C" void fmultiple_check_cuda(void *buffers[], void *cl_arg)
 
 
         /* TODO: use more vals and threads in vals */
         /* TODO: use more vals and threads in vals */
 	_fmultiple_check_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val, nx, ny, ld, start, factor);
 	_fmultiple_check_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val, nx, ny, ld, start, factor);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }
 }

+ 2 - 0
examples/incrementer/incrementer_kernels.cu

@@ -32,4 +32,6 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
 	float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 
 
 	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
 	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }
 }

+ 2 - 0
examples/interface/complex_kernels.cu

@@ -44,4 +44,6 @@ extern "C" void copy_complex_codelet_cuda(void *descr[], void *_args)
 	unsigned nblocks = (nx + threads_per_block-1) / threads_per_block;
 	unsigned nblocks = (nx + threads_per_block-1) / threads_per_block;
 
 
         complex_copy_cuda<<<nblocks, threads_per_block, 0, starpu_cuda_get_local_stream()>>>(o_real, o_imaginary, i_real, i_imaginary, nx);
         complex_copy_cuda<<<nblocks, threads_per_block, 0, starpu_cuda_get_local_stream()>>>(o_real, o_imaginary, i_real, i_imaginary, nx);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }
 }

+ 1 - 0
examples/mult/double.h

@@ -15,6 +15,7 @@
  */
  */
 
 
 #define TYPE	double
 #define TYPE	double
+#define EPSILON	0.000000000001
 
 
 #define CUBLAS_GEMM cublasDgemm
 #define CUBLAS_GEMM cublasDgemm
 #define CPU_GEMM	STARPU_DGEMM
 #define CPU_GEMM	STARPU_DGEMM

+ 1 - 0
examples/mult/simple.h

@@ -15,6 +15,7 @@
  */
  */
 
 
 #define TYPE	float
 #define TYPE	float
+#define EPSILON	0.000001
 
 
 #define CUBLAS_GEMM cublasSgemm
 #define CUBLAS_GEMM cublasSgemm
 #define CPU_GEMM	STARPU_SGEMM
 #define CPU_GEMM	STARPU_SGEMM

+ 1 - 1
examples/mult/xgemm.c

@@ -75,7 +75,7 @@ static int check_output(void)
 	TYPE err;
 	TYPE err;
 	err = CPU_ASUM(xdim*ydim, C, 1);
 	err = CPU_ASUM(xdim*ydim, C, 1);
 
 
-	if (err < xdim*ydim*0.001)
+	if (err < EPSILON*xdim*ydim*zdim)
 	{
 	{
 		FPRINTF(stderr, "Results are OK\n");
 		FPRINTF(stderr, "Results are OK\n");
 		return 0;
 		return 0;

+ 2 - 0
examples/pi/SobolQRNG/sobol_gpu.cu

@@ -165,4 +165,6 @@ extern "C" void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_direct
 
 
     // Execute GPU kernel
     // Execute GPU kernel
     sobolGPU_kernel<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(n_vectors, n_dimensions, d_directions, d_output);
     sobolGPU_kernel<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(n_vectors, n_dimensions, d_directions, d_output);
+    cudaError_t status = cudaGetLastError();
+    if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }
 }

+ 4 - 0
examples/pi/pi_kernel.cu

@@ -137,12 +137,16 @@ extern "C" void cuda_kernel(void *descr[], void *cl_arg)
 	/* each entry of per_block_cnt contains the number of successful shots
 	/* each entry of per_block_cnt contains the number of successful shots
 	 * in the corresponding block. */
 	 * in the corresponding block. */
 	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(random_numbers_x, random_numbers_y, nx, per_block_cnt);
 	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(random_numbers_x, random_numbers_y, nx, per_block_cnt);
+	cures = cudaGetLastError();
+	if (cures != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures);
 
 
 	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
 	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
 
 
 	/* compute the total number of successful shots by adding the elements
 	/* compute the total number of successful shots by adding the elements
 	 * of the per_block_cnt array */
 	 * of the per_block_cnt array */
 	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, cnt);
 	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, cnt);
+	cures = cudaGetLastError();
+	if (cures != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures);
 	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	if (cures)
 	if (cures)
 		STARPU_CUDA_REPORT_ERROR(cures);
 		STARPU_CUDA_REPORT_ERROR(cures);

+ 4 - 0
examples/pi/pi_redux_kernel.cu

@@ -115,12 +115,16 @@ extern "C" void pi_redux_cuda_kernel(float *x, float *y, unsigned n, unsigned lo
 	/* each entry of per_block_cnt contains the number of successful shots
 	/* each entry of per_block_cnt contains the number of successful shots
 	 * in the corresponding block. */
 	 * in the corresponding block. */
 	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(x, y, n, per_block_cnt);
 	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(x, y, n, per_block_cnt);
+	cures = cudaGetLastError();
+	if (cures != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures);
 
 
 	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
 	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
 
 
 	/* compute the total number of successful shots by adding the elements
 	/* compute the total number of successful shots by adding the elements
 	 * of the per_block_cnt array */
 	 * of the per_block_cnt array */
 	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, shot_cnt);
 	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, shot_cnt);
+	cures = cudaGetLastError();
+	if (cures != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures);
 	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	if (cures)
 	if (cures)
 		STARPU_CUDA_REPORT_ERROR(cures);
 		STARPU_CUDA_REPORT_ERROR(cures);

+ 2 - 0
examples/reductions/dot_product_kernels.cu

@@ -33,4 +33,6 @@ extern "C" void redux_cuda_func(void *descr[], void *_args)
 	DOT_TYPE *dotb = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
 	DOT_TYPE *dotb = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
 
 
 	cuda_redux<<<1,1, 0, starpu_cuda_get_local_stream()>>>(dota, dotb);
 	cuda_redux<<<1,1, 0, starpu_cuda_get_local_stream()>>>(dota, dotb);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }
 }

+ 2 - 0
examples/sched_ctx/axpy_partition_gpu.cu

@@ -73,4 +73,6 @@ extern "C" void cuda_axpy(void *descr[], void *_args)
 	__P_HOSTSETUP(saxpy_partitioned,dim3(dimensions,1,1),dimensions,0,SM_mapping_start,SM_allocation,stream);
 	__P_HOSTSETUP(saxpy_partitioned,dim3(dimensions,1,1),dimensions,0,SM_mapping_start,SM_allocation,stream);
 
 
   	saxpy_partitioned<<<width,dimensions,0,stream>>>(__P_HKARGS,n,a,x,y);
   	saxpy_partitioned<<<width,dimensions,0,stream>>>(__P_HKARGS,n,a,x,y);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }
 }

+ 2 - 0
examples/spmv/spmv_cuda.cu

@@ -97,6 +97,8 @@ extern "C" void spmv_kernel_cuda(void *descr[], void *args)
 
 
 	spmv_kernel_3<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>
 	spmv_kernel_3<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>
 		(nnz, nrow, nzval, colind, rowptr, firstentry, vecin, nx_in, vecout, nx_out);
 		(nnz, nrow, nzval, colind, rowptr, firstentry, vecin, nx_in, vecout, nx_out);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }
 }
 
 
 
 

+ 2 - 0
examples/stencil/life_cuda.cu

@@ -73,4 +73,6 @@ extern "C" void cuda_life_update_host(int bz, const TYPE *old, TYPE *newp, int n
 	dim3 dimGrid((nx + threads_per_dim_x-1) / threads_per_dim_x, (ny + threads_per_dim_y-1) / threads_per_dim_y);
 	dim3 dimGrid((nx + threads_per_dim_x-1) / threads_per_dim_x, (ny + threads_per_dim_y-1) / threads_per_dim_y);
 #endif
 #endif
 	cuda_life_update <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> (bz, old, newp, nx, ny, nz, ldy, ldz, iter);
 	cuda_life_update <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> (bz, old, newp, nx, ny, nz, ldy, ldz, iter);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }
 }

+ 2 - 0
examples/stencil/shadow.cu

@@ -53,4 +53,6 @@ extern "C" void cuda_shadow_host(int bz, TYPE *ptr, int nx, int ny, int nz, int
 	dim3 dimGrid((nx + threads_per_dim_x-1) / threads_per_dim_x, (ny + threads_per_dim_y-1) / threads_per_dim_y);
 	dim3 dimGrid((nx + threads_per_dim_x-1) / threads_per_dim_x, (ny + threads_per_dim_y-1) / threads_per_dim_y);
 #endif
 #endif
 	cuda_shadow <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> (bz, ptr, nx, ny, nz, ldy, ldz, i);
 	cuda_shadow <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> (bz, ptr, nx, ny, nz, ldy, ldz, i);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }
 }

+ 9 - 0
include/starpu_config.h.in

@@ -187,6 +187,15 @@
 #undef STARPU_NMAXBUFS
 #undef STARPU_NMAXBUFS
 
 
 /**
 /**
+   Define the maximum number of fxt mpi files that can be read when
+   generating traces. The default value is 64, it can be changed by
+   using the configure option \ref enable-fxt-max-files
+   "--enable-fxt-max-files".
+   @ingroup API_MPI_Support
+*/
+#undef STARPU_FXT_MAX_FILES
+
+/**
    Define the maximum number of CPU workers managed by StarPU. The
    Define the maximum number of CPU workers managed by StarPU. The
    default value can be modified at configure by using the option \ref
    default value can be modified at configure by using the option \ref
    enable-maxcpus "--enable-maxcpus".
    enable-maxcpus "--enable-maxcpus".

+ 1 - 2
include/starpu_fxt.h

@@ -20,6 +20,7 @@
 #ifndef __STARPU_FXT_H__
 #ifndef __STARPU_FXT_H__
 #define __STARPU_FXT_H__
 #define __STARPU_FXT_H__
 
 
+#include <starpu_config.h>
 #include <starpu_perfmodel.h>
 #include <starpu_perfmodel.h>
 
 
 #ifdef __cplusplus
 #ifdef __cplusplus
@@ -32,8 +33,6 @@ extern "C"
    @{
    @{
 */
 */
 
 
-#define STARPU_FXT_MAX_FILES	64
-
 struct starpu_fxt_codelet_event
 struct starpu_fxt_codelet_event
 {
 {
 	char symbol[256];
 	char symbol[256];

+ 17 - 0
include/starpu_helper.h

@@ -20,6 +20,10 @@
 #include <stdio.h>
 #include <stdio.h>
 #include <starpu.h>
 #include <starpu.h>
 
 
+#ifdef STARPU_HAVE_HWLOC
+#include <hwloc.h>
+#endif
+
 #ifdef __cplusplus
 #ifdef __cplusplus
 extern "C"
 extern "C"
 {
 {
@@ -190,6 +194,19 @@ int starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_ha
 */
 */
 void starpu_display_bindings(void);
 void starpu_display_bindings(void);
 
 
+/**
+   If \c hwloc is used, convert the given \p logical_index of a PU to the OS
+   index of this PU. If \c hwloc is not used, return \p logical_index.
+*/
+int starpu_get_pu_os_index(unsigned logical_index);
+
+#ifdef STARPU_HAVE_HWLOC
+/**
+   Get the hwloc topology used by StarPU. One can use this pointer to get
+   information about topology, but not to change settings related to topology.
+*/
+hwloc_topology_t starpu_get_hwloc_topology(void);
+#endif
 /** @} */
 /** @} */
 
 
 #ifdef __cplusplus
 #ifdef __cplusplus

+ 5 - 0
include/starpu_scheduler.h

@@ -186,6 +186,11 @@ struct starpu_sched_policy
 	*/
 	*/
 	void (*remove_workers)(unsigned sched_ctx_id, int *workerids, unsigned nworkers);
 	void (*remove_workers)(unsigned sched_ctx_id, int *workerids, unsigned nworkers);
 
 
+	/** Whether this scheduling policy does data prefetching, and thus the
+	    core should not try to do it opportunistically.
+	*/
+	int prefetches;
+
 	/**
 	/**
 	   Optional field. Name of the policy.
 	   Optional field. Name of the policy.
 	*/
 	*/

+ 23 - 0
include/starpu_stdlib.h

@@ -239,9 +239,32 @@ void starpu_memory_deallocate(unsigned node, size_t size);
 */
 */
 void starpu_memory_wait_available(unsigned node, size_t size);
 void starpu_memory_wait_available(unsigned node, size_t size);
 
 
+/**
+   Sleep for the given \p nb_sec seconds.
+   In simgrid mode, this only sleeps within virtual time.
+  */
 void starpu_sleep(float nb_sec);
 void starpu_sleep(float nb_sec);
+
+/**
+   Sleep for the given \p nb_micro_sec micro-seconds.
+   In simgrid mode, this only sleeps within virtual time.
+  */
 void starpu_usleep(float nb_micro_sec);
 void starpu_usleep(float nb_micro_sec);
 
 
+/**
+   Account for \p joules J being used.
+   This is support in simgrid mode, to record how much energy was used, and will
+   show up in further call to starpu_energy_used().
+  */
+void starpu_energy_use(float joules);
+
+/**
+   Return the amount of energy having been used in J.
+   This account the amounts passed to starpu_energy_use(), but also the static
+   energy use set by the \ref STARPU_IDLE_POWER environment variable.
+  */
+double starpu_energy_used(void);
+
 /** @} */
 /** @} */
 
 
 #ifdef __cplusplus
 #ifdef __cplusplus

+ 1 - 1
include/starpu_task.h

@@ -563,7 +563,7 @@ struct starpu_codelet
 
 
 	/**
 	/**
 	   Optional pointer to the task energy consumption performance
 	   Optional pointer to the task energy consumption performance
-	   model associated to this codelet. This optional field is
+	   model associated to this codelet (in J). This optional field is
 	   ignored when set to <c>NULL</c> or when its field
 	   ignored when set to <c>NULL</c> or when its field
 	   starpu_perfmodel::symbol is not set. In the case of
 	   starpu_perfmodel::symbol is not set. In the case of
 	   parallel codelets, this has to account for all processing
 	   parallel codelets, this has to account for all processing

+ 2 - 0
julia/examples/mult/gpu_mult.cu

@@ -79,6 +79,8 @@ extern "C" void gpu_mult(void * descr[], void * args)
 	gpuMultKernel
 	gpuMultKernel
 		<<< nblocks, THREADS_PER_BLOCK, 0, NULL /*starpu_cuda_get_local_stream()*/
 		<<< nblocks, THREADS_PER_BLOCK, 0, NULL /*starpu_cuda_get_local_stream()*/
 		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
 		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
 
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 

+ 2 - 0
julia/examples/old_examples/gpu_mult.cu

@@ -78,6 +78,8 @@ extern "C" void gpu_mult(void * descr[], void * args)
 	gpuMultKernel
 	gpuMultKernel
 		<<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
 		<<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
 		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
 		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
 
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 

+ 2 - 0
julia/examples/old_examples/mandelbrot/gpu_mandelbrot.cu

@@ -106,6 +106,8 @@ extern "C" void gpu_mandelbrot(void *descr[], void *args)
   nblocks = (nxP * nyP + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
   nblocks = (nxP * nyP + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
 
 
   gpuMandelbrotKernel <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream() >>> (nxP, nyP, ldP, d_subP, *params);
   gpuMandelbrotKernel <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream() >>> (nxP, nyP, ldP, d_subP, *params);
+  cudaError_t status = cudaGetLastError();
+  if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
 
   cudaStreamSynchronize(starpu_cuda_get_local_stream());
   cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }

+ 2 - 0
julia/examples/old_examples/mandelbrot/gpu_mandelbrot_between.cu

@@ -123,6 +123,8 @@ extern "C" void CUDA_mandelbrot(void** buffers_uwrYFDVe, void* cl_arg_uwrYFDVe)
              ptr_qoUGBRtY, local_height, conv_limit, ptr_A5zD9sJZ, 
              ptr_qoUGBRtY, local_height, conv_limit, ptr_A5zD9sJZ, 
              ld_A5zD9sJZ);
              ld_A5zD9sJZ);
     ;
     ;
+    cudaError_t status = cudaGetLastError();
+    if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
     cudaStreamSynchronize(starpu_cuda_get_local_stream());
     cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }
 
 

+ 2 - 0
julia/examples/old_examples/mult/gpu_mult.cu

@@ -78,6 +78,8 @@ extern "C" void gpu_mult(void * descr[], void * args)
 	gpuMultKernel
 	gpuMultKernel
 		<<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
 		<<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
 		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
 		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
 
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 

+ 4 - 0
julia/examples/old_examples/nbody/gpu_nbody.cu

@@ -94,6 +94,8 @@ extern "C" void gpu_nbody(void * descr[], void * args)
   gpuNbodyKernel
   gpuNbodyKernel
     <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
     <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
     >>> (d_P,  d_subA, d_M, nxP, nxA, nxM, ldP, ldA, *params);
     >>> (d_P,  d_subA, d_M, nxP, nxA, nxM, ldP, ldA, *params);
+  cudaError_t status = cudaGetLastError();
+  if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
 
   cudaStreamSynchronize(starpu_cuda_get_local_stream());
   cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 
@@ -156,6 +158,8 @@ extern "C" void gpu_nbody2(void * descr[], void *args)
   gpuNbody2Kernel
   gpuNbody2Kernel
     <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
     <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
     >>> (d_subP, d_subV, d_subA, nxP, nxV, nxA, ldP, ldV, ldA, *params);
     >>> (d_subP, d_subV, d_subA, nxP, nxV, nxA, ldP, ldV, ldA, *params);
+  cudaError_t status = cudaGetLastError();
+  if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
 
   cudaStreamSynchronize(starpu_cuda_get_local_stream());
   cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }

+ 2 - 0
julia/examples/old_examples/nbody/gpu_nbody_between.cu

@@ -161,6 +161,8 @@ extern "C" void CUDA_nbody_updt(void** buffers_gj6UYWT4, void* cl_arg_gj6UYWT4)
              ld_jJ5f8wMA, ptr_piPvdbTs, ld_piPvdbTs, ptr_JBaPgPiT, 
              ld_jJ5f8wMA, ptr_piPvdbTs, ld_piPvdbTs, ptr_JBaPgPiT, 
              ptr_0STm2S4k, ld_0STm2S4k);
              ptr_0STm2S4k, ld_0STm2S4k);
     ;
     ;
+    cudaError_t status = cudaGetLastError();
+    if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
     cudaStreamSynchronize(starpu_cuda_get_local_stream());
     cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }
 
 

+ 4 - 0
julia/src/compiler/expressions.jl

@@ -335,6 +335,10 @@ function print(io :: IO, expr :: StarpuExprCudaCall ; indent = 0,restrict=false)
 
 
     print(io, ");")
     print(io, ");")
     print_newline(io, indent)
     print_newline(io, indent)
+    print(io, "cudaError_t status = cudaGetLastError();")
+    print_newline(io, indent)
+    print(io, "if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);")
+    print_newline(io, indent)
 
 
 end
 end
 
 

+ 1 - 1
mpi/examples/matrix_decomposition/mpi_cholesky.c

@@ -58,7 +58,7 @@ int main(int argc, char **argv)
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
 	matrix_display(bmat, rank);
 	matrix_display(bmat, rank);
 
 
-	if (check)
+	if (check && rank == 0)
 		dw_cholesky_check_computation(bmat, rank, nodes, &correctness, &flops, 0.001);
 		dw_cholesky_check_computation(bmat, rank, nodes, &correctness, &flops, 0.001);
 #endif
 #endif
 
 

+ 223 - 57
mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c

@@ -68,6 +68,212 @@ static struct starpu_codelet cl22 =
 	.color = 0x00ff00,
 	.color = 0x00ff00,
 };
 };
 
 
+static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int nodes)
+{
+	unsigned k, m, n;
+	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
+
+	for (k = 0; k < nblocks; k++)
+	{
+		starpu_iteration_push(k);
+
+		starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
+				       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
+				       STARPU_RW, data_handles[k][k],
+				       0);
+
+		for (m = k+1; m<nblocks; m++)
+		{
+			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
+					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+					       STARPU_R, data_handles[k][k],
+					       STARPU_RW, data_handles[m][k],
+					       0);
+
+			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][k]);
+			if (my_distrib(k, k, nodes) == rank)
+				starpu_data_wont_use(data_handles[k][k]);
+
+			for (n = k+1; n<nblocks; n++)
+			{
+				if (n <= m)
+				{
+					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
+							       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+							       STARPU_R, data_handles[n][k],
+							       STARPU_R, data_handles[m][k],
+							       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
+							       0);
+				}
+			}
+
+			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
+			if (my_distrib(m, k, nodes) == rank)
+				starpu_data_wont_use(data_handles[m][k]);
+		}
+		starpu_iteration_pop();
+	}
+}
+
+/* TODO: generated from compiler polyhedral analysis of classical algorithm */
+static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, int nodes)
+{
+	unsigned k, m, n;
+	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
+
+	/* Column */
+	for (n = 0; n<nblocks; n++)
+	{
+		starpu_iteration_push(n);
+
+		/* Row */
+		for (m = n; m<nblocks; m++)
+		{
+			for (k = 0; k < n; k++)
+			{
+				/* Accumulate updates from TRSMs */
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+						       STARPU_R, data_handles[n][k],
+						       STARPU_R, data_handles[m][k],
+						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
+						       0);
+			}
+			k = n;
+			if (m > n)
+			{
+				/* non-diagonal block, solve */
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+						       STARPU_R, data_handles[k][k],
+						       STARPU_RW, data_handles[m][k],
+						       0);
+			}
+			else
+			{
+				/* diagonal block, factorize */
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
+						       STARPU_RW, data_handles[k][k],
+						       0);
+			}
+		}
+
+		starpu_iteration_pop();
+	}
+
+	/* Submit flushes, StarPU will fit them according to the progress */
+	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
+	for (m = 0; m < nblocks; m++)
+		for (n = 0; n < nblocks ; n++)
+			starpu_data_wont_use(data_handles[m][n]);
+}
+
+/* TODO: generated from compiler polyhedral analysis of classical algorithm */
+static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int rank, int nodes)
+{
+	unsigned a, b, c;
+	unsigned k, m, n;
+	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
+
+	/* double-antidiagonal number:
+	 * - a=0 contains (0,0) plus (1,0)
+	 * - a=1 contains (2,0), (1,1) plus (3,0), (2, 1)
+	 * - etc.
+	 */
+	for (a = 0; a < nblocks; a++)
+	{
+		starpu_iteration_push(a);
+
+		unsigned bfirst;
+		if (2*a < nblocks)
+			bfirst = 0;
+		else
+			bfirst = 2*a - (nblocks-1);
+
+		/* column within first antidiagonal for a */
+		for (b = bfirst; b <= a; b++)
+		{
+			/* column */
+			n = b;
+			/* row */
+			m = 2*a-b;
+
+			/* Accumulate updates from TRSMs */
+			for (c = 0; c < n; c++)
+			{
+				k = c;
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+						       STARPU_R, data_handles[n][k],
+						       STARPU_R, data_handles[m][k],
+						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
+						       0);
+			}
+
+			if (b < a)
+			{
+				/* non-diagonal block, solve */
+				k = n;
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+						       STARPU_R, data_handles[k][k],
+						       STARPU_RW, data_handles[m][k],
+						       0);
+			}
+			else
+			{
+				/* diagonal block, factorize */
+				k = a;
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
+						       STARPU_RW, data_handles[k][k],
+						       0);
+			}
+		}
+
+		/* column within second antidiagonal for a */
+		for (b = bfirst; b <= a; b++)
+		{
+			/* column */
+			n = b;
+			/* row */
+			m = 2*a-b + 1;
+
+			if (m >= nblocks)
+				/* Skip first item when even number of tiles */
+				continue;
+
+			/* Accumulate updates from TRSMs */
+			for (c = 0; c < n; c++)
+			{
+				k = c;
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+						       STARPU_R, data_handles[n][k],
+						       STARPU_R, data_handles[m][k],
+						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
+						       0);
+			}
+			/* non-diagonal block, solve */
+			k = n;
+			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
+					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+					       STARPU_R, data_handles[k][k],
+					       STARPU_RW, data_handles[m][k],
+					       0);
+		}
+
+		starpu_iteration_pop();
+	}
+
+	/* Submit flushes, StarPU will fit them according to the progress */
+	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
+	for (m = 0; m < nblocks; m++)
+		for (n = 0; n < nblocks ; n++)
+			starpu_data_wont_use(data_handles[m][n]);
+}
+
 /*
 /*
  *	code to bootstrap the factorization
  *	code to bootstrap the factorization
  *	and construct the DAG
  *	and construct the DAG
@@ -79,8 +285,6 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 	starpu_data_handle_t **data_handles;
 	starpu_data_handle_t **data_handles;
 	unsigned k, m, n;
 	unsigned k, m, n;
 
 
-	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
-
 	/* create all the DAG nodes */
 	/* create all the DAG nodes */
 
 
 	data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
 	data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
@@ -91,7 +295,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 		for(n = 0; n < nblocks ; n++)
 		for(n = 0; n < nblocks ; n++)
 		{
 		{
 			int mpi_rank = my_distrib(m, n, nodes);
 			int mpi_rank = my_distrib(m, n, nodes);
-			if (mpi_rank == rank)
+			if (mpi_rank == rank || (check && rank == 0))
 			{
 			{
 				//fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, n, m);
 				//fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, n, m);
 				starpu_matrix_data_register(&data_handles[m][n], STARPU_MAIN_RAM, (uintptr_t)matA[m][n],
 				starpu_matrix_data_register(&data_handles[m][n], STARPU_MAIN_RAM, (uintptr_t)matA[m][n],
@@ -119,50 +323,16 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 	starpu_mpi_barrier(MPI_COMM_WORLD);
 	starpu_mpi_barrier(MPI_COMM_WORLD);
 	start = starpu_timing_now();
 	start = starpu_timing_now();
 
 
-	for (k = 0; k < nblocks; k++)
+	switch (submission)
 	{
 	{
-		starpu_iteration_push(k);
-
-		starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
-				       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
-				       STARPU_RW, data_handles[k][k],
-				       0);
-
-		for (m = k+1; m<nblocks; m++)
-		{
-			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
-					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
-					       STARPU_R, data_handles[k][k],
-					       STARPU_RW, data_handles[m][k],
-					       0);
-
-			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][k]);
-			if (my_distrib(k, k, nodes) == rank)
-				starpu_data_wont_use(data_handles[k][k]);
-
-			for (n = k+1; n<nblocks; n++)
-			{
-				if (n <= m)
-				{
-					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
-							       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
-							       STARPU_R, data_handles[n][k],
-							       STARPU_R, data_handles[m][k],
-							       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
-							       0);
-				}
-			}
-
-			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
-			if (my_distrib(m, k, nodes) == rank)
-				starpu_data_wont_use(data_handles[m][k]);
-		}
-		starpu_iteration_pop();
+		case TRIANGLES:		run_cholesky(data_handles, rank, nodes); break;
+		case COLUMNS:		run_cholesky_column(data_handles, rank, nodes); break;
+		case ANTIDIAGONALS:	run_cholesky_antidiagonal(data_handles, rank, nodes); break;
+		default: STARPU_ABORT();
 	}
 	}
 
 
 	starpu_mpi_wait_for_all(MPI_COMM_WORLD);
 	starpu_mpi_wait_for_all(MPI_COMM_WORLD);
 	starpu_mpi_barrier(MPI_COMM_WORLD);
 	starpu_mpi_barrier(MPI_COMM_WORLD);
-
 	end = starpu_timing_now();
 	end = starpu_timing_now();
 
 
 	for (m = 0; m < nblocks; m++)
 	for (m = 0; m < nblocks; m++)
@@ -170,7 +340,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 		for(n = 0; n < nblocks ; n++)
 		for(n = 0; n < nblocks ; n++)
 		{
 		{
 			/* Get back data on node 0 for the check */
 			/* Get back data on node 0 for the check */
-			if (check)
+			if (check && data_handles[m][n])
 				starpu_mpi_get_data_on_node(MPI_COMM_WORLD, data_handles[m][n], 0);
 				starpu_mpi_get_data_on_node(MPI_COMM_WORLD, data_handles[m][n], 0);
 
 
 			if (data_handles[m][n])
 			if (data_handles[m][n])
@@ -248,24 +418,20 @@ void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *corr
 	{
 	{
 		for (m = 0; m < nblocks; m++)
 		for (m = 0; m < nblocks; m++)
 		{
 		{
-			int mpi_rank = my_distrib(m, n, nodes);
-			if (mpi_rank == rank)
+			for (nn = BLOCKSIZE*n ; nn < BLOCKSIZE*(n+1); nn++)
 			{
 			{
-				for (nn = (size/nblocks)*n ; nn < (size/nblocks)*n+(size/nblocks); nn++)
+				for (mm = BLOCKSIZE*m ; mm < BLOCKSIZE*(m+1); mm++)
 				{
 				{
-					for (mm = (size/nblocks)*m ; mm < (size/nblocks)*m+(size/nblocks); mm++)
+					if (nn <= mm)
 					{
 					{
-						if (nn <= mm)
+						float orig = (1.0f/(1.0f+nn+mm)) + ((nn == mm)?1.0f*size:0.0f);
+						float err = fabsf(test_mat[mm +nn*size] - orig) / orig;
+						if (err > epsilon)
 						{
 						{
-							float orig = (1.0f/(1.0f+nn+mm)) + ((nn == mm)?1.0f*size:0.0f);
-							float err = fabsf(test_mat[mm +nn*size] - orig) / orig;
-							if (err > epsilon)
-							{
-								FPRINTF(stderr, "[%d] Error[%u, %u] --> %2.20f != %2.20f (err %2.20f)\n", rank, nn, mm, test_mat[mm +nn*size], orig, err);
-								*correctness = 0;
-								*flops = 0;
-								break;
-							}
+							FPRINTF(stderr, "[%d] Error[%u, %u] --> %2.20f != %2.20f (err %2.20f)\n", rank, nn, mm, test_mat[mm +nn*size], orig, err);
+							*correctness = 0;
+							*flops = 0;
+							break;
 						}
 						}
 					}
 					}
 				}
 				}

+ 12 - 1
mpi/examples/matrix_decomposition/mpi_decomposition_params.c

@@ -43,6 +43,7 @@ unsigned check = 0;
 unsigned display = 0;
 unsigned display = 0;
 int dblockx = -1;
 int dblockx = -1;
 int dblocky = -1;
 int dblocky = -1;
+enum submission submission = TRIANGLES;
 
 
 void parse_args(int argc, char **argv, int nodes)
 void parse_args(int argc, char **argv, int nodes)
 {
 {
@@ -79,6 +80,16 @@ void parse_args(int argc, char **argv, int nodes)
                         nbigblocks = strtol(argv[++i], &argptr, 10);
                         nbigblocks = strtol(argv[++i], &argptr, 10);
                 }
                 }
 
 
+                if (strcmp(argv[i], "-columns") == 0)
+                {
+                        submission = COLUMNS;
+                }
+
+                if (strcmp(argv[i], "-antidiagonals") == 0)
+                {
+                        submission = ANTIDIAGONALS;
+                }
+
                 if (strcmp(argv[i], "-no-prio") == 0)
                 if (strcmp(argv[i], "-no-prio") == 0)
                 {
                 {
                         noprio = 1;
                         noprio = 1;
@@ -96,7 +107,7 @@ void parse_args(int argc, char **argv, int nodes)
 
 
                 if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
                 if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
                 {
                 {
-			printf("usage : %s [-size size] [-nblocks nblocks] [-no-prio] [-display] [-check]\n", argv[0]);
+                        printf("usage : %s [-size size] [-nblocks nblocks] [-columns] [-antidiagonals] [-no-prio] [-display] [-check]\n", argv[0]);
                 }
                 }
         }
         }
 
 

+ 8 - 0
mpi/examples/matrix_decomposition/mpi_decomposition_params.h

@@ -28,6 +28,14 @@ extern unsigned display;
 extern int dblockx;
 extern int dblockx;
 extern int dblocky;
 extern int dblocky;
 
 
+enum submission
+{
+	TRIANGLES,
+	COLUMNS,
+	ANTIDIAGONALS,
+};
+extern enum submission submission;
+
 void parse_args(int argc, char **argv, int nodes);
 void parse_args(int argc, char **argv, int nodes);
 
 
 #endif // __MPI_CHOLESKY_PARAMS_H__
 #endif // __MPI_CHOLESKY_PARAMS_H__

+ 6 - 0
mpi/include/starpu_mpi.h

@@ -132,6 +132,12 @@ int starpu_mpi_world_size(void);
 */
 */
 int starpu_mpi_comm_get_attr(MPI_Comm comm, int keyval, void *attribute_val, int *flag);
 int starpu_mpi_comm_get_attr(MPI_Comm comm, int keyval, void *attribute_val, int *flag);
 
 
+
+/**
+   Get the logical index of the core where the MPI thread is bound.
+*/
+int starpu_mpi_get_thread_cpuid(void);
+
 int starpu_mpi_get_communication_tag(void);
 int starpu_mpi_get_communication_tag(void);
 void starpu_mpi_set_communication_tag(int tag);
 void starpu_mpi_set_communication_tag(int tag);
 
 

+ 2 - 3
mpi/src/starpu_mpi_datatype.c

@@ -26,17 +26,16 @@ struct _starpu_mpi_datatype_funcs
 	UT_hash_handle hh;
 	UT_hash_handle hh;
 };
 };
 
 
-static starpu_pthread_mutex_t _starpu_mpi_datatype_funcs_table_mutex;
+/* We want to allow applications calling starpu_mpi_interface_datatype_register/unregister as constructor/destructor */
+static starpu_pthread_mutex_t _starpu_mpi_datatype_funcs_table_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
 static struct _starpu_mpi_datatype_funcs *_starpu_mpi_datatype_funcs_table = NULL;
 static struct _starpu_mpi_datatype_funcs *_starpu_mpi_datatype_funcs_table = NULL;
 
 
 void _starpu_mpi_datatype_init(void)
 void _starpu_mpi_datatype_init(void)
 {
 {
-	STARPU_PTHREAD_MUTEX_INIT(&_starpu_mpi_datatype_funcs_table_mutex, NULL);
 }
 }
 
 
 void _starpu_mpi_datatype_shutdown(void)
 void _starpu_mpi_datatype_shutdown(void)
 {
 {
-	STARPU_PTHREAD_MUTEX_DESTROY(&_starpu_mpi_datatype_funcs_table_mutex);
 }
 }
 
 
 /*
 /*

+ 5 - 0
mpi/src/starpu_mpi_init.c

@@ -336,3 +336,8 @@ int starpu_mpi_world_rank(void)
 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
 	return rank;
 	return rank;
 }
 }
+
+int starpu_mpi_get_thread_cpuid(void)
+{
+	return _starpu_mpi_thread_cpuid;
+}

+ 2 - 0
mpi/tests/ring_kernel.cu

@@ -27,5 +27,7 @@ extern "C" void increment_cuda(void *descr[], void *_args)
 	int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
 	int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
 
 
 	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(tokenptr);
 	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(tokenptr);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }

+ 11 - 4
src/Makefile.am

@@ -411,9 +411,16 @@ endif
 # static inline definition
 # static inline definition
 dist-hook:
 dist-hook:
 	failed=0 ; \
 	failed=0 ; \
-	for i in $$( $(GREP) "static inline" $$(find $(srcdir) -name \*.h) | $(SED) -e 's/.*static inline //g' | $(GREP) -v ENAME | $(SED) -e 's/[^(]* \(\|\*\)\([^ (]*\)(.*/\2/' | $(GREP) -v _starpu_spin_init) ; do \
-		for j in $(shell find . -name \*.o) ; do \
-			nm $$j | $(GREP) "U $$i$$" && { echo $$j ; failed=1 ; } ; \
-		done ; \
+	look=""; \
+	for i in $$( $(GREP) "static inline" $$(find $(srcdir) -name \*.h) | $(SED) -e 's/.*static inline //g' | $(GREP) -v ENAME\#\# | $(SED) -n -e 's/[^(]* \(\|\*\)\([^ (]*\)(.*/\2/' -e 'p;s/^_*//;p' | $(GREP) -v _starpu_spin_init | $(GREP) -v starpu_sched_ctx_worker_is_master_for_child_ctx) ; do \
+		if [ -z "$$look" ] ; then \
+			look="$$i" ; \
+		else \
+			look="$$look\|$$i" ; \
+		fi ; \
+	done ; \
+	echo "$$look" ; \
+	for j in $(shell find . -name \*.o) ; do \
+		nm $$j | $(GREP) -e "U \($$look\)$$" && { echo $$j ; failed=1 ; } ; \
 	done ; \
 	done ; \
 	[ $$failed == 0 ]
 	[ $$failed == 0 ]

+ 3 - 0
src/core/jobs.c

@@ -24,10 +24,12 @@
 #include <common/config.h>
 #include <common/config.h>
 #include <common/utils.h>
 #include <common/utils.h>
 #include <common/graph.h>
 #include <common/graph.h>
+#include <datawizard/memory_nodes.h>
 #include <profiling/profiling.h>
 #include <profiling/profiling.h>
 #include <profiling/bound.h>
 #include <profiling/bound.h>
 #include <core/debug.h>
 #include <core/debug.h>
 #include <limits.h>
 #include <limits.h>
+#include <core/workers.h>
 
 
 static int max_memory_use;
 static int max_memory_use;
 static unsigned long njobs, maxnjobs;
 static unsigned long njobs, maxnjobs;
@@ -483,6 +485,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 		 * also the callback were executed. */
 		 * also the callback were executed. */
 		j->terminated = 2;
 		j->terminated = 2;
 	}
 	}
+	task->prefetched = 0;
 	STARPU_PTHREAD_COND_BROADCAST(&j->sync_cond);
 	STARPU_PTHREAD_COND_BROADCAST(&j->sync_cond);
 	STARPU_AYU_REMOVETASK(j->job_id);
 	STARPU_AYU_REMOVETASK(j->job_id);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);

+ 5 - 4
src/core/perfmodel/perfmodel_bus.c

@@ -37,6 +37,7 @@
 #include <core/topology.h>
 #include <core/topology.h>
 #include <common/utils.h>
 #include <common/utils.h>
 #include <drivers/mpi/driver_mpi_common.h>
 #include <drivers/mpi/driver_mpi_common.h>
+#include <datawizard/memory_nodes.h>
 
 
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
 #include <starpu_opencl.h>
 #include <starpu_opencl.h>
@@ -188,7 +189,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 	/* Allocate a buffer on the device */
 	/* Allocate a buffer on the device */
 	unsigned char *d_buffer;
 	unsigned char *d_buffer;
 	cures = cudaMalloc((void **)&d_buffer, size);
 	cures = cudaMalloc((void **)&d_buffer, size);
-	STARPU_ASSERT(cures == cudaSuccess);
+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
 
 
 	/* hack to avoid third party libs to rebind threads */
 	/* hack to avoid third party libs to rebind threads */
 	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
 	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
@@ -217,7 +218,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 		cudaHostRegister((void *)h_buffer, size, 0);
 		cudaHostRegister((void *)h_buffer, size, 0);
 	}
 	}
 
 
-	STARPU_ASSERT(cures == cudaSuccess);
+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
 
 
 	/* hack to avoid third party libs to rebind threads */
 	/* hack to avoid third party libs to rebind threads */
 	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
 	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
@@ -342,7 +343,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 	/* Allocate a buffer on the device */
 	/* Allocate a buffer on the device */
 	unsigned char *s_buffer;
 	unsigned char *s_buffer;
 	cures = cudaMalloc((void **)&s_buffer, size);
 	cures = cudaMalloc((void **)&s_buffer, size);
-	STARPU_ASSERT(cures == cudaSuccess);
+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
 	cudaMemset(s_buffer, 0, size);
 	cudaMemset(s_buffer, 0, size);
 	cudaDeviceSynchronize();
 	cudaDeviceSynchronize();
 
 
@@ -368,7 +369,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 	/* Allocate a buffer on the device */
 	/* Allocate a buffer on the device */
 	unsigned char *d_buffer;
 	unsigned char *d_buffer;
 	cures = cudaMalloc((void **)&d_buffer, size);
 	cures = cudaMalloc((void **)&d_buffer, size);
-	STARPU_ASSERT(cures == cudaSuccess);
+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
 	cudaMemset(d_buffer, 0, size);
 	cudaMemset(d_buffer, 0, size);
 	cudaDeviceSynchronize();
 	cudaDeviceSynchronize();
 
 

+ 1 - 0
src/core/perfmodel/perfmodel_print.c

@@ -19,6 +19,7 @@
 #include <starpu.h>
 #include <starpu.h>
 #include <starpu_perfmodel.h>
 #include <starpu_perfmodel.h>
 #include <common/config.h>
 #include <common/config.h>
+#include <core/workers.h>
 #include "perfmodel.h"
 #include "perfmodel.h"
 
 
 static
 static

+ 1 - 0
src/core/sched_ctx.c

@@ -21,6 +21,7 @@
 #include <common/utils.h>
 #include <common/utils.h>
 #include <stdarg.h>
 #include <stdarg.h>
 #include <core/task.h>
 #include <core/task.h>
+#include <core/workers.h>
 
 
 enum _starpu_ctx_change_op
 enum _starpu_ctx_change_op
 {
 {

+ 20 - 20
src/core/sched_policy.c

@@ -22,6 +22,7 @@
 #include <common/utils.h>
 #include <common/utils.h>
 #include <core/sched_policy.h>
 #include <core/sched_policy.h>
 #include <profiling/profiling.h>
 #include <profiling/profiling.h>
+#include <datawizard/memory_nodes.h>
 #include <common/barrier.h>
 #include <common/barrier.h>
 #include <core/debug.h>
 #include <core/debug.h>
 #include <core/task.h>
 #include <core/task.h>
@@ -569,32 +570,12 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 	int ret = 0;
 	int ret = 0;
 	if (STARPU_UNLIKELY(task->execute_on_a_specific_worker))
 	if (STARPU_UNLIKELY(task->execute_on_a_specific_worker))
 	{
 	{
-		if (starpu_get_prefetch_flag())
-			starpu_prefetch_task_input_for(task, task->workerid);
-
 		ret = _starpu_push_task_on_specific_worker(task, task->workerid);
 		ret = _starpu_push_task_on_specific_worker(task, task->workerid);
 	}
 	}
 	else
 	else
 	{
 	{
 		struct _starpu_machine_config *config = _starpu_get_machine_config();
 		struct _starpu_machine_config *config = _starpu_get_machine_config();
 
 
-		/* When a task can only be executed on a given arch and we have
-		 * only one memory node for that arch, we can systematically
-		 * prefetch before the scheduling decision. */
-		if (starpu_get_prefetch_flag() && starpu_memory_nodes_get_count() > 1)
-		{
-			if (task->where == STARPU_CPU && config->cpus_nodeid >= 0)
-				starpu_prefetch_task_input_on_node(task, config->cpus_nodeid);
-			else if (task->where == STARPU_CUDA && config->cuda_nodeid >= 0)
-				starpu_prefetch_task_input_on_node(task, config->cuda_nodeid);
-                        else if (task->cl->where == STARPU_FPGA && config->fpga_nodeid >= 0)
-				starpu_prefetch_task_input_on_node(task, config->fpga_nodeid);
-			else if (task->where == STARPU_OPENCL && config->opencl_nodeid >= 0)
-				starpu_prefetch_task_input_on_node(task, config->opencl_nodeid);
-			else if (task->where == STARPU_MIC && config->mic_nodeid >= 0)
-				starpu_prefetch_task_input_on_node(task, config->mic_nodeid);
-		}
-
 		if(!sched_ctx->sched_policy)
 		if(!sched_ctx->sched_policy)
 		{
 		{
 			/* Note: we have to call that early, or else the task may have
 			/* Note: we have to call that early, or else the task may have
@@ -637,6 +618,25 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 		}
 		}
 		else
 		else
 		{
 		{
+			/* When a task can only be executed on a given arch and we have
+			 * only one memory node for that arch, we can systematically
+			 * prefetch before the scheduling decision. */
+			if (!sched_ctx->sched_policy->prefetches
+				&& starpu_get_prefetch_flag()
+				&& starpu_memory_nodes_get_count() > 1)
+			{
+				if (task->where == STARPU_CPU && config->cpus_nodeid >= 0)
+					starpu_prefetch_task_input_on_node(task, config->cpus_nodeid);
+				else if (task->where == STARPU_CUDA && config->cuda_nodeid >= 0)
+					starpu_prefetch_task_input_on_node(task, config->cuda_nodeid);
+				else if (task->cl->where == STARPU_FPGA && config->fpga_nodeid >= 0)
+					starpu_prefetch_task_input_on_node(task, config->fpga_nodeid);
+				else if (task->where == STARPU_OPENCL && config->opencl_nodeid >= 0)
+					starpu_prefetch_task_input_on_node(task, config->opencl_nodeid);
+				else if (task->where == STARPU_MIC && config->mic_nodeid >= 0)
+					starpu_prefetch_task_input_on_node(task, config->mic_nodeid);
+			}
+
 			STARPU_ASSERT(sched_ctx->sched_policy->push_task);
 			STARPU_ASSERT(sched_ctx->sched_policy->push_task);
 			/* check out if there are any workers in the context */
 			/* check out if there are any workers in the context */
 			unsigned nworkers = starpu_sched_ctx_get_nworkers(sched_ctx->id);
 			unsigned nworkers = starpu_sched_ctx_get_nworkers(sched_ctx->id);

+ 24 - 2
src/core/simgrid.c

@@ -58,6 +58,8 @@ extern int _starpu_mpi_simgrid_init(int argc, char *argv[]);
 extern void smpi_process_set_user_data(void *);
 extern void smpi_process_set_user_data(void *);
 #endif
 #endif
 
 
+static double _starpu_simgrid_dynamic_energy = 0.0;
+
 /* 1 when MSG_init was done, 2 when initialized through redirected main, 3 when
 /* 1 when MSG_init was done, 2 when initialized through redirected main, 3 when
  * initialized through MSG_process_attach */
  * initialized through MSG_process_attach */
 static int simgrid_started;
 static int simgrid_started;
@@ -629,6 +631,7 @@ struct task
 #else
 #else
 	msg_task_t task;
 	msg_task_t task;
 #endif
 #endif
+	double energy;
 
 
 	/* communication termination signalization */
 	/* communication termination signalization */
 	unsigned *finished;
 	unsigned *finished;
@@ -666,6 +669,7 @@ static void *task_execute(void *arg)
 		MSG_task_execute(task->task);
 		MSG_task_execute(task->task);
 		MSG_task_destroy(task->task);
 		MSG_task_destroy(task->task);
 #endif
 #endif
+		starpu_energy_use(task->energy);
 		_STARPU_DEBUG("task %p finished\n", task);
 		_STARPU_DEBUG("task %p finished\n", task);
 
 
 		*task->finished = 1;
 		*task->finished = 1;
@@ -702,7 +706,7 @@ void _starpu_simgrid_wait_tasks(int workerid)
 }
 }
 
 
 /* Task execution submitted by StarPU */
 /* Task execution submitted by StarPU */
-void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, double length, unsigned *finished)
+void _starpu_simgrid_submit_job(int workerid, int sched_ctx_id, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, double length, double energy, unsigned *finished)
 {
 {
 	struct starpu_task *starpu_task = j->task;
 	struct starpu_task *starpu_task = j->task;
 	double flops;
 	double flops;
@@ -717,13 +721,19 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 
 
 	if (isnan(length))
 	if (isnan(length))
 	{
 	{
-		length = starpu_task_expected_length(starpu_task, perf_arch, j->nimpl);
+		length = starpu_task_worker_expected_length(starpu_task, workerid, sched_ctx_id, j->nimpl);
 		STARPU_ASSERT_MSG(!_STARPU_IS_ZERO(length) && !isnan(length),
 		STARPU_ASSERT_MSG(!_STARPU_IS_ZERO(length) && !isnan(length),
 				  "Codelet %s does not have a perfmodel (in directory %s), or is not calibrated enough, please re-run in non-simgrid mode until it is calibrated",
 				  "Codelet %s does not have a perfmodel (in directory %s), or is not calibrated enough, please re-run in non-simgrid mode until it is calibrated",
 				  _starpu_job_get_model_name(j), _starpu_get_perf_model_dir_codelet());
 				  _starpu_job_get_model_name(j), _starpu_get_perf_model_dir_codelet());
                 /* TODO: option to add variance according to performance model,
                 /* TODO: option to add variance according to performance model,
                  * to be able to easily check scheduling robustness */
                  * to be able to easily check scheduling robustness */
 	}
 	}
+	if (isnan(energy))
+	{
+		energy = starpu_task_worker_expected_energy(starpu_task, workerid, sched_ctx_id, j->nimpl);
+		/* TODO: option to add variance according to performance model,
+		 * to be able to easily check scheduling robustness */
+	}
 
 
 #if defined(HAVE_SG_HOST_SPEED) || defined(sg_host_speed)
 #if defined(HAVE_SG_HOST_SPEED) || defined(sg_host_speed)
 #  if defined(HAVE_SG_HOST_SELF) || defined(sg_host_self)
 #  if defined(HAVE_SG_HOST_SELF) || defined(sg_host_self)
@@ -754,6 +764,7 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 		MSG_task_execute(simgrid_task);
 		MSG_task_execute(simgrid_task);
 		MSG_task_destroy(simgrid_task);
 		MSG_task_destroy(simgrid_task);
 #endif
 #endif
+		starpu_energy_use(energy);
 	}
 	}
 	else
 	else
 	{
 	{
@@ -766,6 +777,7 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 #else
 #else
 		task->task = simgrid_task;
 		task->task = simgrid_task;
 #endif
 #endif
+		task->energy = energy;
 		task->finished = finished;
 		task->finished = finished;
 		*finished = 0;
 		*finished = 0;
 		task->next = NULL;
 		task->next = NULL;
@@ -1391,5 +1403,15 @@ void _starpu_simgrid_data_transfer(size_t size, unsigned src_node, unsigned dst_
 }
 }
 #endif
 #endif
 
 
+void starpu_energy_use(float joules)
+{
+	_starpu_simgrid_dynamic_energy += joules;
+}
+
+double starpu_energy_used(void)
+{
+	float idle_power = starpu_get_env_float_default("STARPU_IDLE_POWER", 0.0);
+	return _starpu_simgrid_dynamic_energy + idle_power * starpu_timing_now() / 1000000;
+}
 
 
 #endif
 #endif

+ 1 - 1
src/core/simgrid.h

@@ -66,7 +66,7 @@ void _starpu_simgrid_deinit_late(void);
 void _starpu_simgrid_actor_setup(void);
 void _starpu_simgrid_actor_setup(void);
 void _starpu_simgrid_wait_tasks(int workerid);
 void _starpu_simgrid_wait_tasks(int workerid);
 struct _starpu_job;
 struct _starpu_job;
-void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *job, struct starpu_perfmodel_arch* perf_arch, double length, unsigned *finished);
+void _starpu_simgrid_submit_job(int workerid, int sched_ctx_id, struct _starpu_job *job, struct starpu_perfmodel_arch* perf_arch, double length, double energy, unsigned *finished);
 struct _starpu_data_request;
 struct _starpu_data_request;
 int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node, struct _starpu_data_request *req);
 int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node, struct _starpu_data_request *req);
 union _starpu_async_channel_event;
 union _starpu_async_channel_event;

+ 1 - 0
src/core/task.c

@@ -30,6 +30,7 @@
 #include <common/utils.h>
 #include <common/utils.h>
 #include <common/fxt.h>
 #include <common/fxt.h>
 #include <common/knobs.h>
 #include <common/knobs.h>
+#include <datawizard/memory_nodes.h>
 #include <profiling/profiling.h>
 #include <profiling/profiling.h>
 #include <profiling/bound.h>
 #include <profiling/bound.h>
 #include <math.h>
 #include <math.h>

+ 27 - 0
src/core/topology.c

@@ -2086,7 +2086,11 @@ int _starpu_bind_thread_on_cpu(int cpuid STARPU_ATTRIBUTE_UNUSED, int workerid S
 		{
 		{
 			cpu_worker[cpuid] = workerid;
 			cpu_worker[cpuid] = workerid;
 			if (name)
 			if (name)
+			{
+				if (cpu_name[cpuid])
+					free(cpu_name[cpuid]);
 				cpu_name[cpuid] = strdup(name);
 				cpu_name[cpuid] = strdup(name);
+			}
 		}
 		}
 	}
 	}
 
 
@@ -3219,3 +3223,26 @@ void starpu_topology_print(FILE *output)
 		fprintf(output, "\n");
 		fprintf(output, "\n");
 	}
 	}
 }
 }
+
+int starpu_get_pu_os_index(unsigned logical_index)
+{
+#ifdef STARPU_HAVE_HWLOC
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
+	struct _starpu_machine_topology *topology = &config->topology;
+
+	hwloc_topology_t topo = topology->hwtopology;
+
+	return hwloc_get_obj_by_type(topo, HWLOC_OBJ_PU, logical_index)->os_index;
+#else
+	return logical_index;
+#endif
+}
+
+#ifdef STARPU_HAVE_HWLOC
+hwloc_topology_t starpu_get_hwloc_topology(void)
+{
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
+
+	return config->topology.hwtopology;
+}
+#endif

+ 6 - 0
src/core/workers.c

@@ -2692,31 +2692,37 @@ int starpu_worker_get_relax_state(void)
 	return _starpu_worker_get_relax_state();
 	return _starpu_worker_get_relax_state();
 }
 }
 
 
+#undef starpu_worker_lock
 void starpu_worker_lock(int workerid)
 void starpu_worker_lock(int workerid)
 {
 {
 	_starpu_worker_lock(workerid);
 	_starpu_worker_lock(workerid);
 }
 }
 
 
+#undef starpu_worker_trylock
 int starpu_worker_trylock(int workerid)
 int starpu_worker_trylock(int workerid)
 {
 {
 	return _starpu_worker_trylock(workerid);
 	return _starpu_worker_trylock(workerid);
 }
 }
 
 
+#undef starpu_worker_unlock
 void starpu_worker_unlock(int workerid)
 void starpu_worker_unlock(int workerid)
 {
 {
 	_starpu_worker_unlock(workerid);
 	_starpu_worker_unlock(workerid);
 }
 }
 
 
+#undef starpu_worker_lock_self
 void starpu_worker_lock_self(void)
 void starpu_worker_lock_self(void)
 {
 {
 	_starpu_worker_lock_self();
 	_starpu_worker_lock_self();
 }
 }
 
 
+#undef starpu_worker_unlock_self
 void starpu_worker_unlock_self(void)
 void starpu_worker_unlock_self(void)
 {
 {
 	_starpu_worker_unlock_self();
 	_starpu_worker_unlock_self();
 }
 }
 
 
+#undef starpu_wake_worker_relax
 int starpu_wake_worker_relax(int workerid)
 int starpu_wake_worker_relax(int workerid)
 {
 {
 	return _starpu_wake_worker_relax(workerid);
 	return _starpu_wake_worker_relax(workerid);

+ 6 - 0
src/core/workers.h

@@ -1132,6 +1132,7 @@ static inline void _starpu_worker_lock(int workerid)
 		STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
 		STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
 	}
 	}
 }
 }
+#define starpu_worker_lock _starpu_worker_lock
 
 
 static inline int _starpu_worker_trylock(int workerid)
 static inline int _starpu_worker_trylock(int workerid)
 {
 {
@@ -1162,6 +1163,7 @@ static inline int _starpu_worker_trylock(int workerid)
 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&cur_worker->sched_mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&cur_worker->sched_mutex);
 	return ret;
 	return ret;
 }
 }
+#define starpu_worker_trylock _starpu_worker_trylock
 
 
 static inline void _starpu_worker_unlock(int workerid)
 static inline void _starpu_worker_unlock(int workerid)
 {
 {
@@ -1174,6 +1176,7 @@ static inline void _starpu_worker_unlock(int workerid)
 		starpu_worker_relax_off();
 		starpu_worker_relax_off();
 	}
 	}
 }
 }
+#define starpu_worker_unlock _starpu_worker_unlock
 
 
 static inline void _starpu_worker_lock_self(void)
 static inline void _starpu_worker_lock_self(void)
 {
 {
@@ -1182,6 +1185,7 @@ static inline void _starpu_worker_lock_self(void)
 	STARPU_ASSERT(worker != NULL);
 	STARPU_ASSERT(worker != NULL);
 	STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
 	STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
 }
 }
+#define starpu_worker_lock_self _starpu_worker_lock_self
 
 
 static inline void _starpu_worker_unlock_self(void)
 static inline void _starpu_worker_unlock_self(void)
 {
 {
@@ -1190,6 +1194,7 @@ static inline void _starpu_worker_unlock_self(void)
 	STARPU_ASSERT(worker != NULL);
 	STARPU_ASSERT(worker != NULL);
 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
 }
 }
+#define starpu_worker_unlock_self _starpu_worker_unlock_self
 
 
 static inline int _starpu_wake_worker_relax(int workerid)
 static inline int _starpu_wake_worker_relax(int workerid)
 {
 {
@@ -1198,6 +1203,7 @@ static inline int _starpu_wake_worker_relax(int workerid)
 	_starpu_worker_unlock(workerid);
 	_starpu_worker_unlock(workerid);
 	return ret;
 	return ret;
 }
 }
+#define starpu_wake_worker_relax _starpu_wake_worker_relax
 
 
 int starpu_wake_worker_relax_light(int workerid);
 int starpu_wake_worker_relax_light(int workerid);
 
 

+ 58 - 17
src/datawizard/coherency.c

@@ -411,7 +411,7 @@ int _starpu_determine_request_path(starpu_data_handle_t handle,
 /* handle->lock should be taken. r is returned locked. The node parameter
 /* handle->lock should be taken. r is returned locked. The node parameter
  * indicate either the source of the request, or the destination for a
  * indicate either the source of the request, or the destination for a
  * write-only request. */
  * write-only request. */
-static struct _starpu_data_request *_starpu_search_existing_data_request(struct _starpu_data_replicate *replicate, unsigned node, enum starpu_data_access_mode mode, unsigned is_prefetch)
+static struct _starpu_data_request *_starpu_search_existing_data_request(struct _starpu_data_replicate *replicate, unsigned node, enum starpu_data_access_mode mode, enum _starpu_is_prefetch is_prefetch)
 {
 {
 	struct _starpu_data_request *r;
 	struct _starpu_data_request *r;
 
 
@@ -474,7 +474,7 @@ static struct _starpu_data_request *_starpu_search_existing_data_request(struct
 
 
 struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
 struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
 								  struct _starpu_data_replicate *dst_replicate,
 								  struct _starpu_data_replicate *dst_replicate,
-								  enum starpu_data_access_mode mode, unsigned is_prefetch,
+								  enum starpu_data_access_mode mode, enum _starpu_is_prefetch is_prefetch,
 								  unsigned async,
 								  unsigned async,
 								  void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
 								  void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
 {
 {
@@ -529,7 +529,13 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 #endif
 #endif
 
 
 			if (dst_replicate->mc)
 			if (dst_replicate->mc)
+			{
+				if (is_prefetch == STARPU_TASK_PREFETCH)
+					/* Make sure it stays there */
+					dst_replicate->mc->nb_tasks_prefetch++;
+
 				_starpu_memchunk_recently_used(dst_replicate->mc, requesting_node);
 				_starpu_memchunk_recently_used(dst_replicate->mc, requesting_node);
+			}
 		}
 		}
 
 
 		_starpu_spin_unlock(&handle->header_lock);
 		_starpu_spin_unlock(&handle->header_lock);
@@ -574,6 +580,9 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 			if (_starpu_allocate_memory_on_node(handle, dst_replicate, is_prefetch) == 0)
 			if (_starpu_allocate_memory_on_node(handle, dst_replicate, is_prefetch) == 0)
 			{
 			{
 				_starpu_update_data_state(handle, dst_replicate, mode);
 				_starpu_update_data_state(handle, dst_replicate, mode);
+				if (is_prefetch == STARPU_TASK_PREFETCH)
+					/* Make sure it stays there */
+					dst_replicate->mc->nb_tasks_prefetch++;
 
 
 				_starpu_spin_unlock(&handle->header_lock);
 				_starpu_spin_unlock(&handle->header_lock);
 
 
@@ -652,9 +661,17 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 				STARPU_ASSERT(r->next_req_count <= STARPU_MAXNODES);
 				STARPU_ASSERT(r->next_req_count <= STARPU_MAXNODES);
 			}
 			}
 		}
 		}
-		else if (!write_invalidation)
-			/* The last request will perform the callback after termination */
-			_starpu_data_request_append_callback(r, callback_func, callback_arg);
+		else
+		{
+			if (is_prefetch == STARPU_TASK_PREFETCH)
+				/* Make last request add the prefetch count on the mc to keep the data
+				 * there until the task gets to execute.  */
+				r->nb_tasks_prefetch++;
+
+			if (!write_invalidation)
+				/* The last request will perform the callback after termination */
+				_starpu_data_request_append_callback(r, callback_func, callback_arg);
+		}
 
 
 
 
 		if (reused_requests[hop])
 		if (reused_requests[hop])
@@ -719,7 +736,7 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 }
 }
 
 
 int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *dst_replicate,
 int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *dst_replicate,
-			       enum starpu_data_access_mode mode, unsigned detached, unsigned is_prefetch, unsigned async,
+			       enum starpu_data_access_mode mode, unsigned detached, enum _starpu_is_prefetch is_prefetch, unsigned async,
 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
 {
 {
         _STARPU_LOG_IN();
         _STARPU_LOG_IN();
@@ -733,7 +750,7 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 	if (cpt == STARPU_SPIN_MAXTRY)
 	if (cpt == STARPU_SPIN_MAXTRY)
 		_starpu_spin_lock(&handle->header_lock);
 		_starpu_spin_lock(&handle->header_lock);
 
 
-	if (is_prefetch > 0)
+	if (is_prefetch > STARPU_FETCH)
 	{
 	{
 		unsigned src_node_mask = 0;
 		unsigned src_node_mask = 0;
 
 
@@ -751,6 +768,7 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 		if (src_node_mask == 0)
 		if (src_node_mask == 0)
 		{
 		{
 			/* no valid copy, nothing to prefetch */
 			/* no valid copy, nothing to prefetch */
+			STARPU_ASSERT_MSG(handle->init_cl, "Could not find a valid copy of the data, and no handle initialization function");
 			_starpu_spin_unlock(&handle->header_lock);
 			_starpu_spin_unlock(&handle->header_lock);
 			return 0;
 			return 0;
 		}
 		}
@@ -789,17 +807,22 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 
 
 static int idle_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
 static int idle_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
 {
 {
-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, 2, 1, NULL, NULL, prio, "idle_prefetch_data_on_node");
+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_IDLEFETCH, 1, NULL, NULL, prio, "idle_prefetch_data_on_node");
 }
 }
 
 
-static int prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
+static int task_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
 {
 {
-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, 1, 1, NULL, NULL, prio, "prefetch_data_on_node");
+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_TASK_PREFETCH, 1, NULL, NULL, prio, "task_prefetch_data_on_node");
+}
+
+static int STARPU_ATTRIBUTE_UNUSED prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
+{
+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_PREFETCH, 1, NULL, NULL, prio, "prefetch_data_on_node");
 }
 }
 
 
 static int fetch_data(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
 static int fetch_data(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
 {
 {
-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 0, 0, 0, NULL, NULL, prio, "fetch_data");
+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 0, STARPU_FETCH, 0, NULL, NULL, prio, "fetch_data");
 }
 }
 
 
 uint32_t _starpu_get_data_refcnt(starpu_data_handle_t handle, unsigned node)
 uint32_t _starpu_get_data_refcnt(starpu_data_handle_t handle, unsigned node)
@@ -899,7 +922,7 @@ int starpu_prefetch_task_input_on_node_prio(struct starpu_task *task, unsigned t
 	if (j->discontinuous != 0)
 	if (j->discontinuous != 0)
 		return 0;
 		return 0;
 #endif
 #endif
-	STARPU_ASSERT(!task->prefetched);
+	STARPU_ASSERT_MSG(!task->prefetched, "Prefetching was already requested for this task! Did you set 'prefetches' to 1 in the starpu_sched_policy structure?");
 	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 	unsigned index;
 	unsigned index;
 
 
@@ -918,10 +941,11 @@ int starpu_prefetch_task_input_on_node_prio(struct starpu_task *task, unsigned t
 		int node = _starpu_task_data_get_node_on_node(task, index, target_node);
 		int node = _starpu_task_data_get_node_on_node(task, index, target_node);
 
 
 		struct _starpu_data_replicate *replicate = &handle->per_node[node];
 		struct _starpu_data_replicate *replicate = &handle->per_node[node];
-		prefetch_data_on_node(handle, node, replicate, mode, prio);
+		task_prefetch_data_on_node(handle, node, replicate, mode, prio);
 
 
 		_starpu_set_data_requested_flag_if_needed(handle, replicate);
 		_starpu_set_data_requested_flag_if_needed(handle, replicate);
 	}
 	}
+	task->prefetched = 1;
 
 
 	return 0;
 	return 0;
 }
 }
@@ -976,7 +1000,7 @@ int starpu_prefetch_task_input_for_prio(struct starpu_task *task, unsigned worke
 	if (j->discontinuous != 0)
 	if (j->discontinuous != 0)
 		return 0;
 		return 0;
 #endif
 #endif
-	STARPU_ASSERT(!task->prefetched);
+	STARPU_ASSERT_MSG(!task->prefetched, "Prefetching was already requested for this task! Did you set 'prefetches' to 1 in the starpu_sched_policy structure?");
 	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 	unsigned index;
 	unsigned index;
 
 
@@ -995,10 +1019,11 @@ int starpu_prefetch_task_input_for_prio(struct starpu_task *task, unsigned worke
 		int node = _starpu_task_data_get_node_on_worker(task, index, worker);
 		int node = _starpu_task_data_get_node_on_worker(task, index, worker);
 
 
 		struct _starpu_data_replicate *replicate = &handle->per_node[node];
 		struct _starpu_data_replicate *replicate = &handle->per_node[node];
-		prefetch_data_on_node(handle, node, replicate, mode, prio);
+		task_prefetch_data_on_node(handle, node, replicate, mode, prio);
 
 
 		_starpu_set_data_requested_flag_if_needed(handle, replicate);
 		_starpu_set_data_requested_flag_if_needed(handle, replicate);
 	}
 	}
+	task->prefetched = 1;
 
 
 	return 0;
 	return 0;
 }
 }
@@ -1140,7 +1165,7 @@ int _starpu_fetch_task_input(struct starpu_task *task, struct _starpu_job *j, in
 
 
 		if (async)
 		if (async)
 		{
 		{
-			ret = _starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, 0, 1,
+			ret = _starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, STARPU_FETCH, 1,
 					_starpu_fetch_task_input_cb, worker, 0, "_starpu_fetch_task_input");
 					_starpu_fetch_task_input_cb, worker, 0, "_starpu_fetch_task_input");
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 			if (_starpu_simgrid_fetching_input_cost())
 			if (_starpu_simgrid_fetching_input_cost())
@@ -1230,7 +1255,23 @@ void _starpu_fetch_task_input_tail(struct starpu_task *task, struct _starpu_job
 		local_replicate = get_replicate(handle, mode, workerid, node);
 		local_replicate = get_replicate(handle, mode, workerid, node);
 		_starpu_spin_lock(&handle->header_lock);
 		_starpu_spin_lock(&handle->header_lock);
 		if (local_replicate->mc)
 		if (local_replicate->mc)
+		{
 			local_replicate->mc->diduse = 1;
 			local_replicate->mc->diduse = 1;
+			if (task->prefetched && local_replicate->initialized &&
+				/* See prefetch conditions in
+				 * starpu_prefetch_task_input_on_node_prio and alike */
+				!(mode & (STARPU_SCRATCH|STARPU_REDUX)) &&
+				(mode & STARPU_R))
+			{
+				/* Allocations or transfer prefetchs should have been done by now and marked
+				 * this mc as needed for us.
+				 * Now that we added a reference for the task, we can relieve that.  */
+				/* Note: the replicate might have been evicted in between, thus not 100% sure
+				 * that our prefetch request is still recorded here.  */
+				if (local_replicate->mc->nb_tasks_prefetch > 0)
+					local_replicate->mc->nb_tasks_prefetch--;
+			}
+		}
 		_starpu_spin_unlock(&handle->header_lock);
 		_starpu_spin_unlock(&handle->header_lock);
 
 
 		_STARPU_TASK_SET_INTERFACE(task , local_replicate->data_interface, descrs[index].index);
 		_STARPU_TASK_SET_INTERFACE(task , local_replicate->data_interface, descrs[index].index);
@@ -1379,7 +1420,7 @@ void _starpu_fetch_nowhere_task_input(struct _starpu_job *j)
 
 
 		local_replicate = get_replicate(handle, mode, -1, node);
 		local_replicate = get_replicate(handle, mode, -1, node);
 
 
-		_starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, 0, 1, _starpu_fetch_nowhere_task_input_cb, wrapper, 0, "_starpu_fetch_nowhere_task_input");
+		_starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, STARPU_FETCH, 1, _starpu_fetch_nowhere_task_input_cb, wrapper, 0, "_starpu_fetch_nowhere_task_input");
 	}
 	}
 
 
 	if (profiling && task->profiling_info)
 	if (profiling && task->profiling_info)

+ 2 - 2
src/datawizard/coherency.h

@@ -298,7 +298,7 @@ struct _starpu_data_state
  * async means that _starpu_fetch_data_on_node will wait for completion of the request
  * async means that _starpu_fetch_data_on_node will wait for completion of the request
  */
  */
 int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate,
 int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate,
-			       enum starpu_data_access_mode mode, unsigned detached, unsigned is_prefetch, unsigned async,
+			       enum starpu_data_access_mode mode, unsigned detached, enum _starpu_is_prefetch is_prefetch, unsigned async,
 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
 /** This releases a reference on the handle */
 /** This releases a reference on the handle */
 void _starpu_release_data_on_node(struct _starpu_data_state *state, uint32_t default_wt_mask,
 void _starpu_release_data_on_node(struct _starpu_data_state *state, uint32_t default_wt_mask,
@@ -341,7 +341,7 @@ int _starpu_determine_request_path(starpu_data_handle_t handle,
  */
  */
 struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
 struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
 								  struct _starpu_data_replicate *dst_replicate,
 								  struct _starpu_data_replicate *dst_replicate,
-								  enum starpu_data_access_mode mode, unsigned is_prefetch,
+								  enum starpu_data_access_mode mode, enum _starpu_is_prefetch is_prefetch,
 								  unsigned async,
 								  unsigned async,
 								  void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
 								  void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
 
 

+ 2 - 2
src/datawizard/copy_driver.c

@@ -203,7 +203,7 @@ int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT _starpu_driver_copy_data_1_to_1(starpu_d
 									unsigned donotread,
 									unsigned donotread,
 									struct _starpu_data_request *req,
 									struct _starpu_data_request *req,
 									unsigned may_alloc,
 									unsigned may_alloc,
-									unsigned prefetch STARPU_ATTRIBUTE_UNUSED)
+									enum _starpu_is_prefetch prefetch STARPU_ATTRIBUTE_UNUSED)
 {
 {
 	if (!donotread)
 	if (!donotread)
 	{
 	{
@@ -221,7 +221,7 @@ int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT _starpu_driver_copy_data_1_to_1(starpu_d
 			/* We're not supposed to allocate there at the moment */
 			/* We're not supposed to allocate there at the moment */
 			return -ENOMEM;
 			return -ENOMEM;
 
 
-		int ret_alloc = _starpu_allocate_memory_on_node(handle, dst_replicate, req ? req->prefetch : 0);
+		int ret_alloc = _starpu_allocate_memory_on_node(handle, dst_replicate, req ? req->prefetch : STARPU_FETCH);
 		if (ret_alloc)
 		if (ret_alloc)
 			return -ENOMEM;
 			return -ENOMEM;
 	}
 	}

+ 10 - 1
src/datawizard/copy_driver.h

@@ -47,6 +47,15 @@ extern "C"
 struct _starpu_data_request;
 struct _starpu_data_request;
 struct _starpu_data_replicate;
 struct _starpu_data_replicate;
 
 
+enum _starpu_is_prefetch
+{
+	STARPU_FETCH = 0,		/* A task really needs it now! */
+	STARPU_TASK_PREFETCH = 1,	/* A task will need it soon */
+	STARPU_PREFETCH = 2,		/* It is a good idea to have it asap */
+	STARPU_IDLEFETCH = 3,		/* Get this here when you have time to */
+	STARPU_NFETCH
+};
+
 #ifdef STARPU_USE_MIC
 #ifdef STARPU_USE_MIC
 /** MIC needs memory_node to know which MIC is concerned.
 /** MIC needs memory_node to know which MIC is concerned.
  * mark is used to wait asynchronous request.
  * mark is used to wait asynchronous request.
@@ -132,7 +141,7 @@ int _starpu_driver_copy_data_1_to_1(starpu_data_handle_t handle,
 				    unsigned donotread,
 				    unsigned donotread,
 				    struct _starpu_data_request *req,
 				    struct _starpu_data_request *req,
 				    unsigned may_alloc,
 				    unsigned may_alloc,
-				    unsigned prefetch);
+				    enum _starpu_is_prefetch prefetch);
 
 
 unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *async_channel);
 unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *async_channel);
 void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_channel);
 void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_channel);

+ 47 - 29
src/datawizard/data_request.c

@@ -25,8 +25,11 @@
 #include <core/simgrid.h>
 #include <core/simgrid.h>
 
 
 /* requests that have not been treated at all */
 /* requests that have not been treated at all */
+#ifdef STARPU_DEVEL
+#warning split into separate out/in queues for each node, so that MAX_PENDING_REQUESTS_PER_NODE is separate for them, since the links are bidirectionnal
+#endif
 static struct _starpu_data_request_prio_list data_requests[STARPU_MAXNODES];
 static struct _starpu_data_request_prio_list data_requests[STARPU_MAXNODES];
-static struct _starpu_data_request_prio_list prefetch_requests[STARPU_MAXNODES];
+static struct _starpu_data_request_prio_list prefetch_requests[STARPU_MAXNODES]; /* Contains both task_prefetch and prefetch */
 static struct _starpu_data_request_prio_list idle_requests[STARPU_MAXNODES];
 static struct _starpu_data_request_prio_list idle_requests[STARPU_MAXNODES];
 static starpu_pthread_mutex_t data_requests_list_mutex[STARPU_MAXNODES];
 static starpu_pthread_mutex_t data_requests_list_mutex[STARPU_MAXNODES];
 
 
@@ -121,7 +124,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 							 int handling_node,
 							 int handling_node,
 							 enum starpu_data_access_mode mode,
 							 enum starpu_data_access_mode mode,
 							 unsigned ndeps,
 							 unsigned ndeps,
-							 unsigned is_prefetch,
+							 enum _starpu_is_prefetch is_prefetch,
 							 int prio,
 							 int prio,
 							 unsigned is_write_invalidation,
 							 unsigned is_write_invalidation,
 							 const char *origin)
 							 const char *origin)
@@ -153,6 +156,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 	STARPU_ASSERT(starpu_node_get_kind(handling_node) == STARPU_CPU_RAM || _starpu_memory_node_get_nworkers(handling_node));
 	STARPU_ASSERT(starpu_node_get_kind(handling_node) == STARPU_CPU_RAM || _starpu_memory_node_get_nworkers(handling_node));
 	r->completed = 0;
 	r->completed = 0;
 	r->prefetch = is_prefetch;
 	r->prefetch = is_prefetch;
+	r->nb_tasks_prefetch = 0;
 	r->prio = prio;
 	r->prio = prio;
 	r->retval = -1;
 	r->retval = -1;
 	r->ndeps = ndeps;
 	r->ndeps = ndeps;
@@ -307,9 +311,9 @@ void _starpu_post_data_request(struct _starpu_data_request *r)
 
 
 	/* insert the request in the proper list */
 	/* insert the request in the proper list */
 	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[handling_node]);
 	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[handling_node]);
-	if (r->prefetch == 2)
+	if (r->prefetch >= STARPU_IDLEFETCH)
 		_starpu_data_request_prio_list_push_back(&idle_requests[handling_node], r);
 		_starpu_data_request_prio_list_push_back(&idle_requests[handling_node], r);
-	else if (r->prefetch)
+	else if (r->prefetch > STARPU_FETCH)
 		_starpu_data_request_prio_list_push_back(&prefetch_requests[handling_node], r);
 		_starpu_data_request_prio_list_push_back(&prefetch_requests[handling_node], r);
 	else
 	else
 		_starpu_data_request_prio_list_push_back(&data_requests[handling_node], r);
 		_starpu_data_request_prio_list_push_back(&data_requests[handling_node], r);
@@ -410,6 +414,10 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 	/* Remove a reference on the destination replicate for the request */
 	/* Remove a reference on the destination replicate for the request */
 	if (dst_replicate)
 	if (dst_replicate)
 	{
 	{
+		if (dst_replicate->mc)
+			/* Make sure it stays there for the task.  */
+			dst_replicate->mc->nb_tasks_prefetch += r->nb_tasks_prefetch;
+
 		STARPU_ASSERT(dst_replicate->refcnt > 0);
 		STARPU_ASSERT(dst_replicate->refcnt > 0);
 		dst_replicate->refcnt--;
 		dst_replicate->refcnt--;
 	}
 	}
@@ -460,7 +468,7 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 }
 }
 
 
 /* TODO : accounting to see how much time was spent working for other people ... */
 /* TODO : accounting to see how much time was spent working for other people ... */
-static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned may_alloc, int prefetch)
+static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned may_alloc, enum _starpu_is_prefetch prefetch)
 {
 {
 	starpu_data_handle_t handle = r->handle;
 	starpu_data_handle_t handle = r->handle;
 
 
@@ -535,7 +543,7 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 	return 0;
 	return 0;
 }
 }
 
 
-static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_list *reqlist, unsigned src_node, unsigned may_alloc, unsigned n, unsigned *pushed, unsigned prefetch)
+static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_list *reqlist, unsigned src_node, unsigned may_alloc, unsigned n, unsigned *pushed, enum _starpu_is_prefetch prefetch)
 {
 {
 	struct _starpu_data_request *r;
 	struct _starpu_data_request *r;
 	struct _starpu_data_request_prio_list new_data_requests[prefetch + 1]; /* Indexed by prefetch level */
 	struct _starpu_data_request_prio_list new_data_requests[prefetch + 1]; /* Indexed by prefetch level */
@@ -606,7 +614,7 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 			ret = res;
 			ret = res;
 			/* Prefetch requests might have gotten promoted while in tmp list */
 			/* Prefetch requests might have gotten promoted while in tmp list */
 			_starpu_data_request_prio_list_push_back(&new_data_requests[r->prefetch], r);
 			_starpu_data_request_prio_list_push_back(&new_data_requests[r->prefetch], r);
-			if (prefetch)
+			if (prefetch > STARPU_FETCH)
 				/* Prefetching more there would make the situation even worse */
 				/* Prefetching more there would make the situation even worse */
 				break;
 				break;
 		}
 		}
@@ -636,20 +644,25 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 	if (i <= prefetch)
 	if (i <= prefetch)
 	{
 	{
 		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
 		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
-		if (!(_starpu_data_request_prio_list_empty(&new_data_requests[0])))
+		if (!(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_FETCH])))
+		{
+			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_FETCH], &data_requests[src_node]);
+			data_requests[src_node] = new_data_requests[STARPU_FETCH];
+		}
+		if (prefetch >= STARPU_TASK_PREFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_TASK_PREFETCH])))
 		{
 		{
-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[0], &data_requests[src_node]);
-			data_requests[src_node] = new_data_requests[0];
+			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_TASK_PREFETCH], &prefetch_requests[src_node]);
+			prefetch_requests[src_node] = new_data_requests[STARPU_TASK_PREFETCH];
 		}
 		}
-		if (prefetch >= 1 && !(_starpu_data_request_prio_list_empty(&new_data_requests[1])))
+		if (prefetch >= STARPU_PREFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_PREFETCH])))
 		{
 		{
-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[1], &prefetch_requests[src_node]);
-			prefetch_requests[src_node] = new_data_requests[1];
+			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_PREFETCH], &prefetch_requests[src_node]);
+			prefetch_requests[src_node] = new_data_requests[STARPU_PREFETCH];
 		}
 		}
-		if (prefetch >= 2 && !(_starpu_data_request_prio_list_empty(&new_data_requests[2])))
+		if (prefetch >= STARPU_IDLEFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_IDLEFETCH])))
 		{
 		{
-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[2], &idle_requests[src_node]);
-			idle_requests[src_node] = new_data_requests[2];
+			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_IDLEFETCH], &idle_requests[src_node]);
+			idle_requests[src_node] = new_data_requests[STARPU_IDLEFETCH];
 		}
 		}
 		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
 
 
@@ -675,17 +688,17 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 
 
 int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
 int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
 {
 {
-	return __starpu_handle_node_data_requests(data_requests, src_node, may_alloc, MAX_PENDING_REQUESTS_PER_NODE, pushed, 0);
+	return __starpu_handle_node_data_requests(data_requests, src_node, may_alloc, MAX_PENDING_REQUESTS_PER_NODE, pushed, STARPU_FETCH);
 }
 }
 
 
 int _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
 int _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
 {
 {
-	return __starpu_handle_node_data_requests(prefetch_requests, src_node, may_alloc, MAX_PENDING_PREFETCH_REQUESTS_PER_NODE, pushed, 1);
+	return __starpu_handle_node_data_requests(prefetch_requests, src_node, may_alloc, MAX_PENDING_PREFETCH_REQUESTS_PER_NODE, pushed, STARPU_PREFETCH);
 }
 }
 
 
 int _starpu_handle_node_idle_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
 int _starpu_handle_node_idle_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
 {
 {
-	return __starpu_handle_node_data_requests(idle_requests, src_node, may_alloc, MAX_PENDING_IDLE_REQUESTS_PER_NODE, pushed, 2);
+	return __starpu_handle_node_data_requests(idle_requests, src_node, may_alloc, MAX_PENDING_IDLE_REQUESTS_PER_NODE, pushed, STARPU_IDLEFETCH);
 }
 }
 
 
 static int _handle_pending_node_data_requests(unsigned src_node, unsigned force)
 static int _handle_pending_node_data_requests(unsigned src_node, unsigned force)
@@ -836,11 +849,15 @@ int _starpu_check_that_no_data_request_is_pending(unsigned node)
 }
 }
 
 
 
 
-void _starpu_update_prefetch_status(struct _starpu_data_request *r, unsigned prefetch)
+void _starpu_update_prefetch_status(struct _starpu_data_request *r, enum _starpu_is_prefetch prefetch)
 {
 {
 	STARPU_ASSERT(r->prefetch > prefetch);
 	STARPU_ASSERT(r->prefetch > prefetch);
 	r->prefetch=prefetch;
 	r->prefetch=prefetch;
 
 
+	if (prefetch >= STARPU_IDLEFETCH)
+		/* No possible actual change */
+		return;
+
 	/* We have to promote chained_request too! */
 	/* We have to promote chained_request too! */
 	unsigned chained_req;
 	unsigned chained_req;
 	for (chained_req = 0; chained_req < r->next_req_count; chained_req++)
 	for (chained_req = 0; chained_req < r->next_req_count; chained_req++)
@@ -852,19 +869,20 @@ void _starpu_update_prefetch_status(struct _starpu_data_request *r, unsigned pre
 
 
 	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[r->handling_node]);
 	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[r->handling_node]);
 
 
+	int found = 1;
+
 	/* The request can be in a different list (handling request or the temp list)
 	/* The request can be in a different list (handling request or the temp list)
-	 * we have to check that it is really in the prefetch list. */
+	 * we have to check that it is really in the prefetch or idle list. */
 	if (_starpu_data_request_prio_list_ismember(&prefetch_requests[r->handling_node], r))
 	if (_starpu_data_request_prio_list_ismember(&prefetch_requests[r->handling_node], r))
-	{
-		_starpu_data_request_prio_list_erase(&prefetch_requests[r->handling_node],r);
-		_starpu_data_request_prio_list_push_back(&data_requests[r->handling_node],r);
-	}
-	/* The request can be in a different list (handling request or the temp list)
-	 * we have to check that it is really in the idle list. */
+		_starpu_data_request_prio_list_erase(&prefetch_requests[r->handling_node], r);
 	else if (_starpu_data_request_prio_list_ismember(&idle_requests[r->handling_node], r))
 	else if (_starpu_data_request_prio_list_ismember(&idle_requests[r->handling_node], r))
+		_starpu_data_request_prio_list_erase(&idle_requests[r->handling_node], r);
+	else
+		found = 0;
+
+	if (found)
 	{
 	{
-		_starpu_data_request_prio_list_erase(&idle_requests[r->handling_node],r);
-		if (prefetch == 1)
+		if (prefetch > STARPU_FETCH)
 			_starpu_data_request_prio_list_push_back(&prefetch_requests[r->handling_node],r);
 			_starpu_data_request_prio_list_push_back(&prefetch_requests[r->handling_node],r);
 		else
 		else
 			_starpu_data_request_prio_list_push_back(&data_requests[r->handling_node],r);
 			_starpu_data_request_prio_list_push_back(&data_requests[r->handling_node],r);

+ 7 - 8
src/datawizard/data_request.h

@@ -79,12 +79,11 @@ LIST_TYPE(_starpu_data_request,
 	/** Whether the transfer is completed. */
 	/** Whether the transfer is completed. */
 	unsigned completed;
 	unsigned completed;
 
 
-	/** Whether this is just a prefetch request:
-	 * 0 for fetch,
-	 * 1 for prefetch (dependencies have just been released)
-	 * 2 for idle (a good idea to do it some time, but no hurry at all)
-	 */
-	unsigned prefetch;
+	/** Whether this is just a prefetch request */
+	enum _starpu_is_prefetch prefetch;
+
+	/** Number of tasks which used this as a prefetch */
+	unsigned nb_tasks_prefetch;
 
 
 	/** Priority of the request. Default is 0 */
 	/** Priority of the request. Default is 0 */
 	int prio;
 	int prio;
@@ -151,7 +150,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 							 int handling_node,
 							 int handling_node,
 							 enum starpu_data_access_mode mode,
 							 enum starpu_data_access_mode mode,
 							 unsigned ndeps,
 							 unsigned ndeps,
-							 unsigned is_prefetch,
+							 enum _starpu_is_prefetch is_prefetch,
 							 int prio,
 							 int prio,
 							 unsigned is_write_invalidation,
 							 unsigned is_write_invalidation,
 							 const char *origin) STARPU_ATTRIBUTE_MALLOC;
 							 const char *origin) STARPU_ATTRIBUTE_MALLOC;
@@ -162,5 +161,5 @@ void _starpu_data_request_append_callback(struct _starpu_data_request *r,
 					  void (*callback_func)(void *),
 					  void (*callback_func)(void *),
 					  void *callback_arg);
 					  void *callback_arg);
 
 
-void _starpu_update_prefetch_status(struct _starpu_data_request *r, unsigned prefetch);
+void _starpu_update_prefetch_status(struct _starpu_data_request *r, enum _starpu_is_prefetch prefetch);
 #endif // __DATA_REQUEST_H__
 #endif // __DATA_REQUEST_H__

+ 2 - 1
src/datawizard/filters.c

@@ -21,6 +21,7 @@
 #include <datawizard/filters.h>
 #include <datawizard/filters.h>
 #include <datawizard/footprint.h>
 #include <datawizard/footprint.h>
 #include <datawizard/interfaces/data_interface.h>
 #include <datawizard/interfaces/data_interface.h>
+#include <datawizard/memory_nodes.h>
 #include <core/task.h>
 #include <core/task.h>
 
 
 /*
 /*
@@ -192,7 +193,7 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 		int home_node = initial_handle->home_node;
 		int home_node = initial_handle->home_node;
 		if (home_node < 0 || (starpu_node_get_kind(home_node) != STARPU_CPU_RAM))
 		if (home_node < 0 || (starpu_node_get_kind(home_node) != STARPU_CPU_RAM))
 			home_node = STARPU_MAIN_RAM;
 			home_node = STARPU_MAIN_RAM;
-		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[home_node], 0);
+		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[home_node], STARPU_FETCH);
 #ifdef STARPU_DEVEL
 #ifdef STARPU_DEVEL
 #warning we should reclaim memory if allocation failed
 #warning we should reclaim memory if allocation failed
 #endif
 #endif

+ 3 - 0
src/datawizard/interfaces/bcsr_interface.c

@@ -15,6 +15,9 @@
  */
  */
 
 
 #include <starpu.h>
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 
 /*
 /*
  * BCSR : blocked CSR, we use blocks of size (r x c)
  * BCSR : blocked CSR, we use blocks of size (r x c)

+ 3 - 0
src/datawizard/interfaces/block_interface.c

@@ -15,6 +15,9 @@
  */
  */
 
 
 #include <starpu.h>
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 
 

+ 3 - 0
src/datawizard/interfaces/coo_interface.c

@@ -15,6 +15,9 @@
  */
  */
 
 
 #include <starpu.h>
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 
 static int
 static int
 copy_any_to_any(void *src_interface, unsigned src_node,
 copy_any_to_any(void *src_interface, unsigned src_node,

+ 3 - 0
src/datawizard/interfaces/csr_interface.c

@@ -16,6 +16,9 @@
  */
  */
 
 
 #include <starpu.h>
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 
 

+ 1 - 1
src/datawizard/interfaces/data_interface.c

@@ -693,7 +693,7 @@ void _starpu_check_if_valid_and_fetch_data_on_node(starpu_data_handle_t handle,
 	}
 	}
 	if (valid)
 	if (valid)
 	{
 	{
-		int ret = _starpu_fetch_data_on_node(handle, handle->home_node, replicate, STARPU_R, 0, 0, 0, NULL, NULL, 0, origin);
+		int ret = _starpu_fetch_data_on_node(handle, handle->home_node, replicate, STARPU_R, 0, STARPU_FETCH, 0, NULL, NULL, 0, origin);
 		STARPU_ASSERT(!ret);
 		STARPU_ASSERT(!ret);
 		_starpu_release_data_on_node(handle, handle->home_node, replicate);
 		_starpu_release_data_on_node(handle, handle->home_node, replicate);
 	}
 	}

+ 3 - 0
src/datawizard/interfaces/matrix_interface.c

@@ -15,6 +15,9 @@
  */
  */
 
 
 #include <starpu.h>
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 
 

+ 3 - 0
src/datawizard/interfaces/multiformat_interface.c

@@ -15,6 +15,9 @@
  */
  */
 
 
 #include <starpu.h>
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 
 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA

+ 3 - 0
src/datawizard/interfaces/tensor_interface.c

@@ -15,6 +15,9 @@
  */
  */
 
 
 #include <starpu.h>
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 
 

+ 3 - 0
src/datawizard/interfaces/variable_interface.c

@@ -15,6 +15,9 @@
  */
  */
 
 
 #include <starpu.h>
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 
 

+ 3 - 0
src/datawizard/interfaces/vector_interface.c

@@ -15,6 +15,9 @@
  */
  */
 
 
 #include <starpu.h>
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 
 

+ 3 - 0
src/datawizard/interfaces/void_interface.c

@@ -15,6 +15,9 @@
  */
  */
 
 
 #include <starpu.h>
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 
 static int dummy_copy(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 static int dummy_copy(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 
 

+ 23 - 18
src/datawizard/memalloc.c

@@ -322,7 +322,7 @@ static int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT transfer_subtree_to_node(starpu_d
 		{
 		{
 			/* This is the only copy, push it to destination */
 			/* This is the only copy, push it to destination */
 			struct _starpu_data_request *r;
 			struct _starpu_data_request *r;
-			r = _starpu_create_request_to_fetch_data(handle, dst_replicate, STARPU_R, 0, 0, NULL, NULL, 0, "transfer_subtree_to_node");
+			r = _starpu_create_request_to_fetch_data(handle, dst_replicate, STARPU_R, STARPU_FETCH, 0, NULL, NULL, 0, "transfer_subtree_to_node");
 			/* There is no way we don't need a request, since
 			/* There is no way we don't need a request, since
 			 * source is OWNER, destination can't be having it */
 			 * source is OWNER, destination can't be having it */
 			STARPU_ASSERT(r);
 			STARPU_ASSERT(r);
@@ -546,7 +546,7 @@ static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_re
 /* This function is called for memory chunks that are possibly in used (ie. not
 /* This function is called for memory chunks that are possibly in used (ie. not
  * in the cache). They should therefore still be associated to a handle. */
  * in the cache). They should therefore still be associated to a handle. */
 /* mc_lock is held and may be temporarily released! */
 /* mc_lock is held and may be temporarily released! */
-static size_t try_to_throw_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node, struct _starpu_data_replicate *replicate, unsigned is_already_in_mc_list)
+static size_t try_to_throw_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node, struct _starpu_data_replicate *replicate, unsigned is_already_in_mc_list, enum _starpu_is_prefetch is_prefetch)
 {
 {
 	size_t freed = 0;
 	size_t freed = 0;
 
 
@@ -571,6 +571,10 @@ static size_t try_to_throw_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node
 		/* Hasn't been used yet, avoid evicting it */
 		/* Hasn't been used yet, avoid evicting it */
 		return 0;
 		return 0;
 
 
+	if (mc->nb_tasks_prefetch && is_prefetch >= STARPU_TASK_PREFETCH)
+		/* We have not finished executing the tasks this was prefetched for */
+		return 0;
+
 	/* REDUX memchunk */
 	/* REDUX memchunk */
 	if (mc->relaxed_coherency == 2)
 	if (mc->relaxed_coherency == 2)
 	{
 	{
@@ -782,7 +786,7 @@ static int try_to_find_reusable_mc(unsigned node, starpu_data_handle_t data, str
 
 
 /* this function looks for a memory chunk that matches a given footprint in the
 /* this function looks for a memory chunk that matches a given footprint in the
  * list of mem chunk that are not important */
  * list of mem chunk that are not important */
-static int try_to_reuse_not_important_mc(unsigned node, starpu_data_handle_t data, struct _starpu_data_replicate *replicate, uint32_t footprint)
+static int try_to_reuse_not_important_mc(unsigned node, starpu_data_handle_t data, struct _starpu_data_replicate *replicate, uint32_t footprint, enum _starpu_is_prefetch is_prefetch)
 {
 {
 	struct _starpu_mem_chunk *mc, *orig_next_mc, *next_mc;
 	struct _starpu_mem_chunk *mc, *orig_next_mc, *next_mc;
 	int success = 0;
 	int success = 0;
@@ -816,7 +820,7 @@ restart:
 		}
 		}
 
 
 		/* Note: this may unlock mc_list! */
 		/* Note: this may unlock mc_list! */
-		success = try_to_throw_mem_chunk(mc, node, replicate, 1);
+		success = try_to_throw_mem_chunk(mc, node, replicate, 1, is_prefetch);
 
 
 		if (orig_next_mc)
 		if (orig_next_mc)
 		{
 		{
@@ -841,11 +845,14 @@ restart:
  * Try to find a buffer currently in use on the memory node which has the given
  * Try to find a buffer currently in use on the memory node which has the given
  * footprint.
  * footprint.
  */
  */
-static int try_to_reuse_potentially_in_use_mc(unsigned node, starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, uint32_t footprint, int is_prefetch)
+static int try_to_reuse_potentially_in_use_mc(unsigned node, starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, uint32_t footprint, enum _starpu_is_prefetch is_prefetch)
 {
 {
 	struct _starpu_mem_chunk *mc, *next_mc, *orig_next_mc;
 	struct _starpu_mem_chunk *mc, *next_mc, *orig_next_mc;
 	int success = 0;
 	int success = 0;
 
 
+	if (is_prefetch >= STARPU_IDLEFETCH)
+		/* Do not evict a MC just for an idle fetch */
+		return 0;
 	/*
 	/*
 	 * We have to unlock mc_lock before locking header_lock, so we have
 	 * We have to unlock mc_lock before locking header_lock, so we have
 	 * to be careful with the list.  We try to do just one pass, by
 	 * to be careful with the list.  We try to do just one pass, by
@@ -868,14 +875,11 @@ restart:
 		if (mc->remove_notify)
 		if (mc->remove_notify)
 			/* Somebody already working here, skip */
 			/* Somebody already working here, skip */
 			continue;
 			continue;
-		if (is_prefetch > 1)
-			/* Do not evict a MC just for an idle fetch */
-			continue;
-		if (is_prefetch == 1 && !mc->wontuse)
+		if (!mc->wontuse && is_prefetch >= STARPU_PREFETCH)
 			/* Do not evict something that we might reuse, just for a prefetch */
 			/* Do not evict something that we might reuse, just for a prefetch */
-			/* FIXME: but perhaps we won't have any task using it in
-                         * the close future, we should perhaps rather check
-                         * mc->replicate->refcnt? */
+			continue;
+		if (mc->nb_tasks_prefetch && is_prefetch >= STARPU_TASK_PREFETCH)
+			/* Do not evict something that we will reuse, just for a task prefetch */
 			continue;
 			continue;
 		if (mc->footprint != footprint || _starpu_data_interface_compare(handle->per_node[node].data_interface, handle->ops, mc->data->per_node[node].data_interface, mc->ops) != 1)
 		if (mc->footprint != footprint || _starpu_data_interface_compare(handle->per_node[node].data_interface, handle->ops, mc->data->per_node[node].data_interface, mc->ops) != 1)
 			/* Not the right type of interface, skip */
 			/* Not the right type of interface, skip */
@@ -889,7 +893,7 @@ restart:
 		}
 		}
 
 
 		/* Note: this may unlock mc_list! */
 		/* Note: this may unlock mc_list! */
-		success = try_to_throw_mem_chunk(mc, node, replicate, 1);
+		success = try_to_throw_mem_chunk(mc, node, replicate, 1, is_prefetch);
 
 
 		if (orig_next_mc)
 		if (orig_next_mc)
 		{
 		{
@@ -999,7 +1003,7 @@ restart2:
 				next_mc->remove_notify = &next_mc;
 				next_mc->remove_notify = &next_mc;
 			}
 			}
 			/* Note: this may unlock mc_list! */
 			/* Note: this may unlock mc_list! */
-			freed += try_to_throw_mem_chunk(mc, node, NULL, 0);
+			freed += try_to_throw_mem_chunk(mc, node, NULL, 0, STARPU_FETCH);
 
 
 			if (orig_next_mc)
 			if (orig_next_mc)
 			{
 			{
@@ -1218,7 +1222,7 @@ void starpu_memchunk_tidy(unsigned node)
 			}
 			}
 
 
 			_starpu_spin_unlock(&mc_lock[node]);
 			_starpu_spin_unlock(&mc_lock[node]);
-			if (!_starpu_create_request_to_fetch_data(handle, &handle->per_node[target_node], STARPU_R, 2, 1, NULL, NULL, 0, "starpu_memchunk_tidy"))
+			if (!_starpu_create_request_to_fetch_data(handle, &handle->per_node[target_node], STARPU_R, STARPU_IDLEFETCH, 1, NULL, NULL, 0, "starpu_memchunk_tidy"))
 			{
 			{
 				/* No request was actually needed??
 				/* No request was actually needed??
 				 * Odd, but cope with it.  */
 				 * Odd, but cope with it.  */
@@ -1317,6 +1321,7 @@ static struct _starpu_mem_chunk *_starpu_memchunk_init(struct _starpu_data_repli
 	mc->size_interface = interface_size;
 	mc->size_interface = interface_size;
 	mc->remove_notify = NULL;
 	mc->remove_notify = NULL;
 	mc->diduse = 0;
 	mc->diduse = 0;
+	mc->nb_tasks_prefetch = 0;
 	mc->wontuse = 0;
 	mc->wontuse = 0;
 
 
 	return mc;
 	return mc;
@@ -1430,7 +1435,7 @@ void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _star
  *
  *
  */
  */
 
 
-static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned dst_node, unsigned is_prefetch)
+static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned dst_node, enum _starpu_is_prefetch is_prefetch)
 {
 {
 	unsigned attempts = 0;
 	unsigned attempts = 0;
 	starpu_ssize_t allocated_memory;
 	starpu_ssize_t allocated_memory;
@@ -1514,7 +1519,7 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 			reclaim -= freed;
 			reclaim -= freed;
 
 
 			/* Try to reuse an allocated data with the same interface (to avoid spurious free/alloc) */
 			/* Try to reuse an allocated data with the same interface (to avoid spurious free/alloc) */
-			if (_starpu_has_not_important_data && try_to_reuse_not_important_mc(dst_node, handle, replicate, footprint))
+			if (_starpu_has_not_important_data && try_to_reuse_not_important_mc(dst_node, handle, replicate, footprint, is_prefetch))
 				break;
 				break;
 			if (try_to_reuse_potentially_in_use_mc(dst_node, handle, replicate, footprint, is_prefetch))
 			if (try_to_reuse_potentially_in_use_mc(dst_node, handle, replicate, footprint, is_prefetch))
 			{
 			{
@@ -1596,7 +1601,7 @@ out:
 	return allocated_memory;
 	return allocated_memory;
 }
 }
 
 
-int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned is_prefetch)
+int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum _starpu_is_prefetch is_prefetch)
 {
 {
 	starpu_ssize_t allocated_memory;
 	starpu_ssize_t allocated_memory;
 
 

+ 9 - 1
src/datawizard/memalloc.h

@@ -26,6 +26,7 @@
 #include <datawizard/interfaces/data_interface.h>
 #include <datawizard/interfaces/data_interface.h>
 #include <datawizard/coherency.h>
 #include <datawizard/coherency.h>
 #include <datawizard/copy_driver.h>
 #include <datawizard/copy_driver.h>
+#include <datawizard/data_request.h>
 
 
 struct _starpu_data_replicate;
 struct _starpu_data_replicate;
 
 
@@ -59,10 +60,17 @@ LIST_TYPE(_starpu_mem_chunk,
 	/** Whether the memchunk is in the clean part of the mc_list */
 	/** Whether the memchunk is in the clean part of the mc_list */
 	unsigned clean:1;
 	unsigned clean:1;
 	/** Was this chunk used since it got allocated?  */
 	/** Was this chunk used since it got allocated?  */
+	/* FIXME: probably useless now with nb_tasks_prefetch */
 	unsigned diduse:1;
 	unsigned diduse:1;
 	/** Was this chunk marked as "won't use"? */
 	/** Was this chunk marked as "won't use"? */
 	unsigned wontuse:1;
 	unsigned wontuse:1;
 
 
+	/** The number of prefetches that we made for this mc for various tasks
+	 * This is also the number of tasks that we will wait to see use this mc before
+	 * we attempt to evict it.
+	 */
+	unsigned nb_tasks_prefetch;
+
 	/** the size of the data is only set when calling _starpu_request_mem_chunk_removal(),
 	/** the size of the data is only set when calling _starpu_request_mem_chunk_removal(),
 	 * it is needed to estimate how much memory is in mc_cache, and by
 	 * it is needed to estimate how much memory is in mc_cache, and by
 	 * free_memory_on_node() which is called when the handle is no longer
 	 * free_memory_on_node() which is called when the handle is no longer
@@ -84,7 +92,7 @@ void _starpu_init_mem_chunk_lists(void);
 void _starpu_deinit_mem_chunk_lists(void);
 void _starpu_deinit_mem_chunk_lists(void);
 void _starpu_mem_chunk_init_last(void);
 void _starpu_mem_chunk_init_last(void);
 void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned node, size_t size);
 void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned node, size_t size);
-int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned is_prefetch);
+int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum _starpu_is_prefetch is_prefetch);
 size_t _starpu_free_all_automatically_allocated_buffers(unsigned node);
 size_t _starpu_free_all_automatically_allocated_buffers(unsigned node);
 void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node);
 void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node);
 void _starpu_memchunk_wont_use(struct _starpu_mem_chunk *m, unsigned nodec);
 void _starpu_memchunk_wont_use(struct _starpu_mem_chunk *m, unsigned nodec);

+ 1 - 0
src/datawizard/memory_manager.c

@@ -19,6 +19,7 @@
 #include <common/thread.h>
 #include <common/thread.h>
 #include <common/fxt.h>
 #include <common/fxt.h>
 #include <datawizard/memory_manager.h>
 #include <datawizard/memory_manager.h>
+#include <datawizard/memory_nodes.h>
 #include <core/workers.h>
 #include <core/workers.h>
 #include <starpu_stdlib.h>
 #include <starpu_stdlib.h>
 
 

+ 1 - 0
src/datawizard/reduction.c

@@ -22,6 +22,7 @@
 #include <datawizard/datawizard.h>
 #include <datawizard/datawizard.h>
 #include <drivers/mic/driver_mic_source.h>
 #include <drivers/mic/driver_mic_source.h>
 #include <drivers/mp_common/source_common.h>
 #include <drivers/mp_common/source_common.h>
+#include <datawizard/memory_nodes.h>
 
 
 void starpu_data_set_reduction_methods(starpu_data_handle_t handle,
 void starpu_data_set_reduction_methods(starpu_data_handle_t handle,
 				       struct starpu_codelet *redux_cl,
 				       struct starpu_codelet *redux_cl,

+ 7 - 6
src/datawizard/user_interactions.c

@@ -22,6 +22,7 @@
 #include <datawizard/write_back.h>
 #include <datawizard/write_back.h>
 #include <core/dependencies/data_concurrency.h>
 #include <core/dependencies/data_concurrency.h>
 #include <core/sched_policy.h>
 #include <core/sched_policy.h>
+#include <datawizard/memory_nodes.h>
 
 
 static void _starpu_data_check_initialized(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
 static void _starpu_data_check_initialized(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
 {
 {
@@ -46,7 +47,7 @@ int starpu_data_request_allocation(starpu_data_handle_t handle, unsigned node)
 
 
 	_starpu_spin_lock(&handle->header_lock);
 	_starpu_spin_lock(&handle->header_lock);
 
 
-	r = _starpu_create_data_request(handle, NULL, &handle->per_node[node], node, STARPU_NONE, 0, 1, 0, 0, "starpu_data_request_allocation");
+	r = _starpu_create_data_request(handle, NULL, &handle->per_node[node], node, STARPU_NONE, 0, STARPU_PREFETCH, 0, 0, "starpu_data_request_allocation");
 
 
 	/* we do not increase the refcnt associated to the request since we are
 	/* we do not increase the refcnt associated to the request since we are
 	 * not waiting for its termination */
 	 * not waiting for its termination */
@@ -67,7 +68,7 @@ struct user_interaction_wrapper
 	starpu_pthread_mutex_t lock;
 	starpu_pthread_mutex_t lock;
 	unsigned finished;
 	unsigned finished;
 	unsigned detached;
 	unsigned detached;
-	unsigned prefetch;
+	enum _starpu_is_prefetch prefetch;
 	unsigned async;
 	unsigned async;
 	int prio;
 	int prio;
 	void (*callback)(void *);
 	void (*callback)(void *);
@@ -535,7 +536,7 @@ static void _prefetch_data_on_node(void *arg)
 }
 }
 
 
 static
 static
-int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigned node, unsigned async, enum starpu_data_access_mode mode, unsigned prefetch, int prio)
+int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigned node, unsigned async, enum starpu_data_access_mode mode, enum _starpu_is_prefetch prefetch, int prio)
 {
 {
 	STARPU_ASSERT(handle);
 	STARPU_ASSERT(handle);
 
 
@@ -595,12 +596,12 @@ int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigne
 
 
 int starpu_data_fetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
 int starpu_data_fetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
 {
 {
-	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, 0, 0);
+	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, STARPU_FETCH, 0);
 }
 }
 
 
 int starpu_data_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned node, unsigned async, int prio)
 int starpu_data_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned node, unsigned async, int prio)
 {
 {
-	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, 1, prio);
+	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, STARPU_PREFETCH, prio);
 }
 }
 
 
 int starpu_data_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
 int starpu_data_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
@@ -610,7 +611,7 @@ int starpu_data_prefetch_on_node(starpu_data_handle_t handle, unsigned node, uns
 
 
 int starpu_data_idle_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned node, unsigned async, int prio)
 int starpu_data_idle_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned node, unsigned async, int prio)
 {
 {
-	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, 2, prio);
+	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, STARPU_IDLEFETCH, prio);
 }
 }
 
 
 int starpu_data_idle_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
 int starpu_data_idle_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)

+ 2 - 1
src/datawizard/write_back.c

@@ -17,6 +17,7 @@
 #include <datawizard/datawizard.h>
 #include <datawizard/datawizard.h>
 #include <datawizard/write_back.h>
 #include <datawizard/write_back.h>
 #include <core/dependencies/data_concurrency.h>
 #include <core/dependencies/data_concurrency.h>
+#include <datawizard/memory_nodes.h>
 
 
 static void wt_callback(void *arg)
 static void wt_callback(void *arg)
 {
 {
@@ -63,7 +64,7 @@ void _starpu_write_through_data(starpu_data_handle_t handle, unsigned requesting
 
 
 				struct _starpu_data_request *r;
 				struct _starpu_data_request *r;
 				r = _starpu_create_request_to_fetch_data(handle, &handle->per_node[node],
 				r = _starpu_create_request_to_fetch_data(handle, &handle->per_node[node],
-									 STARPU_R, 1, 1, wt_callback, handle, 0, "_starpu_write_through_data");
+									 STARPU_R, STARPU_IDLEFETCH, 1, wt_callback, handle, 0, "_starpu_write_through_data");
 
 
 			        /* If no request was created, the handle was already up-to-date on the
 			        /* If no request was created, the handle was already up-to-date on the
 			         * node */
 			         * node */

+ 2 - 2
src/debug/latency.c

@@ -34,7 +34,7 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 		_starpu_spin_unlock(&handle->header_lock);
 		_starpu_spin_unlock(&handle->header_lock);
 
 
 		struct _starpu_data_replicate *replicate_0 = &handle->per_node[node0];
 		struct _starpu_data_replicate *replicate_0 = &handle->per_node[node0];
-		ret = _starpu_fetch_data_on_node(handle, node0, replicate_0, STARPU_RW, 0, 0, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
+		ret = _starpu_fetch_data_on_node(handle, node0, replicate_0, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
 		STARPU_ASSERT(!ret);
 		STARPU_ASSERT(!ret);
 		_starpu_release_data_on_node(handle, node0, replicate_0);
 		_starpu_release_data_on_node(handle, node0, replicate_0);
 
 
@@ -44,7 +44,7 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 		_starpu_spin_unlock(&handle->header_lock);
 		_starpu_spin_unlock(&handle->header_lock);
 
 
 		struct _starpu_data_replicate *replicate_1 = &handle->per_node[node1];
 		struct _starpu_data_replicate *replicate_1 = &handle->per_node[node1];
-		ret = _starpu_fetch_data_on_node(handle, node1, replicate_1, STARPU_RW, 0, 0, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
+		ret = _starpu_fetch_data_on_node(handle, node1, replicate_1, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
 		STARPU_ASSERT(!ret);
 		STARPU_ASSERT(!ret);
 		_starpu_release_data_on_node(handle, node1, replicate_1);
 		_starpu_release_data_on_node(handle, node1, replicate_1);
 	}
 	}

+ 10 - 8
src/debug/traces/starpu_fxt.c

@@ -20,6 +20,7 @@
 #include <starpu.h>
 #include <starpu.h>
 #include <common/config.h>
 #include <common/config.h>
 #include <common/uthash.h>
 #include <common/uthash.h>
+#include <datawizard/copy_driver.h>
 #include <string.h>
 #include <string.h>
 
 
 #ifdef STARPU_HAVE_POTI
 #ifdef STARPU_HAVE_POTI
@@ -1194,8 +1195,8 @@ static void handle_new_mem_node(struct fxt_ev_64 *ev, struct starpu_fxt_options
  */
  */
 static int create_ordered_stream_id (int nodeid, int devid)
 static int create_ordered_stream_id (int nodeid, int devid)
 {
 {
-	static int stable[MAX_MPI_NODES][STARPU_MAXCUDADEVS];
-	STARPU_ASSERT(nodeid < MAX_MPI_NODES);
+	static int stable[STARPU_FXT_MAX_FILES][STARPU_MAXCUDADEVS];
+	STARPU_ASSERT(nodeid < STARPU_FXT_MAX_FILES);
 	STARPU_ASSERT(devid < STARPU_MAXCUDADEVS);
 	STARPU_ASSERT(devid < STARPU_MAXCUDADEVS);
 	return stable[nodeid][devid]++;
 	return stable[nodeid][devid]++;
 }
 }
@@ -2268,13 +2269,14 @@ static void handle_mpi_data_set_tag(struct fxt_ev_64 *ev, struct starpu_fxt_opti
 	data->mpi_tag = tag;
 	data->mpi_tag = tag;
 }
 }
 
 
-static const char *copy_link_type(unsigned prefetch)
+static const char *copy_link_type(enum _starpu_is_prefetch prefetch)
 {
 {
 	switch (prefetch)
 	switch (prefetch)
 	{
 	{
-		case 0: return "F";
-		case 1: return "PF";
-		case 2: return "IF";
+		case STARPU_FETCH: return "F";
+		case STARPU_TASK_PREFETCH: return "TF";
+		case STARPU_PREFETCH: return "PF";
+		case STARPU_IDLEFETCH: return "IF";
 		default: STARPU_ASSERT(0);
 		default: STARPU_ASSERT(0);
 	}
 	}
 }
 }
@@ -2285,7 +2287,7 @@ static void handle_start_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 	unsigned dst = ev->param[1];
 	unsigned dst = ev->param[1];
 	unsigned size = ev->param[2];
 	unsigned size = ev->param[2];
 	unsigned comid = ev->param[3];
 	unsigned comid = ev->param[3];
-	unsigned prefetch = ev->param[4];
+	enum _starpu_is_prefetch prefetch = ev->param[4];
 	unsigned long handle = ev->param[5];
 	unsigned long handle = ev->param[5];
 	const char *link_type = copy_link_type(prefetch);
 	const char *link_type = copy_link_type(prefetch);
 
 
@@ -2367,7 +2369,7 @@ static void handle_end_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 	unsigned dst = ev->param[1];
 	unsigned dst = ev->param[1];
 	unsigned long size = ev->param[2];
 	unsigned long size = ev->param[2];
 	unsigned comid = ev->param[3];
 	unsigned comid = ev->param[3];
-	unsigned prefetch = ev->param[4];
+	enum _starpu_is_prefetch prefetch = ev->param[4];
 	const char *link_type = copy_link_type(prefetch);
 	const char *link_type = copy_link_type(prefetch);
 
 
 	char *prefix = options->file_prefix;
 	char *prefix = options->file_prefix;

+ 0 - 2
src/debug/traces/starpu_fxt.h

@@ -41,8 +41,6 @@
 #include <starpu.h>
 #include <starpu.h>
 #include "../../../include/starpu_fxt.h"
 #include "../../../include/starpu_fxt.h"
 
 
-#define MAX_MPI_NODES 64
-
 extern char _starpu_last_codelet_symbol[STARPU_NMAXWORKERS][(FXT_MAX_PARAMS-5)*sizeof(unsigned long)];
 extern char _starpu_last_codelet_symbol[STARPU_NMAXWORKERS][(FXT_MAX_PARAMS-5)*sizeof(unsigned long)];
 
 
 void _starpu_fxt_dag_init(char *dag_filename);
 void _starpu_fxt_dag_init(char *dag_filename);

+ 19 - 19
src/debug/traces/starpu_fxt_mpi.c

@@ -103,27 +103,27 @@ int _starpu_fxt_mpi_find_sync_point(char *filename_in, uint64_t *offset, int *ke
  */
  */
 
 
 /* the list of MPI transfers found in the different traces */
 /* the list of MPI transfers found in the different traces */
-static struct mpi_transfer *mpi_sends[MAX_MPI_NODES] = {NULL};
-static struct mpi_transfer *mpi_recvs[MAX_MPI_NODES] = {NULL};
+static struct mpi_transfer *mpi_sends[STARPU_FXT_MAX_FILES] = {NULL};
+static struct mpi_transfer *mpi_recvs[STARPU_FXT_MAX_FILES] = {NULL};
 
 
 /* number of available slots in the lists  */
 /* number of available slots in the lists  */
-unsigned mpi_sends_list_size[MAX_MPI_NODES] = {0};
-unsigned mpi_recvs_list_size[MAX_MPI_NODES] = {0};
+unsigned mpi_sends_list_size[STARPU_FXT_MAX_FILES] = {0};
+unsigned mpi_recvs_list_size[STARPU_FXT_MAX_FILES] = {0};
 
 
 /* number of slots actually used in the list  */
 /* number of slots actually used in the list  */
-unsigned mpi_sends_used[MAX_MPI_NODES] = {0};
-unsigned mpi_recvs_used[MAX_MPI_NODES] = {0};
+unsigned mpi_sends_used[STARPU_FXT_MAX_FILES] = {0};
+unsigned mpi_recvs_used[STARPU_FXT_MAX_FILES] = {0};
 
 
 /* number of slots already matched at the beginning of the list. This permits
 /* number of slots already matched at the beginning of the list. This permits
  * going through the lists from the beginning to match each and every
  * going through the lists from the beginning to match each and every
  * transfer, thus avoiding a quadratic complexity. */
  * transfer, thus avoiding a quadratic complexity. */
-unsigned mpi_recvs_matched[MAX_MPI_NODES][MAX_MPI_NODES] = { {0} };
-unsigned mpi_sends_matched[MAX_MPI_NODES][MAX_MPI_NODES] = { {0} };
+unsigned mpi_recvs_matched[STARPU_FXT_MAX_FILES][STARPU_FXT_MAX_FILES] = { {0} };
+unsigned mpi_sends_matched[STARPU_FXT_MAX_FILES][STARPU_FXT_MAX_FILES] = { {0} };
 
 
 void _starpu_fxt_mpi_add_send_transfer(int src, int dst STARPU_ATTRIBUTE_UNUSED, long mpi_tag, size_t size, float date, long jobid, unsigned long handle)
 void _starpu_fxt_mpi_add_send_transfer(int src, int dst STARPU_ATTRIBUTE_UNUSED, long mpi_tag, size_t size, float date, long jobid, unsigned long handle)
 {
 {
 	STARPU_ASSERT(src >= 0);
 	STARPU_ASSERT(src >= 0);
-	if (src >= MAX_MPI_NODES)
+	if (src >= STARPU_FXT_MAX_FILES)
 		return;
 		return;
 	unsigned slot = mpi_sends_used[src]++;
 	unsigned slot = mpi_sends_used[src]++;
 
 
@@ -153,7 +153,7 @@ void _starpu_fxt_mpi_add_send_transfer(int src, int dst STARPU_ATTRIBUTE_UNUSED,
 
 
 void _starpu_fxt_mpi_add_recv_transfer(int src STARPU_ATTRIBUTE_UNUSED, int dst, long mpi_tag, float date, long jobid, unsigned long handle)
 void _starpu_fxt_mpi_add_recv_transfer(int src STARPU_ATTRIBUTE_UNUSED, int dst, long mpi_tag, float date, long jobid, unsigned long handle)
 {
 {
-	if (dst >= MAX_MPI_NODES)
+	if (dst >= STARPU_FXT_MAX_FILES)
 		return;
 		return;
 	unsigned slot = mpi_recvs_used[dst]++;
 	unsigned slot = mpi_recvs_used[dst]++;
 
 
@@ -220,11 +220,11 @@ static unsigned long mpi_com_id = 0;
 
 
 static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comms_file, unsigned n)
 static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comms_file, unsigned n)
 {
 {
-	unsigned slot[MAX_MPI_NODES] = { 0 }, node;
+	unsigned slot[STARPU_FXT_MAX_FILES] = { 0 }, node;
 	unsigned nb_wrong_comm_timing = 0;
 	unsigned nb_wrong_comm_timing = 0;
 	struct mpi_transfer_list pending_receives; /* Sorted list of matches which have not happened yet */
 	struct mpi_transfer_list pending_receives; /* Sorted list of matches which have not happened yet */
-	double current_out_bandwidth[MAX_MPI_NODES] = { 0. };
-	double current_in_bandwidth[MAX_MPI_NODES] = { 0. };
+	double current_out_bandwidth[STARPU_FXT_MAX_FILES] = { 0. };
+	double current_in_bandwidth[STARPU_FXT_MAX_FILES] = { 0. };
 #ifdef STARPU_HAVE_POTI
 #ifdef STARPU_HAVE_POTI
 	char mpi_container[STARPU_POTI_STR_LEN];
 	char mpi_container[STARPU_POTI_STR_LEN];
 #endif
 #endif
@@ -246,7 +246,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 		else
 		else
 			start_date = mpi_transfer_list_front(&pending_receives)->date;
 			start_date = mpi_transfer_list_front(&pending_receives)->date;
 
 
-		src = MAX_MPI_NODES;
+		src = STARPU_FXT_MAX_FILES;
 		for (node = 0; node < n; node++)
 		for (node = 0; node < n; node++)
 		{
 		{
 			if (slot[node] < mpi_sends_used[node] && mpi_sends[node][slot[node]].date < start_date)
 			if (slot[node] < mpi_sends_used[node] && mpi_sends[node][slot[node]].date < start_date)
@@ -260,7 +260,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 			/* No event any more, we're finished! */
 			/* No event any more, we're finished! */
 			break;
 			break;
 
 
-		if (src == MAX_MPI_NODES)
+		if (src == STARPU_FXT_MAX_FILES)
 		{
 		{
 			/* Pending match is earlier than all new sends, finish its communication */
 			/* Pending match is earlier than all new sends, finish its communication */
 			match = mpi_transfer_list_pop_front(&pending_receives);
 			match = mpi_transfer_list_pop_front(&pending_receives);
@@ -284,7 +284,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 		size_t size = cur->size;
 		size_t size = cur->size;
 		unsigned long send_handle = cur->handle;
 		unsigned long send_handle = cur->handle;
 
 
-		if (dst < MAX_MPI_NODES)
+		if (dst < STARPU_FXT_MAX_FILES)
 			match = try_to_match_send_transfer(src, dst, mpi_tag);
 			match = try_to_match_send_transfer(src, dst, mpi_tag);
 		else
 		else
 			match = NULL;
 			match = NULL;
@@ -377,10 +377,10 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
 
 void _starpu_fxt_display_mpi_transfers(struct starpu_fxt_options *options, int *ranks STARPU_ATTRIBUTE_UNUSED, FILE *out_paje_file, FILE* out_comms_file)
 void _starpu_fxt_display_mpi_transfers(struct starpu_fxt_options *options, int *ranks STARPU_ATTRIBUTE_UNUSED, FILE *out_paje_file, FILE* out_comms_file)
 {
 {
-	if (options->ninputfiles > MAX_MPI_NODES)
+	if (options->ninputfiles > STARPU_FXT_MAX_FILES)
 	{
 	{
-		_STARPU_DISP("Warning: %u files given, maximum %u supported, truncating to %u\n", options->ninputfiles, MAX_MPI_NODES, MAX_MPI_NODES);
-		options->ninputfiles = MAX_MPI_NODES;
+		_STARPU_DISP("Warning: %u files given, maximum %u supported, truncating to %u\n", options->ninputfiles, STARPU_FXT_MAX_FILES, STARPU_FXT_MAX_FILES);
+		options->ninputfiles = STARPU_FXT_MAX_FILES;
 	}
 	}
 
 
 	/* display the MPI transfers if possible */
 	/* display the MPI transfers if possible */

+ 2 - 0
src/debug/traces/starpu_paje.c

@@ -398,6 +398,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 	/* Link types */
 	/* Link types */
 	poti_DefineLinkType("MPIL", "MPIP", "MPICt", "MPICt", "MPI communication");
 	poti_DefineLinkType("MPIL", "MPIP", "MPICt", "MPICt", "MPI communication");
 	poti_DefineLinkType("F", "P", "Mm", "Mm", "Intra-node data Fetch");
 	poti_DefineLinkType("F", "P", "Mm", "Mm", "Intra-node data Fetch");
+	poti_DefineLinkType("TF", "P", "Mm", "Mm", "Intra-node data TaskPreFetch");
 	poti_DefineLinkType("PF", "P", "Mm", "Mm", "Intra-node data PreFetch");
 	poti_DefineLinkType("PF", "P", "Mm", "Mm", "Intra-node data PreFetch");
 	poti_DefineLinkType("IF", "P", "Mm", "Mm", "Intra-node data IdleFetch");
 	poti_DefineLinkType("IF", "P", "Mm", "Mm", "Intra-node data IdleFetch");
 	poti_DefineLinkType("WSL", "P", "W", "W", "Work steal");
 	poti_DefineLinkType("WSL", "P", "W", "W", "Work steal");
@@ -551,6 +552,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 6       No       MS     Nothing         \".0 .0 .0\"		\n\
 6       No       MS     Nothing         \".0 .0 .0\"		\n\
 5       MPIL     MPIP	MPICt	MPICt   \"MPI communication\"\n\
 5       MPIL     MPIP	MPICt	MPICt   \"MPI communication\"\n\
 5       F       P	Mm	Mm      \"Intra-node data Fetch\"\n\
 5       F       P	Mm	Mm      \"Intra-node data Fetch\"\n\
+5       TF      P	Mm	Mm      \"Intra-node data TaskPreFetch\"\n\
 5       PF      P	Mm	Mm      \"Intra-node data PreFetch\"\n\
 5       PF      P	Mm	Mm      \"Intra-node data PreFetch\"\n\
 5       IF      P	Mm	Mm      \"Intra-node data IdleFetch\"\n\
 5       IF      P	Mm	Mm      \"Intra-node data IdleFetch\"\n\
 5       WSL     P	W	W       \"Work steal\"\n");
 5       WSL     P	W	W       \"Work steal\"\n");

+ 4 - 1
src/drivers/cpu/driver_cpu.c

@@ -109,7 +109,10 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 				_SIMGRID_TIMER_END;
 				_SIMGRID_TIMER_END;
 			}
 			}
 			else
 			else
-				_starpu_simgrid_submit_job(cpu_args->workerid, j, perf_arch, NAN, NULL);
+			{
+				struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(cpu_args, j);
+				_starpu_simgrid_submit_job(cpu_args->workerid, sched_ctx->id, j, perf_arch, NAN, NAN, NULL);
+			}
 #else
 #else
 #  ifdef STARPU_PAPI
 #  ifdef STARPU_PAPI
 			_starpu_profiling_papi_task_start_counters(task);
 			_starpu_profiling_papi_task_start_counters(task);

+ 29 - 20
src/drivers/cuda/driver_cuda.c

@@ -531,10 +531,13 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worke
 				_SIMGRID_TIMER_END;
 				_SIMGRID_TIMER_END;
 			}
 			}
 		else
 		else
-			_starpu_simgrid_submit_job(workerid, j, &worker->perf_arch, NAN,
+		{
+			struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
+			_starpu_simgrid_submit_job(workerid, sched_ctx->id, j, &worker->perf_arch, NAN, NAN,
 				async ? &task_finished[workerid][pipeline_idx] : NULL);
 				async ? &task_finished[workerid][pipeline_idx] : NULL);
+		}
 #else
 #else
-#ifdef HAVE_LIBNVIDIA_ML
+#ifdef HAVE_NVMLDEVICEGETTOTALENERGYCONSUMPTION
 		unsigned long long energy_start = 0;
 		unsigned long long energy_start = 0;
 		nvmlReturn_t nvmlRet = -1;
 		nvmlReturn_t nvmlRet = -1;
 		if (profiling && task->profiling_info)
 		if (profiling && task->profiling_info)
@@ -558,7 +561,7 @@ static void finish_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *wor
 	int profiling = starpu_profiling_status_get();
 	int profiling = starpu_profiling_status_get();
 
 
 
 
-#ifdef HAVE_LIBNVIDIA_ML
+#ifdef HAVE_NVMLDEVICEGETTOTALENERGYCONSUMPTION
 	if (profiling && j->task->profiling_info && j->task->profiling_info->energy_consumed)
 	if (profiling && j->task->profiling_info && j->task->profiling_info->energy_consumed)
 	{
 	{
 		unsigned long long energy_end;
 		unsigned long long energy_end;
@@ -880,27 +883,33 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 			_starpu_set_local_worker_key(worker);
 			_starpu_set_local_worker_key(worker);
 			finish_job_on_cuda(_starpu_get_job_associated_to_task(task), worker);
 			finish_job_on_cuda(_starpu_get_job_associated_to_task(task), worker);
 			/* See next task if any */
 			/* See next task if any */
-			if (worker->ntasks && worker->current_tasks[worker->first_task] != worker->task_transferring)
+			if (worker->ntasks)
 			{
 			{
-				task = worker->current_tasks[worker->first_task];
-				j = _starpu_get_job_associated_to_task(task);
-				if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
+				if (worker->current_tasks[worker->first_task] != worker->task_transferring)
 				{
 				{
-					/* An asynchronous task, it was already
-					 * queued, it's now running, record its start time.  */
-					_starpu_driver_start_job(worker, j, &worker->perf_arch, 0, starpu_profiling_status_get());
+					task = worker->current_tasks[worker->first_task];
+					j = _starpu_get_job_associated_to_task(task);
+					if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
+					{
+						/* An asynchronous task, it was already
+						 * queued, it's now running, record its start time.  */
+						_starpu_driver_start_job(worker, j, &worker->perf_arch, 0, starpu_profiling_status_get());
+					}
+					else
+					{
+						/* A synchronous task, we have finished
+						 * flushing the pipeline, we can now at
+						 * last execute it.  */
+
+						_STARPU_TRACE_EVENT("sync_task");
+						execute_job_on_cuda(task, worker);
+						_STARPU_TRACE_EVENT("end_sync_task");
+						worker->pipeline_stuck = 0;
+					}
 				}
 				}
 				else
 				else
-				{
-					/* A synchronous task, we have finished
-					 * flushing the pipeline, we can now at
-					 * last execute it.  */
-
-					_STARPU_TRACE_EVENT("sync_task");
-					execute_job_on_cuda(task, worker);
-					_STARPU_TRACE_EVENT("end_sync_task");
-					worker->pipeline_stuck = 0;
-				}
+					/* Data for next task didn't have time to finish transferring :/ */
+					_STARPU_TRACE_WORKER_START_FETCH_INPUT(NULL, workerid);
 			}
 			}
 #ifdef STARPU_USE_FXT
 #ifdef STARPU_USE_FXT
 			int k;
 			int k;

+ 1 - 0
src/drivers/disk/driver_disk.c

@@ -21,6 +21,7 @@
 #include <drivers/disk/driver_disk.h>
 #include <drivers/disk/driver_disk.h>
 #include <drivers/cpu/driver_cpu.h>
 #include <drivers/cpu/driver_cpu.h>
 #include <datawizard/coherency.h>
 #include <datawizard/coherency.h>
+#include <datawizard/memory_nodes.h>
 
 
 int _starpu_disk_copy_src_to_disk(void * src, unsigned src_node, void * dst, size_t dst_offset, unsigned dst_node, size_t size, void * async_channel)
 int _starpu_disk_copy_src_to_disk(void * src, unsigned src_node, void * dst, size_t dst_offset, unsigned dst_node, size_t size, void * async_channel)
 {
 {

+ 0 - 0
src/drivers/driver_common/driver_common.c


Some files were not shown because too many files changed in this diff