Browse Source

merge trunk

Corentin Salingue 12 years ago
parent
commit
021d380faa
59 changed files with 4396 additions and 231 deletions
  1. 8 0
      ChangeLog
  2. 13 7
      configure.ac
  3. 12 2
      doc/doxygen/Makefile.am
  4. 16 1
      doc/doxygen/chapters/advanced_examples.doxy
  5. 1 0
      doc/doxygen/chapters/api/data_out_of_core.doxy
  6. 16 0
      doc/doxygen/chapters/api/mpi.doxy
  7. 5 2
      doc/doxygen/chapters/building.doxy
  8. 5 5
      doc/doxygen/chapters/environment_variables.doxy
  9. 17 13
      doc/doxygen/chapters/optimize_performance.doxy
  10. 3 0
      doc/doxygen/chapters/performance_feedback.doxy
  11. 1138 0
      doc/doxygen/chapters/starpu_non_linear_memset_regression_based.eps
  12. BIN
      doc/doxygen/chapters/starpu_non_linear_memset_regression_based.pdf
  13. BIN
      doc/doxygen/chapters/starpu_non_linear_memset_regression_based.png
  14. 1036 0
      doc/doxygen/chapters/starpu_starpu_slu_lu_model_11.eps
  15. BIN
      doc/doxygen/chapters/starpu_starpu_slu_lu_model_11.pdf
  16. BIN
      doc/doxygen/chapters/starpu_starpu_slu_lu_model_11.png
  17. 1253 0
      doc/doxygen/chapters/tasks_size_overhead.eps
  18. BIN
      doc/doxygen/chapters/tasks_size_overhead.pdf
  19. BIN
      doc/doxygen/chapters/tasks_size_overhead.png
  20. 1 0
      doc/doxygen/doxygen-config.cfg.in
  21. 2 1
      doc/doxygen/doxygen.cfg
  22. 3 3
      doc/texinfo/chapters/configuration.texi
  23. 2 1
      examples/lu/lu_example.c
  24. 2 2
      include/starpu_util.h
  25. 2 0
      mpi/include/starpu_mpi.h
  26. 44 26
      mpi/src/starpu_mpi.c
  27. 2 0
      mpi/src/starpu_mpi_private.h
  28. 4 0
      mpi/tests/Makefile.am
  29. 310 0
      mpi/tests/datatypes.c
  30. 1 1
      mpi/tests/helper.h
  31. 4 7
      mpi/tests/mpi_earlyrecv.c
  32. 5 1
      mpi/tests/mpi_earlyrecv2.c
  33. 72 45
      sc_hypervisor/src/hypervisor_policies/debit_lp_policy.c
  34. 83 58
      sc_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c
  35. 7 2
      sc_hypervisor/src/policies_utils/policy_tools.c
  36. 0 3
      sc_hypervisor/src/policies_utils/speed.c
  37. 1 0
      sc_hypervisor/src/sc_hypervisor.c
  38. 5 1
      src/Makefile.am
  39. 23 1
      src/core/disk_ops/disk_stdio.c
  40. 41 9
      src/core/disk_ops/unistd/disk_unistd_global.c
  41. 2 2
      src/core/perfmodel/perfmodel_bus.c
  42. 1 1
      src/core/perfmodel/perfmodel_history.c
  43. 0 21
      src/core/sched_policy.c
  44. 3 2
      src/datawizard/coherency.c
  45. 4 0
      src/datawizard/data_request.c
  46. 3 0
      src/datawizard/filters.c
  47. 65 0
      src/datawizard/interfaces/block_interface.c
  48. 54 0
      src/datawizard/interfaces/matrix_interface.c
  49. 35 0
      src/datawizard/interfaces/variable_interface.c
  50. 35 0
      src/datawizard/interfaces/vector_interface.c
  51. 3 0
      src/drivers/cpu/driver_cpu.c
  52. 1 1
      src/drivers/cuda/driver_cuda.c
  53. 27 0
      src/drivers/driver_common/driver_common.c
  54. 1 3
      src/sched_policies/parallel_eager.c
  55. 3 1
      tests/datawizard/commute.c
  56. 16 4
      tests/disk/disk_compute.c
  57. 1 1
      tests/main/starpu_init.c
  58. 3 3
      tests/main/tag_task_data_deps.c
  59. 2 1
      tools/gdbinit

+ 8 - 0
ChangeLog

@@ -34,6 +34,12 @@ New features:
     let starpu commute write accesses.
   * Out-of-core support, through registration of disk areas as additional memory
     nodes.
+  * StarPU-MPI: new function
+    starpu_mpi_irecv_detached_sequential_consistency which allows to
+    enable or disable the sequential consistency for the given data
+    handle (sequential consistency will be enabled or disabled based
+    on the value of the function parameter and the value of the
+    sequential consistency defined for the given data)
 
 Small features:
   * Add cl_arg_free field to enable automatic free(cl_arg) on task
@@ -48,6 +54,8 @@ Changes:
   * Fix of the livelock issue discovered while executing applications
     on a CPU+GPU cluster of machines by adding a maximum trylock 
     threshold before a blocking lock.
+  * Data interfaces (variable, vector, matrix and block) now define
+    pack und unpack functions
 
 StarPU 1.1.0 (svn revision xxxx)
 ==============================================

+ 13 - 7
configure.ac

@@ -134,8 +134,13 @@ case "$target" in
   libext=a
   AC_DEFINE(STARPU_HAVE_WINDOWS, [], [Define this on windows.])
   ;;
+*-*-linux*)
+  starpu_linux=yes
+  AC_DEFINE(STARPU_LINUX_SYS, 1, [Define to 1 on Linux])
+  ;;
 esac
 AM_CONDITIONAL([STARPU_HAVE_WINDOWS], [test "x$starpu_windows" = "xyes"])
+AM_CONDITIONAL([STARPU_LINUX_SYS], [test "x$starpu_linux" = "xyes"])
 
 # on Darwin, GCC targets i386 by default, so we don't have atomic ops
 AC_CHECK_SIZEOF([void *])
@@ -959,13 +964,13 @@ AC_MSG_RESULT($nmaxmicdev)
 AC_DEFINE_UNQUOTED(STARPU_MAXMICDEVS, [$nmaxmicdev],
 	[maximum number of MIC devices])
 
-AC_MSG_CHECKING(maximum number of MIC cores)
-AC_ARG_ENABLE(maxmicdev, [AS_HELP_STRING([--enable-maxmiccore=<number>],
-			[maximum number of MIC cores])],
-			nmaxmiccore=$enableval, nmaxmiccore=128)
-AC_MSG_RESULT($nmaxmiccore)
+AC_MSG_CHECKING(maximum number of MIC threads)
+AC_ARG_ENABLE(maxmicthreads, [AS_HELP_STRING([--enable-maxmicthreads=<number>],
+			[maximum number of MIC threads])],
+			nmaxmicthreads=$enableval, nmaxmicthreads=128)
+AC_MSG_RESULT($nmaxmicthread)
 
-AC_DEFINE_UNQUOTED(STARPU_MAXMICCORES, [$nmaxmiccore],
+AC_DEFINE_UNQUOTED(STARPU_MAXMICCORES, [$nmaxmicthreads],
 	[maximum number of MIC cores])
 
 AC_ARG_WITH(coi-dir,
@@ -1466,7 +1471,7 @@ AC_CHECK_FUNCS([clock_gettime])
 
 # Compute the maximum number of workers (we round it to 16 for alignment
 # purposes).
-nmaxworkers=`expr 16 \* \( \( $maxcpus + $nmaxcudadev + $nmaxopencldev + $nmaxmiccore + $nmaxsccdev + 15 \) / 16 \) `
+nmaxworkers=`expr 16 \* \( \( $maxcpus + $nmaxcudadev + $nmaxopencldev + $nmaxmicthreads + $nmaxsccdev + 15 \) / 16 \) `
 AC_MSG_CHECKING(Maximum number of workers)
 AC_MSG_RESULT($nmaxworkers)
 AC_DEFINE_UNQUOTED(STARPU_NMAXWORKERS, [$nmaxworkers], [Maximum number of workers])
@@ -2283,6 +2288,7 @@ AC_MSG_NOTICE([
 	Maximum number of CPUs:           $maxcpus
 	Maximum number of CUDA devices:   $nmaxcudadev
 	Maximum number of OpenCL devices: $nmaxopencldev
+	Maximum number of MIC threads:    $nmaxmicthreads
 	Maximum number of memory nodes:   $maxnodes
 	Maximum number of task buffers:   $nmaxbuffers
 

+ 12 - 2
doc/doxygen/Makefile.am

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009, 2011  Université de Bordeaux 1
+# Copyright (C) 2009, 2011, 2013  Université de Bordeaux 1
 # Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
 #
 # Permission is granted to copy, distribute and/or modify this document
@@ -109,7 +109,7 @@ chapters/version.sty: $(chapters)
 	@if test -s timestamp_updated ; then \
 		echo "\newcommand{\STARPUUPDATED}{"`cat timestamp_updated`"}" > $(top_srcdir)/doc/doxygen/chapters/version.sty;\
 	else \
-		echo "\newcommand{\STARPUUPDATED}{unknown_date}" > $(top_srcdir)/doc/doxygen/chapters/version.sty;\
+		echo "\newcommand{\STARPUUPDATED}{unknown date}" > $(top_srcdir)/doc/doxygen/chapters/version.sty;\
 	fi
 	@echo "\newcommand{\STARPUVERSION}{$(VERSION)}" >> $(top_srcdir)/doc/doxygen/chapters/version.sty
 	@-for f in timestamp timestamp_updated timestamp_updated_month ; do \
@@ -138,6 +138,15 @@ EXTRA_DIST	= 		\
 	$(chapters) 		\
 	chapters/version.sty	\
 	chapters/version.html	\
+	chapters/tasks_size_overhead.png	\
+	chapters/tasks_size_overhead.eps	\
+	chapters/tasks_size_overhead.pdf	\
+	chapters/starpu_non_linear_memset_regression_based.png	\
+	chapters/starpu_non_linear_memset_regression_based.eps	\
+	chapters/starpu_non_linear_memset_regression_based.pdf	\
+	chapters/starpu_starpu_slu_lu_model_11.png	\
+	chapters/starpu_starpu_slu_lu_model_11.eps	\
+	chapters/starpu_starpu_slu_lu_model_11.pdf	\
 	doxygen.cfg 		\
 	refman.tex
 
@@ -194,6 +203,7 @@ dist_pdf_DATA = $(DOX_PDF)
 
 $(DOX_PDF): $(DOX_TAG) refman.tex
 	cp $(top_srcdir)/doc/doxygen/chapters/version.sty $(DOX_LATEX_DIR)
+	cp $(top_srcdir)/doc/doxygen/chapters/*pdf $(DOX_LATEX_DIR)
 	cd $(DOX_LATEX_DIR); \
 	rm -f *.aux *.toc *.idx *.ind *.ilg *.log *.out; \
 	sed -i -e 's/__env__/\\_Environment Variables!/' -e 's/\\-\\_\\-\\-\\_\\-env\\-\\_\\-\\-\\_\\-//' ExecutionConfigurationThroughEnvironmentVariables.tex ;\

+ 16 - 1
doc/doxygen/chapters/advanced_examples.doxy

@@ -390,10 +390,25 @@ starpu_perfmodel::size_base however permits the application to
 override that, when for instance some of the data do not matter for
 task cost (e.g. mere reference table), or when using sparse
 structures (in which case it is the number of non-zeros which matter), or when
-there is some hidden parameter such as the number of iterations, etc.
+there is some hidden parameter such as the number of iterations, or when the application
+actually has a very good idea of the complexity of the algorithm, and just not
+the speed of the processor, etc.
 The example in the directory <c>examples/pi</c> uses this to include
 the number of iterations in the base.
 
+StarPU will automatically determine when the performance model is calibrated,
+or rather, it will assume the performance model is calibrated until the
+application submits a task for which the performance can not be predicted. For
+::STARPU_HISTORY_BASED, StarPU will require 10 (::_STARPU_CALIBRATION_MINIMUM)
+measurements for a given size before estimating that an average can be taken as
+estimation for further executions with the same size. For
+::STARPU_REGRESSION_BASED and ::STARPU_NL_REGRESSION_BASED, StarPU will require
+10 (::_STARPU_CALIBRATION_MINIMUM) measurements, and that the minimum measured
+data size is smaller than 90% of the maximum measured data size (i.e. the
+measurement interval is large enough for a regression to have a meaning).
+Calibration can also be forced by setting the \ref STARPU_CALIBRATE environment
+variable to <c>1</c>, or even reset by setting it to <c>2</c>.
+
 How to use schedulers which can benefit from such performance model is explained
 in \ref TaskSchedulingPolicy.
 

+ 1 - 0
doc/doxygen/chapters/api/data_out_of_core.doxy

@@ -43,5 +43,6 @@ This set uses the unistd library (write, read...) to read/write on disk. <br />
 \ingroup API_Out_Of_Core
 This set uses the unistd library (write, read...) to read/write on disk with the O_DIRECT flag. <br />
 <strong>Warning: It creates one file per allocation !</strong>  <br />
+Only available on Linux.
 
 */

+ 16 - 0
doc/doxygen/chapters/api/mpi.doxy

@@ -98,6 +98,22 @@ communication completes, its resources are automatically released back
 to the system, there is no need to test or to wait for the completion
 of the request.
 
+\fn int starpu_mpi_irecv_detached_sequential_consistency(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg, int sequential_consistency)
+\ingroup API_MPI_Support
+Posts a nonblocking receive in \p data_handle from the node \p source
+using the message tag \p mpi_tag within the communicator \p comm. On
+completion, the \p callback function is called with the argument \p
+arg.
+The parameter \p sequential_consistency allows to enable or disable
+the sequential consistency for \p data handle (sequential consistency
+will be enabled or disabled based on the value of the parameter \p
+sequential_consistency and the value of the sequential consistency
+defined for \p data_handle).
+Similarly to the pthread detached functionality, when a detached
+communication completes, its resources are automatically released back
+to the system, there is no need to test or to wait for the completion
+of the request.
+
 \fn int starpu_mpi_wait(starpu_mpi_req *req, MPI_Status *status)
 \ingroup API_MPI_Support
 Returns when the operation identified by request \p req is complete.

+ 5 - 2
doc/doxygen/chapters/building.doxy

@@ -266,11 +266,14 @@ schedulers, for instance <c>STARPU_SCHED=dmda</c>.
 
 \subsection TaskSizeOverhead Task Size Overhead
 
-This benchmark gives a glimpse into how big a size should be for StarPU overhead
-to be low enough.  Run <c>tasks_size_overhead.sh</c>, it will generate a plot
+This benchmark gives a glimpse into how long a task should be (in µs) for StarPU overhead
+to be low enough to keep efficiency.  Run <c>tasks_size_overhead.sh</c>, it will generate a plot
 of the speedup of tasks of various sizes, depending on the number of CPUs being
 used.
 
+\image html tasks_size_overhead.png
+\image latex tasks_size_overhead.eps "" width=\textwidth
+
 \subsection DataTransferLatency Data Transfer Latency
 
 <c>local_pingpong</c> performs a ping-pong between the first two CUDA nodes, and

+ 5 - 5
doc/doxygen/chapters/environment_variables.doxy

@@ -217,12 +217,12 @@ it is therefore necessary to disable asynchronous data transfers.
 Disable asynchronous copies between CPU and MIC devices.
 </dd>
 
-<dt>STARPU_DISABLE_CUDA_GPU_GPU_DIRECT</dt>
+<dt>STARPU_ENABLE_CUDA_GPU_GPU_DIRECT</dt>
 <dd>
-\anchor STARPU_DISABLE_CUDA_GPU_GPU_DIRECT
-\addindex __env__STARPU_DISABLE_CUDA_GPU_GPU_DIRECT
-Disable direct CUDA transfers from GPU to GPU, and let CUDA copy through RAM
-instead. This permits to test the performance effect of GPU-Direct.
+\anchor STARPU_ENABLE_CUDA_GPU_GPU_DIRECT
+\addindex __env__STARPU_ENABLE_CUDA_GPU_GPU_DIRECT
+Enable direct CUDA transfers from GPU to GPU, without copying through RAM.
+This permits to test the performance effect of GPU-Direct.
 </dd>
 
 </dl>

+ 17 - 13
doc/doxygen/chapters/optimize_performance.doxy

@@ -198,28 +198,32 @@ option lists the available performance models, and the <c>-s</c> option permits
 to choose the performance model to be displayed. The result looks like:
 
 \verbatim
-$ starpu_perfmodel_display -s starpu_dlu_lu_model_22
-performance model for cpu
-# hash    size     mean          dev           n
-880805ba  98304    2.731309e+02  6.010210e+01  1240
-b50b6605  393216   1.469926e+03  1.088828e+02  1240
-5c6c3401  1572864  1.125983e+04  3.265296e+03  1240
+$ starpu_perfmodel_display -s starpu_slu_lu_model_11
+performance model for cpu_impl_0
+# hash    size     flops         mean          dev           n
+914f3bef  1048576  0.000000e+00  2.503577e+04  1.982465e+02  8
+3e921964  65536    0.000000e+00  5.527003e+02  1.848114e+01  7
+e5a07e31  4096     0.000000e+00  1.717457e+01  5.190038e+00  14
+...
 \endverbatim
 
-Which shows that for the LU 22 kernel with a 1.5MiB matrix, the average
-execution time on CPUs was about 11ms, with a 3ms standard deviation, over
-1240 samples. It is a good idea to check this before doing actual performance
+Which shows that for the LU 11 kernel with a 1MiB matrix, the average
+execution time on CPUs was about 25ms, with a 0.2ms standard deviation, over
+8 samples. It is a good idea to check this before doing actual performance
 measurements.
 
 A graph can be drawn by using the tool <c>starpu_perfmodel_plot</c>:
 
 \verbatim
-$ starpu_perfmodel_plot -s starpu_dlu_lu_model_22
-98304 393216 1572864
-$ gnuplot starpu_starpu_dlu_lu_model_22.gp
-$ gv starpu_starpu_dlu_lu_model_22.eps
+$ starpu_perfmodel_plot -s starpu_slu_lu_model_11
+4096 16384 65536 262144 1048576 4194304 
+$ gnuplot starpu_starpu_slu_lu_model_11.gp
+$ gv starpu_starpu_slu_lu_model_11.eps
 \endverbatim
 
+\image html starpu_starpu_slu_lu_model_11.png
+\image latex starpu_starpu_slu_lu_model_11.eps "" width=\textwidth
+
 If a kernel source code was modified (e.g. performance improvement), the
 calibration information is stale and should be dropped, to re-calibrate from
 start. This can be done by using <c>export STARPU_CALIBRATE=2</c>.

+ 3 - 0
doc/doxygen/chapters/performance_feedback.doxy

@@ -420,6 +420,9 @@ The tool <c>starpu_perfmodel_plot</c> can be used to draw performance
 models. It writes a <c>.gp</c> file in the current directory, to be
 run with the tool <c>gnuplot</c>, which shows the corresponding curve.
 
+\image html starpu_non_linear_memset_regression_based.png
+\image latex starpu_non_linear_memset_regression_based.eps "" width=\textwidth
+
 When the field starpu_task::flops is set, <c>starpu_perfmodel_plot</c> can
 directly draw a GFlops curve, by simply adding the <c>-f</c> option:
 

File diff suppressed because it is too large
+ 1138 - 0
doc/doxygen/chapters/starpu_non_linear_memset_regression_based.eps


BIN
doc/doxygen/chapters/starpu_non_linear_memset_regression_based.pdf


BIN
doc/doxygen/chapters/starpu_non_linear_memset_regression_based.png


File diff suppressed because it is too large
+ 1036 - 0
doc/doxygen/chapters/starpu_starpu_slu_lu_model_11.eps


BIN
doc/doxygen/chapters/starpu_starpu_slu_lu_model_11.pdf


BIN
doc/doxygen/chapters/starpu_starpu_slu_lu_model_11.png


File diff suppressed because it is too large
+ 1253 - 0
doc/doxygen/chapters/tasks_size_overhead.eps


BIN
doc/doxygen/chapters/tasks_size_overhead.pdf


BIN
doc/doxygen/chapters/tasks_size_overhead.png


+ 1 - 0
doc/doxygen/doxygen-config.cfg.in

@@ -63,3 +63,4 @@ INPUT_FILTER           = @top_builddir@/doc/doxygen/doxygen_filter.sh
 
 LATEX_HEADER           = @top_srcdir@/doc/doxygen/refman.tex
 
+IMAGE_PATH             = @top_srcdir@/doc/doxygen/chapters

+ 2 - 1
doc/doxygen/doxygen.cfg

@@ -774,7 +774,8 @@ EXAMPLE_RECURSIVE      = NO
 # directories that contain image that are included in the documentation (see
 # the \image command).
 
-IMAGE_PATH             =
+# From @INCLUDE, above
+#IMAGE_PATH             =
 
 # The INPUT_FILTER tag can be used to specify a program that doxygen should
 # invoke to filter for each input file. Doxygen will invoke the filter program

+ 3 - 3
doc/texinfo/chapters/configuration.texi

@@ -477,9 +477,9 @@ it is therefore necessary to disable asynchronous data transfers.
 Disable asynchronous copies between CPU and MIC devices.
 @end defvr
 
-@defvr {Environment variable} STARPU_DISABLE_CUDA_GPU_GPU_DIRECT
-Disable direct CUDA transfers from GPU to GPU, and let CUDA copy through RAM
-instead. This permits to test the performance effect of GPU-Direct.
+@defvr {Environment variable} STARPU_ENABLE_CUDA_GPU_GPU_DIRECT
+Enable direct CUDA transfers from GPU to GPU, without copying through RAM.
+This permits to test the performance effect of GPU-Direct.
 @end defvr
 
 @node Scheduling

+ 2 - 1
examples/lu/lu_example.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -96,6 +96,7 @@ static void parse_args(int argc, char **argv)
 		else if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
 		{
 			fprintf(stderr,"usage: lu [-size n] [-nblocks b] [-piv] [-no-stride] [-profile] [-bound] [-bounddeps] [-bounddepsprio]\n");
+			fprintf(stderr,"Default is size %lu and nblocks %u\n", size, nblocks);
 			exit(0);
 		}
 	}

+ 2 - 2
include/starpu_util.h

@@ -82,10 +82,10 @@ extern "C"
 #else
 #  if defined(__CUDACC__) && defined(STARPU_HAVE_WINDOWS)
 #    define STARPU_ASSERT(x)		do { if (STARPU_UNLIKELY(!(x))) *(int*)NULL = 0; } while(0)
-#    define STARPU_ASSERT_MSG(x, msg, ...)	do { if (STARPU_UNLIKELY(!(x))) { fprintf(stderr, "[starpu][%s][assert failure] " msg "\n", __starpu_func__, ## __VA_ARGS__); *(int*)NULL = 0; }} while(0)
+#    define STARPU_ASSERT_MSG(x, msg, ...)	do { if (STARPU_UNLIKELY(!(x))) { fprintf(stderr, "\n[starpu][%s][assert failure] " msg "\n", __starpu_func__, ## __VA_ARGS__); *(int*)NULL = 0; }} while(0)
 #  else
 #    define STARPU_ASSERT(x)		assert(x)
-#    define STARPU_ASSERT_MSG(x, msg, ...)	do { if (STARPU_UNLIKELY(!(x))) { fprintf(stderr, "[starpu][%s][assert failure] " msg "\n", __starpu_func__, ## __VA_ARGS__); } ; assert(x); } while(0)
+#    define STARPU_ASSERT_MSG(x, msg, ...)	do { if (STARPU_UNLIKELY(!(x))) { fprintf(stderr, "\n[starpu][%s][assert failure] " msg "\n", __starpu_func__, ## __VA_ARGS__); } ; assert(x); } while(0)
 
 #  endif
 #endif

+ 2 - 0
mpi/include/starpu_mpi.h

@@ -40,6 +40,8 @@ int starpu_mpi_wait(starpu_mpi_req *req, MPI_Status *status);
 int starpu_mpi_test(starpu_mpi_req *req, int *flag, MPI_Status *status);
 int starpu_mpi_barrier(MPI_Comm comm);
 
+int starpu_mpi_irecv_detached_sequential_consistency(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg, int sequential_consistency);
+
 int starpu_mpi_init(int *argc, char ***argv, int initialize_mpi);
 int starpu_mpi_initialize(void) STARPU_DEPRECATED;
 int starpu_mpi_initialize_extended(int *rank, int *world_size) STARPU_DEPRECATED;

+ 44 - 26
mpi/src/starpu_mpi.c

@@ -32,8 +32,12 @@ static char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type
 #endif
 static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t data_handle,
 							int dest, int mpi_tag, MPI_Comm comm,
-							unsigned detached, void (*callback)(void *), void *arg);
-static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, unsigned detached, void (*callback)(void *), void *arg);
+							unsigned detached, void (*callback)(void *), void *arg,
+							int sequential_consistency);
+static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle,
+							int source, int mpi_tag, MPI_Comm comm,
+							unsigned detached, void (*callback)(void *), void *arg,
+							int sequential_consistency);
 static void _starpu_mpi_handle_detached_request(struct _starpu_mpi_req *req);
 
 /* The list of requests that have been newly submitted by the application */
@@ -73,6 +77,7 @@ struct _starpu_mpi_copy_handle
  /********************************************************/
 
 static struct _starpu_mpi_req *_starpu_mpi_req_hashmap = NULL;
+/** stores data which have been received by MPI but have not been requested by the application */
 static struct _starpu_mpi_copy_handle *_starpu_mpi_copy_handle_hashmap = NULL;
 
 static struct _starpu_mpi_req* find_req(int mpi_tag)
@@ -99,7 +104,7 @@ static void add_req(struct _starpu_mpi_req *req)
 	{
 		_STARPU_MPI_DEBUG(3, "Error add_req : request %p with tag %d already in the hashmap. \n", req, req->mpi_tag);
 		int seq_const = starpu_data_get_sequential_consistency_flag(req->data_handle);
-		if (seq_const)
+		if (seq_const &&  req->sequential_consistency)
 		{
 			STARPU_ASSERT_MSG(!test_req, "Error add_req : request %p with tag %d wanted to be added to the hashmap, while another request %p with the same tag is already in it. \n Sequential consistency is activated : this is not supported by StarPU.", req, req->mpi_tag, test_req);
 		}
@@ -213,6 +218,7 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
 	req->internal_req = NULL;
 	req->is_internal_req = 0;
 	req->envelope = NULL;
+	req->sequential_consistency = 1;
  }
 
  /********************************************************/
@@ -225,7 +231,8 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
 							       int srcdst, int mpi_tag, MPI_Comm comm,
 							       unsigned detached, void (*callback)(void *), void *arg,
 							       enum _starpu_mpi_request_type request_type, void (*func)(struct _starpu_mpi_req *),
-							       enum starpu_data_access_mode mode)
+							       enum starpu_data_access_mode mode,
+							       int sequential_consistency)
  {
 
 	 _STARPU_MPI_LOG_IN();
@@ -245,11 +252,12 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
 	 req->callback = callback;
 	 req->callback_arg = arg;
 	 req->func = func;
+	 req->sequential_consistency = sequential_consistency;
 
 	 /* Asynchronously request StarPU to fetch the data in main memory: when
 	  * it is available in main memory, _starpu_mpi_submit_new_mpi_request(req) is called and
 	  * the request is actually submitted */
-	 starpu_data_acquire_cb(data_handle, mode, _starpu_mpi_submit_new_mpi_request, (void *)req);
+	 starpu_data_acquire_cb_sequential_consistency(data_handle, mode, _starpu_mpi_submit_new_mpi_request, (void *)req, sequential_consistency);
 
 	 _STARPU_MPI_LOG_OUT();
 	 return req;
@@ -343,9 +351,10 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
 
 static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t data_handle,
 							int dest, int mpi_tag, MPI_Comm comm,
-							unsigned detached, void (*callback)(void *), void *arg)
+							unsigned detached, void (*callback)(void *), void *arg,
+							int sequential_consistency)
 {
-	return _starpu_mpi_isend_irecv_common(data_handle, dest, mpi_tag, comm, detached, callback, arg, SEND_REQ, _starpu_mpi_isend_size_func, STARPU_R);
+	return _starpu_mpi_isend_irecv_common(data_handle, dest, mpi_tag, comm, detached, callback, arg, SEND_REQ, _starpu_mpi_isend_size_func, STARPU_R, sequential_consistency);
 }
 
 int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, int mpi_tag, MPI_Comm comm)
@@ -354,7 +363,7 @@ int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *public_re
 	STARPU_ASSERT_MSG(public_req, "starpu_mpi_isend needs a valid starpu_mpi_req");
 
 	struct _starpu_mpi_req *req;
-	req = _starpu_mpi_isend_common(data_handle, dest, mpi_tag, comm, 0, NULL, NULL);
+	req = _starpu_mpi_isend_common(data_handle, dest, mpi_tag, comm, 0, NULL, NULL, 1);
 
 	STARPU_ASSERT_MSG(req, "Invalid return for _starpu_mpi_isend_common");
 	*public_req = req;
@@ -367,7 +376,7 @@ int starpu_mpi_isend_detached(starpu_data_handle_t data_handle,
 			      int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
 {
 	_STARPU_MPI_LOG_IN();
-	_starpu_mpi_isend_common(data_handle, dest, mpi_tag, comm, 1, callback, arg);
+	_starpu_mpi_isend_common(data_handle, dest, mpi_tag, comm, 1, callback, arg, 1);
 
 	_STARPU_MPI_LOG_OUT();
 	return 0;
@@ -420,9 +429,9 @@ static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
 	_STARPU_MPI_LOG_OUT();
 }
 
-static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, unsigned detached, void (*callback)(void *), void *arg)
+static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, unsigned detached, void (*callback)(void *), void *arg, int sequential_consistency)
 {
-	return _starpu_mpi_isend_irecv_common(data_handle, source, mpi_tag, comm, detached, callback, arg, RECV_REQ, _starpu_mpi_irecv_data_func, STARPU_W);
+	return _starpu_mpi_isend_irecv_common(data_handle, source, mpi_tag, comm, detached, callback, arg, RECV_REQ, _starpu_mpi_irecv_data_func, STARPU_W, sequential_consistency);
 }
 
 int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int source, int mpi_tag, MPI_Comm comm)
@@ -438,7 +447,7 @@ int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_re
 		starpu_data_set_tag(data_handle, mpi_tag);
 
 	struct _starpu_mpi_req *req;
-	req = _starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 0, NULL, NULL);
+	req = _starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 0, NULL, NULL, 1);
 
 	STARPU_ASSERT_MSG(req, "Invalid return for _starpu_mpi_irecv_common");
 	*public_req = req;
@@ -458,7 +467,15 @@ int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int
 	if (tag == -1)
 		starpu_data_set_tag(data_handle, mpi_tag);
 
-	_starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 1, callback, arg);
+	_starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 1, callback, arg, 1);
+	_STARPU_MPI_LOG_OUT();
+	return 0;
+}
+
+int starpu_mpi_irecv_detached_sequential_consistency(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg, int sequential_consistency)
+{
+	_STARPU_MPI_LOG_IN();
+	_starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 1, callback, arg, sequential_consistency);
 
 	_STARPU_MPI_LOG_OUT();
 	return 0;
@@ -527,8 +544,7 @@ int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 	STARPU_PTHREAD_MUTEX_UNLOCK(&(req->req_mutex));
 
 	/* Initialize the request structure */
-	STARPU_PTHREAD_MUTEX_INIT(&(waiting_req->req_mutex), NULL);
-	STARPU_PTHREAD_COND_INIT(&(waiting_req->req_cond), NULL);
+	 _starpu_mpi_request_init(waiting_req);
 	waiting_req->status = status;
 	waiting_req->other_request = req;
 	waiting_req->func = _starpu_mpi_wait_func;
@@ -759,12 +775,12 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 		{
 			if (req->request_type == SEND_REQ)
 			{
-				// We already know the request to send the size is completed, we just call MPI_Test to make sure that the request object is deallocated
-				MPI_Status status;
-				int flag;
-				ret = MPI_Test(&req->size_req, &flag, &status);
-				STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Test returning %d", ret);
-				STARPU_ASSERT_MSG(flag, "MPI_Test returning flag %d", flag);
+				// We need to make sure the communication for sending the size
+				// has completed, as MPI can re-order messages, let's call
+				// MPI_Wait to make sure data have been sent
+				ret = MPI_Wait(&req->size_req, MPI_STATUS_IGNORE);
+				STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Wait returning %d", ret);
+
 			}
 			if (req->request_type == RECV_REQ)
 				// req->ptr is freed by starpu_data_unpack
@@ -1036,14 +1052,14 @@ static void _starpu_mpi_handle_detached_request(struct _starpu_mpi_req *req)
 {
 	if (req->detached)
 	{
-		STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+		/* put the submitted request into the list of pending requests
+		 * so that it can be handled by the progression mechanisms */
+		STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
 		_starpu_mpi_req_list_push_front(detached_requests, req);
-		STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
 
 		starpu_wake_all_blocked_workers();
 
-		/* put the submitted request into the list of pending requests
-		 * so that it can be handled by the progression mechanisms */
 		STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 		STARPU_PTHREAD_COND_SIGNAL(&cond_progression);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
@@ -1141,7 +1157,9 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 		unsigned block = _starpu_mpi_req_list_empty(new_requests) && (HASH_COUNT(_starpu_mpi_req_hashmap) == 0);
 
 #ifndef STARPU_MPI_ACTIVITY
+		STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
 		block = block && _starpu_mpi_req_list_empty(detached_requests);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
 #endif /* STARPU_MPI_ACTIVITY */
 
 		if (block)
@@ -1230,7 +1248,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 					add_chandle(chandle);
 
 					_STARPU_MPI_DEBUG(3, "Posting internal detached irecv on copy_handle with tag %d from src %d ..\n", chandle->mpi_tag, status.MPI_SOURCE);
-					chandle->req = _starpu_mpi_irecv_common(chandle->handle, status.MPI_SOURCE, chandle->mpi_tag, MPI_COMM_WORLD, 1, NULL, NULL);
+					chandle->req = _starpu_mpi_irecv_common(chandle->handle, status.MPI_SOURCE, chandle->mpi_tag, MPI_COMM_WORLD, 1, NULL, NULL, 1);
 					chandle->req->is_internal_req = 1;
 
 					// We wait until the request is pushed in the

+ 2 - 0
mpi/src/starpu_mpi_private.h

@@ -144,6 +144,8 @@ LIST_TYPE(_starpu_mpi_req,
 
 	int is_internal_req;
 	struct _starpu_mpi_req *internal_req;
+
+	int sequential_consistency;
 );
 
 #ifdef __cplusplus

+ 4 - 0
mpi/tests/Makefile.am

@@ -77,6 +77,7 @@ AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS)
 ########################
 
 starpu_mpi_TESTS =				\
+	datatypes				\
 	pingpong				\
 	mpi_test				\
 	mpi_isend				\
@@ -104,6 +105,7 @@ starpu_mpi_TESTS =				\
 	user_defined_datatype
 
 noinst_PROGRAMS =				\
+	datatypes				\
 	pingpong				\
 	mpi_test				\
 	mpi_isend				\
@@ -146,6 +148,8 @@ mpi_detached_tag_LDADD =				\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 mpi_redux_LDADD =					\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+datatypes_LDADD =					\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 pingpong_LDADD =					\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 mpi_test_LDADD =					\

+ 310 - 0
mpi/tests/datatypes.c

@@ -0,0 +1,310 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <stdlib.h>
+#include "helper.h"
+
+typedef void (*check_func)(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error);
+
+void check_variable(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
+{
+	int ret;
+	float *v_s, *v_r;
+
+	STARPU_ASSERT(starpu_variable_get_elemsize(handle_s) == starpu_variable_get_elemsize(handle_r));
+
+	v_s = (float *)starpu_variable_get_local_ptr(handle_s);
+	v_r = (float *)starpu_variable_get_local_ptr(handle_r);
+
+	if (*v_s == *v_r)
+	{
+		FPRINTF_MPI("Success with variable value: %f == %f\n", *v_s, *v_r);
+	}
+	else
+	{
+		*error = 1;
+		FPRINTF_MPI("Error with variable value: %f != %f\n", *v_s, *v_r);
+	}
+}
+
+void check_vector(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
+{
+	int ret, i;
+	int nx;
+	int *v_r, *v_s;
+
+	STARPU_ASSERT(starpu_vector_get_elemsize(handle_s) == starpu_vector_get_elemsize(handle_r));
+	STARPU_ASSERT(starpu_vector_get_nx(handle_s) == starpu_vector_get_nx(handle_r));
+
+	nx = starpu_vector_get_nx(handle_r);
+	v_r = (int *)starpu_vector_get_local_ptr(handle_r);
+	v_s = (int *)starpu_vector_get_local_ptr(handle_s);
+
+	for(i=0 ; i<nx ; i++)
+	{
+		if (v_s[i] == v_r[i])
+		{
+			FPRINTF_MPI("Success with vector[%d] value: %d == %d\n", i, v_s[i], v_r[i]);
+		}
+		else
+		{
+			*error = 1;
+			FPRINTF_MPI("Error with vector[%d] value: %d != %d\n", i, v_s[i], v_r[i]);
+		}
+	}
+}
+
+void check_matrix(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
+{
+	STARPU_ASSERT(starpu_matrix_get_elemsize(handle_s) == starpu_matrix_get_elemsize(handle_r));
+	STARPU_ASSERT(starpu_matrix_get_nx(handle_s) == starpu_matrix_get_nx(handle_r));
+	STARPU_ASSERT(starpu_matrix_get_ny(handle_s) == starpu_matrix_get_ny(handle_r));
+	STARPU_ASSERT(starpu_matrix_get_local_ld(handle_s) == starpu_matrix_get_local_ld(handle_r));
+
+	char *matrix_s = (char *)starpu_matrix_get_local_ptr(handle_s);
+	char *matrix_r = (char *)starpu_matrix_get_local_ptr(handle_r);
+
+	int nx = starpu_matrix_get_nx(handle_s);
+	int ny = starpu_matrix_get_ny(handle_s);
+	int ldy = starpu_matrix_get_local_ld(handle_s);
+
+	int x, y;
+
+	for(y=0 ; y<ny ; y++)
+		for(x=0 ; x<nx ; x++)
+		{
+			int index=(y*ldy)+x;
+			if (matrix_s[index] == matrix_r[index])
+			{
+				FPRINTF_MPI("Success with matrix[%d,%d --> %d] value: %c == %c\n", x, y, index, matrix_s[index], matrix_r[index]);
+			}
+			else
+			{
+				*error = 1;
+				FPRINTF_MPI("Error with matrix[%d,%d --> %d] value: %c != %c\n", x, y, index, matrix_s[index], matrix_r[index]);
+			}
+		}
+}
+
+void check_block(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
+{
+	STARPU_ASSERT(starpu_block_get_elemsize(handle_s) == starpu_block_get_elemsize(handle_r));
+	STARPU_ASSERT(starpu_block_get_nx(handle_s) == starpu_block_get_nx(handle_r));
+	STARPU_ASSERT(starpu_block_get_ny(handle_s) == starpu_block_get_ny(handle_r));
+	STARPU_ASSERT(starpu_block_get_nz(handle_s) == starpu_block_get_nz(handle_r));
+	STARPU_ASSERT(starpu_block_get_local_ldy(handle_s) == starpu_block_get_local_ldy(handle_r));
+	STARPU_ASSERT(starpu_block_get_local_ldz(handle_s) == starpu_block_get_local_ldz(handle_r));
+
+	float *block_s = (float *)starpu_block_get_local_ptr(handle_s);
+	float *block_r = (float *)starpu_block_get_local_ptr(handle_r);
+
+	int nx = starpu_block_get_nx(handle_s);
+	int ny = starpu_block_get_ny(handle_s);
+	int nz = starpu_block_get_nz(handle_s);
+
+	int ldy = starpu_block_get_local_ldy(handle_s);
+	int ldz = starpu_block_get_local_ldz(handle_s);
+
+	int x, y, z;
+
+	for(z=0 ; z<nz ; z++)
+		for(y=0 ; y<ny ; y++)
+			for(x=0 ; x<nx ; x++)
+			{
+				int index=(z*ldz)+(y*ldy)+x;
+				if (block_s[index] == block_r[index])
+				{
+					FPRINTF_MPI("Success with block[%d,%d,%d --> %d] value: %f == %f\n", x, y, z, index, block_s[index], block_r[index]);
+				}
+				else
+				{
+					*error = 1;
+					FPRINTF_MPI("Error with block[%d,%d,%d --> %d] value: %f != %f\n", x, y, z, index, block_s[index], block_r[index]);
+				}
+			}
+}
+
+void send_recv_and_check(int rank, int node, starpu_data_handle_t handle_s, int tag_s, starpu_data_handle_t handle_r, int tag_r, int *error, check_func func)
+{
+	int ret;
+	MPI_Status status;
+
+	if (rank == 0)
+	{
+		ret = starpu_mpi_send(handle_s, node, tag_s, MPI_COMM_WORLD);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
+		ret = starpu_mpi_recv(handle_r, node, tag_r, MPI_COMM_WORLD, &status);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
+
+		func(handle_s, handle_r, error);
+	}
+	else
+	{
+		ret = starpu_mpi_recv(handle_s, node, tag_s, MPI_COMM_WORLD, &status);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
+		ret = starpu_mpi_send(handle_s, node, tag_r, MPI_COMM_WORLD);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
+	}
+}
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+	int error=0;
+
+	int nx=3;
+	int ny=2;
+	int nz=4;
+
+	MPI_Init(NULL, NULL);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size < 2)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need at least 2 processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(NULL, NULL, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	if (rank == 0)
+	{
+		MPI_Status status;
+
+		{
+			float v = 42.12;
+			starpu_data_handle_t variable_handle[2];
+			starpu_variable_data_register(&variable_handle[0], 0, (uintptr_t)&v, sizeof(v));
+			starpu_variable_data_register(&variable_handle[1], -1, (uintptr_t)NULL, sizeof(v));
+
+			send_recv_and_check(rank, 1, variable_handle[0], 0x42, variable_handle[1], 0x1337, &error, check_variable);
+
+			starpu_data_unregister(variable_handle[0]);
+			starpu_data_unregister(variable_handle[1]);
+		}
+
+		{
+			int vector[4] = {1, 2, 3, 4};
+			starpu_data_handle_t vector_handle[2];
+
+			starpu_vector_data_register(&vector_handle[0], 0, (uintptr_t)vector, 4, sizeof(vector[0]));
+			starpu_vector_data_register(&vector_handle[1], -1, (uintptr_t)NULL, 4, sizeof(vector[0]));
+
+			send_recv_and_check(rank, 1, vector_handle[0], 0x43, vector_handle[1], 0x2337, &error, check_vector);
+
+			starpu_data_unregister(vector_handle[0]);
+			starpu_data_unregister(vector_handle[1]);
+		}
+
+		{
+			char *matrix, n='a';
+			int x, y;
+			starpu_data_handle_t matrix_handle[2];
+
+			matrix = (char*)malloc(nx*ny*nz*sizeof(char));
+			assert(matrix);
+			for(y=0 ; y<ny ; y++)
+			{
+				for(x=0 ; x<nx ; x++)
+				{
+					matrix[(y*nx)+x] = n++;
+				}
+			}
+
+			starpu_matrix_data_register(&matrix_handle[0], 0, (uintptr_t)matrix, nx, nx, ny, sizeof(char));
+			starpu_matrix_data_register(&matrix_handle[1], -1, (uintptr_t)NULL, nx, nx, ny, sizeof(char));
+
+			send_recv_and_check(rank, 1, matrix_handle[0], 0x75, matrix_handle[1], 0x8555, &error, check_matrix);
+
+			starpu_data_unregister(matrix_handle[0]);
+			starpu_data_unregister(matrix_handle[1]);
+		}
+
+		{
+			float *block, n=1.0;
+			int x, y, z;
+			starpu_data_handle_t block_handle[2];
+
+			block = (float*)malloc(nx*ny*nz*sizeof(float));
+			assert(block);
+			for(z=0 ; z<nz ; z++)
+			{
+				for(y=0 ; y<ny ; y++)
+				{
+					for(x=0 ; x<nx ; x++)
+					{
+						block[(z*nx*ny)+(y*nx)+x] = n++;
+					}
+				}
+			}
+
+			starpu_block_data_register(&block_handle[0], 0, (uintptr_t)block, nx, nx*ny, nx, ny, nz, sizeof(float));
+			starpu_block_data_register(&block_handle[1], -1, (uintptr_t)NULL, nx, nx*ny, nx, ny, nz, sizeof(float));
+
+			send_recv_and_check(rank, 1, block_handle[0], 0x73, block_handle[1], 0x8337, &error, check_block);
+
+			starpu_data_unregister(block_handle[0]);
+			starpu_data_unregister(block_handle[1]);
+		}
+	}
+	else if (rank == 1)
+	{
+		MPI_Status status;
+
+		{
+			starpu_data_handle_t variable_handle;
+			starpu_variable_data_register(&variable_handle, -1, (uintptr_t)NULL, sizeof(float));
+			send_recv_and_check(rank, 0, variable_handle, 0x42, NULL, 0x1337, NULL, NULL);
+			starpu_data_unregister(variable_handle);
+		}
+
+		{
+			starpu_data_handle_t vector_handle;
+			starpu_vector_data_register(&vector_handle, -1, (uintptr_t)NULL, 4, sizeof(int));
+			send_recv_and_check(rank, 0, vector_handle, 0x43, NULL, 0x2337, NULL, NULL);
+			starpu_data_unregister(vector_handle);
+		}
+
+		{
+			starpu_data_handle_t matrix_handle;
+			starpu_matrix_data_register(&matrix_handle, -1, (uintptr_t)NULL, nx, nx, ny, sizeof(char));
+			send_recv_and_check(rank, 0, matrix_handle, 0x75, NULL, 0x8555, NULL, NULL);
+			starpu_data_unregister(matrix_handle);
+		}
+
+		{
+			starpu_data_handle_t block_handle;
+			starpu_block_data_register(&block_handle, -1, (uintptr_t)NULL, nx, nx*ny, nx, ny, nz, sizeof(float));
+			send_recv_and_check(rank, 0, block_handle, 0x73, NULL, 0x8337, NULL, NULL);
+			starpu_data_unregister(block_handle);
+		}
+	}
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	MPI_Finalize();
+
+	return rank == 0 ? error : 0;
+}

+ 1 - 1
mpi/tests/helper.h

@@ -19,7 +19,7 @@
 #define STARPU_TEST_SKIPPED 77
 
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
-#define FPRINTF_MPI(fmt, ...) do { if (!getenv("STARPU_SILENT")) { \
+#define FPRINTF_MPI(fmt, ...) do { if (!getenv("STARPU_SSILENT")) { \
     						int _disp_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_disp_rank);       \
                                                 fprintf(stderr, "[%d][starpu_mpi][%s] " fmt , _disp_rank, __starpu_func__ ,## __VA_ARGS__); \
                                                 fflush(stderr); }} while(0);

+ 4 - 7
mpi/tests/mpi_earlyrecv.c

@@ -19,14 +19,11 @@
 #include "helper.h"
 #include <unistd.h>
 
-//#define NB 1000
-#define NB 10
-
 int main(int argc, char **argv)
 {
 	int ret, rank, size, i, nb_requests;
-	starpu_data_handle_t tab_handle[NB];
-	starpu_mpi_req request[NB];
+	starpu_data_handle_t tab_handle[3];
+	starpu_mpi_req request[3];
 
 	MPI_Init(NULL, NULL);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
@@ -46,7 +43,7 @@ int main(int argc, char **argv)
 	ret = starpu_mpi_init(NULL, NULL, 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
-	for(i=0 ; i<NB ; i++)
+	for(i=0 ; i<3 ; i++)
 	{
 		starpu_variable_data_register(&tab_handle[i], STARPU_MAIN_RAM, (uintptr_t)&rank, sizeof(int));
 		starpu_data_set_tag(tab_handle[i], i);
@@ -90,7 +87,7 @@ int main(int argc, char **argv)
 		for(i=1 ; i<nb_requests ; i++) finished = finished && request[i] == NULL;
 	}
 
-	for(i=0 ; i<NB ; i++)
+	for(i=0 ; i<3 ; i++)
 		starpu_data_unregister(tab_handle[i]);
 
 	starpu_mpi_shutdown();

+ 5 - 1
mpi/tests/mpi_earlyrecv2.c

@@ -26,6 +26,7 @@ int main(int argc, char **argv)
 {
 	int ret, rank, size, i;
 	starpu_data_handle_t tab_handle[NB];
+	int value[NB];
 
 	MPI_Init(NULL, NULL);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
@@ -47,7 +48,8 @@ int main(int argc, char **argv)
 
 	for(i=0 ; i<NB ; i++)
 	{
-		starpu_variable_data_register(&tab_handle[i], STARPU_MAIN_RAM, (uintptr_t)&rank, sizeof(int));
+		value[i]=i*rank;
+		starpu_variable_data_register(&tab_handle[i], STARPU_MAIN_RAM, (uintptr_t)&value[i], sizeof(int));
 		starpu_data_set_tag(tab_handle[i], i);
 	}
 
@@ -79,6 +81,8 @@ int main(int argc, char **argv)
 		for(i=0 ; i<NB ; i++)
 		{
 			starpu_mpi_wait(&req[i], NULL);
+			int *rvalue = (int *)starpu_data_get_local_ptr(tab_handle[i]);
+			STARPU_ASSERT_MSG(*rvalue==i*other_rank, "Incorrect received value: %d != %d\n", *rvalue, i*other_rank);
 		}
 	}
 

+ 72 - 45
sc_hypervisor/src/hypervisor_policies/debit_lp_policy.c

@@ -225,67 +225,94 @@ static double _glp_resolve(int ns, int nw, double velocity[ns][nw], double w_in_
 }
 
 
-static void debit_lp_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
+static void _try_resizing(void)
 {
-	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx);
-	sc_hypervisor_get_velocity_per_worker(sc_w, worker);
-	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
-	if(ret != EBUSY)
+	int ns = sc_hypervisor_get_nsched_ctxs();
+	int nw = starpu_worker_get_count(); /* Number of different workers */
+	
+	double w_in_s[ns][nw];
+	unsigned found_sol = _compute_max_velocity(ns, nw,  w_in_s, NULL, NULL);
+	/* if we did find at least one solution redistribute the resources */
+	if(found_sol)
 	{
-		if(sc_hypervisor_criteria_fulfilled(sched_ctx, worker))
+		int w, s;
+		double nworkers[ns][2];
+		int nworkers_rounded[ns][2];
+		for(s = 0; s < ns; s++)
 		{
-			int ns = sc_hypervisor_get_nsched_ctxs();
-			int nw = starpu_worker_get_count(); /* Number of different workers */
-
-			double w_in_s[ns][nw];
-			unsigned found_sol = _compute_max_velocity(ns, nw,  w_in_s, NULL, NULL);
-			/* if we did find at least one solution redistribute the resources */
-			if(found_sol)
+			nworkers[s][0] = 0.0;
+			nworkers[s][1] = 0.0;
+			nworkers_rounded[s][0] = 0;
+			nworkers_rounded[s][1] = 0;
+			
+		}
+		
+		for(s = 0; s < ns; s++)
+		{
+			for(w = 0; w < nw; w++)
 			{
-				int w, s;
-				double nworkers[ns][2];
-				int nworkers_rounded[ns][2];
-				for(s = 0; s < ns; s++)
+				enum starpu_worker_archtype arch = starpu_worker_get_type(w);
+				
+				if(arch == STARPU_CUDA_WORKER)
 				{
-					nworkers[s][0] = 0.0;
-					nworkers[s][1] = 0.0;
-					nworkers_rounded[s][0] = 0;
-					nworkers_rounded[s][1] = 0;
-
+					nworkers[s][0] += w_in_s[s][w];
+					if(w_in_s[s][w] >= 0.3)
+						nworkers_rounded[s][0]++;
 				}
-
-				for(s = 0; s < ns; s++)
+				else
 				{
-					for(w = 0; w < nw; w++)
-					{
-						enum starpu_worker_archtype arch = starpu_worker_get_type(w);
-
-						if(arch == STARPU_CUDA_WORKER)
-						{
-							nworkers[s][0] += w_in_s[s][w];
-							if(w_in_s[s][w] >= 0.3)
-								nworkers_rounded[s][0]++;
-						}
-						else
-						{
-							nworkers[s][1] += w_in_s[s][w];
-							if(w_in_s[s][w] > 0.5)
-								nworkers_rounded[s][1]++;
-						}
-					}
+					nworkers[s][1] += w_in_s[s][w];
+					if(w_in_s[s][w] > 0.5)
+						nworkers_rounded[s][1]++;
 				}
+			}
+		}
 /* 				for(s = 0; s < ns; s++) */
 /* 					printf("%d: cpus = %lf gpus = %lf cpus_round = %d gpus_round = %d\n", s, nworkers[s][1], nworkers[s][0], */
 /* 					       nworkers_rounded[s][1], nworkers_rounded[s][0]); */
+		
+		sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_rounded, nworkers);
+		
+	}
+}
 
-				sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_rounded, nworkers);
-
+static void debit_lp_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
+{
+	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
+        if(ret != EBUSY)
+	{
+		unsigned criteria = sc_hypervisor_get_resize_criteria();
+		if(criteria != SC_NOTHING && criteria == SC_VELOCITY)
+		{
+			if(sc_hypervisor_check_velocity_gap_btw_ctxs())
+			{
+				_try_resizing();
 			}
 		}
-		starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
+                starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
 	}
 }
 
+static debit_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
+{
+	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
+        if(ret != EBUSY)
+	{
+                unsigned criteria = sc_hypervisor_get_resize_criteria();
+                if(criteria != SC_NOTHING && criteria == SC_IDLE)
+                {
+
+			if(sc_hypervisor_check_idle(sched_ctx, worker))
+                        {
+                                _try_resizing();
+//                              sc_hypervisor_move_workers(sched_ctx, 3 - sched_ctx, &worker, 1, 1);                                                                                                               \
+                                                                                                                                                                                                                    
+                        }
+                }
+                starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
+        }
+}
+
 static void debit_lp_end_ctx(unsigned sched_ctx)
 {
 	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx);
@@ -300,7 +327,7 @@ struct sc_hypervisor_policy debit_lp_policy = {
 	.size_ctxs = NULL,
 	.handle_poped_task = debit_lp_handle_poped_task,
 	.handle_pushed_task = NULL,
-	.handle_idle_cycle = NULL,
+	.handle_idle_cycle = debit_lp_handle_idle_cycle,
 	.handle_idle_end = NULL,
 	.handle_post_exec_hook = NULL,
 	.handle_submitted_job = NULL,

+ 83 - 58
sc_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c

@@ -325,77 +325,102 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 	return found_sol;
 }
 
-
-
-static void ispeed_lp_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
+static void _try_resizing(void)
 {
-	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx);
-	sc_hypervisor_get_velocity_per_worker(sc_w, worker);
-	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
-	if(ret != EBUSY)
+	int ns = sc_hypervisor_get_nsched_ctxs();
+	int nw = starpu_worker_get_count(); /* Number of different workers */
+	
+	double w_in_s[ns][nw];
+//			double flops_on_w[ns][nw];
+	double **flops_on_w = (double**)malloc(ns*sizeof(double*));
+	int i;
+	for(i = 0; i < ns; i++)
+		flops_on_w[i] = (double*)malloc(nw*sizeof(double));
+	
+	unsigned found_sol = _compute_flops_distribution_over_ctxs(ns, nw,  w_in_s, flops_on_w, NULL, NULL);
+	/* if we did find at least one solution redistribute the resources */
+	if(found_sol)
 	{
-		if(sc_hypervisor_criteria_fulfilled(sched_ctx, worker))
+		int w, s;
+		double nworkers[ns][2];
+		int nworkers_rounded[ns][2];
+		for(s = 0; s < ns; s++)
 		{
-			int ns = sc_hypervisor_get_nsched_ctxs();
-			int nw = starpu_worker_get_count(); /* Number of different workers */
-
-			double w_in_s[ns][nw];
-//			double flops_on_w[ns][nw];
-			double **flops_on_w = (double**)malloc(ns*sizeof(double*));
-			int i;
-			for(i = 0; i < ns; i++)
-				flops_on_w[i] = (double*)malloc(nw*sizeof(double));
-
-			unsigned found_sol = _compute_flops_distribution_over_ctxs(ns, nw,  w_in_s, flops_on_w, NULL, NULL);
-			/* if we did find at least one solution redistribute the resources */
-			if(found_sol)
+			nworkers[s][0] = 0.0;
+			nworkers[s][1] = 0.0;
+			nworkers_rounded[s][0] = 0;
+			nworkers_rounded[s][1] = 0;
+			
+		}
+		
+		for(s = 0; s < ns; s++)
+		{
+			for(w = 0; w < nw; w++)
 			{
-				int w, s;
-				double nworkers[ns][2];
-				int nworkers_rounded[ns][2];
-				for(s = 0; s < ns; s++)
+				enum starpu_worker_archtype arch = starpu_worker_get_type(w);
+				
+				if(arch == STARPU_CUDA_WORKER)
 				{
-					nworkers[s][0] = 0.0;
-					nworkers[s][1] = 0.0;
-					nworkers_rounded[s][0] = 0;
-					nworkers_rounded[s][1] = 0;
-
+					nworkers[s][0] += w_in_s[s][w];
+					if(w_in_s[s][w] >= 0.3)
+						nworkers_rounded[s][0]++;
 				}
-
-				for(s = 0; s < ns; s++)
+				else
 				{
-					for(w = 0; w < nw; w++)
-					{
-						enum starpu_worker_archtype arch = starpu_worker_get_type(w);
-
-						if(arch == STARPU_CUDA_WORKER)
-						{
-							nworkers[s][0] += w_in_s[s][w];
-							if(w_in_s[s][w] >= 0.3)
-								nworkers_rounded[s][0]++;
-						}
-						else
-						{
-							nworkers[s][1] += w_in_s[s][w];
-							if(w_in_s[s][w] > 0.5)
-								nworkers_rounded[s][1]++;
-						}
-					}
+					nworkers[s][1] += w_in_s[s][w];
+					if(w_in_s[s][w] > 0.5)
+						nworkers_rounded[s][1]++;
 				}
+			}
+		}
 /* 				for(s = 0; s < ns; s++) */
 /* 					printf("%d: cpus = %lf gpus = %lf cpus_round = %d gpus_round = %d\n", s, nworkers[s][1], nworkers[s][0], */
 /* 					       nworkers_rounded[s][1], nworkers_rounded[s][0]); */
-
-				sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_rounded, nworkers);
-			}
-			for(i = 0; i < ns; i++)
-				free(flops_on_w[i]);
-			free(flops_on_w);
-		}
-		starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
+		
+		sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_rounded, nworkers);
 	}
+	for(i = 0; i < ns; i++)
+		free(flops_on_w[i]);
+	free(flops_on_w);
+}
+
+static void ispeed_lp_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
+{
+        int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
+        if(ret != EBUSY)
+        {
+                unsigned criteria = sc_hypervisor_get_resize_criteria();
+                if(criteria != SC_NOTHING && criteria == SC_VELOCITY)
+                {
+                        if(sc_hypervisor_check_velocity_gap_btw_ctxs())
+                        {
+                                _try_resizing();
+                        }
+                }
+                starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
+        }
 }
 
+static ispeed_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
+{
+        int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
+        if(ret != EBUSY)
+        {
+                unsigned criteria = sc_hypervisor_get_resize_criteria();
+                if(criteria != SC_NOTHING && criteria == SC_IDLE)
+                {
+
+			if(sc_hypervisor_check_idle(sched_ctx, worker))
+                        {
+                                _try_resizing();
+//                              sc_hypervisor_move_workers(sched_ctx, 3 - sched_ctx, &worker, 1, 1);                                                                                                                
+                        }
+                }
+                starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
+        }
+}
+
+
 static void ispeed_lp_end_ctx(unsigned sched_ctx)
 {
 	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx);
@@ -410,7 +435,7 @@ struct sc_hypervisor_policy ispeed_lp_policy = {
 	.size_ctxs = NULL,
 	.handle_poped_task = ispeed_lp_handle_poped_task,
 	.handle_pushed_task = NULL,
-	.handle_idle_cycle = NULL,
+	.handle_idle_cycle = ispeed_lp_handle_idle_cycle,
 	.handle_idle_end = NULL,
 	.handle_post_exec_hook = NULL,
 	.handle_submitted_job = NULL,

+ 7 - 2
sc_hypervisor/src/policies_utils/policy_tools.c

@@ -498,6 +498,7 @@ unsigned sc_hypervisor_check_velocity_gap_btw_ctxs(void)
 		}
 	}
 
+/*if an optimal speed has not been computed yet do it now */
 	if(!has_opt_v)
 	{
 		int nw = 1;
@@ -533,6 +534,8 @@ unsigned sc_hypervisor_check_velocity_gap_btw_ctxs(void)
 		}
 	}
 
+/* if we have an optimal speed for each type of worker compare the monitored one with the 
+   theoretical one */
 	if(has_opt_v)
 	{
 		for(i = 0; i < nsched_ctxs; i++)
@@ -553,7 +556,9 @@ unsigned sc_hypervisor_check_velocity_gap_btw_ctxs(void)
 				return 1;
 		}
 	}
-	else
+	else /* if we have not been able to compute a theoretical velocity consider the env variable
+		SC_MAX_VELOCITY_GAP and compare the speed of the contexts, whenever the difference
+		btw them is greater than the max value the function returns true */
 	{
 		for(i = 0; i < nsched_ctxs; i++)
 		{
@@ -575,7 +580,7 @@ unsigned sc_hypervisor_check_velocity_gap_btw_ctxs(void)
 						{
 							double gap = ctx_v < other_ctx_v ? other_ctx_v / ctx_v : ctx_v / other_ctx_v;
 							double max_vel = _get_max_velocity_gap();
-							if(gap > max_vel-1 && gap < max_vel+1)
+							if(gap > max_vel)
 								return 1;
 						}
 					}

+ 0 - 3
sc_hypervisor/src/policies_utils/speed.c

@@ -155,16 +155,13 @@ double sc_hypervisor_get_velocity(struct sc_hypervisor_wrapper *sc_w, enum starp
 {
 
 	double velocity = sc_hypervisor_get_velocity_per_worker_type(sc_w, arch);
-	printf("arch %d vel %lf\n", arch, velocity);
 	if(velocity == -1.0)
 	{
 		velocity = sc_hypervisor_get_ref_velocity_per_worker_type(sc_w, arch);
-		printf("arch %d ref_vel %lf\n", arch, velocity);
 	}
 	if(velocity == -1.0)
 	{
 		velocity = arch == STARPU_CPU_WORKER ? 5.0 : 100.0;
-		printf("arch %d default_vel %lf\n", arch, velocity);
 	}
        
 	return velocity;

+ 1 - 0
sc_hypervisor/src/sc_hypervisor.c

@@ -632,6 +632,7 @@ static unsigned _ack_resize_completed(unsigned sched_ctx, int worker)
 					if(sc_w->resize_ack.moved_workers[j] == worker)
 					{
 						only_remove = 1;
+						starpu_pthread_mutex_unlock(&sc_w->mutex);
 						break;
 					}
 			}

+ 5 - 1
src/Makefile.am

@@ -159,7 +159,6 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 						\
 	core/dependencies/tags.c				\
 	core/dependencies/task_deps.c				\
 	core/dependencies/data_concurrency.c			\
-	core/disk_ops/disk_unistd_o_direct.c			\
 	core/disk_ops/disk_stdio.c				\
 	core/disk_ops/disk_unistd.c                             \
 	core/disk_ops/unistd/disk_unistd_global.c		\
@@ -269,6 +268,11 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/scc/driver_scc_sink.c
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/scc/driver_scc_utils.c
 endif
 
+if STARPU_LINUX_SYS
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += core/disk_ops/disk_unistd_o_direct.c
+endif
+
+
 
 #########################################
 #										#

+ 23 - 1
src/core/disk_ops/disk_stdio.c

@@ -24,6 +24,10 @@
 #include <core/disk.h>
 #include <core/perfmodel/perfmodel.h>
 
+#ifdef STARPU_HAVE_WINDOWS
+        #include <io.h>
+#endif
+
 #define NITER	64
 
 /* ------------------- use STDIO to write on disk -------------------  */
@@ -38,7 +42,7 @@ struct starpu_stdio_obj {
 
 /* allocation memory on disk */
 static void * 
-starpu_stdio_alloc (void *base, size_t size STARPU_ATTRIBUTE_UNUSED)
+starpu_stdio_alloc (void *base, size_t size)
 {
 	
 	struct starpu_stdio_obj * obj = malloc(sizeof(struct starpu_stdio_obj));
@@ -58,7 +62,13 @@ starpu_stdio_alloc (void *base, size_t size STARPU_ATTRIBUTE_UNUSED)
 	strcpy(baseCpy, (char *) base);
 	strcat(baseCpy,tmp);
 
+#ifdef STARPU_HAVE_WINDOWS
+        _mktemp(baseCpy);
+        id = open(baseCpy, "rb+");
+#else
 	id = mkstemp(baseCpy);
+
+#endif
 	/* fail */
 	if (id < 0)
 	{
@@ -78,7 +88,11 @@ starpu_stdio_alloc (void *base, size_t size STARPU_ATTRIBUTE_UNUSED)
 		return NULL;
 	}
 
+#ifdef STARPU_HAVE_WINDOWS
+	int val = _chsize(id, size);
+#else
 	int val = ftruncate(id,size);
+#endif
 	/* fail */
 	if (val < 0)
 	{
@@ -245,7 +259,11 @@ get_stdio_bandwidth_between_disk_and_main_ram(unsigned node)
 		int res = fflush (tmp->file);
 		STARPU_ASSERT_MSG(res == 0, "Slowness computation failed \n");
 
+#ifdef STARPU_HAVE_WINDOWS
+		res = _commit(tmp->descriptor);
+#else
 		res = fsync(tmp->descriptor);
+#endif
 		STARPU_ASSERT_MSG(res == 0, "Slowness computation failed \n");
 	}
 	gettimeofday(&end, NULL);
@@ -267,7 +285,11 @@ get_stdio_bandwidth_between_disk_and_main_ram(unsigned node)
 		int res = fflush (tmp->file);
 		STARPU_ASSERT_MSG(res == 0, "Latency computation failed");
 
+#ifdef STARPU_HAVE_WINDOWS
+		res = _commit(tmp->descriptor);
+#else
 		res = fsync(tmp->descriptor);
+#endif
 		STARPU_ASSERT_MSG(res == 0, "Latency computation failed");
 	}
 	gettimeofday(&end, NULL);

+ 41 - 9
src/core/disk_ops/unistd/disk_unistd_global.c

@@ -26,6 +26,10 @@
 #include <core/perfmodel/perfmodel.h>
 #include <core/disk_ops/unistd/disk_unistd_global.h>
 
+#ifdef STARPU_HAVE_WINDOWS
+        #include <io.h>
+#endif
+
 #define NITER	64
 #define SIZE_BENCH (4*getpagesize())
 
@@ -33,24 +37,30 @@
 
 /* allocation memory on disk */
  void * 
-starpu_unistd_global_alloc (struct starpu_unistd_global_obj * obj, void *base, size_t size STARPU_ATTRIBUTE_UNUSED)
+starpu_unistd_global_alloc (struct starpu_unistd_global_obj * obj, void *base, size_t size)
 {
 	int id = -1;
+	const char *template = "STARPU_XXXXXX";
 
 	/* create template for mkstemp */
-	unsigned int sizeBase = 16;
-	while(sizeBase < (strlen(base)+7))
-		sizeBase *= 2;
+	unsigned int sizeBase = strlen(base) + strlen(template)+1;
 
 	char * baseCpy = malloc(sizeBase*sizeof(char));
 	STARPU_ASSERT(baseCpy != NULL);
 
-	char * tmp = "STARPU_XXXXXX";
-
 	strcpy(baseCpy, (char *) base);
-	strcat(baseCpy,tmp);
-
+	strcat(baseCpy,template);
+#ifdef STARPU_LINUX_SYS
 	id = mkostemp(baseCpy, obj->flags);
+#elif defined(STARPU_HAVE_WINDOWS)
+	/* size in windows is a multiple of char */
+	_mktemp(baseCpy);
+	id = open(baseCpy, obj->flags);
+#else
+	STARPU_ASSERT(obj->flags == O_RDWR);
+	id = mkstemp(baseCpy);
+#endif
+
 	/* fail */
 	if (id < 0)
 	{
@@ -59,7 +69,11 @@ starpu_unistd_global_alloc (struct starpu_unistd_global_obj * obj, void *base, s
 		return NULL;
 	}
 	
+#ifdef STARPU_HAVE_WINDOWS
+	int val = _chsize(id, size);
+#else 
 	int val = ftruncate(id,size);
+#endif
 	/* fail */
 	if (val < 0)
 	{
@@ -189,7 +203,7 @@ starpu_unistd_global_unplug (void *base)
  int
 get_unistd_global_bandwidth_between_disk_and_main_ram(unsigned node)
 {
-
+	int res;
 	unsigned iter;
 	double timing_slowness, timing_latency;
 	struct timeval start;
@@ -205,12 +219,22 @@ get_unistd_global_bandwidth_between_disk_and_main_ram(unsigned node)
 	/* fail to alloc */
 	if (mem == NULL)
 		return 0;
+	
+	struct starpu_unistd_global_obj * tmp = (struct starpu_unistd_global_obj *) mem;
 
 	/* Measure upload slowness */
 	gettimeofday(&start, NULL);
 	for (iter = 0; iter < NITER; ++iter)
 	{
 		_starpu_disk_write(node, mem, buf, 0, SIZE_BENCH, NULL);
+
+#ifdef STARPU_HAVE_WINDOWS
+		res = _commit(tmp->descriptor);
+#else
+		res = fsync(tmp->descriptor);
+#endif
+
+		STARPU_ASSERT_MSG(res == 0, "bandwidth computation failed");
 	}
 	gettimeofday(&end, NULL);
 	timing_slowness = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
@@ -228,6 +252,14 @@ get_unistd_global_bandwidth_between_disk_and_main_ram(unsigned node)
 	for (iter = 0; iter < NITER; ++iter)
 	{
 		_starpu_disk_write(node, mem, buf, rand() % (SIZE_BENCH -1) , getpagesize(), NULL);
+
+#ifdef STARPU_HAVE_WINDOWS
+		res = _commit(tmp->descriptor);
+#else
+		res = fsync(tmp->descriptor);
+#endif
+
+		STARPU_ASSERT_MSG(res == 0, "Latency computation failed");
 	}
 	gettimeofday(&end, NULL);
 	timing_latency = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));

+ 2 - 2
src/core/perfmodel/perfmodel_bus.c

@@ -238,7 +238,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 	 * since we cleanly shutdown CUDA before returning. */
 	cudaSetDevice(src);
 
-	if (starpu_get_env_number("STARPU_DISABLE_CUDA_GPU_GPU_DIRECT") <= 0)
+	if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") > 0)
 	{
 		cures = cudaDeviceCanAccessPeer(&can, src, dst);
 		if (!cures && can)
@@ -260,7 +260,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 	 * since we cleanly shutdown CUDA before returning. */
 	cudaSetDevice(dst);
 
-	if (starpu_get_env_number("STARPU_DISABLE_CUDA_GPU_GPU_DIRECT") <= 0)
+	if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") > 0)
 	{
 		cures = cudaDeviceCanAccessPeer(&can, dst, src);
 		if (!cures && can)

+ 1 - 1
src/core/perfmodel/perfmodel_history.c

@@ -1078,7 +1078,7 @@ double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model
 
 	regmodel = &model->per_arch[arch][nimpl].regression;
 
-	if (regmodel->valid)
+	if (regmodel->valid && size >= regmodel->minx * 0.9 && size <= regmodel->maxx * 1.1)
                 exp = regmodel->alpha*pow((double)size, regmodel->beta);
 
 	return exp;

+ 0 - 21
src/core/sched_policy.c

@@ -638,27 +638,6 @@ pick:
 		}
 	  }
 
-#ifdef STARPU_USE_SC_HYPERVISOR
-	struct _starpu_sched_ctx *sched_ctx = NULL;
-	struct starpu_sched_ctx_performance_counters *perf_counters = NULL;
-	int j;
-	for(j = 0; j < STARPU_NMAX_SCHED_CTXS; j++)
-	{
-		sched_ctx = worker->sched_ctx[j];
-		if(sched_ctx != NULL && sched_ctx->id != 0 && sched_ctx->id != STARPU_NMAX_SCHED_CTXS)
-		{
-			perf_counters = sched_ctx->perf_counters;
-			if(perf_counters != NULL && perf_counters->notify_idle_cycle && perf_counters->notify_idle_end)
-			{
-				if(!task)
-					perf_counters->notify_idle_cycle(sched_ctx->id, worker->workerid, 1.0);
-				else
-					perf_counters->notify_idle_end(sched_ctx->id, worker->workerid);
-			}
-		}
-	}
-#endif //STARPU_USE_SC_HYPERVISOR
-
 
 	if (!task)
 		return NULL;

+ 3 - 2
src/datawizard/coherency.c

@@ -241,6 +241,7 @@ static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned
 	if ((starpu_node_get_kind(src_node) == STARPU_DISK_RAM && starpu_node_get_kind(dst_node) == STARPU_CPU_RAM) ||
 	    (starpu_node_get_kind(src_node) == STARPU_CPU_RAM && starpu_node_get_kind(dst_node) == STARPU_DISK_RAM))
 	{
+		/* FIXME: not necessarily a worker :/ */
 		*handling_node = STARPU_MAIN_RAM;
 		return 1;
 	}
@@ -284,12 +285,12 @@ static int determine_request_path(starpu_data_handle_t handle,
 		/* GPU -> RAM */
 		src_nodes[0] = src_node;
 		dst_nodes[0] = STARPU_MAIN_RAM;
-		handling_nodes[0] = src_node;
+		handling_nodes[0] = starpu_node_get_kind(src_node) == STARPU_DISK_RAM ? dst_node : src_node;
 
 		/* RAM -> GPU */
 		src_nodes[1] = STARPU_MAIN_RAM;
 		dst_nodes[1] = dst_node;
-		handling_nodes[1] = dst_node;
+		handling_nodes[1] = starpu_node_get_kind(dst_node) == STARPU_DISK_RAM ? src_node : dst_node;
 
 		return 2;
 	}

+ 4 - 0
src/datawizard/data_request.c

@@ -182,6 +182,10 @@ int _starpu_wait_data_request_completion(struct _starpu_data_request *r, unsigne
 /* this is non blocking */
 void _starpu_post_data_request(struct _starpu_data_request *r, unsigned handling_node)
 {
+	/* We don't have a worker for disk nodes, these should have been posted to a main RAM node */
+	STARPU_ASSERT(starpu_node_get_kind(handling_node) != STARPU_DISK_RAM);
+	STARPU_ASSERT(_starpu_memory_node_get_nworkers(handling_node));
+
 //	_STARPU_DEBUG("POST REQUEST\n");
 
 	/* If some dependencies are not fulfilled yet, we don't actually post the request */

+ 3 - 0
src/datawizard/filters.c

@@ -148,6 +148,9 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 		/* This is lazy allocation, allocate it now in main RAM, so as
 		 * to have somewhere to gather pieces later */
 		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[0], 0);
+#ifdef STARPU_DEVEL
+#warning we should reclaim memory if allocation failed
+#endif
 		STARPU_ASSERT(!ret);
 	}
 

+ 65 - 0
src/datawizard/interfaces/block_interface.c

@@ -75,6 +75,8 @@ static size_t block_interface_get_size(starpu_data_handle_t handle);
 static uint32_t footprint_block_interface_crc32(starpu_data_handle_t handle);
 static int block_compare(void *data_interface_a, void *data_interface_b);
 static void display_block_interface(starpu_data_handle_t handle, FILE *f);
+static int pack_block_handle(starpu_data_handle_t handle, unsigned node, void **ptr, ssize_t *count);
+static int unpack_block_handle(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count);
 
 struct starpu_data_interface_ops starpu_interface_block_ops =
 {
@@ -89,6 +91,8 @@ struct starpu_data_interface_ops starpu_interface_block_ops =
 	.interfaceid = STARPU_BLOCK_INTERFACE_ID,
 	.interface_size = sizeof(struct starpu_block_interface),
 	.display = display_block_interface,
+	.pack_data = pack_block_handle,
+	.unpack_data = unpack_block_handle
 };
 
 static void *block_handle_to_pointer(starpu_data_handle_t handle, unsigned node)
@@ -195,6 +199,67 @@ static void display_block_interface(starpu_data_handle_t handle, FILE *f)
 	fprintf(f, "%u\t%u\t%u\t", block_interface->nx, block_interface->ny, block_interface->nz);
 }
 
+static int pack_block_handle(starpu_data_handle_t handle, unsigned node, void **ptr, ssize_t *count)
+{
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+
+	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
+		starpu_data_get_interface_on_node(handle, node);
+
+	*count = block_interface->nx*block_interface->ny*block_interface->nz*block_interface->elemsize;
+
+	if (ptr != NULL)
+	{
+		uint32_t z, y;
+		void *block = (void *)block_interface->ptr;
+
+		*ptr = malloc(*count);
+
+		void *cur = *ptr;
+		for(z=0 ; z<block_interface->nz ; z++)
+		{
+			void *block_z = block;
+			for(y=0 ; y<block_interface->ny ; y++)
+			{
+				memcpy(cur, block, block_interface->nx*block_interface->elemsize);
+				cur += block_interface->nx*block_interface->elemsize;
+				block += block_interface->ldy * block_interface->elemsize;
+			}
+			block = block_z + block_interface->ldz * block_interface->elemsize;
+		}
+	}
+
+	return 0;
+}
+
+static int unpack_block_handle(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
+{
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+
+	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
+		starpu_data_get_interface_on_node(handle, node);
+
+	STARPU_ASSERT(count == block_interface->elemsize * block_interface->nx * block_interface->ny * block_interface->nz);
+
+	uint32_t z, y;
+	void *cur = ptr;
+	void *block = (void *)block_interface->ptr;
+	for(z=0 ; z<block_interface->nz ; z++)
+	{
+		void *block_z = block;
+		for(y=0 ; y<block_interface->ny ; y++)
+		{
+			memcpy(block, cur, block_interface->nx*block_interface->elemsize);
+			cur += block_interface->nx*block_interface->elemsize;
+			block += block_interface->ldy * block_interface->elemsize;
+		}
+		block = block_z + block_interface->ldz * block_interface->elemsize;
+	}
+
+	return 0;
+}
+
+
 static size_t block_interface_get_size(starpu_data_handle_t handle)
 {
 	size_t size;

+ 54 - 0
src/datawizard/interfaces/matrix_interface.c

@@ -88,6 +88,8 @@ static size_t matrix_interface_get_size(starpu_data_handle_t handle);
 static uint32_t footprint_matrix_interface_crc32(starpu_data_handle_t handle);
 static int matrix_compare(void *data_interface_a, void *data_interface_b);
 static void display_matrix_interface(starpu_data_handle_t handle, FILE *f);
+static int pack_matrix_handle(starpu_data_handle_t handle, unsigned node, void **ptr, ssize_t *count);
+static int unpack_matrix_handle(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count);
 
 struct starpu_data_interface_ops starpu_interface_matrix_ops =
 {
@@ -102,6 +104,8 @@ struct starpu_data_interface_ops starpu_interface_matrix_ops =
 	.interfaceid = STARPU_MATRIX_INTERFACE_ID,
 	.interface_size = sizeof(struct starpu_matrix_interface),
 	.display = display_matrix_interface,
+	.pack_data = pack_matrix_handle,
+	.unpack_data = unpack_matrix_handle
 };
 
 static void register_matrix_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface)
@@ -196,6 +200,56 @@ static void display_matrix_interface(starpu_data_handle_t handle, FILE *f)
 	fprintf(f, "%u\t%u\t", matrix_interface->nx, matrix_interface->ny);
 }
 
+static int pack_matrix_handle(starpu_data_handle_t handle, unsigned node, void **ptr, ssize_t *count)
+{
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+
+	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
+		starpu_data_get_interface_on_node(handle, node);
+
+	*count = matrix_interface->nx*matrix_interface->ny*matrix_interface->elemsize;
+
+	if (ptr != NULL)
+	{
+		uint32_t y;
+		void *matrix = (void *)matrix_interface->ptr;
+
+		*ptr = malloc(*count);
+
+		void *cur = *ptr;
+		for(y=0 ; y<matrix_interface->ny ; y++)
+		{
+			memcpy(cur, matrix, matrix_interface->nx*matrix_interface->elemsize);
+			cur += matrix_interface->nx*matrix_interface->elemsize;
+			matrix += matrix_interface->ld * matrix_interface->elemsize;
+		}
+	}
+
+	return 0;
+}
+
+static int unpack_matrix_handle(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
+{
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+
+	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
+		starpu_data_get_interface_on_node(handle, node);
+
+	STARPU_ASSERT(count == matrix_interface->elemsize * matrix_interface->nx * matrix_interface->ny);
+
+	uint32_t y;
+	void *cur = ptr;
+	void *matrix = (void *)matrix_interface->ptr;
+	for(y=0 ; y<matrix_interface->ny ; y++)
+	{
+		memcpy(matrix, cur, matrix_interface->nx*matrix_interface->elemsize);
+		cur += matrix_interface->nx*matrix_interface->elemsize;
+		matrix += matrix_interface->ld * matrix_interface->elemsize;
+	}
+
+	return 0;
+}
+
 static size_t matrix_interface_get_size(starpu_data_handle_t handle)
 {
 	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)

+ 35 - 0
src/datawizard/interfaces/variable_interface.c

@@ -42,6 +42,8 @@ static size_t variable_interface_get_size(starpu_data_handle_t handle);
 static uint32_t footprint_variable_interface_crc32(starpu_data_handle_t handle);
 static int variable_compare(void *data_interface_a, void *data_interface_b);
 static void display_variable_interface(starpu_data_handle_t handle, FILE *f);
+static int pack_variable_handle(starpu_data_handle_t handle, unsigned node, void **ptr, ssize_t *count);
+static int unpack_variable_handle(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count);
 
 struct starpu_data_interface_ops starpu_interface_variable_ops =
 {
@@ -56,6 +58,8 @@ struct starpu_data_interface_ops starpu_interface_variable_ops =
 	.interfaceid = STARPU_VARIABLE_INTERFACE_ID,
 	.interface_size = sizeof(struct starpu_variable_interface),
 	.display = display_variable_interface,
+	.pack_data = pack_variable_handle,
+	.unpack_data = unpack_variable_handle
 };
 
 static void *variable_handle_to_pointer(starpu_data_handle_t handle, unsigned node)
@@ -136,6 +140,37 @@ static void display_variable_interface(starpu_data_handle_t handle, FILE *f)
 	fprintf(f, "%ld\t", (long)variable_interface->elemsize);
 }
 
+static int pack_variable_handle(starpu_data_handle_t handle, unsigned node, void **ptr, ssize_t *count)
+{
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+
+	struct starpu_variable_interface *variable_interface = (struct starpu_variable_interface *)
+		starpu_data_get_interface_on_node(handle, node);
+
+	*count = variable_interface->elemsize;
+
+	if (ptr != NULL)
+	{
+		*ptr = malloc(*count);
+		memcpy(*ptr, (void*)variable_interface->ptr, variable_interface->elemsize);
+	}
+
+	return 0;
+}
+
+static int unpack_variable_handle(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
+{
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+
+	struct starpu_variable_interface *variable_interface = (struct starpu_variable_interface *)
+		starpu_data_get_interface_on_node(handle, node);
+
+	STARPU_ASSERT(count == variable_interface->elemsize);
+
+	memcpy((void*)variable_interface->ptr, ptr, variable_interface->elemsize);
+	return 0;
+}
+
 static size_t variable_interface_get_size(starpu_data_handle_t handle)
 {
 	struct starpu_variable_interface *variable_interface = (struct starpu_variable_interface *)

+ 35 - 0
src/datawizard/interfaces/vector_interface.c

@@ -42,6 +42,8 @@ static size_t vector_interface_get_size(starpu_data_handle_t handle);
 static uint32_t footprint_vector_interface_crc32(starpu_data_handle_t handle);
 static int vector_compare(void *data_interface_a, void *data_interface_b);
 static void display_vector_interface(starpu_data_handle_t handle, FILE *f);
+static int pack_vector_handle(starpu_data_handle_t handle, unsigned node, void **ptr, ssize_t *count);
+static int unpack_vector_handle(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count);
 
 struct starpu_data_interface_ops starpu_interface_vector_ops =
 {
@@ -56,6 +58,8 @@ struct starpu_data_interface_ops starpu_interface_vector_ops =
 	.interfaceid = STARPU_VECTOR_INTERFACE_ID,
 	.interface_size = sizeof(struct starpu_vector_interface),
 	.display = display_vector_interface,
+	.pack_data = pack_vector_handle,
+	.unpack_data = unpack_vector_handle
 };
 
 static void *vector_handle_to_pointer(starpu_data_handle_t handle, unsigned node)
@@ -142,6 +146,37 @@ static void display_vector_interface(starpu_data_handle_t handle, FILE *f)
 	fprintf(f, "%u\t", vector_interface->nx);
 }
 
+static int pack_vector_handle(starpu_data_handle_t handle, unsigned node, void **ptr, ssize_t *count)
+{
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+
+	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
+		starpu_data_get_interface_on_node(handle, node);
+
+	*count = vector_interface->nx*vector_interface->elemsize;
+
+	if (ptr != NULL)
+	{
+		*ptr = malloc(*count);
+		memcpy(*ptr, (void*)vector_interface->ptr, vector_interface->elemsize*vector_interface->nx);
+	}
+
+	return 0;
+}
+
+static int unpack_vector_handle(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
+{
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+
+	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
+		starpu_data_get_interface_on_node(handle, node);
+
+	STARPU_ASSERT(count == vector_interface->elemsize * vector_interface->nx);
+	memcpy((void*)vector_interface->ptr, ptr, count);
+
+	return 0;
+}
+
 static size_t vector_interface_get_size(starpu_data_handle_t handle)
 {
 	size_t size;

+ 3 - 0
src/drivers/cpu/driver_cpu.c

@@ -207,6 +207,9 @@ static size_t _starpu_cpu_get_global_mem_size(int devid, struct _starpu_machine_
 	if (depth_node == HWLOC_TYPE_DEPTH_UNKNOWN)
 	     global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;
 	else
+#ifdef STARPU_DEVEL
+#warning devid looks wrong
+#endif
 	     global_mem = hwloc_get_obj_by_depth(topology->hwtopology, depth_node, devid)->memory.local_memory;
 
 #else /* STARPU_HAVE_HWLOC */

+ 1 - 1
src/drivers/cuda/driver_cuda.c

@@ -200,7 +200,7 @@ static void init_context(unsigned devid)
 	starpu_cuda_set_device(devid);
 
 #ifdef HAVE_CUDA_MEMCPY_PEER
-	if (starpu_get_env_number("STARPU_DISABLE_CUDA_GPU_GPU_DIRECT") == 0)
+	if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") > 0)
 	{
 		int nworkers = starpu_worker_get_count();
 		for (workerid = 0; workerid < nworkers; workerid++)

+ 27 - 0
src/drivers/driver_common/driver_common.c

@@ -203,11 +203,38 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 
 		STARPU_PTHREAD_MUTEX_UNLOCK(&args->sched_mutex);
 
+#ifdef STARPU_USE_SC_HYPERVISOR
+		struct _starpu_sched_ctx *sched_ctx = NULL;
+		struct starpu_sched_ctx_performance_counters *perf_counters = NULL;
+		int j;
+		for(j = 0; j < STARPU_NMAX_SCHED_CTXS; j++)
+		{
+			sched_ctx = args->sched_ctx[j];
+			if(sched_ctx != NULL && sched_ctx->id != 0 && sched_ctx->id != STARPU_NMAX_SCHED_CTXS)
+			{
+				perf_counters = sched_ctx->perf_counters;
+				if(perf_counters != NULL && perf_counters->notify_idle_cycle)
+				{
+					perf_counters->notify_idle_cycle(sched_ctx->id, args->workerid, 1.0);
+					
+				}
+			}
+		}
+#endif //STARPU_USE_SC_HYPERVISOR
+
 		return NULL;
 	}
 
 	STARPU_PTHREAD_MUTEX_UNLOCK(&args->sched_mutex);
 
+#ifdef STARPU_USE_SC_HYPERVISOR
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
+	struct starpu_sched_ctx_performance_counters *perf_counters = sched_ctx->perf_counters;
+
+	if(sched_ctx->id != 0 && perf_counters != NULL && perf_counters->notify_idle_end)
+		perf_counters->notify_idle_end(task->sched_ctx, args->workerid);
+#endif //STARPU_USE_SC_HYPERVISOR
+
 	if (_starpu_worker_get_status(workerid) == STATUS_SLEEPING)
 	{
 		_STARPU_TRACE_WORKER_SLEEP_END;

+ 1 - 3
src/sched_policies/parallel_eager.c

@@ -44,8 +44,6 @@ static void peager_add_workers(unsigned sched_ctx_id, int *workerids, unsigned n
 {
 	struct _starpu_peager_data *data = (struct _starpu_peager_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 	unsigned nbasic_workers = starpu_worker_get_count();
-	unsigned ncombined_workers = starpu_combined_worker_get_count();
-	unsigned ntotal_workers = nbasic_workers + ncombined_workers;
 		
 	_starpu_sched_find_worker_combinations(workerids, nworkers);
 
@@ -71,7 +69,7 @@ static void peager_add_workers(unsigned sched_ctx_id, int *workerids, unsigned n
 
 	for (i = 0; i < ncombinedworkers; i++)
 	{
-		workerid = ntotal_workers + i;
+		workerid = nbasic_workers + i;
 
 		/* Note that we ASSUME that the workers are sorted by size ! */
 		int *workers;

+ 3 - 1
tests/datawizard/commute.c

@@ -102,6 +102,7 @@ static starpu_data_handle_t x_handle, f_handle;
 static void test(enum starpu_data_access_mode begin_mode, enum starpu_data_access_mode end_mode, int order)
 {
 	struct starpu_task *begin_t, *commute1_t, *commute2_t, *end_t;
+	int ret;
 
 	codelet_begin.modes[0] = begin_mode;
 	codelet_end.modes[0] = end_mode;
@@ -140,7 +141,8 @@ static void test(enum starpu_data_access_mode begin_mode, enum starpu_data_acces
 	if (starpu_task_submit(end_t) == -ENODEV)
 		exit(STARPU_TEST_SKIPPED);
 
-	starpu_task_wait(end_t);
+	ret = starpu_task_wait(end_t);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait");
 	starpu_data_acquire(x_handle, STARPU_R);
 	if (x != 1 + order + !!(end_mode & STARPU_W))
 		exit(EXIT_FAILURE);

+ 16 - 4
tests/disk/disk_compute.c

@@ -27,6 +27,10 @@
 #include <unistd.h>
 #include <math.h>
 
+#ifdef STARPU_HAVE_WINDOWS
+        #include <io.h>
+#endif 
+
 #define NX (1024)
 
 int main(int argc, char **argv)
@@ -61,7 +65,7 @@ int main(int argc, char **argv)
 
 
 	/* register a disk */
-	int new_dd = starpu_disk_register(&starpu_disk_unistd_o_direct_ops, (void *) base, 1024*1024*1);
+	int new_dd = starpu_disk_register(&starpu_disk_unistd_ops, (void *) base, 1024*1024*1);
 	/* can't write on /tmp/ */
 	if (new_dd == -ENOENT) goto enoent;
 	
@@ -99,7 +103,11 @@ int main(int argc, char **argv)
 	fclose(f);
 
 	int descriptor = open(path_file_start, O_RDWR);
-	fdatasync(descriptor);
+#ifdef STARPU_HAVE_WINDOWS
+	_commit(descriptor);
+#else
+	fsync(descriptor);
+#endif
 	close(descriptor);
 
 	/* create a file to store result */
@@ -114,8 +122,12 @@ int main(int argc, char **argv)
 	fclose(f);
 
         descriptor = open(path_file_end, O_RDWR);
-        fdatasync(descriptor);
-        close(descriptor);
+#ifdef STARPU_HAVE_WINDOWS
+        _commit(descriptor);
+#else
+        fsync(descriptor);
+#endif
+	close(descriptor);
 
 	/* And now, you want to use your datas in StarPU */
 	/* Open the file ON the disk */

+ 1 - 1
tests/main/starpu_init.c

@@ -30,7 +30,7 @@ int main(int argc, char **argv)
 static int check_cpu(int env_cpu, int conf_cpu, int expected_cpu, int *cpu)
 {
 	int ret;
-	char *string;
+	char *string = NULL;
 
 	FPRINTF(stderr, "Testing with env=%d - conf=%d\n", env_cpu, conf_cpu);
 

+ 3 - 3
tests/main/tag_task_data_deps.c

@@ -243,13 +243,13 @@ int main(int argc, char **argv)
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
 		ret = starpu_task_wait(taskA);
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait");
 
 		ret = starpu_task_wait(taskB);
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait");
 
 		ret = starpu_task_wait(taskC);
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait");
 	} while(0);
 
 	starpu_shutdown();

+ 2 - 1
tools/gdbinit

@@ -20,7 +20,7 @@ define starpu-print-job
   set $job = (struct _starpu_job *)$arg0
 
   printf "StarPU Job (%p)\n", $job
-  if ($job != NULL)
+  if ($job != 0)
     printf "\ttask:\t\t\t\t<%p>\n", $job->task
     printf "\tsubmitted:\t\t\t<%d>\n", $job->submitted
     printf "\tterminated:\t\t\t<%d>\n", $job->terminated
@@ -41,6 +41,7 @@ define starpu-print-task
   set language c
   set $task = (struct starpu_task *)$arg0
   set $job = (struct _starpu_job *)$task->starpu_private
+  set $status=0
   if $task->status == 0
     set $status="STARPU_TASK_INVALID"
   end