Browse Source

- merge trunk

Olivier Aumage 11 years ago
parent
commit
721ce5f920
57 changed files with 1059 additions and 643 deletions
  1. 6 0
      ChangeLog
  2. 5 1
      configure.ac
  3. 1 1
      doc/doxygen/chapters/41configure_options.doxy
  4. 4 0
      doc/doxygen/chapters/api/scheduling_contexts.doxy
  5. 1 1
      examples/axpy/axpy.c
  6. 10 10
      examples/cg/cg.h
  7. 1 1
      examples/cholesky/cholesky_grain_tag.c
  8. 1 1
      examples/cholesky/cholesky_implicit.c
  9. 5 6
      examples/cholesky/cholesky_kernels.c
  10. 1 1
      examples/cholesky/cholesky_tag.c
  11. 79 79
      examples/common/blas.c
  12. 27 27
      examples/common/blas.h
  13. 1 1
      examples/heat/dw_factolu.h
  14. 5 5
      examples/heat/dw_factolu_kernels.c
  15. 7 7
      examples/heat/dw_sparse_cg_kernels.c
  16. 7 7
      examples/heat/heat.c
  17. 9 9
      examples/lu/lu-double.h
  18. 10 10
      examples/lu/lu-float.h
  19. 1 1
      examples/lu/xlu.h
  20. 3 3
      examples/mult/double.h
  21. 3 3
      examples/mult/simple.h
  22. 2 2
      examples/pipeline/pipeline.c
  23. 58 24
      examples/sched_ctx/sched_ctx.c
  24. 8 4
      examples/sched_ctx/sched_ctx_without_sched_policy.c
  25. 6 1
      examples/worker_collections/worker_tree_example.c
  26. 1 1
      gcc-plugin/examples/cholesky/cholesky.c
  27. 4 4
      gcc-plugin/examples/cholesky/cholesky_kernels.c
  28. 2 0
      include/starpu_sched_ctx.h
  29. 1 1
      mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
  30. 4 4
      mpi/examples/matrix_decomposition/mpi_cholesky_kernels.c
  31. 10 10
      mpi/examples/mpi_lu/mpi_lu-double.h
  32. 10 10
      mpi/examples/mpi_lu/mpi_lu-float.h
  33. 6 2
      mpi/src/Makefile.am
  34. 138 348
      mpi/src/starpu_mpi.c
  35. 54 5
      mpi/src/starpu_mpi_cache.c
  36. 168 0
      mpi/src/starpu_mpi_early_data.c
  37. 55 0
      mpi/src/starpu_mpi_early_data.h
  38. 104 0
      mpi/src/starpu_mpi_early_request.c
  39. 44 0
      mpi/src/starpu_mpi_early_request.h
  40. 26 17
      mpi/tests/mpi_earlyrecv.c
  41. 3 3
      mpi/tests/mpi_earlyrecv2.c
  42. 1 1
      sc_hypervisor/examples/cholesky/cholesky_grain_tag.c
  43. 1 1
      sc_hypervisor/examples/cholesky/cholesky_implicit.c
  44. 5 5
      sc_hypervisor/examples/cholesky/cholesky_kernels.c
  45. 1 1
      sc_hypervisor/examples/cholesky/cholesky_tag.c
  46. 17 2
      src/common/fxt.h
  47. 4 4
      src/common/utils.h
  48. 16 0
      src/core/sched_ctx.c
  49. 8 1
      src/core/sched_policy.c
  50. 26 10
      src/core/workers.c
  51. 2 2
      src/datawizard/malloc.c
  52. 64 0
      src/debug/traces/starpu_fxt.c
  53. 2 2
      src/debug/traces/starpu_paje.c
  54. 10 0
      src/drivers/driver_common/driver_common.c
  55. 0 4
      src/sched_policies/deque_modeling_policy_data_aware.c
  56. 10 0
      tests/main/starpu_worker_exists.c
  57. 1 0
      tests/perfmodels/regression_based.c

+ 6 - 0
ChangeLog

@@ -17,6 +17,12 @@
 StarPU 1.2.0 (svn revision xxxx)
 ==============================================
 
+Small features:
+  * New function starpu_sched_ctx_display_workers() to display worker
+    information belonging to a given scheduler context
+  * The option --enable-verbose can be called with
+    --enable-verbose=extra to increase the verbosity
+
 StarPU 1.1.2 (svn revision xxxx)
 ==============================================
 The scheduling context release

+ 5 - 1
configure.ac

@@ -1393,12 +1393,16 @@ fi
 
 AC_MSG_CHECKING(whether debug messages should be displayed)
 AC_ARG_ENABLE(verbose, [AS_HELP_STRING([--enable-verbose],
-			[display verbose debug messages])],
+			[display verbose debug messages (--enable-verbose=extra increase the verbosity)])],
 			enable_verbose=$enableval, enable_verbose=no)
 AC_MSG_RESULT($enable_verbose)
 if test x$enable_verbose = xyes; then
 	AC_DEFINE(STARPU_VERBOSE, [1], [display verbose debug messages])
 fi
+if test x$enable_verbose = xextra; then
+	AC_DEFINE(STARPU_VERBOSE, [1], [display verbose debug messages])
+	AC_DEFINE(STARPU_EXTRA_VERBOSE, [1], [display verbose debug messages])
+fi
 
 AC_MSG_CHECKING(whether coverage testing should be enabled)
 AC_ARG_ENABLE(coverage, [AS_HELP_STRING([--enable-coverage],

+ 1 - 1
doc/doxygen/chapters/41configure_options.doxy

@@ -42,7 +42,7 @@ Disable assertion checks, which saves computation time.
 \addindex __configure__--enable-verbose
 Increase the verbosity of the debugging messages.  This can be disabled
 at runtime by setting the environment variable \ref STARPU_SILENT to
-any value.
+any value. <c>--enable-verbose=extra</c> increase even more the verbosity.
 
 \verbatim
 $ STARPU_SILENT=1 ./vector_scal

+ 4 - 0
doc/doxygen/chapters/api/scheduling_contexts.doxy

@@ -108,6 +108,10 @@ This function removes the workers in \p workerids_ctx from the context
 \p sched_ctx_id. The last argument cannot be greater than
 STARPU_NMAX_SCHED_CTXS.
 
+\fn void starpu_sched_ctx_display_workers(unsigned sched_ctx_id, FILE *f)
+\ingroup API_Scheduling_Contexts
+This function prints on the file \p f the worker names belonging to the context \p sched_ctx_id
+
 \fn void starpu_sched_ctx_delete(unsigned sched_ctx_id)
 \ingroup API_Scheduling_Contexts
 Delete scheduling context \p sched_ctx_id and transfer remaining

+ 1 - 1
examples/axpy/axpy.c

@@ -31,7 +31,7 @@
 
 #include "axpy.h"
 
-#define AXPY	SAXPY
+#define AXPY	STARPU_SAXPY
 #define CUBLASAXPY	cublasSaxpy
 
 #define N	(16*1024*1024)

+ 10 - 10
examples/cg/cg.h

@@ -30,11 +30,11 @@
 
 #ifdef DOUBLE
 #define TYPE	double
-#define GEMV	DGEMV
-#define DOT	DDOT
-#define GEMV	DGEMV
-#define AXPY	DAXPY
-#define SCAL	DSCAL
+#define GEMV	STARPU_DGEMV
+#define DOT	STARPU_DDOT
+#define GEMV	STARPU_DGEMV
+#define AXPY	STARPU_DAXPY
+#define SCAL	STARPU_DSCAL
 #define cublasdot	cublasDdot
 #define cublasscal	cublasDscal
 #define cublasaxpy	cublasDaxpy
@@ -42,11 +42,11 @@
 #define cublasscal	cublasDscal
 #else
 #define TYPE	float
-#define GEMV	SGEMV
-#define DOT	SDOT
-#define GEMV	SGEMV
-#define AXPY	SAXPY
-#define SCAL	SSCAL
+#define GEMV	STARPU_SGEMV
+#define DOT	STARPU_SDOT
+#define GEMV	STARPU_SGEMV
+#define AXPY	STARPU_SAXPY
+#define SCAL	STARPU_SSCAL
 #define cublasdot	cublasSdot
 #define cublasscal	cublasSscal
 #define cublasaxpy	cublasSaxpy

+ 1 - 1
examples/cholesky/cholesky_grain_tag.c

@@ -392,7 +392,7 @@ int main(int argc, char **argv)
 	float *test_mat = malloc(size*size*sizeof(float));
 	STARPU_ASSERT(test_mat);
 
-	SSYRK("L", "N", size, size, 1.0f,
+	STARPU_SSYRK("L", "N", size, size, 1.0f,
 				mat, size, 0.0f, test_mat, size);
 
 	FPRINTF(stderr, "comparing results ...\n");

+ 1 - 1
examples/cholesky/cholesky_implicit.c

@@ -243,7 +243,7 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 		float *test_mat = malloc(size*size*sizeof(float));
 		STARPU_ASSERT(test_mat);
 
-		SSYRK("L", "N", size, size, 1.0f,
+		STARPU_SSYRK("L", "N", size, size, 1.0f,
 					mat, size, 0.0f, test_mat, size);
 
 		FPRINTF(stderr, "comparing results ...\n");

+ 5 - 6
examples/cholesky/cholesky_kernels.c

@@ -49,7 +49,7 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STAR
 		if (worker_size == 1)
 		{
 			/* Sequential CPU kernel */
-			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21, 
+			STARPU_SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21, 
 				right, ld12, 1.0f, center, ld22);
 		}
 		else
@@ -63,7 +63,7 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STAR
 			float *new_left = &left[block_size*rank];
 			float *new_center = &center[block_size*rank];
 
-			SGEMM("N", "T", dy, new_dx, dz, -1.0f, new_left, ld21, 
+			STARPU_SGEMM("N", "T", dy, new_dx, dz, -1.0f, new_left, ld21, 
 				right, ld12, 1.0f, new_center, ld22);
 		}
 	}
@@ -113,7 +113,7 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, STARPU_A
 	switch (s)
 	{
 		case 0:
-			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
+			STARPU_STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
 			break;
 #ifdef STARPU_USE_CUDA
 		case 1:
@@ -172,9 +172,9 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 
 				STARPU_ASSERT(lambda11 != 0.0f);
 		
-				SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
+				STARPU_SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
 		
-				SSYR("L", nx - z - 1, -1.0f, 
+				STARPU_SSYR("L", nx - z - 1, -1.0f, 
 							&sub11[(z+1)+z*ld], 1,
 							&sub11[(z+1)+(z+1)*ld], ld);
 			}
@@ -191,7 +191,6 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 				fprintf(stderr, "Error in Magma: %d\n", ret);
 				STARPU_ABORT();
 			}
-			STARPU_ASSERT(!cures);
 			}
 #else
 			{

+ 1 - 1
examples/cholesky/cholesky_tag.c

@@ -368,7 +368,7 @@ int main(int argc, char **argv)
 	float *test_mat = malloc(size*size*sizeof(float));
 	STARPU_ASSERT(test_mat);
 
-	SSYRK("L", "N", size, size, 1.0f,
+	STARPU_SSYRK("L", "N", size, size, 1.0f,
 				mat, size, 0.0f, test_mat, size);
 
 	FPRINTF(stderr, "comparing results ...\n");

+ 79 - 79
examples/common/blas.c

@@ -23,13 +23,13 @@
 
 /*
     This files contains BLAS wrappers for the different BLAS implementations
-  (eg. REFBLAS, STARPU_ATLAS, GOTOBLAS ...). We assume a Fortran orientation as most
+  (eg. REFBLAS, ATLAS, GOTOBLAS ...). We assume a Fortran orientation as most
   libraries do not supply C-based ordering.
  */
 
 #ifdef STARPU_ATLAS
 
-inline void SGEMM(char *transa, char *transb, int M, int N, int K, 
+inline void STARPU_SGEMM(char *transa, char *transb, int M, int N, int K, 
 			float alpha, const float *A, int lda, const float *B, int ldb, 
 			float beta, float *C, int ldc)
 {
@@ -40,7 +40,7 @@ inline void SGEMM(char *transa, char *transb, int M, int N, int K,
 			M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);				
 }
 
-inline void DGEMM(char *transa, char *transb, int M, int N, int K, 
+inline void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, 
 			double alpha, double *A, int lda, double *B, int ldb, 
 			double beta, double *C, int ldc)
 {
@@ -51,7 +51,7 @@ inline void DGEMM(char *transa, char *transb, int M, int N, int K,
 			M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);				
 }
 
-inline void SGEMV(char *transa, int M, int N, float alpha, float *A, int lda, float *X, int incX, float beta, float *Y, int incY)
+inline void STARPU_SGEMV(char *transa, int M, int N, float alpha, float *A, int lda, float *X, int incX, float beta, float *Y, int incY)
 {
 	enum CBLAS_TRANSPOSE ta = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
 
@@ -59,7 +59,7 @@ inline void SGEMV(char *transa, int M, int N, float alpha, float *A, int lda, fl
 					X, incX, beta, Y, incY);
 }
 
-inline void DGEMV(char *transa, int M, int N, double alpha, double *A, int lda, double *X, int incX, double beta, double *Y, int incY)
+inline void STARPU_DGEMV(char *transa, int M, int N, double alpha, double *A, int lda, double *X, int incX, double beta, double *Y, int incY)
 {
 	enum CBLAS_TRANSPOSE ta = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
 
@@ -67,27 +67,27 @@ inline void DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
 					X, incX, beta, Y, incY);
 }
 
-inline float SASUM(int N, float *X, int incX)
+inline float STARPU_SASUM(int N, float *X, int incX)
 {
 	return cblas_sasum(N, X, incX);
 }
 
-inline double DASUM(int N, double *X, int incX)
+inline double STARPU_DASUM(int N, double *X, int incX)
 {
 	return cblas_dasum(N, X, incX);
 }
 
-void SSCAL(int N, float alpha, float *X, int incX)
+void STARPU_SSCAL(int N, float alpha, float *X, int incX)
 {
 	cblas_sscal(N, alpha, X, incX);
 }
 
-void DSCAL(int N, double alpha, double *X, int incX)
+void STARPU_DSCAL(int N, double alpha, double *X, int incX)
 {
 	cblas_dscal(N, alpha, X, incX);
 }
 
-void STRSM (const char *side, const char *uplo, const char *transa,
+void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
                    const char *diag, const int m, const int n,
                    const float alpha, const float *A, const int lda,
                    float *B, const int ldb)
@@ -100,7 +100,7 @@ void STRSM (const char *side, const char *uplo, const char *transa,
 	cblas_strsm(CblasColMajor, side_, uplo_, transa_, diag_, m, n, alpha, A, lda, B, ldb);
 }
 
-void DTRSM (const char *side, const char *uplo, const char *transa,
+void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
                    const char *diag, const int m, const int n,
                    const double alpha, const double *A, const int lda,
                    double *B, const int ldb)
@@ -113,7 +113,7 @@ void DTRSM (const char *side, const char *uplo, const char *transa,
 	cblas_dtrsm(CblasColMajor, side_, uplo_, transa_, diag_, m, n, alpha, A, lda, B, ldb);
 }
 
-void SSYR (const char *uplo, const int n, const float alpha,
+void STARPU_SSYR (const char *uplo, const int n, const float alpha,
                   const float *x, const int incx, float *A, const int lda)
 {
 	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
@@ -121,7 +121,7 @@ void SSYR (const char *uplo, const int n, const float alpha,
 	cblas_ssyr(CblasColMajor, uplo_, n, alpha, x, incx, A, lda); 
 }
 
-void SSYRK (const char *uplo, const char *trans, const int n,
+void STARPU_SSYRK (const char *uplo, const char *trans, const int n,
                    const int k, const float alpha, const float *A,
                    const int lda, const float beta, float *C,
                    const int ldc)
@@ -132,21 +132,21 @@ void SSYRK (const char *uplo, const char *trans, const int n,
 	cblas_ssyrk(CblasColMajor, uplo_, trans_, n, k, alpha, A, lda, beta, C, ldc); 
 }
 
-void SGER(const int m, const int n, const float alpha,
+void STARPU_SGER(const int m, const int n, const float alpha,
                   const float *x, const int incx, const float *y,
                   const int incy, float *A, const int lda)
 {
 	cblas_sger(CblasColMajor, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void DGER(const int m, const int n, const double alpha,
+void STARPU_DGER(const int m, const int n, const double alpha,
                   const double *x, const int incx, const double *y,
                   const int incy, double *A, const int lda)
 {
 	cblas_dger(CblasColMajor, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void STRSV (const char *uplo, const char *trans, const char *diag, 
+void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
                    const int n, const float *A, const int lda, float *x, 
                    const int incx)
 {
@@ -157,7 +157,7 @@ void STRSV (const char *uplo, const char *trans, const char *diag,
 	cblas_strsv(CblasColMajor, uplo_, trans_, diag_, n, A, lda, x, incx);
 }
 
-void STRMM(const char *side, const char *uplo, const char *transA,
+void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
                  const char *diag, const int m, const int n,
                  const float alpha, const float *A, const int lda,
                  float *B, const int ldb)
@@ -170,7 +170,7 @@ void STRMM(const char *side, const char *uplo, const char *transA,
 	cblas_strmm(CblasColMajor, side_, uplo_, transA_, diag_, m, n, alpha, A, lda, B, ldb);
 }
 
-void DTRMM(const char *side, const char *uplo, const char *transA,
+void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
                  const char *diag, const int m, const int n,
                  const double alpha, const double *A, const int lda,
                  double *B, const int ldb)
@@ -183,7 +183,7 @@ void DTRMM(const char *side, const char *uplo, const char *transA,
 	cblas_dtrmm(CblasColMajor, side_, uplo_, transA_, diag_, m, n, alpha, A, lda, B, ldb);
 }
 
-void STRMV(const char *uplo, const char *transA, const char *diag,
+void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
                  const int n, const float *A, const int lda, float *X,
                  const int incX)
 {
@@ -194,53 +194,53 @@ void STRMV(const char *uplo, const char *transA, const char *diag,
 	cblas_strmv(CblasColMajor, uplo_, transA_, diag_, n, A, lda, X, incX);
 }
 
-void SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY)
+void STARPU_SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY)
 {
 	cblas_saxpy(n, alpha, X, incX, Y, incY);
 }
 
-void DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY)
+void STARPU_DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY)
 {
 	cblas_daxpy(n, alpha, X, incX, Y, incY);
 }
 
-int ISAMAX (const int n, float *X, const int incX)
+int STARPU_ISAMAX (const int n, float *X, const int incX)
 {
     int retVal;
     retVal = cblas_isamax(n, X, incX);
     return retVal;
 }
 
-int IDAMAX (const int n, double *X, const int incX)
+int STARPU_IDAMAX (const int n, double *X, const int incX)
 {
     int retVal;
     retVal = cblas_idamax(n, X, incX);
     return retVal;
 }
 
-float SDOT(const int n, const float *x, const int incx, const float *y, const int incy)
+float STARPU_SDOT(const int n, const float *x, const int incx, const float *y, const int incy)
 {
 	return cblas_sdot(n, x, incx, y, incy);
 }
 
-double DDOT(const int n, const double *x, const int incx, const double *y, const int incy)
+double STARPU_DDOT(const int n, const double *x, const int incx, const double *y, const int incy)
 {
 	return cblas_ddot(n, x, incx, y, incy);
 }
 
-void SSWAP(const int n, float *x, const int incx, float *y, const int incy)
+void STARPU_SSWAP(const int n, float *x, const int incx, float *y, const int incy)
 {
 	cblas_sswap(n, x, incx, y, incy);
 }
 
-void DSWAP(const int n, double *x, const int incx, double *y, const int incy)
+void STARPU_DSWAP(const int n, double *x, const int incx, double *y, const int incy)
 {
 	cblas_dswap(n, x, incx, y, incy);
 }
 
 #elif defined(STARPU_GOTO) || defined(STARPU_SYSTEM_BLAS) || defined(STARPU_MKL)
 
-inline void SGEMM(char *transa, char *transb, int M, int N, int K, 
+inline void STARPU_SGEMM(char *transa, char *transb, int M, int N, int K, 
 			float alpha, const float *A, int lda, const float *B, int ldb, 
 			float beta, float *C, int ldc)
 {
@@ -249,7 +249,7 @@ inline void SGEMM(char *transa, char *transb, int M, int N, int K,
 			 &beta, C, &ldc);	
 }
 
-inline void DGEMM(char *transa, char *transb, int M, int N, int K, 
+inline void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, 
 			double alpha, double *A, int lda, double *B, int ldb, 
 			double beta, double *C, int ldc)
 {
@@ -259,39 +259,39 @@ inline void DGEMM(char *transa, char *transb, int M, int N, int K,
 }
 
 
-inline void SGEMV(char *transa, int M, int N, float alpha, float *A, int lda,
+inline void STARPU_SGEMV(char *transa, int M, int N, float alpha, float *A, int lda,
 		float *X, int incX, float beta, float *Y, int incY)
 {
 	sgemv_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
 }
 
-inline void DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
+inline void STARPU_DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
 		double *X, int incX, double beta, double *Y, int incY)
 {
 	dgemv_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
 }
 
-inline float SASUM(int N, float *X, int incX)
+inline float STARPU_SASUM(int N, float *X, int incX)
 {
 	return sasum_(&N, X, &incX);
 }
 
-inline double DASUM(int N, double *X, int incX)
+inline double STARPU_DASUM(int N, double *X, int incX)
 {
 	return dasum_(&N, X, &incX);
 }
 
-void SSCAL(int N, float alpha, float *X, int incX)
+void STARPU_SSCAL(int N, float alpha, float *X, int incX)
 {
 	sscal_(&N, &alpha, X, &incX);
 }
 
-void DSCAL(int N, double alpha, double *X, int incX)
+void STARPU_DSCAL(int N, double alpha, double *X, int incX)
 {
 	dscal_(&N, &alpha, X, &incX);
 }
 
-void STRSM (const char *side, const char *uplo, const char *transa,
+void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
                    const char *diag, const int m, const int n,
                    const float alpha, const float *A, const int lda,
                    float *B, const int ldb)
@@ -299,7 +299,7 @@ void STRSM (const char *side, const char *uplo, const char *transa,
 	strsm_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
 }
 
-void DTRSM (const char *side, const char *uplo, const char *transa,
+void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
                    const char *diag, const int m, const int n,
                    const double alpha, const double *A, const int lda,
                    double *B, const int ldb)
@@ -307,13 +307,13 @@ void DTRSM (const char *side, const char *uplo, const char *transa,
 	dtrsm_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
 }
 
-void SSYR (const char *uplo, const int n, const float alpha,
+void STARPU_SSYR (const char *uplo, const int n, const float alpha,
                   const float *x, const int incx, float *A, const int lda)
 {
 	ssyr_(uplo, &n, &alpha, x, &incx, A, &lda); 
 }
 
-void SSYRK (const char *uplo, const char *trans, const int n,
+void STARPU_SSYRK (const char *uplo, const char *trans, const int n,
                    const int k, const float alpha, const float *A,
                    const int lda, const float beta, float *C,
                    const int ldc)
@@ -321,28 +321,28 @@ void SSYRK (const char *uplo, const char *trans, const int n,
 	ssyrk_(uplo, trans, &n, &k, &alpha, A, &lda, &beta, C, &ldc); 
 }
 
-void SGER(const int m, const int n, const float alpha,
+void STARPU_SGER(const int m, const int n, const float alpha,
                   const float *x, const int incx, const float *y,
                   const int incy, float *A, const int lda)
 {
 	sger_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
 }
 
-void DGER(const int m, const int n, const double alpha,
+void STARPU_DGER(const int m, const int n, const double alpha,
                   const double *x, const int incx, const double *y,
                   const int incy, double *A, const int lda)
 {
 	dger_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
 }
 
-void STRSV (const char *uplo, const char *trans, const char *diag, 
+void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
                    const int n, const float *A, const int lda, float *x, 
                    const int incx)
 {
 	strsv_(uplo, trans, diag, &n, A, &lda, x, &incx);
 }
 
-void STRMM(const char *side, const char *uplo, const char *transA,
+void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
                  const char *diag, const int m, const int n,
                  const float alpha, const float *A, const int lda,
                  float *B, const int ldb)
@@ -350,7 +350,7 @@ void STRMM(const char *side, const char *uplo, const char *transA,
 	strmm_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
 }
 
-void DTRMM(const char *side, const char *uplo, const char *transA,
+void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
                  const char *diag, const int m, const int n,
                  const double alpha, const double *A, const int lda,
                  double *B, const int ldb)
@@ -358,38 +358,38 @@ void DTRMM(const char *side, const char *uplo, const char *transA,
 	dtrmm_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
 }
 
-void STRMV(const char *uplo, const char *transA, const char *diag,
+void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
                  const int n, const float *A, const int lda, float *X,
                  const int incX)
 {
 	strmv_(uplo, transA, diag, &n, A, &lda, X, &incX);
 }
 
-void SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY)
+void STARPU_SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY)
 {
 	saxpy_(&n, &alpha, X, &incX, Y, &incY);
 }
 
-void DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY)
+void STARPU_DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY)
 {
 	daxpy_(&n, &alpha, X, &incX, Y, &incY);
 }
 
-int ISAMAX (const int n, float *X, const int incX)
+int STARPU_ISAMAX (const int n, float *X, const int incX)
 {
     int retVal;
     retVal = isamax_ (&n, X, &incX);
     return retVal;
 }
 
-int IDAMAX (const int n, double *X, const int incX)
+int STARPU_IDAMAX (const int n, double *X, const int incX)
 {
     int retVal;
     retVal = idamax_ (&n, X, &incX);
     return retVal;
 }
 
-float SDOT(const int n, const float *x, const int incx, const float *y, const int incy)
+float STARPU_SDOT(const int n, const float *x, const int incx, const float *y, const int incy)
 {
 	float retVal = 0;
 
@@ -399,104 +399,104 @@ float SDOT(const int n, const float *x, const int incx, const float *y, const in
 	return retVal;
 }
 
-double DDOT(const int n, const double *x, const int incx, const double *y, const int incy)
+double STARPU_DDOT(const int n, const double *x, const int incx, const double *y, const int incy)
 {
 	return ddot_(&n, x, &incx, y, &incy);
 }
 
-void SSWAP(const int n, float *X, const int incX, float *Y, const int incY)
+void STARPU_SSWAP(const int n, float *X, const int incX, float *Y, const int incY)
 {
 	sswap_(&n, X, &incX, Y, &incY);
 }
 
-void DSWAP(const int n, double *X, const int incX, double *Y, const int incY)
+void STARPU_DSWAP(const int n, double *X, const int incX, double *Y, const int incY)
 {
 	dswap_(&n, X, &incX, Y, &incY);
 }
 
 
 #elif defined(STARPU_SIMGRID)
-inline void SGEMM(char *transa, char *transb, int M, int N, int K, 
+inline void STARPU_SGEMM(char *transa, char *transb, int M, int N, int K, 
 			float alpha, const float *A, int lda, const float *B, int ldb, 
 			float beta, float *C, int ldc) { }
 
-inline void DGEMM(char *transa, char *transb, int M, int N, int K, 
+inline void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, 
 			double alpha, double *A, int lda, double *B, int ldb, 
 			double beta, double *C, int ldc) { }
 
-inline void SGEMV(char *transa, int M, int N, float alpha, float *A, int lda,
+inline void STARPU_SGEMV(char *transa, int M, int N, float alpha, float *A, int lda,
 		float *X, int incX, float beta, float *Y, int incY) { }
 
-inline void DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
+inline void STARPU_DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
 		double *X, int incX, double beta, double *Y, int incY) { }
 
-inline float SASUM(int N, float *X, int incX) { }
+inline float STARPU_SASUM(int N, float *X, int incX) { }
 
-inline double DASUM(int N, double *X, int incX) { }
+inline double STARPU_DASUM(int N, double *X, int incX) { }
 
-void SSCAL(int N, float alpha, float *X, int incX) { }
+void STARPU_SSCAL(int N, float alpha, float *X, int incX) { }
 
-void DSCAL(int N, double alpha, double *X, int incX) { }
+void STARPU_DSCAL(int N, double alpha, double *X, int incX) { }
 
-void STRSM (const char *side, const char *uplo, const char *transa,
+void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
                    const char *diag, const int m, const int n,
                    const float alpha, const float *A, const int lda,
                    float *B, const int ldb) { }
 
-void DTRSM (const char *side, const char *uplo, const char *transa,
+void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
                    const char *diag, const int m, const int n,
                    const double alpha, const double *A, const int lda,
                    double *B, const int ldb) { }
 
-void SSYR (const char *uplo, const int n, const float alpha,
+void STARPU_SSYR (const char *uplo, const int n, const float alpha,
                   const float *x, const int incx, float *A, const int lda) { }
 
-void SSYRK (const char *uplo, const char *trans, const int n,
+void STARPU_SSYRK (const char *uplo, const char *trans, const int n,
                    const int k, const float alpha, const float *A,
                    const int lda, const float beta, float *C,
                    const int ldc) { }
 
-void SGER(const int m, const int n, const float alpha,
+void STARPU_SGER(const int m, const int n, const float alpha,
                   const float *x, const int incx, const float *y,
                   const int incy, float *A, const int lda) { }
 
-void DGER(const int m, const int n, const double alpha,
+void STARPU_DGER(const int m, const int n, const double alpha,
                   const double *x, const int incx, const double *y,
                   const int incy, double *A, const int lda) { }
 
-void STRSV (const char *uplo, const char *trans, const char *diag, 
+void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
                    const int n, const float *A, const int lda, float *x, 
                    const int incx) { }
 
-void STRMM(const char *side, const char *uplo, const char *transA,
+void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
                  const char *diag, const int m, const int n,
                  const float alpha, const float *A, const int lda,
                  float *B, const int ldb) { }
 
-void DTRMM(const char *side, const char *uplo, const char *transA,
+void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
                  const char *diag, const int m, const int n,
                  const double alpha, const double *A, const int lda,
                  double *B, const int ldb) { }
 
-void STRMV(const char *uplo, const char *transA, const char *diag,
+void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
                  const int n, const float *A, const int lda, float *X,
                  const int incX) { }
 
-void SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY) { }
+void STARPU_SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY) { }
 
-void DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY) { }
+void STARPU_DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY) { }
 
-int ISAMAX (const int n, float *X, const int incX) { }
+int STARPU_ISAMAX (const int n, float *X, const int incX) { }
 
-int IDAMAX (const int n, double *X, const int incX) { }
+int STARPU_IDAMAX (const int n, double *X, const int incX) { }
 
-float SDOT(const int n, const float *x, const int incx, const float *y, const int incy) { }
+float STARPU_SDOT(const int n, const float *x, const int incx, const float *y, const int incy) { }
 
-double DDOT(const int n, const double *x, const int incx, const double *y, const int incy) { }
+double STARPU_DDOT(const int n, const double *x, const int incx, const double *y, const int incy) { }
 
-void SSWAP(const int n, float *X, const int incX, float *Y, const int incY) { }
+void STARPU_SSWAP(const int n, float *X, const int incX, float *Y, const int incY) { }
 
-void DSWAP(const int n, double *X, const int incX, double *Y, const int incY) { }
+void STARPU_DSWAP(const int n, double *X, const int incX, double *Y, const int incY) { }
 
 
 #else

+ 27 - 27
examples/common/blas.h

@@ -24,63 +24,63 @@
 #include <cblas.h>
 #endif
 
-void SGEMM(char *transa, char *transb, int M, int N, int K, float alpha, const float *A, int lda, 
+void STARPU_SGEMM(char *transa, char *transb, int M, int N, int K, float alpha, const float *A, int lda, 
 		const float *B, int ldb, float beta, float *C, int ldc);
-void DGEMM(char *transa, char *transb, int M, int N, int K, double alpha, double *A, int lda, 
+void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, double alpha, double *A, int lda, 
 		double *B, int ldb, double beta, double *C, int ldc);
-void SGEMV(char *transa, int M, int N, float alpha, float *A, int lda,
+void STARPU_SGEMV(char *transa, int M, int N, float alpha, float *A, int lda,
 		float *X, int incX, float beta, float *Y, int incY);
-void DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
+void STARPU_DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
 		double *X, int incX, double beta, double *Y, int incY);
-float SASUM(int N, float *X, int incX);
-double DASUM(int N, double *X, int incX);
-void SSCAL(int N, float alpha, float *X, int incX);
-void DSCAL(int N, double alpha, double *X, int incX);
-void STRSM (const char *side, const char *uplo, const char *transa,
+float STARPU_SASUM(int N, float *X, int incX);
+double STARPU_DASUM(int N, double *X, int incX);
+void STARPU_SSCAL(int N, float alpha, float *X, int incX);
+void STARPU_DSCAL(int N, double alpha, double *X, int incX);
+void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
                    const char *diag, const int m, const int n,
                    const float alpha, const float *A, const int lda,
                    float *B, const int ldb);
-void DTRSM (const char *side, const char *uplo, const char *transa,
+void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
                    const char *diag, const int m, const int n,
                    const double alpha, const double *A, const int lda,
                    double *B, const int ldb);
-void DGEMM(char *transa, char *transb, int M, int N, int K, 
+void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, 
 			double alpha, double *A, int lda, double *B, int ldb, 
 			double beta, double *C, int ldc);
-void SSYR (const char *uplo, const int n, const float alpha,
+void STARPU_SSYR (const char *uplo, const int n, const float alpha,
                   const float *x, const int incx, float *A, const int lda);
-void SSYRK (const char *uplo, const char *trans, const int n,
+void STARPU_SSYRK (const char *uplo, const char *trans, const int n,
                    const int k, const float alpha, const float *A,
                    const int lda, const float beta, float *C,
                    const int ldc);
-void SGER (const int m, const int n, const float alpha,
+void STARPU_SGER (const int m, const int n, const float alpha,
                   const float *x, const int incx, const float *y,
                   const int incy, float *A, const int lda);
-void DGER(const int m, const int n, const double alpha,
+void STARPU_DGER(const int m, const int n, const double alpha,
                   const double *x, const int incx, const double *y,
                   const int incy, double *A, const int lda);
-void STRSV (const char *uplo, const char *trans, const char *diag, 
+void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
                    const int n, const float *A, const int lda, float *x, 
                    const int incx);
-void STRMM(const char *side, const char *uplo, const char *transA,
+void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
                  const char *diag, const int m, const int n,
                  const float alpha, const float *A, const int lda,
                  float *B, const int ldb);
-void DTRMM(const char *side, const char *uplo, const char *transA,
+void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
                  const char *diag, const int m, const int n,
                  const double alpha, const double *A, const int lda,
                  double *B, const int ldb);
-void STRMV(const char *uplo, const char *transA, const char *diag,
+void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
                  const int n, const float *A, const int lda, float *X,
                  const int incX);
-void SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incy);
-void DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY);
-int ISAMAX (const int n, float *X, const int incX);
-int IDAMAX (const int n, double *X, const int incX);
-float SDOT(const int n, const float *x, const int incx, const float *y, const int incy);
-double DDOT(const int n, const double *x, const int incx, const double *y, const int incy);
-void SSWAP(const int n, float *x, const int incx, float *y, const int incy);
-void DSWAP(const int n, double *x, const int incx, double *y, const int incy);
+void STARPU_SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incy);
+void STARPU_DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY);
+int STARPU_ISAMAX (const int n, float *X, const int incX);
+int STARPU_IDAMAX (const int n, double *X, const int incX);
+float STARPU_SDOT(const int n, const float *x, const int incx, const float *y, const int incy);
+double STARPU_DDOT(const int n, const double *x, const int incx, const double *y, const int incy);
+void STARPU_SSWAP(const int n, float *x, const int incx, float *y, const int incy);
+void STARPU_DSWAP(const int n, double *x, const int incx, double *y, const int incy);
 
 #if defined(STARPU_GOTO) || defined(STARPU_SYSTEM_BLAS) || defined(STARPU_MKL)
 

+ 1 - 1
examples/heat/dw_factolu.h

@@ -141,7 +141,7 @@ static void STARPU_ATTRIBUTE_UNUSED compare_A_LU(float *A, float *LU,
 
 
         /* now A_err = L, compute L*U */
-	STRMM("R", "U", "N", "U", size, size, 1.0f, U, size, L, size);
+	STARPU_STRMM("R", "U", "N", "U", size, size, 1.0f, U, size, L, size);
 
 	float max_err = 0.0f;
 	for (i = 0; i < size ; i++)

+ 5 - 5
examples/heat/dw_factolu_kernels.c

@@ -124,7 +124,7 @@ static inline void dw_common_cpu_codelet_update_u22(void *descr[], int s, STARPU
 	switch (s)
 	{
 		case 0:
-			SGEMM("N", "N",	dy, dx, dz, 
+			STARPU_SGEMM("N", "N",	dy, dx, dz, 
 				-1.0f, left, ld21, right, ld12,
 					     1.0f, center, ld22);
 			break;
@@ -189,7 +189,7 @@ static inline void dw_common_codelet_update_u12(void *descr[], int s, STARPU_ATT
 	switch (s)
 	{
 		case 0:
-			STRSM("L", "L", "N", "N",
+			STARPU_STRSM("L", "L", "N", "N",
 					 nx12, ny12, 1.0f, sub11, ld11, sub12, ld12);
 			break;
 #ifdef STARPU_USE_CUDA
@@ -251,7 +251,7 @@ static inline void dw_common_codelet_update_u21(void *descr[], int s, STARPU_ATT
 	switch (s)
 	{
 		case 0:
-			STRSM("R", "U", "N", "U", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
+			STARPU_STRSM("R", "U", "N", "U", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
 			break;
 #ifdef STARPU_USE_CUDA
 		case 1:
@@ -325,9 +325,9 @@ static inline void dw_common_codelet_update_u11(void *descr[], int s, STARPU_ATT
 				pivot = sub11[z+z*ld];
 				STARPU_ASSERT(pivot != 0.0f);
 		
-				SSCAL(nx - z - 1, (1.0f/pivot), &sub11[z+(z+1)*ld], ld);
+				STARPU_SSCAL(nx - z - 1, (1.0f/pivot), &sub11[z+(z+1)*ld], ld);
 		
-				SGER(nx - z - 1, nx - z - 1, -1.0f,
+				STARPU_SGER(nx - z - 1, nx - z - 1, -1.0f,
 						&sub11[z+(z+1)*ld], ld,
 						&sub11[(z+1)+z*ld], 1,
 						&sub11[(z+1) + (z+1)*ld],ld);

+ 7 - 7
examples/heat/dw_sparse_cg_kernels.c

@@ -126,7 +126,7 @@ void cpu_codelet_func_3(void *descr[], void *arg)
 	vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	size = (int)STARPU_VECTOR_GET_NX(descr[0]);
 
-	dot = SDOT(size, vec, 1, vec, 1);
+	dot = STARPU_SDOT(size, vec, 1, vec, 1);
 
 	fprintf(stderr, "func 3 : DOT = %f\n", dot);
 
@@ -218,7 +218,7 @@ void cpu_codelet_func_5(void *descr[], void *arg)
 	STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
-	dot = SDOT(size, vecd, 1, vecq, 1);
+	dot = STARPU_SDOT(size, vecd, 1, vecq, 1);
 
 	pb->alpha = pb->delta_new / dot;
 }
@@ -265,7 +265,7 @@ void cpu_codelet_func_6(void *descr[], void *arg)
 
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
-	SAXPY(size, pb->alpha, vecd, 1, vecx, 1);
+	STARPU_SAXPY(size, pb->alpha, vecd, 1, vecx, 1);
 }
 
 #ifdef STARPU_USE_CUDA
@@ -304,7 +304,7 @@ void cpu_codelet_func_7(void *descr[], void *arg)
 
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
-	SAXPY(size, -pb->alpha, vecq, 1, vecr, 1);
+	STARPU_SAXPY(size, -pb->alpha, vecq, 1, vecr, 1);
 }
 
 #ifdef STARPU_USE_CUDA
@@ -344,7 +344,7 @@ void cpu_codelet_func_8(void *descr[], void *arg)
 	vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
-	dot = SDOT(size, vecr, 1, vecr, 1);
+	dot = STARPU_SDOT(size, vecr, 1, vecr, 1);
 
 	pb->delta_old = pb->delta_new;
 	pb->delta_new = dot;
@@ -392,10 +392,10 @@ void cpu_codelet_func_9(void *descr[], void *arg)
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
 	/* d = beta d */
-	SSCAL(size, pb->beta, vecd, 1);
+	STARPU_SSCAL(size, pb->beta, vecd, 1);
 
 	/* d = r + d */
-	SAXPY (size, 1.0f, vecr, 1, vecd, 1);
+	STARPU_SAXPY (size, 1.0f, vecr, 1, vecd, 1);
 }
 
 #ifdef STARPU_USE_CUDA

+ 7 - 7
examples/heat/heat.c

@@ -362,10 +362,10 @@ static void solve_system(unsigned size, unsigned subsize, float *result, int *Re
 	}
 
 		/* L */
-		STRSV("L", "N", "N", subsize, A, subsize, B, 1);
+		STARPU_STRSV("L", "N", "N", subsize, A, subsize, B, 1);
 	
 		/* U */
-	        STRSV("U", "N", "U", subsize, A, subsize, B, 1);
+	        STARPU_STRSV("U", "N", "U", subsize, A, subsize, B, 1);
 	
 		STARPU_ASSERT(DIM == size);
 	
@@ -378,19 +378,19 @@ static void solve_system(unsigned size, unsigned subsize, float *result, int *Re
 	
 	
 		/* LUB = U * LUB */
-		STRMV("U", "N", "U", subsize, A, subsize, LUB, 1);
+		STARPU_STRMV("U", "N", "U", subsize, A, subsize, LUB, 1);
 		
 		/* LUB = L * LUB */
-		STRMV("L", "N", "N", subsize, A, subsize, LUB, 1);
+		STARPU_STRMV("L", "N", "N", subsize, A, subsize, LUB, 1);
 	
 		/* LUB -= B */
-		SAXPY(subsize, -1.0f, savedB, 1, LUB, 1);
+		STARPU_SAXPY(subsize, -1.0f, savedB, 1, LUB, 1);
 	
 		/* check if LUB is close to the 0 vector */
-		int maxind = ISAMAX(subsize, LUB, 1);
+		int maxind = STARPU_ISAMAX(subsize, LUB, 1);
 		FPRINTF(stderr, "max error (LUX - B) = %e\n",LUB[maxind - 1]);
 
-		float sum = SASUM(subsize, LUB, 1);
+		float sum = STARPU_SASUM(subsize, LUB, 1);
 		FPRINTF(stderr,"avg. error %e\n", sum/subsize);
 	
 		free(LUB);

+ 9 - 9
examples/lu/lu-double.h

@@ -33,16 +33,16 @@
 #define CUBLAS_SWAP	cublasDswap
 #define CUBLAS_IAMAX	cublasIdamax
 
-#define CPU_GEMM	DGEMM
-#define CPU_TRSM	DTRSM
-#define CPU_SCAL	DSCAL
-#define CPU_GER		DGER
-#define CPU_SWAP	DSWAP
+#define CPU_GEMM	STARPU_DGEMM
+#define CPU_TRSM	STARPU_DTRSM
+#define CPU_SCAL	STARPU_DSCAL
+#define CPU_GER		STARPU_DGER
+#define CPU_SWAP	STARPU_DSWAP
 
-#define CPU_TRMM	DTRMM
-#define CPU_AXPY	DAXPY
-#define CPU_ASUM	DASUM
-#define CPU_IAMAX	IDAMAX
+#define CPU_TRMM	STARPU_DTRMM
+#define CPU_AXPY	STARPU_DAXPY
+#define CPU_ASUM	STARPU_DASUM
+#define CPU_IAMAX	STARPU_IDAMAX
 
 #define PIVOT_THRESHHOLD	10e-10
 

+ 10 - 10
examples/lu/lu-float.h

@@ -35,16 +35,16 @@
 #define CUBLAS_SWAP	cublasSswap
 #define CUBLAS_IAMAX	cublasIsamax
 
-#define CPU_GEMM	SGEMM
-#define CPU_TRSM	STRSM
-#define CPU_SCAL	SSCAL
-#define CPU_GER		SGER
-#define CPU_SWAP	SSWAP
-
-#define CPU_TRMM	STRMM
-#define CPU_AXPY	SAXPY
-#define CPU_ASUM	SASUM
-#define CPU_IAMAX	ISAMAX
+#define CPU_GEMM	STARPU_SGEMM
+#define CPU_TRSM	STARPU_STRSM
+#define CPU_SCAL	STARPU_SSCAL
+#define CPU_GER		STARPU_SGER
+#define CPU_SWAP	STARPU_SSWAP
+
+#define CPU_TRMM	STARPU_STRMM
+#define CPU_AXPY	STARPU_SAXPY
+#define CPU_ASUM	STARPU_SASUM
+#define CPU_IAMAX	STARPU_ISAMAX
 
 #define PIVOT_THRESHHOLD	10e-5
 

+ 1 - 1
examples/lu/xlu.h

@@ -60,7 +60,7 @@ static void STARPU_ATTRIBUTE_UNUSED compare_A_LU(float *A, float *LU,
 	}
 
         /* now A_err = L, compute L*U */
-	STRMM("R", "U", "N", "U", size, size, 1.0f, U, size, L, size);
+	STARPU_STRMM("R", "U", "N", "U", size, size, 1.0f, U, size, L, size);
 
 	float max_err = 0.0f;
 	for (i = 0; i < size ; i++)

+ 3 - 3
examples/mult/double.h

@@ -17,9 +17,9 @@
 #define TYPE	double
 
 #define CUBLAS_GEMM cublasDgemm
-#define CPU_GEMM	DGEMM
-#define CPU_ASUM	DASUM
-#define CPU_IAMAX	IDAMAX
+#define CPU_GEMM	STARPU_DGEMM
+#define CPU_ASUM	STARPU_DASUM
+#define CPU_IAMAX	STARPU_IDAMAX
 #define STARPU_GEMM(name)	starpu_dgemm_##name
 
 #define str(s) #s

+ 3 - 3
examples/mult/simple.h

@@ -17,9 +17,9 @@
 #define TYPE	float
 
 #define CUBLAS_GEMM cublasSgemm
-#define CPU_GEMM	SGEMM
-#define CPU_ASUM	SASUM
-#define CPU_IAMAX	ISAMAX
+#define CPU_GEMM	STARPU_SGEMM
+#define CPU_ASUM	STARPU_SASUM
+#define CPU_IAMAX	STARPU_ISAMAX
 #define STARPU_GEMM(name)	starpu_sgemm_##name
 
 #define str(s) #s

+ 2 - 2
examples/pipeline/pipeline.c

@@ -90,7 +90,7 @@ void pipeline_cpu_axpy(void *descr[], void *arg)
 	float *y = (float *) STARPU_VECTOR_GET_PTR(descr[1]);
 	int n = STARPU_VECTOR_GET_NX(descr[0]);
 
-	SAXPY(n, 1., x, 1, y, 1);
+	STARPU_SAXPY(n, 1., x, 1, y, 1);
 }
 
 #ifdef STARPU_USE_CUDA
@@ -129,7 +129,7 @@ void pipeline_cpu_sum(void *descr[], void *_args)
 	int n = STARPU_VECTOR_GET_NX(descr[0]);
 	float y;
 
-	y = SASUM(n, x, 1);
+	y = STARPU_SASUM(n, x, 1);
 
 	FPRINTF(stderr,"CPU finished with %f\n", y);
 }

+ 58 - 24
examples/sched_ctx/sched_ctx.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2013  Université de Bordeaux 1
- * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  *
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
+OB */
 
 #include <starpu.h>
 
@@ -26,16 +26,20 @@
 int tasks_executed = 0;
 starpu_pthread_mutex_t mut;
 
-static void sched_ctx_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg STARPU_ATTRIBUTE_UNUSED)
+static void sched_ctx_cpu_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg STARPU_ATTRIBUTE_UNUSED)
 {
 	starpu_pthread_mutex_lock(&mut);
 	tasks_executed++;
 	starpu_pthread_mutex_unlock(&mut);
 }
 
-static struct starpu_codelet sched_ctx_codelet =
+static void sched_ctx_cuda_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg STARPU_ATTRIBUTE_UNUSED)
 {
-	.cpu_funcs = {sched_ctx_func, NULL},
+}
+
+static struct starpu_codelet sched_ctx_codelet1 =
+{
+	.cpu_funcs = {sched_ctx_cpu_func, NULL},
 	.cuda_funcs = {NULL},
 	.opencl_funcs = {NULL},
 	.model = NULL,
@@ -43,11 +47,25 @@ static struct starpu_codelet sched_ctx_codelet =
 	.name = "sched_ctx"
 };
 
+static struct starpu_codelet sched_ctx_codelet2 =
+{
+	.cpu_funcs = {sched_ctx_cpu_func, NULL},
+	.cuda_funcs = {sched_ctx_cuda_func, NULL},
+	.opencl_funcs = {NULL},
+	.model = NULL,
+	.nbuffers = 0,
+	.name = "sched_ctx"
+};
+
 
 int main(int argc, char **argv)
 {
 	int ntasks = NTASKS;
 	int ret;
+	unsigned ncuda = 0;
+	int nprocs1 = 0;
+	int nprocs2 = 0;
+	int procs1[STARPU_NMAXWORKERS], procs2[STARPU_NMAXWORKERS];
 
 	ret = starpu_init(NULL);
 	if (ret == -ENODEV)
@@ -55,25 +73,23 @@ int main(int argc, char **argv)
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	starpu_pthread_mutex_init(&mut, NULL);
-	int nprocs1 = 1;
-	int nprocs2 = 1;
-	int procs1[STARPU_NMAXWORKERS], procs2[STARPU_NMAXWORKERS];
-	procs1[0] = 0;
-	procs2[0] = 0;
 
 #ifdef STARPU_USE_CPU
-	unsigned ncpus =  starpu_cpu_worker_get_count();
-	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, ncpus);
-
-	nprocs1 = ncpus;
+	nprocs1 = starpu_cpu_worker_get_count();
+	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, nprocs1);
 #endif
+	// if there is no cpu, skip
+	if (nprocs1 == 0) goto enodev;
 
 #ifdef STARPU_USE_CUDA
-	unsigned ncuda = starpu_cuda_worker_get_count();
-	starpu_worker_get_ids_by_type(STARPU_CUDA_WORKER, procs2, ncuda);
-
-	nprocs2 = ncuda == 0 ? 1 : ncuda;
+	ncuda = nprocs2 = starpu_cuda_worker_get_count();
+	starpu_worker_get_ids_by_type(STARPU_CUDA_WORKER, procs2, nprocs2);
 #endif
+	if (nprocs2 == 0)
+	{
+	     nprocs2 = 1;
+	     procs2[0] = procs1[0];
+	}
 
 	/*create contexts however you want*/
 	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", STARPU_SCHED_CTX_POLICY_NAME, "eager", 0);
@@ -82,17 +98,18 @@ int main(int argc, char **argv)
 	/*indicate what to do with the resources when context 2 finishes (it depends on your application)*/
 	starpu_sched_ctx_set_inheritor(sched_ctx2, sched_ctx1);
 
+	starpu_sched_ctx_display_workers(sched_ctx2, stderr);
+
 	int i;
 	for (i = 0; i < ntasks/2; i++)
 	{
 		struct starpu_task *task = starpu_task_create();
 
-		task->cl = &sched_ctx_codelet;
+		task->cl = &sched_ctx_codelet1;
 		task->cl_arg = NULL;
 
 		/*submit tasks to context*/
 		ret = starpu_task_submit_to_ctx(task,sched_ctx1);
-
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}
 
@@ -102,11 +119,27 @@ int main(int argc, char **argv)
 
 	starpu_sched_ctx_finished_submit(sched_ctx1);
 
+	/* task with no cuda impl submitted to a ctx with gpus only */
+	struct starpu_task *task2 = starpu_task_create();
+	task2->cl = &sched_ctx_codelet1;
+	task2->cl_arg = NULL;
+
+	/*submit tasks to context*/
+	ret = starpu_task_submit_to_ctx(task2,sched_ctx2);
+	if (ncuda == 0)
+	{
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+	else
+	{
+		STARPU_ASSERT_MSG(ret == -ENODEV, "submit task should ret enodev when the ctx does not have the PUs needed by the task");
+	}
+
 	for (i = 0; i < ntasks/2; i++)
 	{
 		struct starpu_task *task = starpu_task_create();
 
-		task->cl = &sched_ctx_codelet;
+		task->cl = &sched_ctx_codelet2;
 		task->cl_arg = NULL;
 
 		ret = starpu_task_submit_to_ctx(task,sched_ctx2);
@@ -121,8 +154,9 @@ int main(int argc, char **argv)
 
 	starpu_sched_ctx_delete(sched_ctx1);
 	starpu_sched_ctx_delete(sched_ctx2);
-	printf("tasks executed %d out of %d\n", tasks_executed, ntasks);
-	starpu_shutdown();
+	printf("tasks executed %d out of %d\n", tasks_executed, ntasks/2);
 
-	return 0;
+enodev:
+	starpu_shutdown();
+	return nprocs1 == 0 ? 77 : 0;
 }

+ 8 - 4
examples/sched_ctx/sched_ctx_without_sched_policy.c

@@ -73,6 +73,7 @@ int main(int argc, char **argv)
 	tasks_executed[1] = 0;
 	int ntasks = NTASKS;
 	int ret, j, k;
+	unsigned ncpus = 0;
 
 	ret = starpu_init(NULL);
 	if (ret == -ENODEV)
@@ -85,7 +86,7 @@ int main(int argc, char **argv)
 	int *procs1, *procs2;
 
 #ifdef STARPU_USE_CPU
-	unsigned ncpus =  starpu_cpu_worker_get_count();
+	ncpus = starpu_cpu_worker_get_count();
 	procs1 = (int*)malloc(ncpus*sizeof(int));
 	procs2 = (int*)malloc(ncpus*sizeof(int));
 	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, ncpus);
@@ -96,7 +97,7 @@ int main(int argc, char **argv)
 		nprocs2 =  ncpus-nprocs1;
 		k = 0;
 		for(j = nprocs1; j < nprocs1+nprocs2; j++)
-			procs2[k++] = j;
+			procs2[k++] = procs1[j];
 	}
 	else
 	{
@@ -113,6 +114,8 @@ int main(int argc, char **argv)
 	procs2[0] = 0;
 #endif
 
+	if (ncpus == 0) goto enodev;
+
 	/*create contexts however you want*/
 	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", 0);
 	unsigned sched_ctx2 = starpu_sched_ctx_create(procs2, nprocs2, "ctx2", 0);
@@ -158,7 +161,8 @@ int main(int argc, char **argv)
 	starpu_sched_ctx_delete(sched_ctx2);
 	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx1, tasks_executed[0], NTASKS*NTASKS);
 	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx2, tasks_executed[1], NTASKS*NTASKS);
-	starpu_shutdown();
 
-	return 0;
+enodev:
+	starpu_shutdown();
+	return ncpus == 0 ? 77 : 0;
 }

+ 6 - 1
examples/worker_collections/worker_tree_example.c

@@ -30,7 +30,12 @@ int main(int argc, char **argv)
 
 int main()
 {
-	starpu_init(NULL);
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	int procs[STARPU_NMAXWORKERS];
 	unsigned ncpus =  starpu_cpu_worker_get_count();

+ 1 - 1
gcc-plugin/examples/cholesky/cholesky.c

@@ -203,7 +203,7 @@ int main(int argc, char **argv)
 		}
 	}
 	float test_mat[size * size] __heap;
-	SSYRK("L", "N", size, size, 1.0f,
+	STARPU_SSYRK("L", "N", size, size, 1.0f,
 	      rmat, size, 0.0f, test_mat, size);
 
 	fprintf(stderr, "comparing results ...\n");

+ 4 - 4
gcc-plugin/examples/cholesky/cholesky_kernels.c

@@ -42,7 +42,7 @@ static inline void chol_common_cpu_codelet_update_u22(const float *left, const f
 
 	switch (s) {
 		case 0:
-			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21,
+			STARPU_SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21,
 				right, ld12, 1.0f, center, ld22);
 			break;
 #ifdef STARPU_USE_CUDA
@@ -95,7 +95,7 @@ static inline void chol_common_codelet_update_u21(const float *sub11, float *sub
 {
 	switch (s) {
 		case 0:
-			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
+			STARPU_STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
 			break;
 #ifdef STARPU_USE_CUDA
 		case 1:
@@ -153,9 +153,9 @@ static inline void chol_common_codelet_update_u11(float *sub11, unsigned nx, uns
 
 				STARPU_ASSERT(lambda11 != 0.0f);
 
-				SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
+				STARPU_SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
 
-				SSYR("L", nx - z - 1, -1.0f,
+				STARPU_SSYR("L", nx - z - 1, -1.0f,
 							&sub11[(z+1)+z*ld], 1,
 							&sub11[(z+1)+(z+1)*ld], ld);
 			}

+ 2 - 0
include/starpu_sched_ctx.h

@@ -40,6 +40,8 @@ void starpu_sched_ctx_add_workers(int *workerids_ctx, int nworkers_ctx, unsigned
 
 void starpu_sched_ctx_remove_workers(int *workerids_ctx, int nworkers_ctx, unsigned sched_ctx_id);
 
+void starpu_sched_ctx_display_workers(unsigned sched_ctx_id, FILE *f);
+
 void starpu_sched_ctx_delete(unsigned sched_ctx_id);
 
 void starpu_sched_ctx_set_inheritor(unsigned sched_ctx_id, unsigned inheritor);

+ 1 - 1
mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c

@@ -203,7 +203,7 @@ void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *corr
 	float *test_mat = malloc(size*size*sizeof(float));
 	STARPU_ASSERT(test_mat);
 
-	SSYRK("L", "N", size, size, 1.0f,
+	STARPU_SSYRK("L", "N", size, size, 1.0f,
 			rmat, size, 0.0f, test_mat, size);
 
 	FPRINTF(stderr, "[%d] comparing results ...\n", rank);

+ 4 - 4
mpi/examples/matrix_decomposition/mpi_cholesky_kernels.c

@@ -55,7 +55,7 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STAR
 	switch (s)
 	{
 		case 0:
-			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21,
+			STARPU_SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21,
 				right, ld12, 1.0f, center, ld22);
 			break;
 #ifdef STARPU_USE_CUDA
@@ -111,7 +111,7 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, STARPU_A
 	switch (s)
 	{
 		case 0:
-			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
+			STARPU_STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
 			break;
 #ifdef STARPU_USE_CUDA
 		case 1:
@@ -171,9 +171,9 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 
 				STARPU_ASSERT(lambda11 != 0.0f);
 
-				SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
+				STARPU_SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
 
-				SSYR("L", nx - z - 1, -1.0f,
+				STARPU_SSYR("L", nx - z - 1, -1.0f,
 							&sub11[(z+1)+z*ld], 1,
 							&sub11[(z+1)+(z+1)*ld], ld);
 			}

+ 10 - 10
mpi/examples/mpi_lu/mpi_lu-double.h

@@ -27,16 +27,16 @@
 #define CUBLAS_SWAP	cublasDswap
 #define CUBLAS_IAMAX	cublasIdamax
 
-#define CPU_GEMM	DGEMM
-#define CPU_GEMV	DGEMV
-#define CPU_TRSM	DTRSM
-#define CPU_SCAL	DSCAL
-#define CPU_GER		DGER
-#define CPU_SWAP	DSWAP
+#define CPU_GEMM	STARPU_DGEMM
+#define CPU_GEMV	STARPU_DGEMV
+#define CPU_TRSM	STARPU_DTRSM
+#define CPU_SCAL	STARPU_DSCAL
+#define CPU_GER		STARPU_DGER
+#define CPU_SWAP	STARPU_DSWAP
 
-#define CPU_TRMM	DTRMM
-#define CPU_AXPY	DAXPY
-#define CPU_ASUM	DASUM
-#define CPU_IAMAX	IDAMAX
+#define CPU_TRMM	STARPU_DTRMM
+#define CPU_AXPY	STARPU_DAXPY
+#define CPU_ASUM	STARPU_DASUM
+#define CPU_IAMAX	STARPU_IDAMAX
 
 #define PIVOT_THRESHHOLD	10e-10

+ 10 - 10
mpi/examples/mpi_lu/mpi_lu-float.h

@@ -27,16 +27,16 @@
 #define CUBLAS_SWAP	cublasSswap
 #define CUBLAS_IAMAX	cublasIsamax
 
-#define CPU_GEMM	SGEMM
-#define CPU_GEMV	SGEMV
-#define CPU_TRSM	STRSM
-#define CPU_SCAL	SSCAL
-#define CPU_GER		SGER
-#define CPU_SWAP	SSWAP
+#define CPU_GEMM	STARPU_SGEMM
+#define CPU_GEMV	STARPU_SGEMV
+#define CPU_TRSM	STARPU_STRSM
+#define CPU_SCAL	STARPU_SSCAL
+#define CPU_GER		STARPU_SGER
+#define CPU_SWAP	STARPU_SSWAP
 
-#define CPU_TRMM	STRMM
-#define CPU_AXPY	SAXPY
-#define CPU_ASUM	SASUM
-#define CPU_IAMAX	ISAMAX
+#define CPU_TRMM	STARPU_STRMM
+#define CPU_AXPY	STARPU_SAXPY
+#define CPU_ASUM	STARPU_SASUM
+#define CPU_IAMAX	STARPU_ISAMAX
 
 #define PIVOT_THRESHHOLD	10e-5

+ 6 - 2
mpi/src/Makefile.am

@@ -39,7 +39,9 @@ noinst_HEADERS =					\
 	starpu_mpi_task_insert.h			\
 	starpu_mpi_datatype.h				\
 	starpu_mpi_cache.h				\
-	starpu_mpi_cache_stats.h
+	starpu_mpi_cache_stats.h			\
+	starpu_mpi_early_data.h				\
+	starpu_mpi_early_request.h
 
 libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_SOURCES =	\
 	starpu_mpi.c					\
@@ -50,7 +52,9 @@ libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_SOURCES =	\
 	starpu_mpi_stats.c				\
 	starpu_mpi_private.c				\
 	starpu_mpi_cache.c				\
-	starpu_mpi_cache_stats.c
+	starpu_mpi_cache_stats.c			\
+	starpu_mpi_early_data.c				\
+	starpu_mpi_early_request.c
 
 showcheck:
 	-cat /dev/null

+ 138 - 348
mpi/src/starpu_mpi.c

@@ -22,6 +22,8 @@
 #include <starpu_profiling.h>
 #include <starpu_mpi_stats.h>
 #include <starpu_mpi_task_insert.h>
+#include <starpu_mpi_early_data.h>
+#include <starpu_mpi_early_request.h>
 #include <common/config.h>
 #include <common/thread.h>
 #include <datawizard/interfaces/data_interface.h>
@@ -65,234 +67,52 @@ static int posted_requests = 0, newer_requests, barrier_running = 0;
 
 #define _STARPU_MPI_INC_POSTED_REQUESTS(value) { STARPU_PTHREAD_MUTEX_LOCK(&mutex_posted_requests); posted_requests += value; STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_posted_requests); }
 
-LIST_TYPE(_starpu_mpi_copy_handle,
-	  starpu_data_handle_t handle;
-	  struct _starpu_mpi_envelope *env;
-	  struct _starpu_mpi_req *req;
-	  void *buffer;
-	  int mpi_tag;
-	  int source;
-	  int req_ready;
-	  starpu_pthread_mutex_t req_mutex;
-	  starpu_pthread_cond_t req_cond;
-);
-
-struct _starpu_mpi_copy_handle_hashlist
+static void _starpu_mpi_request_init(struct _starpu_mpi_req **req)
 {
-	struct _starpu_mpi_copy_handle_list *list;
-	UT_hash_handle hh;
-	int mpi_tag;
-};
-
-/********************************************************/
-/*                                                      */
-/*  Hashmap's requests functionalities                  */
-/*                                                      */
-/********************************************************/
-
-/** stores application requests for which data have not been received yet */
-static struct _starpu_mpi_req **_starpu_mpi_app_req_hashmap = NULL;
-static int _starpu_mpi_app_req_hashmap_count = 0;
-/** stores data which have been received by MPI but have not been requested by the application */
-static struct _starpu_mpi_copy_handle_hashlist **_starpu_mpi_copy_handle_hashmap = NULL;
-static int _starpu_mpi_copy_handle_hashmap_count = 0;
-
-static struct _starpu_mpi_req* find_app_req(int mpi_tag, int source)
-{
-	struct _starpu_mpi_req* req;
-
-	HASH_FIND_INT(_starpu_mpi_app_req_hashmap[source], &mpi_tag, req);
-
-	return req;
-}
-
-static void add_app_req(struct _starpu_mpi_req *req)
-{
-	struct _starpu_mpi_req *test_req;
-
-	test_req = find_app_req(req->mpi_tag, req->srcdst);
-
-	if (test_req == NULL)
-	{
-		HASH_ADD_INT(_starpu_mpi_app_req_hashmap[req->srcdst], mpi_tag, req);
-		_starpu_mpi_app_req_hashmap_count ++;
-		_STARPU_MPI_DEBUG(3, "Adding request %p with tag %d in the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
-	}
-	else
-	{
-		_STARPU_MPI_DEBUG(3, "[Error] request %p with tag %d already in the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
-		int seq_const = starpu_data_get_sequential_consistency_flag(req->data_handle);
-		if (seq_const &&  req->sequential_consistency)
-		{
-			STARPU_ASSERT_MSG(!test_req, "[Error] request %p with tag %d wanted to be added to the application request hashmap[%d], while another request %p with the same tag is already in it. \n Sequential consistency is activated : this is not supported by StarPU.", req, req->mpi_tag, req->srcdst, test_req);
-		}
-		else
-		{
-			STARPU_ASSERT_MSG(!test_req, "[Error] request %p with tag %d wanted to be added to the application request hashmap[%d], while another request %p with the same tag is already in it. \n Sequential consistency isn't activated for this handle : you should want to add dependencies between requests for which the sequential consistency is deactivated.", req, req->mpi_tag, req->srcdst, test_req);
-		}
-	}
-}
-
-static void delete_app_req(struct _starpu_mpi_req *req)
-{
-	struct _starpu_mpi_req *test_req;
+	*req = malloc(sizeof(struct _starpu_mpi_req));
+	STARPU_ASSERT_MSG(*req, "Invalid request");
 
-	test_req = find_app_req(req->mpi_tag, req->srcdst);
-
-	if (test_req != NULL)
-	{
-		HASH_DEL(_starpu_mpi_app_req_hashmap[req->srcdst], req);
-		_starpu_mpi_app_req_hashmap_count --;
-		_STARPU_MPI_DEBUG(3, "Deleting application request %p with tag %d from the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
-	}
-	else
-	{
-		_STARPU_MPI_DEBUG(3, "[Warning] request %p with tag %d is NOT in the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
-	}
-}
-
-#ifdef STARPU_VERBOSE
-static void _starpu_mpi_copy_handle_display_hash(int source, int tag)
-{
-	struct _starpu_mpi_copy_handle_hashlist *hashlist;
-	HASH_FIND_INT(_starpu_mpi_copy_handle_hashmap[source], &tag, hashlist);
-
-	if (hashlist == NULL)
-	{
-		_STARPU_MPI_DEBUG(60, "Hashlist for source %d and tag %d does not exist\n", source, tag);
-	}
-	else if (_starpu_mpi_copy_handle_list_empty(hashlist->list))
-	{
-		_STARPU_MPI_DEBUG(60, "Hashlist for source %d and tag %d is empty\n", source, tag);
-	}
-	else
-	{
-		struct _starpu_mpi_copy_handle *cur;
-		for (cur = _starpu_mpi_copy_handle_list_begin(hashlist->list) ;
-		     cur != _starpu_mpi_copy_handle_list_end(hashlist->list);
-		     cur = _starpu_mpi_copy_handle_list_next(cur))
-		{
-			_STARPU_MPI_DEBUG(60, "Element for source %d and tag %d: %p\n", source, tag, cur);
-		}
-	}
-}
-#endif
-
-static struct _starpu_mpi_copy_handle *pop_chandle(int mpi_tag, int source, int delete)
-{
-	struct _starpu_mpi_copy_handle_hashlist *hashlist;
-	struct _starpu_mpi_copy_handle *chandle;
-
-	_STARPU_MPI_DEBUG(60, "Looking for chandle with tag %d in the hashmap[%d]\n", mpi_tag, source);
-	HASH_FIND_INT(_starpu_mpi_copy_handle_hashmap[source], &mpi_tag, hashlist);
-	if (hashlist == NULL)
-	{
-		chandle = NULL;
-	}
-	else
-	{
-		if (_starpu_mpi_copy_handle_list_empty(hashlist->list))
-		{
-			chandle = NULL;
-		}
-		else
-		{
-			if (delete == 1)
-			{
-				chandle = _starpu_mpi_copy_handle_list_pop_front(hashlist->list);
-			}
-			else
-			{
-				chandle = _starpu_mpi_copy_handle_list_front(hashlist->list);
-			}
-		}
-	}
-	_STARPU_MPI_DEBUG(60, "Found chandle %p with tag %d in the hashmap[%d]\n", chandle, mpi_tag, source);
-	return chandle;
-}
-
-static struct _starpu_mpi_copy_handle *find_chandle(int mpi_tag, int source)
-{
-	return pop_chandle(mpi_tag, source, 0);
-}
-
-static void add_chandle(struct _starpu_mpi_copy_handle *chandle)
-{
-	_STARPU_MPI_DEBUG(60, "Trying to add chandle %p with tag %d in the hashmap[%d]\n", chandle, chandle->mpi_tag, chandle->source);
-
-	struct _starpu_mpi_copy_handle_hashlist *hashlist;
-	HASH_FIND_INT(_starpu_mpi_copy_handle_hashmap[chandle->source], &chandle->mpi_tag, hashlist);
-	if (hashlist == NULL)
-	{
-		hashlist = malloc(sizeof(struct _starpu_mpi_copy_handle_hashlist));
-		hashlist->list = _starpu_mpi_copy_handle_list_new();
-		hashlist->mpi_tag = chandle->mpi_tag;
-		HASH_ADD_INT(_starpu_mpi_copy_handle_hashmap[chandle->source], mpi_tag, hashlist);
-	}
-	_starpu_mpi_copy_handle_list_push_back(hashlist->list, chandle);
-	_starpu_mpi_copy_handle_hashmap_count ++;
-#ifdef STARPU_VERBOSE
-	_starpu_mpi_copy_handle_display_hash(chandle->source, chandle->mpi_tag);
-#endif
-}
-
-static void delete_chandle(struct _starpu_mpi_copy_handle *chandle)
-{
-	_STARPU_MPI_DEBUG(60, "Trying to delete chandle %p with tag %d in the hashmap[%d]\n", chandle, chandle->mpi_tag, chandle->source);
-	struct _starpu_mpi_copy_handle *found = pop_chandle(chandle->mpi_tag, chandle->source, 1);
-
-	STARPU_ASSERT_MSG(found == chandle,
-			  "Error delete_chandle : chandle %p with tag %d is NOT in the hashmap[%d]\n", chandle, chandle->mpi_tag, chandle->source);
-
-	_starpu_mpi_copy_handle_hashmap_count --;
-#ifdef STARPU_VERBOSE
-	_starpu_mpi_copy_handle_display_hash(chandle->source, chandle->mpi_tag);
-#endif
-}
-
-static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
-{
 	/* Initialize the request structure */
-	req->data_handle = NULL;
+	(*req)->data_handle = NULL;
 
-	req->datatype = 0;
-	req->ptr = NULL;
-	req->count = -1;
-	req->user_datatype = -1;
+	(*req)->datatype = 0;
+	(*req)->ptr = NULL;
+	(*req)->count = -1;
+	(*req)->user_datatype = -1;
 
-	req->srcdst = -1;
-	req->mpi_tag = -1;
-	req->comm = 0;
+	(*req)->srcdst = -1;
+	(*req)->mpi_tag = -1;
+	(*req)->comm = 0;
 
-	req->func = NULL;
+	(*req)->func = NULL;
 
-	req->status = NULL;
-	req->request = 0;
-	req->flag = NULL;
+	(*req)->status = NULL;
+	(*req)->request = 0;
+	(*req)->flag = NULL;
 
-	req->ret = -1;
-	STARPU_PTHREAD_MUTEX_INIT(&req->req_mutex, NULL);
-	STARPU_PTHREAD_COND_INIT(&req->req_cond, NULL);
-	STARPU_PTHREAD_MUTEX_INIT(&req->posted_mutex, NULL);
-	STARPU_PTHREAD_COND_INIT(&req->posted_cond, NULL);
+	(*req)->ret = -1;
+	STARPU_PTHREAD_MUTEX_INIT(&((*req)->req_mutex), NULL);
+	STARPU_PTHREAD_COND_INIT(&((*req)->req_cond), NULL);
+	STARPU_PTHREAD_MUTEX_INIT(&((*req)->posted_mutex), NULL);
+	STARPU_PTHREAD_COND_INIT(&((*req)->posted_cond), NULL);
 
-	req->request_type = UNKNOWN_REQ;
+	(*req)->request_type = UNKNOWN_REQ;
 
-	req->submitted = 0;
-	req->completed = 0;
-	req->posted = 0;
+	(*req)->submitted = 0;
+	(*req)->completed = 0;
+	(*req)->posted = 0;
 
-	req->other_request = NULL;
+	(*req)->other_request = NULL;
 
-	req->detached = -1;
-	req->callback = NULL;
-	req->callback_arg = NULL;
+	(*req)->detached = -1;
+	(*req)->callback = NULL;
+	(*req)->callback_arg = NULL;
 
-	req->size_req = 0;
-	req->internal_req = NULL;
-	req->is_internal_req = 0;
-	req->envelope = NULL;
-	req->sequential_consistency = 1;
+	(*req)->size_req = 0;
+	(*req)->internal_req = NULL;
+	(*req)->is_internal_req = 0;
+	(*req)->envelope = NULL;
+	(*req)->sequential_consistency = 1;
 }
 
  /********************************************************/
@@ -310,35 +130,33 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
 							       int is_internal_req,
 							       ssize_t count)
 {
+	struct _starpu_mpi_req *req;
 
-	 _STARPU_MPI_LOG_IN();
-	 struct _starpu_mpi_req *req = malloc(sizeof(struct _starpu_mpi_req));
-	 STARPU_ASSERT_MSG(req, "Invalid request");
-
-	 _STARPU_MPI_INC_POSTED_REQUESTS(1);
-
-	 /* Initialize the request structure */
-	 _starpu_mpi_request_init(req);
-	 req->request_type = request_type;
-	 req->data_handle = data_handle;
-	 req->srcdst = srcdst;
-	 req->mpi_tag = mpi_tag;
-	 req->comm = comm;
-	 req->detached = detached;
-	 req->callback = callback;
-	 req->callback_arg = arg;
-	 req->func = func;
-	 req->sequential_consistency = sequential_consistency;
-	 req->is_internal_req = is_internal_req;
-	 req->count = count;
-
-	 /* Asynchronously request StarPU to fetch the data in main memory: when
-	  * it is available in main memory, _starpu_mpi_submit_new_mpi_request(req) is called and
-	  * the request is actually submitted */
-	 starpu_data_acquire_cb_sequential_consistency(data_handle, mode, _starpu_mpi_submit_new_mpi_request, (void *)req, sequential_consistency);
+	_STARPU_MPI_LOG_IN();
+	_STARPU_MPI_INC_POSTED_REQUESTS(1);
 
-	 _STARPU_MPI_LOG_OUT();
-	 return req;
+	/* Initialize the request structure */
+	_starpu_mpi_request_init(&req);
+	req->request_type = request_type;
+	req->data_handle = data_handle;
+	req->srcdst = srcdst;
+	req->mpi_tag = mpi_tag;
+	req->comm = comm;
+	req->detached = detached;
+	req->callback = callback;
+	req->callback_arg = arg;
+	req->func = func;
+	req->sequential_consistency = sequential_consistency;
+	req->is_internal_req = is_internal_req;
+	req->count = count;
+
+	/* Asynchronously request StarPU to fetch the data in main memory: when
+	 * it is available in main memory, _starpu_mpi_submit_new_mpi_request(req) is called and
+	 * the request is actually submitted */
+	starpu_data_acquire_cb_sequential_consistency(data_handle, mode, _starpu_mpi_submit_new_mpi_request, (void *)req, sequential_consistency);
+
+	_STARPU_MPI_LOG_OUT();
+	return req;
  }
 
  /********************************************************/
@@ -608,15 +426,11 @@ static void _starpu_mpi_wait_func(struct _starpu_mpi_req *waiting_req)
 
 int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 {
-	_STARPU_MPI_LOG_IN();
 	int ret;
-
-	struct _starpu_mpi_req *waiting_req = malloc(sizeof(struct _starpu_mpi_req));
-	_starpu_mpi_request_init(waiting_req);
-	STARPU_ASSERT_MSG(waiting_req, "Allocation failed");
-
 	struct _starpu_mpi_req *req = *public_req;
+	struct _starpu_mpi_req *waiting_req;
 
+	_STARPU_MPI_LOG_IN();
 	_STARPU_MPI_INC_POSTED_REQUESTS(1);
 
 	/* We cannot try to complete a MPI request that was not actually posted
@@ -627,7 +441,7 @@ int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 	STARPU_PTHREAD_MUTEX_UNLOCK(&(req->req_mutex));
 
 	/* Initialize the request structure */
-	 _starpu_mpi_request_init(waiting_req);
+	 _starpu_mpi_request_init(&waiting_req);
 	waiting_req->status = status;
 	waiting_req->other_request = req;
 	waiting_req->func = _starpu_mpi_wait_func;
@@ -704,9 +518,8 @@ int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 
 	if (submitted)
 	{
-		struct _starpu_mpi_req *testing_req = malloc(sizeof(struct _starpu_mpi_req));
-		STARPU_ASSERT_MSG(testing_req, "allocation failed");
-		_starpu_mpi_request_init(testing_req);
+		struct _starpu_mpi_req *testing_req;
+		_starpu_mpi_request_init(&testing_req);
 
 		/* Initialize the request structure */
 		STARPU_PTHREAD_MUTEX_INIT(&(testing_req->req_mutex), NULL);
@@ -768,11 +581,11 @@ static void _starpu_mpi_barrier_func(struct _starpu_mpi_req *barrier_req)
 
 int starpu_mpi_barrier(MPI_Comm comm)
 {
-	_STARPU_MPI_LOG_IN();
 	int ret;
-	struct _starpu_mpi_req *barrier_req = malloc(sizeof(struct _starpu_mpi_req));
-	STARPU_ASSERT_MSG(barrier_req, "allocation failed");
-	_starpu_mpi_request_init(barrier_req);
+	struct _starpu_mpi_req *barrier_req;
+
+	_STARPU_MPI_LOG_IN();
+	_starpu_mpi_request_init(&barrier_req);
 
 	/* First wait for *both* all tasks and MPI requests to finish, in case
 	 * some tasks generate MPI requests, MPI requests generate tasks, etc.
@@ -855,11 +668,11 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 
 	if (req->internal_req)
 	{
-		struct _starpu_mpi_copy_handle *chandle = find_chandle(req->mpi_tag, req->srcdst);
-		STARPU_ASSERT_MSG(chandle, "Could not find a copy data handle with the tag %d and the node %d\n", req->mpi_tag, req->srcdst);
-		_STARPU_MPI_DEBUG(3, "Handling deleting of copy_handle structure from the hashmap..\n");
-		delete_chandle(chandle);
-		free(chandle);
+		struct _starpu_mpi_early_data_handle *early_data_handle = _starpu_mpi_early_data_find(req->mpi_tag, req->srcdst);
+		STARPU_ASSERT_MSG(early_data_handle, "Could not find a copy data handle with the tag %d and the node %d\n", req->mpi_tag, req->srcdst);
+		_STARPU_MPI_DEBUG(3, "Handling deleting of early_data structure from the hashmap..\n");
+		_starpu_mpi_early_data_delete(early_data_handle);
+		free(early_data_handle);
 	}
 	else
 	{
@@ -911,17 +724,17 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 	_STARPU_MPI_LOG_OUT();
 }
 
-struct _starpu_mpi_copy_cb_args
+struct _starpu_mpi_early_data_cb_args
 {
 	starpu_data_handle_t data_handle;
-	starpu_data_handle_t copy_handle;
+	starpu_data_handle_t early_handle;
 	struct _starpu_mpi_req *req;
 	void *buffer;
 };
 
-static void _starpu_mpi_copy_cb(void* arg)
+static void _starpu_mpi_early_data_cb(void* arg)
 {
-	struct _starpu_mpi_copy_cb_args *args = arg;
+	struct _starpu_mpi_early_data_cb_args *args = arg;
 
 	// We store in the application request the internal MPI
 	// request so that it can be used by starpu_mpi_wait
@@ -931,16 +744,16 @@ static void _starpu_mpi_copy_cb(void* arg)
 	if (args->buffer)
 	{
 		/* Data has been received as a raw memory, it has to be unpacked */
-		struct starpu_data_interface_ops *itf_src = starpu_data_get_interface_ops(args->copy_handle);
+		struct starpu_data_interface_ops *itf_src = starpu_data_get_interface_ops(args->early_handle);
 		struct starpu_data_interface_ops *itf_dst = starpu_data_get_interface_ops(args->data_handle);
 		STARPU_ASSERT_MSG(itf_dst->unpack_data, "The data interface does not define an unpack function\n");
-		itf_dst->unpack_data(args->data_handle, STARPU_MAIN_RAM, args->buffer, itf_src->get_size(args->copy_handle));
+		itf_dst->unpack_data(args->data_handle, STARPU_MAIN_RAM, args->buffer, itf_src->get_size(args->early_handle));
 		free(args->buffer);
 	}
 	else
 	{
-		struct starpu_data_interface_ops *itf = starpu_data_get_interface_ops(args->copy_handle);
-		void* itf_src = starpu_data_get_interface_on_node(args->copy_handle, STARPU_MAIN_RAM);
+		struct starpu_data_interface_ops *itf = starpu_data_get_interface_ops(args->early_handle);
+		void* itf_src = starpu_data_get_interface_on_node(args->early_handle, STARPU_MAIN_RAM);
 		void* itf_dst = starpu_data_get_interface_on_node(args->data_handle, STARPU_MAIN_RAM);
 
 		if (!itf->copy_methods->ram_to_ram)
@@ -955,11 +768,11 @@ static void _starpu_mpi_copy_cb(void* arg)
 		}
 	}
 
-	_STARPU_MPI_DEBUG(3, "Done, handling release of copy_handle..\n");
-	starpu_data_release(args->copy_handle);
+	_STARPU_MPI_DEBUG(3, "Done, handling release of early_handle..\n");
+	starpu_data_release(args->early_handle);
 
-	_STARPU_MPI_DEBUG(3, "Done, handling unregister of copy_handle..\n");
-	starpu_data_unregister_submit(args->copy_handle);
+	_STARPU_MPI_DEBUG(3, "Done, handling unregister of early_handle..\n");
+	starpu_data_unregister_submit(args->early_handle);
 
 	_STARPU_MPI_DEBUG(3, "Done, handling request %p termination of the already received request\n",args->req);
 	// If the request is detached, we need to call _starpu_mpi_handle_request_termination
@@ -1019,41 +832,41 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 		else
 		{
 			/* test whether the receive request has already been submitted internally by StarPU-MPI*/
-			struct _starpu_mpi_copy_handle *chandle = find_chandle(req->mpi_tag, req->srcdst);
+			struct _starpu_mpi_early_data_handle *early_data_handle = _starpu_mpi_early_data_find(req->mpi_tag, req->srcdst);
 
 			/* Case : the request has already been submitted internally by StarPU.
 			 * We'll asynchronously ask a Read permission over the temporary handle, so as when
-			 * the internal receive will be over, the _starpu_mpi_copy_cb function will be called to
+			 * the internal receive will be over, the _starpu_mpi_early_data_cb function will be called to
 			 * bring the data back to the original data handle associated to the request.*/
-			if (chandle)
+			if (early_data_handle)
 			{
 				STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
-				STARPU_PTHREAD_MUTEX_LOCK(&(chandle->req_mutex));
-				while (!(chandle->req_ready))
-					STARPU_PTHREAD_COND_WAIT(&(chandle->req_cond), &(chandle->req_mutex));
-				STARPU_PTHREAD_MUTEX_UNLOCK(&(chandle->req_mutex));
+				STARPU_PTHREAD_MUTEX_LOCK(&(early_data_handle->req_mutex));
+				while (!(early_data_handle->req_ready))
+					STARPU_PTHREAD_COND_WAIT(&(early_data_handle->req_cond), &(early_data_handle->req_mutex));
+				STARPU_PTHREAD_MUTEX_UNLOCK(&(early_data_handle->req_mutex));
 				STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 
 				_STARPU_MPI_DEBUG(3, "The RECV request %p with tag %d has already been received, copying previously received data into handle's pointer..\n", req, req->mpi_tag);
-				STARPU_ASSERT(req->data_handle != chandle->handle);
+				STARPU_ASSERT(req->data_handle != early_data_handle->handle);
 
-				req->internal_req = chandle->req;
+				req->internal_req = early_data_handle->req;
 
-				struct _starpu_mpi_copy_cb_args *cb_args = malloc(sizeof(struct _starpu_mpi_copy_cb_args));
+				struct _starpu_mpi_early_data_cb_args *cb_args = malloc(sizeof(struct _starpu_mpi_early_data_cb_args));
 				cb_args->data_handle = req->data_handle;
-				cb_args->copy_handle = chandle->handle;
-				cb_args->buffer = chandle->buffer;
+				cb_args->early_handle = early_data_handle->handle;
+				cb_args->buffer = early_data_handle->buffer;
 				cb_args->req = req;
 
 				_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
-				starpu_data_acquire_cb(chandle->handle,STARPU_R,_starpu_mpi_copy_cb,(void*) cb_args);
+				starpu_data_acquire_cb(early_data_handle->handle,STARPU_R,_starpu_mpi_early_data_cb,(void*) cb_args);
 			}
 			/* Case : a classic receive request with no send received earlier than expected.
 			 * We just add the pending receive request to the requests' hashmap. */
 			else
 			{
 				_STARPU_MPI_DEBUG(3, "Adding the pending receive request %p (srcdst %d tag %d) into the request hashmap\n", req, req->srcdst, req->mpi_tag);
-				add_app_req(req);
+				_starpu_mpi_early_request_add(req);
 			}
 		}
 	}
@@ -1252,14 +1065,8 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	_starpu_mpi_comm_amounts_init(MPI_COMM_WORLD);
 	_starpu_mpi_cache_init(MPI_COMM_WORLD);
 
-	{
-		int nb_nodes, k;
-		MPI_Comm_size(MPI_COMM_WORLD, &nb_nodes);
-		_starpu_mpi_app_req_hashmap = malloc(nb_nodes * sizeof(struct _starpu_mpi_req *));
-		for(k=0 ; k<nb_nodes ; k++) _starpu_mpi_app_req_hashmap[k] = NULL;
-		_starpu_mpi_copy_handle_hashmap = malloc(nb_nodes * sizeof(struct _starpu_mpi_copy_handle_hash_list *));
-		for(k=0 ; k<nb_nodes ; k++) _starpu_mpi_copy_handle_hashmap[k] = NULL;
-	}
+	_starpu_mpi_early_request_init(worldsize);
+	_starpu_mpi_early_data_init(worldsize);
 
 	/* notify the main thread that the progression thread is ready */
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
@@ -1276,7 +1083,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	while (running || posted_requests || !(_starpu_mpi_req_list_empty(new_requests)) || !(_starpu_mpi_req_list_empty(detached_requests)))
 	{
 		/* shall we block ? */
-		unsigned block = _starpu_mpi_req_list_empty(new_requests) && (_starpu_mpi_app_req_hashmap_count == 0);
+		unsigned block = _starpu_mpi_req_list_empty(new_requests) && _starpu_mpi_early_request_count() == 0;
 
 #ifndef STARPU_MPI_ACTIVITY
 		STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
@@ -1316,7 +1123,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 		/* If there is no currently submitted header_req submitted to catch envelopes from senders, and there is some pending receive
 		 * requests in our side, we resubmit a header request. */
 		MPI_Request header_req;
-		if ((_starpu_mpi_app_req_hashmap_count > 0) && (header_req_submitted == 0))// && (HASH_COUNT(_starpu_mpi_copy_handle_hashmap) == 0))
+		if ((_starpu_mpi_early_request_count() > 0) && (header_req_submitted == 0))// && (HASH_COUNT(_starpu_mpi_early_data_handle_hashmap) == 0))
 		{
 			_STARPU_MPI_DEBUG(3, "Posting a receive to get a data envelop\n");
 			MPI_Irecv(recv_env, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, MPI_ANY_SOURCE, _starpu_mpi_tag, MPI_COMM_WORLD, &header_req);
@@ -1342,14 +1149,14 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 			{
 				_STARPU_MPI_DEBUG(3, "Searching for application request with tag %d and source %d (size %ld)\n", recv_env->mpi_tag, status.MPI_SOURCE, recv_env->size);
 
-				struct _starpu_mpi_req *found_req = find_app_req(recv_env->mpi_tag, status.MPI_SOURCE);
+				struct _starpu_mpi_req *found_req = _starpu_mpi_early_request_find(recv_env->mpi_tag, status.MPI_SOURCE);
 
 				/* Case : a data will arrive before the matching receive has been submitted in our side of the application.
 				 * We will allow a temporary handle to store the incoming data, by submitting a starpu_mpi_irecv_detached
 				 * on this handle, and register this so as the StarPU-MPI layer can remember it.*/
 				if (!found_req)
 				{
-					_STARPU_MPI_DEBUG(3, "Request with tag %d and source %d not found, creating a copy_handle to receive incoming data..\n", recv_env->mpi_tag, status.MPI_SOURCE);
+					_STARPU_MPI_DEBUG(3, "Request with tag %d and source %d not found, creating a early_handle to receive incoming data..\n", recv_env->mpi_tag, status.MPI_SOURCE);
 
 					starpu_data_handle_t data_handle = NULL;
 
@@ -1357,19 +1164,19 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 					data_handle = _starpu_data_get_data_handle_from_tag(recv_env->mpi_tag);
 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 
-					struct _starpu_mpi_copy_handle* chandle = calloc(1, sizeof(struct _starpu_mpi_copy_handle));
-					STARPU_ASSERT(chandle);
-					STARPU_PTHREAD_MUTEX_INIT(&chandle->req_mutex, NULL);
-					STARPU_PTHREAD_COND_INIT(&chandle->req_cond, NULL);
-					chandle->mpi_tag = recv_env->mpi_tag;
-					chandle->env = recv_env;
-					chandle->source = status.MPI_SOURCE;
+					struct _starpu_mpi_early_data_handle* early_data_handle = calloc(1, sizeof(struct _starpu_mpi_early_data_handle));
+					STARPU_ASSERT(early_data_handle);
+					STARPU_PTHREAD_MUTEX_INIT(&early_data_handle->req_mutex, NULL);
+					STARPU_PTHREAD_COND_INIT(&early_data_handle->req_cond, NULL);
+					early_data_handle->mpi_tag = recv_env->mpi_tag;
+					early_data_handle->env = recv_env;
+					early_data_handle->source = status.MPI_SOURCE;
 
 					if (data_handle)
 					{
-						chandle->buffer = NULL;
-						starpu_data_register_same(&chandle->handle, data_handle);
-						add_chandle(chandle);
+						early_data_handle->buffer = NULL;
+						starpu_data_register_same(&early_data_handle->handle, data_handle);
+						_starpu_mpi_early_data_add(early_data_handle);
 					}
 					else
 					{
@@ -1377,15 +1184,17 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 						 * we are going to receive the data as a raw memory, and give it
 						 * to the application when it post a receive for this tag
 						 */
-						_STARPU_MPI_DEBUG(20, "Posting a receive for a data of size %d which has not yet been registered\n", (int)chandle->env->size);
-						chandle->buffer = malloc(chandle->env->size);
-						starpu_vector_data_register(&chandle->handle, STARPU_MAIN_RAM, (uintptr_t) chandle->buffer, chandle->env->size, 1);
-						add_chandle(chandle);
+						_STARPU_MPI_DEBUG(20, "Posting a receive for a data of size %d which has not yet been registered\n", (int)early_data_handle->env->size);
+						early_data_handle->buffer = malloc(early_data_handle->env->size);
+						starpu_vector_data_register(&early_data_handle->handle, STARPU_MAIN_RAM, (uintptr_t) early_data_handle->buffer, early_data_handle->env->size, 1);
+						_starpu_mpi_early_data_add(early_data_handle);
 					}
 
-					_STARPU_MPI_DEBUG(20, "Posting internal detached irecv on copy_handle with tag %d from src %d ..\n", chandle->mpi_tag, status.MPI_SOURCE);
+					_STARPU_MPI_DEBUG(20, "Posting internal detached irecv on early_handle with tag %d from src %d ..\n", early_data_handle->mpi_tag, status.MPI_SOURCE);
 					STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
-					chandle->req = _starpu_mpi_irecv_common(chandle->handle, status.MPI_SOURCE, chandle->mpi_tag, MPI_COMM_WORLD, 1, NULL, NULL, 1, 1, recv_env->size);
+					early_data_handle->req = _starpu_mpi_irecv_common(early_data_handle->handle, status.MPI_SOURCE,
+											  early_data_handle->mpi_tag, MPI_COMM_WORLD, 1,
+											  NULL, NULL, 1, 1, recv_env->size);
 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 
 					// We wait until the request is pushed in the
@@ -1394,15 +1203,15 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 					// on the request and post the corresponding mpi_irecv,
 					// otherwise, it may lead to read data as envelop
 					STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
-					STARPU_PTHREAD_MUTEX_LOCK(&(chandle->req->posted_mutex));
-					while (!(chandle->req->posted))
-					     STARPU_PTHREAD_COND_WAIT(&(chandle->req->posted_cond), &(chandle->req->posted_mutex));
-					STARPU_PTHREAD_MUTEX_UNLOCK(&(chandle->req->posted_mutex));
-
-					STARPU_PTHREAD_MUTEX_LOCK(&chandle->req_mutex);
-					chandle->req_ready = 1;
-					STARPU_PTHREAD_COND_BROADCAST(&chandle->req_cond);
-					STARPU_PTHREAD_MUTEX_UNLOCK(&chandle->req_mutex);
+					STARPU_PTHREAD_MUTEX_LOCK(&(early_data_handle->req->posted_mutex));
+					while (!(early_data_handle->req->posted))
+					     STARPU_PTHREAD_COND_WAIT(&(early_data_handle->req->posted_cond), &(early_data_handle->req->posted_mutex));
+					STARPU_PTHREAD_MUTEX_UNLOCK(&(early_data_handle->req->posted_mutex));
+
+					STARPU_PTHREAD_MUTEX_LOCK(&early_data_handle->req_mutex);
+					early_data_handle->req_ready = 1;
+					STARPU_PTHREAD_COND_BROADCAST(&early_data_handle->req_cond);
+					STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_handle->req_mutex);
 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 				}
 				/* Case : a matching receive has been found for the incoming data, we handle the correct allocation of the pointer associated to
@@ -1411,7 +1220,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 				{
 					_STARPU_MPI_DEBUG(3, "A matching receive has been found for the incoming data with tag %d\n", recv_env->mpi_tag);
 
-					delete_app_req(found_req);
+					_starpu_mpi_early_request_delete(found_req);
 
 					_starpu_mpi_handle_allocate_datatype(found_req->data_handle, &found_req->datatype, &found_req->user_datatype);
 					if (found_req->user_datatype == 0)
@@ -1448,8 +1257,8 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(detached_requests), "List of detached requests not empty");
 	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(new_requests), "List of new requests not empty");
 	STARPU_ASSERT_MSG(posted_requests == 0, "Number of posted request is not zero");
-	STARPU_ASSERT_MSG(_starpu_mpi_app_req_hashmap_count == 0, "Number of receive requests left is not zero");
-	STARPU_ASSERT_MSG(_starpu_mpi_copy_handle_hashmap_count == 0, "Number of copy requests left is not zero");
+	_starpu_mpi_early_request_check_termination();
+	_starpu_mpi_early_data_check_termination();
 
 	if (argc_argv->initialize_mpi)
 	{
@@ -1459,27 +1268,8 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
 	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 
-	{
-		int n;
-		struct _starpu_mpi_copy_handle_hashlist *hashlist;
-
-		for(n=0 ; n<worldsize; n++)
-		{
-			for(hashlist=_starpu_mpi_copy_handle_hashmap[n]; hashlist != NULL; hashlist=hashlist->hh.next)
-			{
-				_starpu_mpi_copy_handle_list_delete(hashlist->list);
-			}
-			struct _starpu_mpi_copy_handle_hashlist *current, *tmp;
-			HASH_ITER(hh, _starpu_mpi_copy_handle_hashmap[n], current, tmp)
-			{
-				HASH_DEL(_starpu_mpi_copy_handle_hashmap[n], current);
-				free(current);
-			}
-		}
-	}
-
-	free(_starpu_mpi_app_req_hashmap);
-	free(_starpu_mpi_copy_handle_hashmap);
+	_starpu_mpi_early_data_free(worldsize);
+	_starpu_mpi_early_request_free();
 	free(argc_argv);
 	free(recv_env);
 

+ 54 - 5
mpi/src/starpu_mpi_cache.c

@@ -30,6 +30,8 @@ struct _starpu_data_entry
 	starpu_data_handle_t data;
 };
 
+static starpu_pthread_mutex_t *_cache_sent_mutex;
+static starpu_pthread_mutex_t *_cache_received_mutex;
 static struct _starpu_data_entry **_cache_sent_data = NULL;
 static struct _starpu_data_entry **_cache_received_data = NULL;
 int _cache_enabled=1;
@@ -53,11 +55,19 @@ void _starpu_mpi_cache_init(MPI_Comm comm)
 
 	MPI_Comm_size(comm, &nb_nodes);
 	_STARPU_MPI_DEBUG(2, "Initialising htable for cache\n");
+
 	_cache_sent_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
-	for(i=0 ; i<nb_nodes ; i++) _cache_sent_data[i] = NULL;
 	_cache_received_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
-	for(i=0 ; i<nb_nodes ; i++) _cache_received_data[i] = NULL;
-	_starpu_mpi_cache_stats_init(comm);
+	_cache_sent_mutex = malloc(nb_nodes * sizeof(starpu_pthread_mutex_t));
+	_cache_received_mutex = malloc(nb_nodes * sizeof(starpu_pthread_mutex_t));
+
+	for(i=0 ; i<nb_nodes ; i++)
+	{
+		_cache_sent_data[i] = NULL;
+		_cache_received_data[i] = NULL;
+		STARPU_PTHREAD_MUTEX_INIT(&_cache_sent_mutex[i], NULL);
+		STARPU_PTHREAD_MUTEX_INIT(&_cache_received_mutex[i], NULL);
+	}
 }
 
 static
@@ -72,27 +82,44 @@ void _starpu_mpi_cache_empty_tables(int world_size)
 	for(i=0 ; i<world_size ; i++)
 	{
 		struct _starpu_data_entry *entry, *tmp;
+
+		STARPU_PTHREAD_MUTEX_LOCK(&_cache_sent_mutex[i]);
 		HASH_ITER(hh, _cache_sent_data[i], entry, tmp)
 		{
 			HASH_DEL(_cache_sent_data[i], entry);
 			free(entry);
 		}
+		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_sent_mutex[i]);
+
+		STARPU_PTHREAD_MUTEX_LOCK(&_cache_received_mutex[i]);
 		HASH_ITER(hh, _cache_received_data[i], entry, tmp)
 		{
 			HASH_DEL(_cache_received_data[i], entry);
 			_starpu_mpi_cache_stats_dec(-1, i, entry->data);
 			free(entry);
 		}
+		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_received_mutex[i]);
 	}
 }
 
 void _starpu_mpi_cache_free(int world_size)
 {
+	int i;
+
 	if (_cache_enabled == 0) return;
 
 	_starpu_mpi_cache_empty_tables(world_size);
 	free(_cache_sent_data);
 	free(_cache_received_data);
+
+	for(i=0 ; i<world_size ; i++)
+	{
+		STARPU_PTHREAD_MUTEX_DESTROY(&_cache_sent_mutex[i]);
+		STARPU_PTHREAD_MUTEX_DESTROY(&_cache_received_mutex[i]);
+	}
+	free(_cache_sent_mutex);
+	free(_cache_received_mutex);
+
 	_starpu_mpi_cache_stats_free();
 }
 
@@ -104,6 +131,8 @@ void _starpu_mpi_cache_flush_sent(MPI_Comm comm, starpu_data_handle_t data)
 	for(n=0 ; n<size ; n++)
 	{
 		struct _starpu_data_entry *already_sent;
+
+		STARPU_PTHREAD_MUTEX_LOCK(&_cache_sent_mutex[n]);
 		HASH_FIND_PTR(_cache_sent_data[n], &data, already_sent);
 		if (already_sent)
 		{
@@ -111,6 +140,7 @@ void _starpu_mpi_cache_flush_sent(MPI_Comm comm, starpu_data_handle_t data)
 			HASH_DEL(_cache_sent_data[n], already_sent);
 			free(already_sent);
 		}
+		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_sent_mutex[n]);
 	}
 }
 
@@ -119,6 +149,7 @@ void _starpu_mpi_cache_flush_recv(starpu_data_handle_t data, int me)
 	int mpi_rank = starpu_data_get_rank(data);
 	struct _starpu_data_entry *already_received;
 
+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_received_mutex[mpi_rank]);
 	HASH_FIND_PTR(_cache_received_data[mpi_rank], &data, already_received);
 	if (already_received)
 	{
@@ -131,6 +162,7 @@ void _starpu_mpi_cache_flush_recv(starpu_data_handle_t data, int me)
 		free(already_received);
 		starpu_data_invalidate_submit(data);
 	}
+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_received_mutex[mpi_rank]);
 }
 
 void starpu_mpi_cache_flush_all_data(MPI_Comm comm)
@@ -146,6 +178,8 @@ void starpu_mpi_cache_flush_all_data(MPI_Comm comm)
 	for(i=0 ; i<nb_nodes ; i++)
 	{
 		struct _starpu_data_entry *entry, *tmp;
+
+		STARPU_PTHREAD_MUTEX_LOCK(&_cache_sent_mutex[i]);
 		HASH_ITER(hh, _cache_sent_data[i], entry, tmp)
 		{
 			mpi_rank = starpu_data_get_rank(entry->data);
@@ -154,6 +188,9 @@ void starpu_mpi_cache_flush_all_data(MPI_Comm comm)
 			HASH_DEL(_cache_sent_data[i], entry);
 			free(entry);
 		}
+		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_sent_mutex[i]);
+
+		STARPU_PTHREAD_MUTEX_LOCK(&_cache_received_mutex[i]);
 		HASH_ITER(hh, _cache_received_data[i], entry, tmp)
 		{
 			mpi_rank = starpu_data_get_rank(entry->data);
@@ -163,6 +200,7 @@ void starpu_mpi_cache_flush_all_data(MPI_Comm comm)
 			_starpu_mpi_cache_stats_dec(my_rank, i, entry->data);
 			free(entry);
 		}
+		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_received_mutex[i]);
 	}
 }
 
@@ -180,6 +218,7 @@ void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle)
 
 	for(i=0 ; i<nb_nodes ; i++)
 	{
+		STARPU_PTHREAD_MUTEX_LOCK(&_cache_sent_mutex[i]);
 		HASH_FIND_PTR(_cache_sent_data[i], &data_handle, avail);
 		if (avail)
 		{
@@ -187,6 +226,9 @@ void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle)
 			HASH_DEL(_cache_sent_data[i], avail);
 			free(avail);
 		}
+		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_sent_mutex[i]);
+
+		STARPU_PTHREAD_MUTEX_LOCK(&_cache_received_mutex[i]);
 		HASH_FIND_PTR(_cache_received_data[i], &data_handle, avail);
 		if (avail)
 		{
@@ -195,6 +237,7 @@ void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle)
 			_starpu_mpi_cache_stats_dec(my_rank, i, data_handle);
 			free(avail);
 		}
+		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_received_mutex[i]);
 	}
 
 	if (mpi_rank != my_rank && mpi_rank != -1)
@@ -203,9 +246,11 @@ void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle)
 
 void *_starpu_mpi_already_received(int src, starpu_data_handle_t data, int mpi_rank)
 {
+	struct _starpu_data_entry *already_received;
+
 	if (_cache_enabled == 0) return NULL;
 
-	struct _starpu_data_entry *already_received;
+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_received_mutex[mpi_rank]);
 	HASH_FIND_PTR(_cache_received_data[mpi_rank], &data, already_received);
 	if (already_received == NULL)
 	{
@@ -218,14 +263,17 @@ void *_starpu_mpi_already_received(int src, starpu_data_handle_t data, int mpi_r
 	{
 		_STARPU_MPI_DEBUG(2, "Do not receive data %p from node %d as it is already available\n", data, mpi_rank);
 	}
+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_received_mutex[mpi_rank]);
 	return already_received;
 }
 
 void *_starpu_mpi_already_sent(starpu_data_handle_t data, int dest)
 {
+	struct _starpu_data_entry *already_sent;
+
 	if (_cache_enabled == 0) return NULL;
 
-	struct _starpu_data_entry *already_sent;
+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_sent_mutex[dest]);
 	HASH_FIND_PTR(_cache_sent_data[dest], &data, already_sent);
 	if (already_sent == NULL)
 	{
@@ -238,6 +286,7 @@ void *_starpu_mpi_already_sent(starpu_data_handle_t data, int dest)
 	{
 		_STARPU_MPI_DEBUG(2, "Do not send data %p to node %d as it has already been sent\n", data, dest);
 	}
+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_sent_mutex[dest]);
 	return already_sent;
 }
 

+ 168 - 0
mpi/src/starpu_mpi_early_data.c

@@ -0,0 +1,168 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010-2014  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdlib.h>
+#include <starpu_mpi.h>
+#include <starpu_mpi_early_data.h>
+#include <starpu_mpi_private.h>
+#include <common/uthash.h>
+
+struct _starpu_mpi_early_data_handle_hashlist
+{
+	struct _starpu_mpi_early_data_handle_list *list;
+	UT_hash_handle hh;
+	int mpi_tag;
+};
+
+/** stores data which have been received by MPI but have not been requested by the application */
+static struct _starpu_mpi_early_data_handle_hashlist **_starpu_mpi_early_data_handle_hashmap = NULL;
+static int _starpu_mpi_early_data_handle_hashmap_count = 0;
+
+void _starpu_mpi_early_data_init(int world_size)
+{
+	int k;
+
+	_starpu_mpi_early_data_handle_hashmap = malloc(world_size * sizeof(struct _starpu_mpi_early_data_handle_hash_list *));
+	for(k=0 ; k<world_size ; k++) _starpu_mpi_early_data_handle_hashmap[k] = NULL;
+}
+
+void _starpu_mpi_early_data_check_termination()
+{
+	STARPU_ASSERT_MSG(_starpu_mpi_early_data_handle_hashmap_count == 0, "Number of copy requests left is not zero");
+}
+
+void _starpu_mpi_early_data_free(int world_size)
+{
+	int n;
+	struct _starpu_mpi_early_data_handle_hashlist *hashlist;
+
+	for(n=0 ; n<world_size; n++)
+	{
+		for(hashlist=_starpu_mpi_early_data_handle_hashmap[n]; hashlist != NULL; hashlist=hashlist->hh.next)
+		{
+			_starpu_mpi_early_data_handle_list_delete(hashlist->list);
+		}
+		struct _starpu_mpi_early_data_handle_hashlist *current, *tmp;
+		HASH_ITER(hh, _starpu_mpi_early_data_handle_hashmap[n], current, tmp)
+		{
+			HASH_DEL(_starpu_mpi_early_data_handle_hashmap[n], current);
+			free(current);
+		}
+	}
+	free(_starpu_mpi_early_data_handle_hashmap);
+}
+
+#ifdef STARPU_VERBOSE
+static void _starpu_mpi_early_data_handle_display_hash(int source, int tag)
+{
+	struct _starpu_mpi_early_data_handle_hashlist *hashlist;
+	HASH_FIND_INT(_starpu_mpi_early_data_handle_hashmap[source], &tag, hashlist);
+
+	if (hashlist == NULL)
+	{
+		_STARPU_MPI_DEBUG(60, "Hashlist for source %d and tag %d does not exist\n", source, tag);
+	}
+	else if (_starpu_mpi_early_data_handle_list_empty(hashlist->list))
+	{
+		_STARPU_MPI_DEBUG(60, "Hashlist for source %d and tag %d is empty\n", source, tag);
+	}
+	else
+	{
+		struct _starpu_mpi_early_data_handle *cur;
+		for (cur = _starpu_mpi_early_data_handle_list_begin(hashlist->list) ;
+		     cur != _starpu_mpi_early_data_handle_list_end(hashlist->list);
+		     cur = _starpu_mpi_early_data_handle_list_next(cur))
+		{
+			_STARPU_MPI_DEBUG(60, "Element for source %d and tag %d: %p\n", source, tag, cur);
+		}
+	}
+}
+#endif
+
+static
+struct _starpu_mpi_early_data_handle *_starpu_mpi_early_data_pop(int mpi_tag, int source, int delete)
+{
+	struct _starpu_mpi_early_data_handle_hashlist *hashlist;
+	struct _starpu_mpi_early_data_handle *early_data_handle;
+
+	_STARPU_MPI_DEBUG(60, "Looking for early_data_handle with tag %d in the hashmap[%d]\n", mpi_tag, source);
+	HASH_FIND_INT(_starpu_mpi_early_data_handle_hashmap[source], &mpi_tag, hashlist);
+	if (hashlist == NULL)
+	{
+		early_data_handle = NULL;
+	}
+	else
+	{
+		if (_starpu_mpi_early_data_handle_list_empty(hashlist->list))
+		{
+			early_data_handle = NULL;
+		}
+		else
+		{
+			if (delete == 1)
+			{
+				early_data_handle = _starpu_mpi_early_data_handle_list_pop_front(hashlist->list);
+			}
+			else
+			{
+				early_data_handle = _starpu_mpi_early_data_handle_list_front(hashlist->list);
+			}
+		}
+	}
+	_STARPU_MPI_DEBUG(60, "Found early_data_handle %p with tag %d in the hashmap[%d]\n", early_data_handle, mpi_tag, source);
+	return early_data_handle;
+}
+
+struct _starpu_mpi_early_data_handle *_starpu_mpi_early_data_find(int mpi_tag, int source)
+{
+	return _starpu_mpi_early_data_pop(mpi_tag, source, 0);
+}
+
+void _starpu_mpi_early_data_add(struct _starpu_mpi_early_data_handle *early_data_handle)
+{
+	_STARPU_MPI_DEBUG(60, "Trying to add early_data_handle %p with tag %d in the hashmap[%d]\n", early_data_handle, early_data_handle->mpi_tag, early_data_handle->source);
+
+	struct _starpu_mpi_early_data_handle_hashlist *hashlist;
+	HASH_FIND_INT(_starpu_mpi_early_data_handle_hashmap[early_data_handle->source], &early_data_handle->mpi_tag, hashlist);
+	if (hashlist == NULL)
+	{
+		hashlist = malloc(sizeof(struct _starpu_mpi_early_data_handle_hashlist));
+		hashlist->list = _starpu_mpi_early_data_handle_list_new();
+		hashlist->mpi_tag = early_data_handle->mpi_tag;
+		HASH_ADD_INT(_starpu_mpi_early_data_handle_hashmap[early_data_handle->source], mpi_tag, hashlist);
+	}
+	_starpu_mpi_early_data_handle_list_push_back(hashlist->list, early_data_handle);
+	_starpu_mpi_early_data_handle_hashmap_count ++;
+#ifdef STARPU_VERBOSE
+	_starpu_mpi_early_data_handle_display_hash(early_data_handle->source, early_data_handle->mpi_tag);
+#endif
+}
+
+void _starpu_mpi_early_data_delete(struct _starpu_mpi_early_data_handle *early_data_handle)
+{
+	_STARPU_MPI_DEBUG(60, "Trying to delete early_data_handle %p with tag %d in the hashmap[%d]\n", early_data_handle, early_data_handle->mpi_tag, early_data_handle->source);
+	struct _starpu_mpi_early_data_handle *found = _starpu_mpi_early_data_pop(early_data_handle->mpi_tag, early_data_handle->source, 1);
+
+	STARPU_ASSERT_MSG(found == early_data_handle,
+			  "[_starpu_mpi_early_data_delete][error] early_data_handle %p with tag %d is NOT in the hashmap[%d]\n", early_data_handle, early_data_handle->mpi_tag, early_data_handle->source);
+
+	_starpu_mpi_early_data_handle_hashmap_count --;
+#ifdef STARPU_VERBOSE
+	_starpu_mpi_early_data_handle_display_hash(early_data_handle->source, early_data_handle->mpi_tag);
+#endif
+}
+

+ 55 - 0
mpi/src/starpu_mpi_early_data.h

@@ -0,0 +1,55 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010-2014  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_MPI_EARLY_DATA_H__
+#define __STARPU_MPI_EARLY_DATA_H__
+
+#include <starpu.h>
+#include <stdlib.h>
+#include <mpi.h>
+#include <common/config.h>
+#include <common/list.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+LIST_TYPE(_starpu_mpi_early_data_handle,
+	  starpu_data_handle_t handle;
+	  struct _starpu_mpi_envelope *env;
+	  struct _starpu_mpi_req *req;
+	  void *buffer;
+	  int mpi_tag;
+	  int source;
+	  int req_ready;
+	  starpu_pthread_mutex_t req_mutex;
+	  starpu_pthread_cond_t req_cond;
+);
+
+void _starpu_mpi_early_data_init(int world_size);
+void _starpu_mpi_early_data_check_termination();
+void _starpu_mpi_early_data_free(int world_size);
+
+struct _starpu_mpi_early_data_handle *_starpu_mpi_early_data_find(int mpi_tag, int source);
+void _starpu_mpi_early_data_add(struct _starpu_mpi_early_data_handle *early_data_handle);
+void _starpu_mpi_early_data_delete(struct _starpu_mpi_early_data_handle *early_data_handle);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __STARPU_MPI_EARLY_DATA_H__ */

+ 104 - 0
mpi/src/starpu_mpi_early_request.c

@@ -0,0 +1,104 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010-2014  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdlib.h>
+#include <starpu_mpi.h>
+#include <starpu_mpi_private.h>
+#include <starpu_mpi_early_request.h>
+#include <common/uthash.h>
+
+/** stores application requests for which data have not been received yet */
+static struct _starpu_mpi_req **_starpu_mpi_app_req_hashmap = NULL;
+static int _starpu_mpi_app_req_hashmap_count = 0;
+
+void _starpu_mpi_early_request_init(int world_size)
+{
+	int k;
+
+	_starpu_mpi_app_req_hashmap = malloc(world_size * sizeof(struct _starpu_mpi_req *));
+	for(k=0 ; k<world_size ; k++) _starpu_mpi_app_req_hashmap[k] = NULL;
+}
+
+void _starpu_mpi_early_request_free()
+{
+	free(_starpu_mpi_app_req_hashmap);
+}
+
+int _starpu_mpi_early_request_count()
+{
+	return _starpu_mpi_app_req_hashmap_count;
+}
+
+void _starpu_mpi_early_request_check_termination()
+{
+	STARPU_ASSERT_MSG(_starpu_mpi_early_request_count() == 0, "Number of receive requests left is not zero");
+}
+
+struct _starpu_mpi_req* _starpu_mpi_early_request_find(int mpi_tag, int source)
+{
+	struct _starpu_mpi_req* req;
+
+	HASH_FIND_INT(_starpu_mpi_app_req_hashmap[source], &mpi_tag, req);
+
+	return req;
+}
+
+void _starpu_mpi_early_request_add(struct _starpu_mpi_req *req)
+{
+	struct _starpu_mpi_req *test_req;
+
+	test_req = _starpu_mpi_early_request_find(req->mpi_tag, req->srcdst);
+
+	if (test_req == NULL)
+	{
+		HASH_ADD_INT(_starpu_mpi_app_req_hashmap[req->srcdst], mpi_tag, req);
+		_starpu_mpi_app_req_hashmap_count ++;
+		_STARPU_MPI_DEBUG(3, "Adding request %p with tag %d in the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
+	}
+	else
+	{
+		_STARPU_MPI_DEBUG(3, "[Error] request %p with tag %d already in the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
+		int seq_const = starpu_data_get_sequential_consistency_flag(req->data_handle);
+		if (seq_const &&  req->sequential_consistency)
+		{
+			STARPU_ASSERT_MSG(!test_req, "[Error] request %p with tag %d wanted to be added to the application request hashmap[%d], while another request %p with the same tag is already in it. \n Sequential consistency is activated : this is not supported by StarPU.", req, req->mpi_tag, req->srcdst, test_req);
+		}
+		else
+		{
+			STARPU_ASSERT_MSG(!test_req, "[Error] request %p with tag %d wanted to be added to the application request hashmap[%d], while another request %p with the same tag is already in it. \n Sequential consistency isn't activated for this handle : you should want to add dependencies between requests for which the sequential consistency is deactivated.", req, req->mpi_tag, req->srcdst, test_req);
+		}
+	}
+}
+
+void _starpu_mpi_early_request_delete(struct _starpu_mpi_req *req)
+{
+	struct _starpu_mpi_req *test_req;
+
+	test_req = _starpu_mpi_early_request_find(req->mpi_tag, req->srcdst);
+
+	if (test_req != NULL)
+	{
+		HASH_DEL(_starpu_mpi_app_req_hashmap[req->srcdst], req);
+		_starpu_mpi_app_req_hashmap_count --;
+		_STARPU_MPI_DEBUG(3, "Deleting application request %p with tag %d from the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
+	}
+	else
+	{
+		_STARPU_MPI_DEBUG(3, "[Warning] request %p with tag %d is NOT in the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
+	}
+}
+

+ 44 - 0
mpi/src/starpu_mpi_early_request.h

@@ -0,0 +1,44 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010-2014  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_MPI_EARLY_REQUEST_H__
+#define __STARPU_MPI_EARLY_REQUEST_H__
+
+#include <starpu.h>
+#include <stdlib.h>
+#include <mpi.h>
+#include <common/config.h>
+#include <common/list.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void _starpu_mpi_early_request_init(int world_size);
+void _starpu_mpi_early_request_free();
+int _starpu_mpi_early_request_count();
+void _starpu_mpi_early_request_check_termination();
+
+void _starpu_mpi_early_request_add(struct _starpu_mpi_req *req);
+struct _starpu_mpi_req* _starpu_mpi_early_request_find(int mpi_tag, int source);
+void _starpu_mpi_early_request_delete(struct _starpu_mpi_req *req);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __STARPU_MPI_EARLY_REQUEST_H__ */

+ 26 - 17
mpi/tests/mpi_earlyrecv.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -21,9 +21,9 @@
 
 int main(int argc, char **argv)
 {
-	int ret, rank, size, i, nb_requests;
-	starpu_data_handle_t tab_handle[3];
-	starpu_mpi_req request[3];
+	int ret, rank, size, i;
+	starpu_data_handle_t tab_handle[4];
+	starpu_mpi_req request[2] = {NULL, NULL};
 
 	MPI_Init(NULL, NULL);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
@@ -43,11 +43,14 @@ int main(int argc, char **argv)
 	ret = starpu_mpi_init(NULL, NULL, 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
-	for(i=0 ; i<3 ; i++)
+	for(i=0 ; i<4 ; i++)
 	{
-		starpu_variable_data_register(&tab_handle[i], STARPU_MAIN_RAM, (uintptr_t)&rank, sizeof(int));
-		starpu_data_set_tag(tab_handle[i], i);
-		request[i] = NULL;
+		if (i<3 || rank%2)
+		{
+			// all data are registered on all nodes, bu the 4th data which is not registered on the receiving node
+			starpu_variable_data_register(&tab_handle[i], STARPU_MAIN_RAM, (uintptr_t)&rank, sizeof(int));
+			starpu_mpi_data_register(tab_handle[i], i, rank);
+		}
 	}
 
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
@@ -56,23 +59,30 @@ int main(int argc, char **argv)
 
 	if (rank%2)
 	{
+		// this data will be received as an early registered data
 		starpu_mpi_isend(tab_handle[0], &request[0], other_rank, 0, MPI_COMM_WORLD);
+		// this data will be received as an early UNregistered data
+		starpu_mpi_isend(tab_handle[3], &request[1], other_rank, 3, MPI_COMM_WORLD);
+
+		starpu_mpi_send(tab_handle[1], other_rank, 1, MPI_COMM_WORLD);
 		starpu_mpi_recv(tab_handle[2], other_rank, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-		starpu_mpi_isend(tab_handle[1], &request[1], other_rank, 1, MPI_COMM_WORLD);
-		nb_requests = 2;
 	}
 	else
 	{
+		starpu_mpi_recv(tab_handle[1], other_rank, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+		starpu_mpi_send(tab_handle[2], other_rank, 2, MPI_COMM_WORLD);
+
+		// we register the data
+		starpu_variable_data_register(&tab_handle[3], STARPU_MAIN_RAM, (uintptr_t)&rank, sizeof(int));
+		starpu_mpi_data_register(tab_handle[3], 3, rank);
+		starpu_mpi_irecv(tab_handle[3], &request[1], other_rank, 3, MPI_COMM_WORLD);
 		starpu_mpi_irecv(tab_handle[0], &request[0], other_rank, 0, MPI_COMM_WORLD);
-		starpu_mpi_irecv(tab_handle[1], &request[1], other_rank, 1, MPI_COMM_WORLD);
-		starpu_mpi_isend(tab_handle[2], &request[2], other_rank, 2, MPI_COMM_WORLD);
-		nb_requests = 3;
 	}
 
 	int finished=0;
 	while (!finished)
 	{
-		for(i=0 ; i<nb_requests ; i++)
+		for(i=0 ; i<2 ; i++)
 		{
 			if (request[i])
 			{
@@ -83,11 +93,10 @@ int main(int argc, char **argv)
 					FPRINTF_MPI("request[%d] = %d %p\n", i, flag, request[i]);
 			}
 		}
-		finished = request[0] == NULL;
-		for(i=1 ; i<nb_requests ; i++) finished = finished && request[i] == NULL;
+		finished = request[0] == NULL && request[1] == NULL;
 	}
 
-	for(i=0 ; i<3 ; i++)
+	for(i=0 ; i<4 ; i++)
 		starpu_data_unregister(tab_handle[i]);
 
 	starpu_mpi_shutdown();

+ 3 - 3
mpi/tests/mpi_earlyrecv2.c

@@ -131,7 +131,7 @@ int exchange_variable(int rank, int detached)
 	{
 		value[i]=i*rank;
 		starpu_variable_data_register(&tab_handle[i], STARPU_MAIN_RAM, (uintptr_t)&value[i], sizeof(int));
-		starpu_data_set_tag(tab_handle[i], i);
+		starpu_mpi_data_register(tab_handle[i], i, rank);
 	}
 	ret = exchange(rank, tab_handle, check_variable, detached);
 	for(i=0 ; i<NB ; i++)
@@ -154,7 +154,7 @@ int exchange_void(int rank, int detached)
 	for(i=0 ; i<NB ; i++)
 	{
 		starpu_void_data_register(&tab_handle[i]);
-		starpu_data_set_tag(tab_handle[i], i);
+		starpu_mpi_data_register(tab_handle[i], i, rank);
 	}
 	ret = exchange(rank, tab_handle, check_void, detached);
 	for(i=0 ; i<NB ; i++)
@@ -191,7 +191,7 @@ int exchange_complex(int rank, int detached)
 		real[i] = (i*rank)+12;
 		imaginary[i] = (i*rank)+45;
 		starpu_complex_data_register(&handle[i], STARPU_MAIN_RAM, &real[i], &imaginary[i], 1);
-		starpu_data_set_tag(handle[i], i);
+		starpu_mpi_data_register(handle[i], i, rank);
 	}
 	ret = exchange(rank, handle, check_complex, detached);
 	for(i=0 ; i<NB ; i++)

+ 1 - 1
sc_hypervisor/examples/cholesky/cholesky_grain_tag.c

@@ -406,7 +406,7 @@ int main(int argc, char **argv)
 	float *test_mat = malloc(size*size*sizeof(float));
 	STARPU_ASSERT(test_mat);
 
-	SSYRK("L", "N", size, size, 1.0f,
+	STARPU_SSYRK("L", "N", size, size, 1.0f,
 				mat, size, 0.0f, test_mat, size);
 
 	FPRINTF(stderr, "comparing results ...\n");

+ 1 - 1
sc_hypervisor/examples/cholesky/cholesky_implicit.c

@@ -290,7 +290,7 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 		float *test_mat = malloc(size*size*sizeof(float));
 		STARPU_ASSERT(test_mat);
 
-		SSYRK("L", "N", size, size, 1.0f,
+		STARPU_SSYRK("L", "N", size, size, 1.0f,
 					mat, size, 0.0f, test_mat, size);
 
 		FPRINTF(stderr, "comparing results ...\n");

+ 5 - 5
sc_hypervisor/examples/cholesky/cholesky_kernels.c

@@ -52,7 +52,7 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STAR
 		if (worker_size == 1)
 		{
 			/* Sequential CPU kernel */
-			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21, 
+			STARPU_SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21, 
 				right, ld12, 1.0f, center, ld22);
 		}
 		else
@@ -66,7 +66,7 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STAR
 			float *new_left = &left[block_size*rank];
 			float *new_center = &center[block_size*rank];
 
-			SGEMM("N", "T", dy, new_dx, dz, -1.0f, new_left, ld21, 
+			STARPU_SGEMM("N", "T", dy, new_dx, dz, -1.0f, new_left, ld21, 
 				right, ld12, 1.0f, new_center, ld22);
 		}
 	}
@@ -117,7 +117,7 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, STARPU_A
 	switch (s)
 	{
 		case 0:
-			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
+			STARPU_STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
 			break;
 #ifdef STARPU_USE_CUDA
 		case 1:
@@ -177,9 +177,9 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 
 				STARPU_ASSERT(lambda11 != 0.0f);
 		
-				SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
+				STARPU_SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
 		
-				SSYR("L", nx - z - 1, -1.0f, 
+				STARPU_SSYR("L", nx - z - 1, -1.0f, 
 							&sub11[(z+1)+z*ld], 1,
 							&sub11[(z+1)+(z+1)*ld], ld);
 			}

+ 1 - 1
sc_hypervisor/examples/cholesky/cholesky_tag.c

@@ -391,7 +391,7 @@ int main(int argc, char **argv)
 	float *test_mat = malloc(size*size*sizeof(float));
 	STARPU_ASSERT(test_mat);
 
-	SSYRK("L", "N", size, size, 1.0f,
+	STARPU_SSYRK("L", "N", size, size, 1.0f,
 				mat, size, 0.0f, test_mat, size);
 
 	FPRINTF(stderr, "comparing results ...\n");

+ 17 - 2
src/common/fxt.h

@@ -107,8 +107,6 @@
 
 #define _STARPU_FUT_EVENT	0x513c
 
-#define _STARPU_FUT_WORKER_SCHEDULING_START	0x513e
-
 #define _STARPU_FUT_LOCKING_MUTEX	0x5140	
 #define _STARPU_FUT_MUTEX_LOCKED	0x5141	
 
@@ -156,6 +154,11 @@
 #define _STARPU_FUT_BARRIER_WAIT_BEGIN		0x5162
 #define _STARPU_FUT_BARRIER_WAIT_END		0x5163
 
+#define _STARPU_FUT_WORKER_SCHEDULING_START	0x5164
+#define _STARPU_FUT_WORKER_SCHEDULING_END	0x5165
+#define _STARPU_FUT_WORKER_SCHEDULING_PUSH	0x5166
+#define _STARPU_FUT_WORKER_SCHEDULING_POP	0x5167
+
 #ifdef STARPU_USE_FXT
 #include <fxt/fxt.h>
 #include <fxt/fut.h>
@@ -487,6 +490,15 @@ do {										\
 #define _STARPU_TRACE_WORKER_SCHEDULING_START	\
 	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SCHEDULING_START, _starpu_gettid());
 
+#define _STARPU_TRACE_WORKER_SCHEDULING_END	\
+	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SCHEDULING_END, _starpu_gettid());
+
+#define _STARPU_TRACE_WORKER_SCHEDULING_PUSH	\
+	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SCHEDULING_PUSH, _starpu_gettid());
+
+#define _STARPU_TRACE_WORKER_SCHEDULING_POP	\
+	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SCHEDULING_POP, _starpu_gettid());
+
 #define _STARPU_TRACE_WORKER_SLEEP_START	\
 	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SLEEP_START, _starpu_gettid());
 
@@ -760,6 +772,9 @@ do {										\
 #define _STARPU_TRACE_WORKER_DEINIT_START	do {} while(0)
 #define _STARPU_TRACE_WORKER_DEINIT_END(a)	do {} while(0)
 #define _STARPU_TRACE_WORKER_SCHEDULING_START		do {} while(0)
+#define _STARPU_TRACE_WORKER_SCHEDULING_END		do {} while(0)
+#define _STARPU_TRACE_WORKER_SCHEDULING_PUSH		do {} while(0)
+#define _STARPU_TRACE_WORKER_SCHEDULING_POP		do {} while(0)
 #define _STARPU_TRACE_WORKER_SLEEP_START		do {} while(0)
 #define _STARPU_TRACE_WORKER_SLEEP_END		do {} while(0)
 #define _STARPU_TRACE_USER_DEFINED_START		do {} while(0)

+ 4 - 4
src/common/utils.h

@@ -82,10 +82,10 @@
 #  define _STARPU_DEBUG(fmt, ...) do { } while (0)
 #endif
 
-#ifdef STARPU_VERBOSE0
-#  define _STARPU_LOG_IN()             do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s] -->\n", pthread_self(), __starpu_func__ ); }} while(0)
-#  define _STARPU_LOG_OUT()            do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s] <--\n", pthread_self(), __starpu_func__ ); }} while(0)
-#  define _STARPU_LOG_OUT_TAG(outtag)  do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s] <-- (%s)\n", pthread_self(), __starpu_func__, outtag); }} while(0)
+#ifdef STARPU_EXTRA_VERBOSE
+#  define _STARPU_LOG_IN()             do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] -->\n", pthread_self(), __starpu_func__,__FILE__,  __LINE__); }} while(0)
+#  define _STARPU_LOG_OUT()            do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] <--\n", pthread_self(), __starpu_func__, __FILE__,  __LINE__); }} while(0)
+#  define _STARPU_LOG_OUT_TAG(outtag)  do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] <-- (%s)\n", pthread_self(), __starpu_func__, __FILE__, __LINE__, outtag); }} while(0)
 #else
 #  define _STARPU_LOG_IN()
 #  define _STARPU_LOG_OUT()

+ 16 - 0
src/core/sched_ctx.c

@@ -1157,6 +1157,22 @@ struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection(unsig
 	return sched_ctx->workers;
 }
 
+void starpu_sched_ctx_display_workers(unsigned sched_ctx_id, FILE *f)
+{
+	int *workerids = NULL;
+	unsigned nworkers;
+	unsigned i;
+
+	nworkers = starpu_sched_ctx_get_workers_list(sched_ctx_id, &workerids);
+	fprintf(f, "[sched_ctx %d]: %d worker%s\n", sched_ctx_id, nworkers, nworkers>1?"s":"");
+	for (i = 0; i < nworkers; i++)
+	{
+		char name[256];
+		starpu_worker_get_name(workerids[i], name, 256);
+		fprintf(f, "\t\t%s\n", name);
+	}
+}
+
 unsigned starpu_sched_ctx_get_workers_list(unsigned sched_ctx_id, int **workerids)
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);

+ 8 - 1
src/core/sched_policy.c

@@ -455,7 +455,14 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 			starpu_pthread_rwlock_t *changing_ctx_mutex = _starpu_sched_ctx_get_changing_ctx_mutex(sched_ctx->id);
 			STARPU_PTHREAD_RWLOCK_RDLOCK(changing_ctx_mutex);
 			nworkers = starpu_sched_ctx_get_nworkers(sched_ctx->id);
-			ret = nworkers == 0 ? -1 : sched_ctx->sched_policy->push_task(task);
+			if (nworkers == 0)
+				ret = -1;
+			else
+			{
+				_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
+				ret = sched_ctx->sched_policy->push_task(task);
+				_STARPU_TRACE_WORKER_SCHEDULING_POP;
+			}
 			STARPU_PTHREAD_RWLOCK_UNLOCK(changing_ctx_mutex);
 		}
 

+ 26 - 10
src/core/workers.c

@@ -95,12 +95,17 @@ static uint32_t _starpu_worker_exists_and_can_execute(struct starpu_task *task,
 						      enum starpu_worker_archtype arch)
 {
 	int i;
-	int nworkers = starpu_worker_get_count();
-
 	_starpu_codelet_check_deprecated_fields(task->cl);
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
+	struct starpu_worker_collection *workers = sched_ctx->workers;
 
-	for (i = 0; i < nworkers; i++)
-	{
+        struct starpu_sched_ctx_iterator it;
+        if(workers->init_iterator)
+                workers->init_iterator(workers, &it);
+
+        while(workers->has_next(workers, &it))
+        {
+                i = workers->get_next(workers, &it);
 		if (starpu_worker_get_type(i) != arch)
 			continue;
 
@@ -141,7 +146,10 @@ static uint32_t _starpu_worker_exists_and_can_execute(struct starpu_task *task,
 			if (!test_implementation)
 				break;
 
-			if (task->cl->can_execute(i, task, impl))
+			if (task->cl->can_execute)
+				return task->cl->can_execute(i, task, impl);
+
+			if(test_implementation)
 				return 1;
 		}
 	}
@@ -155,11 +163,18 @@ uint32_t _starpu_worker_exists(struct starpu_task *task)
 {
 	_starpu_codelet_check_deprecated_fields(task->cl);
 
-	if (!(task->cl->where & config.worker_mask))
-		return 0;
-
-	if (!task->cl->can_execute)
-		return 1;
+	/* if the task belongs to the init context we can
+	   check out all the worker mask of the machine
+	   if not we should iterate on the workers of the ctx
+	   and verify if it exists a worker able to exec the task */
+	if(task->sched_ctx == 0)
+	{
+		if (!(task->cl->where & config.worker_mask))
+			return 0;
+		
+		if (!task->cl->can_execute)
+			return 1;
+	}
 
 #if defined(STARPU_USE_CPU) || defined(STARPU_SIMGRID)
 	if ((task->cl->where & STARPU_CPU) &&
@@ -186,6 +201,7 @@ uint32_t _starpu_worker_exists(struct starpu_task *task)
 	    _starpu_worker_exists_and_can_execute(task, STARPU_SCC_WORKER))
 		return 1;
 #endif
+
 	return 0;
 }
 

+ 2 - 2
src/datawizard/malloc.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2010, 2012-2014  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -217,7 +217,7 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 end:
 	if (ret == 0)
 	{
-		STARPU_ASSERT(*A);
+		STARPU_ASSERT_MSG(*A, "Failed to allocated memory of size %ld b\n", dim);
 	}
 
 	return ret;

+ 64 - 0
src/debug/traces/starpu_fxt.c

@@ -275,6 +275,28 @@ static void worker_set_state(double time, const char *prefix, long unsigned int
 #endif
 }
 
+static void worker_push_state(double time, const char *prefix, long unsigned int workerid, const char *name)
+{
+#ifdef STARPU_HAVE_POTI
+	char container[STARPU_POTI_STR_LEN];
+	thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, workerid);
+	poti_PushState(time, container, "S", name);
+#else
+	fprintf(out_paje_file, "11	%.9f	%st%lu	S	%s\n", time, prefix, workerid, name);
+#endif
+}
+
+static void worker_pop_state(double time, const char *prefix, long unsigned int workerid)
+{
+#ifdef STARPU_HAVE_POTI
+	char container[STARPU_POTI_STR_LEN];
+	thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, workerid);
+	poti_PopState(time, container, "S");
+#else
+	fprintf(out_paje_file, "12	%.9f	%st%lu	S\n", time, prefix, workerid);
+#endif
+}
+
 static void mpicommthread_set_state(double time, const char *prefix, const char *name)
 {
 #ifdef STARPU_HAVE_POTI
@@ -790,6 +812,36 @@ static void handle_start_scheduling(struct fxt_ev_64 *ev, struct starpu_fxt_opti
 		worker_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "Sc");
 }
 
+static void handle_end_scheduling(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+{
+	int worker;
+	worker = find_worker_id(ev->param[0]);
+	if (worker < 0) return;
+
+	if (out_paje_file)
+		worker_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "B");
+}
+
+static void handle_push_scheduling(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+{
+	int worker;
+	worker = find_worker_id(ev->param[0]);
+	if (worker < 0) return;
+
+	if (out_paje_file)
+		worker_push_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "Sc");
+}
+
+static void handle_pop_scheduling(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+{
+	int worker;
+	worker = find_worker_id(ev->param[0]);
+	if (worker < 0) return;
+
+	if (out_paje_file)
+		worker_pop_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0]);
+}
+
 static void handle_start_sleep(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 {
 	int worker;
@@ -1506,6 +1558,18 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 				handle_start_scheduling(&ev, options);
 				break;
 
+			case _STARPU_FUT_WORKER_SCHEDULING_END:
+				handle_end_scheduling(&ev, options);
+				break;
+
+			case _STARPU_FUT_WORKER_SCHEDULING_PUSH:
+				handle_push_scheduling(&ev, options);
+				break;
+
+			case _STARPU_FUT_WORKER_SCHEDULING_POP:
+				handle_pop_scheduling(&ev, options);
+				break;
+
 			case _STARPU_FUT_WORKER_SLEEP_START:
 				handle_start_sleep(&ev, options);
 				break;

+ 2 - 2
src/debug/traces/starpu_paje.c

@@ -87,14 +87,14 @@ void _starpu_fxt_write_paje_header(FILE *file)
 	fprintf(file, "%%EndEventDef\n");
 	fprintf(file, "%%EventDef	PajePushState	11\n");
 	fprintf(file, "%%	Time	date\n");
-	fprintf(file, "%%	Type	string\n");
 	fprintf(file, "%%	Container	string\n");
+	fprintf(file, "%%	Type	string\n");
 	fprintf(file, "%%	Value	string\n");
 	fprintf(file, "%%EndEventDef\n");
 	fprintf(file, "%%EventDef	PajePopState	12\n");
 	fprintf(file, "%%	Time	date\n");
-	fprintf(file, "%%	Type	string\n");
 	fprintf(file, "%%	Container	string\n");
+	fprintf(file, "%%	Type	string\n");
 	fprintf(file, "%%EndEventDef\n");
 	fprintf(file, "%%EventDef	PajeSetVariable	13\n");
 	fprintf(file, "%%	Time	date\n");

+ 10 - 0
src/drivers/driver_common/driver_common.c

@@ -220,7 +220,15 @@ static void _starpu_worker_set_status_scheduling(int workerid)
 		_STARPU_TRACE_WORKER_SCHEDULING_START;
 		_starpu_worker_set_status(workerid, STATUS_SCHEDULING);
 	}
+}
 
+static void _starpu_worker_set_status_scheduling_done(int workerid)
+{
+	if (_starpu_worker_get_status(workerid) == STATUS_SCHEDULING)
+	{
+		_STARPU_TRACE_WORKER_SCHEDULING_END;
+		_starpu_worker_set_status(workerid, STATUS_UNKNOWN);
+	}
 }
 
 static void _starpu_worker_set_status_sleeping(int workerid)
@@ -355,6 +363,8 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 
 	STARPU_PTHREAD_MUTEX_UNLOCK(&args->sched_mutex);
 
+	_starpu_worker_set_status_scheduling_done(workerid);
+
 	_starpu_worker_set_status_wakeup(workerid);
 	args->spinning_backoff = BACKOFF_MIN;
 

+ 0 - 4
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -547,10 +547,6 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 	{
 		worker = workers->get_next(workers, &it);
 
-		if (worker >= nworkers)
-			/* This is a just-added worker, discard it */
-			continue;
-
 		struct _starpu_fifo_taskq *fifo = dt->queue_array[worker];
 		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
 		unsigned memory_node = starpu_worker_get_memory_node(worker);

+ 10 - 0
tests/main/starpu_worker_exists.c

@@ -67,21 +67,31 @@ main(int argc, char **argv)
 	task = starpu_task_create();
 	task->cl = &cl;
 	task->destroy = 0;
+	task->sched_ctx = 0;
 
 	cl.can_execute = NULL;
 	ret = _starpu_worker_exists(task);
 	if (!ret)
+	{
+		FPRINTF(stderr, "failure with can_execute=NULL\n");
 		return EXIT_FAILURE;
+	}
 
 	cl.can_execute = can_always_execute;
 	ret = _starpu_worker_exists(task);
 	if (!ret)
+	{
+		FPRINTF(stderr, "failure with can_always_execute\n");
 		return EXIT_FAILURE;
+	}
 
 	cl.can_execute = can_never_execute;
 	ret = _starpu_worker_exists(task);
 	if (ret)
+	{
+		FPRINTF(stderr, "failure with can_never_execute\n");
 		return EXIT_FAILURE;
+	}
 
 	starpu_task_destroy(task);
 	starpu_shutdown();

+ 1 - 0
tests/perfmodels/regression_based.c

@@ -80,6 +80,7 @@ static struct starpu_codelet nl_memset_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {memset_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {memset_opencl, NULL},