Olivier Aumage лет назад: 12
Родитель
Сommit
721ce5f920
57 измененных файлов с 1059 добавлено и 643 удалено
  1. 6 0
      ChangeLog
  2. 5 1
      configure.ac
  3. 1 1
      doc/doxygen/chapters/41configure_options.doxy
  4. 4 0
      doc/doxygen/chapters/api/scheduling_contexts.doxy
  5. 1 1
      examples/axpy/axpy.c
  6. 10 10
      examples/cg/cg.h
  7. 1 1
      examples/cholesky/cholesky_grain_tag.c
  8. 1 1
      examples/cholesky/cholesky_implicit.c
  9. 5 6
      examples/cholesky/cholesky_kernels.c
  10. 1 1
      examples/cholesky/cholesky_tag.c
  11. 79 79
      examples/common/blas.c
  12. 27 27
      examples/common/blas.h
  13. 1 1
      examples/heat/dw_factolu.h
  14. 5 5
      examples/heat/dw_factolu_kernels.c
  15. 7 7
      examples/heat/dw_sparse_cg_kernels.c
  16. 7 7
      examples/heat/heat.c
  17. 9 9
      examples/lu/lu-double.h
  18. 10 10
      examples/lu/lu-float.h
  19. 1 1
      examples/lu/xlu.h
  20. 3 3
      examples/mult/double.h
  21. 3 3
      examples/mult/simple.h
  22. 2 2
      examples/pipeline/pipeline.c
  23. 58 24
      examples/sched_ctx/sched_ctx.c
  24. 8 4
      examples/sched_ctx/sched_ctx_without_sched_policy.c
  25. 6 1
      examples/worker_collections/worker_tree_example.c
  26. 1 1
      gcc-plugin/examples/cholesky/cholesky.c
  27. 4 4
      gcc-plugin/examples/cholesky/cholesky_kernels.c
  28. 2 0
      include/starpu_sched_ctx.h
  29. 1 1
      mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
  30. 4 4
      mpi/examples/matrix_decomposition/mpi_cholesky_kernels.c
  31. 10 10
      mpi/examples/mpi_lu/mpi_lu-double.h
  32. 10 10
      mpi/examples/mpi_lu/mpi_lu-float.h
  33. 6 2
      mpi/src/Makefile.am
  34. 138 348
      mpi/src/starpu_mpi.c
  35. 54 5
      mpi/src/starpu_mpi_cache.c
  36. 168 0
      mpi/src/starpu_mpi_early_data.c
  37. 55 0
      mpi/src/starpu_mpi_early_data.h
  38. 104 0
      mpi/src/starpu_mpi_early_request.c
  39. 44 0
      mpi/src/starpu_mpi_early_request.h
  40. 26 17
      mpi/tests/mpi_earlyrecv.c
  41. 3 3
      mpi/tests/mpi_earlyrecv2.c
  42. 1 1
      sc_hypervisor/examples/cholesky/cholesky_grain_tag.c
  43. 1 1
      sc_hypervisor/examples/cholesky/cholesky_implicit.c
  44. 5 5
      sc_hypervisor/examples/cholesky/cholesky_kernels.c
  45. 1 1
      sc_hypervisor/examples/cholesky/cholesky_tag.c
  46. 17 2
      src/common/fxt.h
  47. 4 4
      src/common/utils.h
  48. 16 0
      src/core/sched_ctx.c
  49. 8 1
      src/core/sched_policy.c
  50. 26 10
      src/core/workers.c
  51. 2 2
      src/datawizard/malloc.c
  52. 64 0
      src/debug/traces/starpu_fxt.c
  53. 2 2
      src/debug/traces/starpu_paje.c
  54. 10 0
      src/drivers/driver_common/driver_common.c
  55. 0 4
      src/sched_policies/deque_modeling_policy_data_aware.c
  56. 10 0
      tests/main/starpu_worker_exists.c
  57. 1 0
      tests/perfmodels/regression_based.c

+ 6 - 0
ChangeLog

@@ -17,6 +17,12 @@
 StarPU 1.2.0 (svn revision xxxx)
 StarPU 1.2.0 (svn revision xxxx)
 ==============================================
 ==============================================
 
 
+Small features:
+  * New function starpu_sched_ctx_display_workers() to display worker
+    information belonging to a given scheduler context
+  * The option --enable-verbose can be called with
+    --enable-verbose=extra to increase the verbosity
+
 StarPU 1.1.2 (svn revision xxxx)
 StarPU 1.1.2 (svn revision xxxx)
 ==============================================
 ==============================================
 The scheduling context release
 The scheduling context release

+ 5 - 1
configure.ac

@@ -1393,12 +1393,16 @@ fi
 
 
 AC_MSG_CHECKING(whether debug messages should be displayed)
 AC_MSG_CHECKING(whether debug messages should be displayed)
 AC_ARG_ENABLE(verbose, [AS_HELP_STRING([--enable-verbose],
 AC_ARG_ENABLE(verbose, [AS_HELP_STRING([--enable-verbose],
-			[display verbose debug messages])],
+			[display verbose debug messages (--enable-verbose=extra increase the verbosity)])],
 			enable_verbose=$enableval, enable_verbose=no)
 			enable_verbose=$enableval, enable_verbose=no)
 AC_MSG_RESULT($enable_verbose)
 AC_MSG_RESULT($enable_verbose)
 if test x$enable_verbose = xyes; then
 if test x$enable_verbose = xyes; then
 	AC_DEFINE(STARPU_VERBOSE, [1], [display verbose debug messages])
 	AC_DEFINE(STARPU_VERBOSE, [1], [display verbose debug messages])
 fi
 fi
+if test x$enable_verbose = xextra; then
+	AC_DEFINE(STARPU_VERBOSE, [1], [display verbose debug messages])
+	AC_DEFINE(STARPU_EXTRA_VERBOSE, [1], [display verbose debug messages])
+fi
 
 
 AC_MSG_CHECKING(whether coverage testing should be enabled)
 AC_MSG_CHECKING(whether coverage testing should be enabled)
 AC_ARG_ENABLE(coverage, [AS_HELP_STRING([--enable-coverage],
 AC_ARG_ENABLE(coverage, [AS_HELP_STRING([--enable-coverage],

+ 1 - 1
doc/doxygen/chapters/41configure_options.doxy

@@ -42,7 +42,7 @@ Disable assertion checks, which saves computation time.
 \addindex __configure__--enable-verbose
 \addindex __configure__--enable-verbose
 Increase the verbosity of the debugging messages.  This can be disabled
 Increase the verbosity of the debugging messages.  This can be disabled
 at runtime by setting the environment variable \ref STARPU_SILENT to
 at runtime by setting the environment variable \ref STARPU_SILENT to
-any value.
+any value. <c>--enable-verbose=extra</c> increase even more the verbosity.
 
 
 \verbatim
 \verbatim
 $ STARPU_SILENT=1 ./vector_scal
 $ STARPU_SILENT=1 ./vector_scal

+ 4 - 0
doc/doxygen/chapters/api/scheduling_contexts.doxy

@@ -108,6 +108,10 @@ This function removes the workers in \p workerids_ctx from the context
 \p sched_ctx_id. The last argument cannot be greater than
 \p sched_ctx_id. The last argument cannot be greater than
 STARPU_NMAX_SCHED_CTXS.
 STARPU_NMAX_SCHED_CTXS.
 
 
+\fn void starpu_sched_ctx_display_workers(unsigned sched_ctx_id, FILE *f)
+\ingroup API_Scheduling_Contexts
+This function prints on the file \p f the worker names belonging to the context \p sched_ctx_id
+
 \fn void starpu_sched_ctx_delete(unsigned sched_ctx_id)
 \fn void starpu_sched_ctx_delete(unsigned sched_ctx_id)
 \ingroup API_Scheduling_Contexts
 \ingroup API_Scheduling_Contexts
 Delete scheduling context \p sched_ctx_id and transfer remaining
 Delete scheduling context \p sched_ctx_id and transfer remaining

+ 1 - 1
examples/axpy/axpy.c

@@ -31,7 +31,7 @@
 
 
 #include "axpy.h"
 #include "axpy.h"
 
 
-#define AXPY	SAXPY
+#define AXPY	STARPU_SAXPY
 #define CUBLASAXPY	cublasSaxpy
 #define CUBLASAXPY	cublasSaxpy
 
 
 #define N	(16*1024*1024)
 #define N	(16*1024*1024)

+ 10 - 10
examples/cg/cg.h

@@ -30,11 +30,11 @@
 
 
 #ifdef DOUBLE
 #ifdef DOUBLE
 #define TYPE	double
 #define TYPE	double
-#define GEMV	DGEMV
-#define DOT	DDOT
-#define GEMV	DGEMV
-#define AXPY	DAXPY
-#define SCAL	DSCAL
+#define GEMV	STARPU_DGEMV
+#define DOT	STARPU_DDOT
+#define GEMV	STARPU_DGEMV
+#define AXPY	STARPU_DAXPY
+#define SCAL	STARPU_DSCAL
 #define cublasdot	cublasDdot
 #define cublasdot	cublasDdot
 #define cublasscal	cublasDscal
 #define cublasscal	cublasDscal
 #define cublasaxpy	cublasDaxpy
 #define cublasaxpy	cublasDaxpy
@@ -42,11 +42,11 @@
 #define cublasscal	cublasDscal
 #define cublasscal	cublasDscal
 #else
 #else
 #define TYPE	float
 #define TYPE	float
-#define GEMV	SGEMV
-#define DOT	SDOT
-#define GEMV	SGEMV
-#define AXPY	SAXPY
-#define SCAL	SSCAL
+#define GEMV	STARPU_SGEMV
+#define DOT	STARPU_SDOT
+#define GEMV	STARPU_SGEMV
+#define AXPY	STARPU_SAXPY
+#define SCAL	STARPU_SSCAL
 #define cublasdot	cublasSdot
 #define cublasdot	cublasSdot
 #define cublasscal	cublasSscal
 #define cublasscal	cublasSscal
 #define cublasaxpy	cublasSaxpy
 #define cublasaxpy	cublasSaxpy

+ 1 - 1
examples/cholesky/cholesky_grain_tag.c

@@ -392,7 +392,7 @@ int main(int argc, char **argv)
 	float *test_mat = malloc(size*size*sizeof(float));
 	float *test_mat = malloc(size*size*sizeof(float));
 	STARPU_ASSERT(test_mat);
 	STARPU_ASSERT(test_mat);
 
 
-	SSYRK("L", "N", size, size, 1.0f,
+	STARPU_SSYRK("L", "N", size, size, 1.0f,
 				mat, size, 0.0f, test_mat, size);
 				mat, size, 0.0f, test_mat, size);
 
 
 	FPRINTF(stderr, "comparing results ...\n");
 	FPRINTF(stderr, "comparing results ...\n");

+ 1 - 1
examples/cholesky/cholesky_implicit.c

@@ -243,7 +243,7 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 		float *test_mat = malloc(size*size*sizeof(float));
 		float *test_mat = malloc(size*size*sizeof(float));
 		STARPU_ASSERT(test_mat);
 		STARPU_ASSERT(test_mat);
 
 
-		SSYRK("L", "N", size, size, 1.0f,
+		STARPU_SSYRK("L", "N", size, size, 1.0f,
 					mat, size, 0.0f, test_mat, size);
 					mat, size, 0.0f, test_mat, size);
 
 
 		FPRINTF(stderr, "comparing results ...\n");
 		FPRINTF(stderr, "comparing results ...\n");

+ 5 - 6
examples/cholesky/cholesky_kernels.c

@@ -49,7 +49,7 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STAR
 		if (worker_size == 1)
 		if (worker_size == 1)
 		{
 		{
 			/* Sequential CPU kernel */
 			/* Sequential CPU kernel */
-			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21, 
+			STARPU_SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21, 
 				right, ld12, 1.0f, center, ld22);
 				right, ld12, 1.0f, center, ld22);
 		}
 		}
 		else
 		else
@@ -63,7 +63,7 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STAR
 			float *new_left = &left[block_size*rank];
 			float *new_left = &left[block_size*rank];
 			float *new_center = &center[block_size*rank];
 			float *new_center = &center[block_size*rank];
 
 
-			SGEMM("N", "T", dy, new_dx, dz, -1.0f, new_left, ld21, 
+			STARPU_SGEMM("N", "T", dy, new_dx, dz, -1.0f, new_left, ld21, 
 				right, ld12, 1.0f, new_center, ld22);
 				right, ld12, 1.0f, new_center, ld22);
 		}
 		}
 	}
 	}
@@ -113,7 +113,7 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, STARPU_A
 	switch (s)
 	switch (s)
 	{
 	{
 		case 0:
 		case 0:
-			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
+			STARPU_STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
 			break;
 			break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
@@ -172,9 +172,9 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 
 
 				STARPU_ASSERT(lambda11 != 0.0f);
 				STARPU_ASSERT(lambda11 != 0.0f);
 		
 		
-				SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
+				STARPU_SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
 		
 		
-				SSYR("L", nx - z - 1, -1.0f, 
+				STARPU_SSYR("L", nx - z - 1, -1.0f, 
 							&sub11[(z+1)+z*ld], 1,
 							&sub11[(z+1)+z*ld], 1,
 							&sub11[(z+1)+(z+1)*ld], ld);
 							&sub11[(z+1)+(z+1)*ld], ld);
 			}
 			}
@@ -191,7 +191,6 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 				fprintf(stderr, "Error in Magma: %d\n", ret);
 				fprintf(stderr, "Error in Magma: %d\n", ret);
 				STARPU_ABORT();
 				STARPU_ABORT();
 			}
 			}
-			STARPU_ASSERT(!cures);
 			}
 			}
 #else
 #else
 			{
 			{

+ 1 - 1
examples/cholesky/cholesky_tag.c

@@ -368,7 +368,7 @@ int main(int argc, char **argv)
 	float *test_mat = malloc(size*size*sizeof(float));
 	float *test_mat = malloc(size*size*sizeof(float));
 	STARPU_ASSERT(test_mat);
 	STARPU_ASSERT(test_mat);
 
 
-	SSYRK("L", "N", size, size, 1.0f,
+	STARPU_SSYRK("L", "N", size, size, 1.0f,
 				mat, size, 0.0f, test_mat, size);
 				mat, size, 0.0f, test_mat, size);
 
 
 	FPRINTF(stderr, "comparing results ...\n");
 	FPRINTF(stderr, "comparing results ...\n");

+ 79 - 79
examples/common/blas.c

@@ -23,13 +23,13 @@
 
 
 /*
 /*
     This files contains BLAS wrappers for the different BLAS implementations
     This files contains BLAS wrappers for the different BLAS implementations
-  (eg. REFBLAS, STARPU_ATLAS, GOTOBLAS ...). We assume a Fortran orientation as most
+  (eg. REFBLAS, ATLAS, GOTOBLAS ...). We assume a Fortran orientation as most
   libraries do not supply C-based ordering.
   libraries do not supply C-based ordering.
  */
  */
 
 
 #ifdef STARPU_ATLAS
 #ifdef STARPU_ATLAS
 
 
-inline void SGEMM(char *transa, char *transb, int M, int N, int K, 
+inline void STARPU_SGEMM(char *transa, char *transb, int M, int N, int K, 
 			float alpha, const float *A, int lda, const float *B, int ldb, 
 			float alpha, const float *A, int lda, const float *B, int ldb, 
 			float beta, float *C, int ldc)
 			float beta, float *C, int ldc)
 {
 {
@@ -40,7 +40,7 @@ inline void SGEMM(char *transa, char *transb, int M, int N, int K,
 			M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);				
 			M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);				
 }
 }
 
 
-inline void DGEMM(char *transa, char *transb, int M, int N, int K, 
+inline void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, 
 			double alpha, double *A, int lda, double *B, int ldb, 
 			double alpha, double *A, int lda, double *B, int ldb, 
 			double beta, double *C, int ldc)
 			double beta, double *C, int ldc)
 {
 {
@@ -51,7 +51,7 @@ inline void DGEMM(char *transa, char *transb, int M, int N, int K,
 			M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);				
 			M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);				
 }
 }
 
 
-inline void SGEMV(char *transa, int M, int N, float alpha, float *A, int lda, float *X, int incX, float beta, float *Y, int incY)
+inline void STARPU_SGEMV(char *transa, int M, int N, float alpha, float *A, int lda, float *X, int incX, float beta, float *Y, int incY)
 {
 {
 	enum CBLAS_TRANSPOSE ta = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
 	enum CBLAS_TRANSPOSE ta = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
 
 
@@ -59,7 +59,7 @@ inline void SGEMV(char *transa, int M, int N, float alpha, float *A, int lda, fl
 					X, incX, beta, Y, incY);
 					X, incX, beta, Y, incY);
 }
 }
 
 
-inline void DGEMV(char *transa, int M, int N, double alpha, double *A, int lda, double *X, int incX, double beta, double *Y, int incY)
+inline void STARPU_DGEMV(char *transa, int M, int N, double alpha, double *A, int lda, double *X, int incX, double beta, double *Y, int incY)
 {
 {
 	enum CBLAS_TRANSPOSE ta = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
 	enum CBLAS_TRANSPOSE ta = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
 
 
@@ -67,27 +67,27 @@ inline void DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
 					X, incX, beta, Y, incY);
 					X, incX, beta, Y, incY);
 }
 }
 
 
-inline float SASUM(int N, float *X, int incX)
+inline float STARPU_SASUM(int N, float *X, int incX)
 {
 {
 	return cblas_sasum(N, X, incX);
 	return cblas_sasum(N, X, incX);
 }
 }
 
 
-inline double DASUM(int N, double *X, int incX)
+inline double STARPU_DASUM(int N, double *X, int incX)
 {
 {
 	return cblas_dasum(N, X, incX);
 	return cblas_dasum(N, X, incX);
 }
 }
 
 
-void SSCAL(int N, float alpha, float *X, int incX)
+void STARPU_SSCAL(int N, float alpha, float *X, int incX)
 {
 {
 	cblas_sscal(N, alpha, X, incX);
 	cblas_sscal(N, alpha, X, incX);
 }
 }
 
 
-void DSCAL(int N, double alpha, double *X, int incX)
+void STARPU_DSCAL(int N, double alpha, double *X, int incX)
 {
 {
 	cblas_dscal(N, alpha, X, incX);
 	cblas_dscal(N, alpha, X, incX);
 }
 }
 
 
-void STRSM (const char *side, const char *uplo, const char *transa,
+void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
                    const char *diag, const int m, const int n,
                    const char *diag, const int m, const int n,
                    const float alpha, const float *A, const int lda,
                    const float alpha, const float *A, const int lda,
                    float *B, const int ldb)
                    float *B, const int ldb)
@@ -100,7 +100,7 @@ void STRSM (const char *side, const char *uplo, const char *transa,
 	cblas_strsm(CblasColMajor, side_, uplo_, transa_, diag_, m, n, alpha, A, lda, B, ldb);
 	cblas_strsm(CblasColMajor, side_, uplo_, transa_, diag_, m, n, alpha, A, lda, B, ldb);
 }
 }
 
 
-void DTRSM (const char *side, const char *uplo, const char *transa,
+void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
                    const char *diag, const int m, const int n,
                    const char *diag, const int m, const int n,
                    const double alpha, const double *A, const int lda,
                    const double alpha, const double *A, const int lda,
                    double *B, const int ldb)
                    double *B, const int ldb)
@@ -113,7 +113,7 @@ void DTRSM (const char *side, const char *uplo, const char *transa,
 	cblas_dtrsm(CblasColMajor, side_, uplo_, transa_, diag_, m, n, alpha, A, lda, B, ldb);
 	cblas_dtrsm(CblasColMajor, side_, uplo_, transa_, diag_, m, n, alpha, A, lda, B, ldb);
 }
 }
 
 
-void SSYR (const char *uplo, const int n, const float alpha,
+void STARPU_SSYR (const char *uplo, const int n, const float alpha,
                   const float *x, const int incx, float *A, const int lda)
                   const float *x, const int incx, float *A, const int lda)
 {
 {
 	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
 	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
@@ -121,7 +121,7 @@ void SSYR (const char *uplo, const int n, const float alpha,
 	cblas_ssyr(CblasColMajor, uplo_, n, alpha, x, incx, A, lda); 
 	cblas_ssyr(CblasColMajor, uplo_, n, alpha, x, incx, A, lda); 
 }
 }
 
 
-void SSYRK (const char *uplo, const char *trans, const int n,
+void STARPU_SSYRK (const char *uplo, const char *trans, const int n,
                    const int k, const float alpha, const float *A,
                    const int k, const float alpha, const float *A,
                    const int lda, const float beta, float *C,
                    const int lda, const float beta, float *C,
                    const int ldc)
                    const int ldc)
@@ -132,21 +132,21 @@ void SSYRK (const char *uplo, const char *trans, const int n,
 	cblas_ssyrk(CblasColMajor, uplo_, trans_, n, k, alpha, A, lda, beta, C, ldc); 
 	cblas_ssyrk(CblasColMajor, uplo_, trans_, n, k, alpha, A, lda, beta, C, ldc); 
 }
 }
 
 
-void SGER(const int m, const int n, const float alpha,
+void STARPU_SGER(const int m, const int n, const float alpha,
                   const float *x, const int incx, const float *y,
                   const float *x, const int incx, const float *y,
                   const int incy, float *A, const int lda)
                   const int incy, float *A, const int lda)
 {
 {
 	cblas_sger(CblasColMajor, m, n, alpha, x, incx, y, incy, A, lda);
 	cblas_sger(CblasColMajor, m, n, alpha, x, incx, y, incy, A, lda);
 }
 }
 
 
-void DGER(const int m, const int n, const double alpha,
+void STARPU_DGER(const int m, const int n, const double alpha,
                   const double *x, const int incx, const double *y,
                   const double *x, const int incx, const double *y,
                   const int incy, double *A, const int lda)
                   const int incy, double *A, const int lda)
 {
 {
 	cblas_dger(CblasColMajor, m, n, alpha, x, incx, y, incy, A, lda);
 	cblas_dger(CblasColMajor, m, n, alpha, x, incx, y, incy, A, lda);
 }
 }
 
 
-void STRSV (const char *uplo, const char *trans, const char *diag, 
+void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
                    const int n, const float *A, const int lda, float *x, 
                    const int n, const float *A, const int lda, float *x, 
                    const int incx)
                    const int incx)
 {
 {
@@ -157,7 +157,7 @@ void STRSV (const char *uplo, const char *trans, const char *diag,
 	cblas_strsv(CblasColMajor, uplo_, trans_, diag_, n, A, lda, x, incx);
 	cblas_strsv(CblasColMajor, uplo_, trans_, diag_, n, A, lda, x, incx);
 }
 }
 
 
-void STRMM(const char *side, const char *uplo, const char *transA,
+void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
                  const char *diag, const int m, const int n,
                  const char *diag, const int m, const int n,
                  const float alpha, const float *A, const int lda,
                  const float alpha, const float *A, const int lda,
                  float *B, const int ldb)
                  float *B, const int ldb)
@@ -170,7 +170,7 @@ void STRMM(const char *side, const char *uplo, const char *transA,
 	cblas_strmm(CblasColMajor, side_, uplo_, transA_, diag_, m, n, alpha, A, lda, B, ldb);
 	cblas_strmm(CblasColMajor, side_, uplo_, transA_, diag_, m, n, alpha, A, lda, B, ldb);
 }
 }
 
 
-void DTRMM(const char *side, const char *uplo, const char *transA,
+void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
                  const char *diag, const int m, const int n,
                  const char *diag, const int m, const int n,
                  const double alpha, const double *A, const int lda,
                  const double alpha, const double *A, const int lda,
                  double *B, const int ldb)
                  double *B, const int ldb)
@@ -183,7 +183,7 @@ void DTRMM(const char *side, const char *uplo, const char *transA,
 	cblas_dtrmm(CblasColMajor, side_, uplo_, transA_, diag_, m, n, alpha, A, lda, B, ldb);
 	cblas_dtrmm(CblasColMajor, side_, uplo_, transA_, diag_, m, n, alpha, A, lda, B, ldb);
 }
 }
 
 
-void STRMV(const char *uplo, const char *transA, const char *diag,
+void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
                  const int n, const float *A, const int lda, float *X,
                  const int n, const float *A, const int lda, float *X,
                  const int incX)
                  const int incX)
 {
 {
@@ -194,53 +194,53 @@ void STRMV(const char *uplo, const char *transA, const char *diag,
 	cblas_strmv(CblasColMajor, uplo_, transA_, diag_, n, A, lda, X, incX);
 	cblas_strmv(CblasColMajor, uplo_, transA_, diag_, n, A, lda, X, incX);
 }
 }
 
 
-void SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY)
+void STARPU_SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY)
 {
 {
 	cblas_saxpy(n, alpha, X, incX, Y, incY);
 	cblas_saxpy(n, alpha, X, incX, Y, incY);
 }
 }
 
 
-void DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY)
+void STARPU_DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY)
 {
 {
 	cblas_daxpy(n, alpha, X, incX, Y, incY);
 	cblas_daxpy(n, alpha, X, incX, Y, incY);
 }
 }
 
 
-int ISAMAX (const int n, float *X, const int incX)
+int STARPU_ISAMAX (const int n, float *X, const int incX)
 {
 {
     int retVal;
     int retVal;
     retVal = cblas_isamax(n, X, incX);
     retVal = cblas_isamax(n, X, incX);
     return retVal;
     return retVal;
 }
 }
 
 
-int IDAMAX (const int n, double *X, const int incX)
+int STARPU_IDAMAX (const int n, double *X, const int incX)
 {
 {
     int retVal;
     int retVal;
     retVal = cblas_idamax(n, X, incX);
     retVal = cblas_idamax(n, X, incX);
     return retVal;
     return retVal;
 }
 }
 
 
-float SDOT(const int n, const float *x, const int incx, const float *y, const int incy)
+float STARPU_SDOT(const int n, const float *x, const int incx, const float *y, const int incy)
 {
 {
 	return cblas_sdot(n, x, incx, y, incy);
 	return cblas_sdot(n, x, incx, y, incy);
 }
 }
 
 
-double DDOT(const int n, const double *x, const int incx, const double *y, const int incy)
+double STARPU_DDOT(const int n, const double *x, const int incx, const double *y, const int incy)
 {
 {
 	return cblas_ddot(n, x, incx, y, incy);
 	return cblas_ddot(n, x, incx, y, incy);
 }
 }
 
 
-void SSWAP(const int n, float *x, const int incx, float *y, const int incy)
+void STARPU_SSWAP(const int n, float *x, const int incx, float *y, const int incy)
 {
 {
 	cblas_sswap(n, x, incx, y, incy);
 	cblas_sswap(n, x, incx, y, incy);
 }
 }
 
 
-void DSWAP(const int n, double *x, const int incx, double *y, const int incy)
+void STARPU_DSWAP(const int n, double *x, const int incx, double *y, const int incy)
 {
 {
 	cblas_dswap(n, x, incx, y, incy);
 	cblas_dswap(n, x, incx, y, incy);
 }
 }
 
 
 #elif defined(STARPU_GOTO) || defined(STARPU_SYSTEM_BLAS) || defined(STARPU_MKL)
 #elif defined(STARPU_GOTO) || defined(STARPU_SYSTEM_BLAS) || defined(STARPU_MKL)
 
 
-inline void SGEMM(char *transa, char *transb, int M, int N, int K, 
+inline void STARPU_SGEMM(char *transa, char *transb, int M, int N, int K, 
 			float alpha, const float *A, int lda, const float *B, int ldb, 
 			float alpha, const float *A, int lda, const float *B, int ldb, 
 			float beta, float *C, int ldc)
 			float beta, float *C, int ldc)
 {
 {
@@ -249,7 +249,7 @@ inline void SGEMM(char *transa, char *transb, int M, int N, int K,
 			 &beta, C, &ldc);	
 			 &beta, C, &ldc);	
 }
 }
 
 
-inline void DGEMM(char *transa, char *transb, int M, int N, int K, 
+inline void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, 
 			double alpha, double *A, int lda, double *B, int ldb, 
 			double alpha, double *A, int lda, double *B, int ldb, 
 			double beta, double *C, int ldc)
 			double beta, double *C, int ldc)
 {
 {
@@ -259,39 +259,39 @@ inline void DGEMM(char *transa, char *transb, int M, int N, int K,
 }
 }
 
 
 
 
-inline void SGEMV(char *transa, int M, int N, float alpha, float *A, int lda,
+inline void STARPU_SGEMV(char *transa, int M, int N, float alpha, float *A, int lda,
 		float *X, int incX, float beta, float *Y, int incY)
 		float *X, int incX, float beta, float *Y, int incY)
 {
 {
 	sgemv_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
 	sgemv_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
 }
 }
 
 
-inline void DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
+inline void STARPU_DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
 		double *X, int incX, double beta, double *Y, int incY)
 		double *X, int incX, double beta, double *Y, int incY)
 {
 {
 	dgemv_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
 	dgemv_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
 }
 }
 
 
-inline float SASUM(int N, float *X, int incX)
+inline float STARPU_SASUM(int N, float *X, int incX)
 {
 {
 	return sasum_(&N, X, &incX);
 	return sasum_(&N, X, &incX);
 }
 }
 
 
-inline double DASUM(int N, double *X, int incX)
+inline double STARPU_DASUM(int N, double *X, int incX)
 {
 {
 	return dasum_(&N, X, &incX);
 	return dasum_(&N, X, &incX);
 }
 }
 
 
-void SSCAL(int N, float alpha, float *X, int incX)
+void STARPU_SSCAL(int N, float alpha, float *X, int incX)
 {
 {
 	sscal_(&N, &alpha, X, &incX);
 	sscal_(&N, &alpha, X, &incX);
 }
 }
 
 
-void DSCAL(int N, double alpha, double *X, int incX)
+void STARPU_DSCAL(int N, double alpha, double *X, int incX)
 {
 {
 	dscal_(&N, &alpha, X, &incX);
 	dscal_(&N, &alpha, X, &incX);
 }
 }
 
 
-void STRSM (const char *side, const char *uplo, const char *transa,
+void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
                    const char *diag, const int m, const int n,
                    const char *diag, const int m, const int n,
                    const float alpha, const float *A, const int lda,
                    const float alpha, const float *A, const int lda,
                    float *B, const int ldb)
                    float *B, const int ldb)
@@ -299,7 +299,7 @@ void STRSM (const char *side, const char *uplo, const char *transa,
 	strsm_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
 	strsm_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
 }
 }
 
 
-void DTRSM (const char *side, const char *uplo, const char *transa,
+void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
                    const char *diag, const int m, const int n,
                    const char *diag, const int m, const int n,
                    const double alpha, const double *A, const int lda,
                    const double alpha, const double *A, const int lda,
                    double *B, const int ldb)
                    double *B, const int ldb)
@@ -307,13 +307,13 @@ void DTRSM (const char *side, const char *uplo, const char *transa,
 	dtrsm_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
 	dtrsm_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
 }
 }
 
 
-void SSYR (const char *uplo, const int n, const float alpha,
+void STARPU_SSYR (const char *uplo, const int n, const float alpha,
                   const float *x, const int incx, float *A, const int lda)
                   const float *x, const int incx, float *A, const int lda)
 {
 {
 	ssyr_(uplo, &n, &alpha, x, &incx, A, &lda); 
 	ssyr_(uplo, &n, &alpha, x, &incx, A, &lda); 
 }
 }
 
 
-void SSYRK (const char *uplo, const char *trans, const int n,
+void STARPU_SSYRK (const char *uplo, const char *trans, const int n,
                    const int k, const float alpha, const float *A,
                    const int k, const float alpha, const float *A,
                    const int lda, const float beta, float *C,
                    const int lda, const float beta, float *C,
                    const int ldc)
                    const int ldc)
@@ -321,28 +321,28 @@ void SSYRK (const char *uplo, const char *trans, const int n,
 	ssyrk_(uplo, trans, &n, &k, &alpha, A, &lda, &beta, C, &ldc); 
 	ssyrk_(uplo, trans, &n, &k, &alpha, A, &lda, &beta, C, &ldc); 
 }
 }
 
 
-void SGER(const int m, const int n, const float alpha,
+void STARPU_SGER(const int m, const int n, const float alpha,
                   const float *x, const int incx, const float *y,
                   const float *x, const int incx, const float *y,
                   const int incy, float *A, const int lda)
                   const int incy, float *A, const int lda)
 {
 {
 	sger_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
 	sger_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
 }
 }
 
 
-void DGER(const int m, const int n, const double alpha,
+void STARPU_DGER(const int m, const int n, const double alpha,
                   const double *x, const int incx, const double *y,
                   const double *x, const int incx, const double *y,
                   const int incy, double *A, const int lda)
                   const int incy, double *A, const int lda)
 {
 {
 	dger_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
 	dger_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
 }
 }
 
 
-void STRSV (const char *uplo, const char *trans, const char *diag, 
+void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
                    const int n, const float *A, const int lda, float *x, 
                    const int n, const float *A, const int lda, float *x, 
                    const int incx)
                    const int incx)
 {
 {
 	strsv_(uplo, trans, diag, &n, A, &lda, x, &incx);
 	strsv_(uplo, trans, diag, &n, A, &lda, x, &incx);
 }
 }
 
 
-void STRMM(const char *side, const char *uplo, const char *transA,
+void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
                  const char *diag, const int m, const int n,
                  const char *diag, const int m, const int n,
                  const float alpha, const float *A, const int lda,
                  const float alpha, const float *A, const int lda,
                  float *B, const int ldb)
                  float *B, const int ldb)
@@ -350,7 +350,7 @@ void STRMM(const char *side, const char *uplo, const char *transA,
 	strmm_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
 	strmm_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
 }
 }
 
 
-void DTRMM(const char *side, const char *uplo, const char *transA,
+void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
                  const char *diag, const int m, const int n,
                  const char *diag, const int m, const int n,
                  const double alpha, const double *A, const int lda,
                  const double alpha, const double *A, const int lda,
                  double *B, const int ldb)
                  double *B, const int ldb)
@@ -358,38 +358,38 @@ void DTRMM(const char *side, const char *uplo, const char *transA,
 	dtrmm_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
 	dtrmm_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
 }
 }
 
 
-void STRMV(const char *uplo, const char *transA, const char *diag,
+void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
                  const int n, const float *A, const int lda, float *X,
                  const int n, const float *A, const int lda, float *X,
                  const int incX)
                  const int incX)
 {
 {
 	strmv_(uplo, transA, diag, &n, A, &lda, X, &incX);
 	strmv_(uplo, transA, diag, &n, A, &lda, X, &incX);
 }
 }
 
 
-void SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY)
+void STARPU_SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY)
 {
 {
 	saxpy_(&n, &alpha, X, &incX, Y, &incY);
 	saxpy_(&n, &alpha, X, &incX, Y, &incY);
 }
 }
 
 
-void DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY)
+void STARPU_DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY)
 {
 {
 	daxpy_(&n, &alpha, X, &incX, Y, &incY);
 	daxpy_(&n, &alpha, X, &incX, Y, &incY);
 }
 }
 
 
-int ISAMAX (const int n, float *X, const int incX)
+int STARPU_ISAMAX (const int n, float *X, const int incX)
 {
 {
     int retVal;
     int retVal;
     retVal = isamax_ (&n, X, &incX);
     retVal = isamax_ (&n, X, &incX);
     return retVal;
     return retVal;
 }
 }
 
 
-int IDAMAX (const int n, double *X, const int incX)
+int STARPU_IDAMAX (const int n, double *X, const int incX)
 {
 {
     int retVal;
     int retVal;
     retVal = idamax_ (&n, X, &incX);
     retVal = idamax_ (&n, X, &incX);
     return retVal;
     return retVal;
 }
 }
 
 
-float SDOT(const int n, const float *x, const int incx, const float *y, const int incy)
+float STARPU_SDOT(const int n, const float *x, const int incx, const float *y, const int incy)
 {
 {
 	float retVal = 0;
 	float retVal = 0;
 
 
@@ -399,104 +399,104 @@ float SDOT(const int n, const float *x, const int incx, const float *y, const in
 	return retVal;
 	return retVal;
 }
 }
 
 
-double DDOT(const int n, const double *x, const int incx, const double *y, const int incy)
+double STARPU_DDOT(const int n, const double *x, const int incx, const double *y, const int incy)
 {
 {
 	return ddot_(&n, x, &incx, y, &incy);
 	return ddot_(&n, x, &incx, y, &incy);
 }
 }
 
 
-void SSWAP(const int n, float *X, const int incX, float *Y, const int incY)
+void STARPU_SSWAP(const int n, float *X, const int incX, float *Y, const int incY)
 {
 {
 	sswap_(&n, X, &incX, Y, &incY);
 	sswap_(&n, X, &incX, Y, &incY);
 }
 }
 
 
-void DSWAP(const int n, double *X, const int incX, double *Y, const int incY)
+void STARPU_DSWAP(const int n, double *X, const int incX, double *Y, const int incY)
 {
 {
 	dswap_(&n, X, &incX, Y, &incY);
 	dswap_(&n, X, &incX, Y, &incY);
 }
 }
 
 
 
 
 #elif defined(STARPU_SIMGRID)
 #elif defined(STARPU_SIMGRID)
-inline void SGEMM(char *transa, char *transb, int M, int N, int K, 
+inline void STARPU_SGEMM(char *transa, char *transb, int M, int N, int K, 
 			float alpha, const float *A, int lda, const float *B, int ldb, 
 			float alpha, const float *A, int lda, const float *B, int ldb, 
 			float beta, float *C, int ldc) { }
 			float beta, float *C, int ldc) { }
 
 
-inline void DGEMM(char *transa, char *transb, int M, int N, int K, 
+inline void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, 
 			double alpha, double *A, int lda, double *B, int ldb, 
 			double alpha, double *A, int lda, double *B, int ldb, 
 			double beta, double *C, int ldc) { }
 			double beta, double *C, int ldc) { }
 
 
-inline void SGEMV(char *transa, int M, int N, float alpha, float *A, int lda,
+inline void STARPU_SGEMV(char *transa, int M, int N, float alpha, float *A, int lda,
 		float *X, int incX, float beta, float *Y, int incY) { }
 		float *X, int incX, float beta, float *Y, int incY) { }
 
 
-inline void DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
+inline void STARPU_DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
 		double *X, int incX, double beta, double *Y, int incY) { }
 		double *X, int incX, double beta, double *Y, int incY) { }
 
 
-inline float SASUM(int N, float *X, int incX) { }
+inline float STARPU_SASUM(int N, float *X, int incX) { }
 
 
-inline double DASUM(int N, double *X, int incX) { }
+inline double STARPU_DASUM(int N, double *X, int incX) { }
 
 
-void SSCAL(int N, float alpha, float *X, int incX) { }
+void STARPU_SSCAL(int N, float alpha, float *X, int incX) { }
 
 
-void DSCAL(int N, double alpha, double *X, int incX) { }
+void STARPU_DSCAL(int N, double alpha, double *X, int incX) { }
 
 
-void STRSM (const char *side, const char *uplo, const char *transa,
+void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
                    const char *diag, const int m, const int n,
                    const char *diag, const int m, const int n,
                    const float alpha, const float *A, const int lda,
                    const float alpha, const float *A, const int lda,
                    float *B, const int ldb) { }
                    float *B, const int ldb) { }
 
 
-void DTRSM (const char *side, const char *uplo, const char *transa,
+void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
                    const char *diag, const int m, const int n,
                    const char *diag, const int m, const int n,
                    const double alpha, const double *A, const int lda,
                    const double alpha, const double *A, const int lda,
                    double *B, const int ldb) { }
                    double *B, const int ldb) { }
 
 
-void SSYR (const char *uplo, const int n, const float alpha,
+void STARPU_SSYR (const char *uplo, const int n, const float alpha,
                   const float *x, const int incx, float *A, const int lda) { }
                   const float *x, const int incx, float *A, const int lda) { }
 
 
-void SSYRK (const char *uplo, const char *trans, const int n,
+void STARPU_SSYRK (const char *uplo, const char *trans, const int n,
                    const int k, const float alpha, const float *A,
                    const int k, const float alpha, const float *A,
                    const int lda, const float beta, float *C,
                    const int lda, const float beta, float *C,
                    const int ldc) { }
                    const int ldc) { }
 
 
-void SGER(const int m, const int n, const float alpha,
+void STARPU_SGER(const int m, const int n, const float alpha,
                   const float *x, const int incx, const float *y,
                   const float *x, const int incx, const float *y,
                   const int incy, float *A, const int lda) { }
                   const int incy, float *A, const int lda) { }
 
 
-void DGER(const int m, const int n, const double alpha,
+void STARPU_DGER(const int m, const int n, const double alpha,
                   const double *x, const int incx, const double *y,
                   const double *x, const int incx, const double *y,
                   const int incy, double *A, const int lda) { }
                   const int incy, double *A, const int lda) { }
 
 
-void STRSV (const char *uplo, const char *trans, const char *diag, 
+void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
                    const int n, const float *A, const int lda, float *x, 
                    const int n, const float *A, const int lda, float *x, 
                    const int incx) { }
                    const int incx) { }
 
 
-void STRMM(const char *side, const char *uplo, const char *transA,
+void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
                  const char *diag, const int m, const int n,
                  const char *diag, const int m, const int n,
                  const float alpha, const float *A, const int lda,
                  const float alpha, const float *A, const int lda,
                  float *B, const int ldb) { }
                  float *B, const int ldb) { }
 
 
-void DTRMM(const char *side, const char *uplo, const char *transA,
+void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
                  const char *diag, const int m, const int n,
                  const char *diag, const int m, const int n,
                  const double alpha, const double *A, const int lda,
                  const double alpha, const double *A, const int lda,
                  double *B, const int ldb) { }
                  double *B, const int ldb) { }
 
 
-void STRMV(const char *uplo, const char *transA, const char *diag,
+void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
                  const int n, const float *A, const int lda, float *X,
                  const int n, const float *A, const int lda, float *X,
                  const int incX) { }
                  const int incX) { }
 
 
-void SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY) { }
+void STARPU_SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY) { }
 
 
-void DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY) { }
+void STARPU_DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY) { }
 
 
-int ISAMAX (const int n, float *X, const int incX) { }
+int STARPU_ISAMAX (const int n, float *X, const int incX) { }
 
 
-int IDAMAX (const int n, double *X, const int incX) { }
+int STARPU_IDAMAX (const int n, double *X, const int incX) { }
 
 
-float SDOT(const int n, const float *x, const int incx, const float *y, const int incy) { }
+float STARPU_SDOT(const int n, const float *x, const int incx, const float *y, const int incy) { }
 
 
-double DDOT(const int n, const double *x, const int incx, const double *y, const int incy) { }
+double STARPU_DDOT(const int n, const double *x, const int incx, const double *y, const int incy) { }
 
 
-void SSWAP(const int n, float *X, const int incX, float *Y, const int incY) { }
+void STARPU_SSWAP(const int n, float *X, const int incX, float *Y, const int incY) { }
 
 
-void DSWAP(const int n, double *X, const int incX, double *Y, const int incY) { }
+void STARPU_DSWAP(const int n, double *X, const int incX, double *Y, const int incY) { }
 
 
 
 
 #else
 #else

+ 27 - 27
examples/common/blas.h

@@ -24,63 +24,63 @@
 #include <cblas.h>
 #include <cblas.h>
 #endif
 #endif
 
 
-void SGEMM(char *transa, char *transb, int M, int N, int K, float alpha, const float *A, int lda, 
+void STARPU_SGEMM(char *transa, char *transb, int M, int N, int K, float alpha, const float *A, int lda, 
 		const float *B, int ldb, float beta, float *C, int ldc);
 		const float *B, int ldb, float beta, float *C, int ldc);
-void DGEMM(char *transa, char *transb, int M, int N, int K, double alpha, double *A, int lda, 
+void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, double alpha, double *A, int lda, 
 		double *B, int ldb, double beta, double *C, int ldc);
 		double *B, int ldb, double beta, double *C, int ldc);
-void SGEMV(char *transa, int M, int N, float alpha, float *A, int lda,
+void STARPU_SGEMV(char *transa, int M, int N, float alpha, float *A, int lda,
 		float *X, int incX, float beta, float *Y, int incY);
 		float *X, int incX, float beta, float *Y, int incY);
-void DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
+void STARPU_DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
 		double *X, int incX, double beta, double *Y, int incY);
 		double *X, int incX, double beta, double *Y, int incY);
-float SASUM(int N, float *X, int incX);
-double DASUM(int N, double *X, int incX);
-void SSCAL(int N, float alpha, float *X, int incX);
-void DSCAL(int N, double alpha, double *X, int incX);
-void STRSM (const char *side, const char *uplo, const char *transa,
+float STARPU_SASUM(int N, float *X, int incX);
+double STARPU_DASUM(int N, double *X, int incX);
+void STARPU_SSCAL(int N, float alpha, float *X, int incX);
+void STARPU_DSCAL(int N, double alpha, double *X, int incX);
+void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
                    const char *diag, const int m, const int n,
                    const char *diag, const int m, const int n,
                    const float alpha, const float *A, const int lda,
                    const float alpha, const float *A, const int lda,
                    float *B, const int ldb);
                    float *B, const int ldb);
-void DTRSM (const char *side, const char *uplo, const char *transa,
+void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
                    const char *diag, const int m, const int n,
                    const char *diag, const int m, const int n,
                    const double alpha, const double *A, const int lda,
                    const double alpha, const double *A, const int lda,
                    double *B, const int ldb);
                    double *B, const int ldb);
-void DGEMM(char *transa, char *transb, int M, int N, int K, 
+void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, 
 			double alpha, double *A, int lda, double *B, int ldb, 
 			double alpha, double *A, int lda, double *B, int ldb, 
 			double beta, double *C, int ldc);
 			double beta, double *C, int ldc);
-void SSYR (const char *uplo, const int n, const float alpha,
+void STARPU_SSYR (const char *uplo, const int n, const float alpha,
                   const float *x, const int incx, float *A, const int lda);
                   const float *x, const int incx, float *A, const int lda);
-void SSYRK (const char *uplo, const char *trans, const int n,
+void STARPU_SSYRK (const char *uplo, const char *trans, const int n,
                    const int k, const float alpha, const float *A,
                    const int k, const float alpha, const float *A,
                    const int lda, const float beta, float *C,
                    const int lda, const float beta, float *C,
                    const int ldc);
                    const int ldc);
-void SGER (const int m, const int n, const float alpha,
+void STARPU_SGER (const int m, const int n, const float alpha,
                   const float *x, const int incx, const float *y,
                   const float *x, const int incx, const float *y,
                   const int incy, float *A, const int lda);
                   const int incy, float *A, const int lda);
-void DGER(const int m, const int n, const double alpha,
+void STARPU_DGER(const int m, const int n, const double alpha,
                   const double *x, const int incx, const double *y,
                   const double *x, const int incx, const double *y,
                   const int incy, double *A, const int lda);
                   const int incy, double *A, const int lda);
-void STRSV (const char *uplo, const char *trans, const char *diag, 
+void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
                    const int n, const float *A, const int lda, float *x, 
                    const int n, const float *A, const int lda, float *x, 
                    const int incx);
                    const int incx);
-void STRMM(const char *side, const char *uplo, const char *transA,
+void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
                  const char *diag, const int m, const int n,
                  const char *diag, const int m, const int n,
                  const float alpha, const float *A, const int lda,
                  const float alpha, const float *A, const int lda,
                  float *B, const int ldb);
                  float *B, const int ldb);
-void DTRMM(const char *side, const char *uplo, const char *transA,
+void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
                  const char *diag, const int m, const int n,
                  const char *diag, const int m, const int n,
                  const double alpha, const double *A, const int lda,
                  const double alpha, const double *A, const int lda,
                  double *B, const int ldb);
                  double *B, const int ldb);
-void STRMV(const char *uplo, const char *transA, const char *diag,
+void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
                  const int n, const float *A, const int lda, float *X,
                  const int n, const float *A, const int lda, float *X,
                  const int incX);
                  const int incX);
-void SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incy);
-void DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY);
-int ISAMAX (const int n, float *X, const int incX);
-int IDAMAX (const int n, double *X, const int incX);
-float SDOT(const int n, const float *x, const int incx, const float *y, const int incy);
-double DDOT(const int n, const double *x, const int incx, const double *y, const int incy);
-void SSWAP(const int n, float *x, const int incx, float *y, const int incy);
-void DSWAP(const int n, double *x, const int incx, double *y, const int incy);
+void STARPU_SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incy);
+void STARPU_DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY);
+int STARPU_ISAMAX (const int n, float *X, const int incX);
+int STARPU_IDAMAX (const int n, double *X, const int incX);
+float STARPU_SDOT(const int n, const float *x, const int incx, const float *y, const int incy);
+double STARPU_DDOT(const int n, const double *x, const int incx, const double *y, const int incy);
+void STARPU_SSWAP(const int n, float *x, const int incx, float *y, const int incy);
+void STARPU_DSWAP(const int n, double *x, const int incx, double *y, const int incy);
 
 
 #if defined(STARPU_GOTO) || defined(STARPU_SYSTEM_BLAS) || defined(STARPU_MKL)
 #if defined(STARPU_GOTO) || defined(STARPU_SYSTEM_BLAS) || defined(STARPU_MKL)
 
 

+ 1 - 1
examples/heat/dw_factolu.h

@@ -141,7 +141,7 @@ static void STARPU_ATTRIBUTE_UNUSED compare_A_LU(float *A, float *LU,
 
 
 
 
         /* now A_err = L, compute L*U */
         /* now A_err = L, compute L*U */
-	STRMM("R", "U", "N", "U", size, size, 1.0f, U, size, L, size);
+	STARPU_STRMM("R", "U", "N", "U", size, size, 1.0f, U, size, L, size);
 
 
 	float max_err = 0.0f;
 	float max_err = 0.0f;
 	for (i = 0; i < size ; i++)
 	for (i = 0; i < size ; i++)

+ 5 - 5
examples/heat/dw_factolu_kernels.c

@@ -124,7 +124,7 @@ static inline void dw_common_cpu_codelet_update_u22(void *descr[], int s, STARPU
 	switch (s)
 	switch (s)
 	{
 	{
 		case 0:
 		case 0:
-			SGEMM("N", "N",	dy, dx, dz, 
+			STARPU_SGEMM("N", "N",	dy, dx, dz, 
 				-1.0f, left, ld21, right, ld12,
 				-1.0f, left, ld21, right, ld12,
 					     1.0f, center, ld22);
 					     1.0f, center, ld22);
 			break;
 			break;
@@ -189,7 +189,7 @@ static inline void dw_common_codelet_update_u12(void *descr[], int s, STARPU_ATT
 	switch (s)
 	switch (s)
 	{
 	{
 		case 0:
 		case 0:
-			STRSM("L", "L", "N", "N",
+			STARPU_STRSM("L", "L", "N", "N",
 					 nx12, ny12, 1.0f, sub11, ld11, sub12, ld12);
 					 nx12, ny12, 1.0f, sub11, ld11, sub12, ld12);
 			break;
 			break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
@@ -251,7 +251,7 @@ static inline void dw_common_codelet_update_u21(void *descr[], int s, STARPU_ATT
 	switch (s)
 	switch (s)
 	{
 	{
 		case 0:
 		case 0:
-			STRSM("R", "U", "N", "U", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
+			STARPU_STRSM("R", "U", "N", "U", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
 			break;
 			break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
@@ -325,9 +325,9 @@ static inline void dw_common_codelet_update_u11(void *descr[], int s, STARPU_ATT
 				pivot = sub11[z+z*ld];
 				pivot = sub11[z+z*ld];
 				STARPU_ASSERT(pivot != 0.0f);
 				STARPU_ASSERT(pivot != 0.0f);
 		
 		
-				SSCAL(nx - z - 1, (1.0f/pivot), &sub11[z+(z+1)*ld], ld);
+				STARPU_SSCAL(nx - z - 1, (1.0f/pivot), &sub11[z+(z+1)*ld], ld);
 		
 		
-				SGER(nx - z - 1, nx - z - 1, -1.0f,
+				STARPU_SGER(nx - z - 1, nx - z - 1, -1.0f,
 						&sub11[z+(z+1)*ld], ld,
 						&sub11[z+(z+1)*ld], ld,
 						&sub11[(z+1)+z*ld], 1,
 						&sub11[(z+1)+z*ld], 1,
 						&sub11[(z+1) + (z+1)*ld],ld);
 						&sub11[(z+1) + (z+1)*ld],ld);

+ 7 - 7
examples/heat/dw_sparse_cg_kernels.c

@@ -126,7 +126,7 @@ void cpu_codelet_func_3(void *descr[], void *arg)
 	vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	size = (int)STARPU_VECTOR_GET_NX(descr[0]);
 	size = (int)STARPU_VECTOR_GET_NX(descr[0]);
 
 
-	dot = SDOT(size, vec, 1, vec, 1);
+	dot = STARPU_SDOT(size, vec, 1, vec, 1);
 
 
 	fprintf(stderr, "func 3 : DOT = %f\n", dot);
 	fprintf(stderr, "func 3 : DOT = %f\n", dot);
 
 
@@ -218,7 +218,7 @@ void cpu_codelet_func_5(void *descr[], void *arg)
 	STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
 	STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
 
-	dot = SDOT(size, vecd, 1, vecq, 1);
+	dot = STARPU_SDOT(size, vecd, 1, vecq, 1);
 
 
 	pb->alpha = pb->delta_new / dot;
 	pb->alpha = pb->delta_new / dot;
 }
 }
@@ -265,7 +265,7 @@ void cpu_codelet_func_6(void *descr[], void *arg)
 
 
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
 
-	SAXPY(size, pb->alpha, vecd, 1, vecx, 1);
+	STARPU_SAXPY(size, pb->alpha, vecd, 1, vecx, 1);
 }
 }
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
@@ -304,7 +304,7 @@ void cpu_codelet_func_7(void *descr[], void *arg)
 
 
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
 
-	SAXPY(size, -pb->alpha, vecq, 1, vecr, 1);
+	STARPU_SAXPY(size, -pb->alpha, vecq, 1, vecr, 1);
 }
 }
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
@@ -344,7 +344,7 @@ void cpu_codelet_func_8(void *descr[], void *arg)
 	vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
 
-	dot = SDOT(size, vecr, 1, vecr, 1);
+	dot = STARPU_SDOT(size, vecr, 1, vecr, 1);
 
 
 	pb->delta_old = pb->delta_new;
 	pb->delta_old = pb->delta_new;
 	pb->delta_new = dot;
 	pb->delta_new = dot;
@@ -392,10 +392,10 @@ void cpu_codelet_func_9(void *descr[], void *arg)
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
 
 	/* d = beta d */
 	/* d = beta d */
-	SSCAL(size, pb->beta, vecd, 1);
+	STARPU_SSCAL(size, pb->beta, vecd, 1);
 
 
 	/* d = r + d */
 	/* d = r + d */
-	SAXPY (size, 1.0f, vecr, 1, vecd, 1);
+	STARPU_SAXPY (size, 1.0f, vecr, 1, vecd, 1);
 }
 }
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA

+ 7 - 7
examples/heat/heat.c

@@ -362,10 +362,10 @@ static void solve_system(unsigned size, unsigned subsize, float *result, int *Re
 	}
 	}
 
 
 		/* L */
 		/* L */
-		STRSV("L", "N", "N", subsize, A, subsize, B, 1);
+		STARPU_STRSV("L", "N", "N", subsize, A, subsize, B, 1);
 	
 	
 		/* U */
 		/* U */
-	        STRSV("U", "N", "U", subsize, A, subsize, B, 1);
+	        STARPU_STRSV("U", "N", "U", subsize, A, subsize, B, 1);
 	
 	
 		STARPU_ASSERT(DIM == size);
 		STARPU_ASSERT(DIM == size);
 	
 	
@@ -378,19 +378,19 @@ static void solve_system(unsigned size, unsigned subsize, float *result, int *Re
 	
 	
 	
 	
 		/* LUB = U * LUB */
 		/* LUB = U * LUB */
-		STRMV("U", "N", "U", subsize, A, subsize, LUB, 1);
+		STARPU_STRMV("U", "N", "U", subsize, A, subsize, LUB, 1);
 		
 		
 		/* LUB = L * LUB */
 		/* LUB = L * LUB */
-		STRMV("L", "N", "N", subsize, A, subsize, LUB, 1);
+		STARPU_STRMV("L", "N", "N", subsize, A, subsize, LUB, 1);
 	
 	
 		/* LUB -= B */
 		/* LUB -= B */
-		SAXPY(subsize, -1.0f, savedB, 1, LUB, 1);
+		STARPU_SAXPY(subsize, -1.0f, savedB, 1, LUB, 1);
 	
 	
 		/* check if LUB is close to the 0 vector */
 		/* check if LUB is close to the 0 vector */
-		int maxind = ISAMAX(subsize, LUB, 1);
+		int maxind = STARPU_ISAMAX(subsize, LUB, 1);
 		FPRINTF(stderr, "max error (LUX - B) = %e\n",LUB[maxind - 1]);
 		FPRINTF(stderr, "max error (LUX - B) = %e\n",LUB[maxind - 1]);
 
 
-		float sum = SASUM(subsize, LUB, 1);
+		float sum = STARPU_SASUM(subsize, LUB, 1);
 		FPRINTF(stderr,"avg. error %e\n", sum/subsize);
 		FPRINTF(stderr,"avg. error %e\n", sum/subsize);
 	
 	
 		free(LUB);
 		free(LUB);

+ 9 - 9
examples/lu/lu-double.h

@@ -33,16 +33,16 @@
 #define CUBLAS_SWAP	cublasDswap
 #define CUBLAS_SWAP	cublasDswap
 #define CUBLAS_IAMAX	cublasIdamax
 #define CUBLAS_IAMAX	cublasIdamax
 
 
-#define CPU_GEMM	DGEMM
-#define CPU_TRSM	DTRSM
-#define CPU_SCAL	DSCAL
-#define CPU_GER		DGER
-#define CPU_SWAP	DSWAP
+#define CPU_GEMM	STARPU_DGEMM
+#define CPU_TRSM	STARPU_DTRSM
+#define CPU_SCAL	STARPU_DSCAL
+#define CPU_GER		STARPU_DGER
+#define CPU_SWAP	STARPU_DSWAP
 
 
-#define CPU_TRMM	DTRMM
-#define CPU_AXPY	DAXPY
-#define CPU_ASUM	DASUM
-#define CPU_IAMAX	IDAMAX
+#define CPU_TRMM	STARPU_DTRMM
+#define CPU_AXPY	STARPU_DAXPY
+#define CPU_ASUM	STARPU_DASUM
+#define CPU_IAMAX	STARPU_IDAMAX
 
 
 #define PIVOT_THRESHHOLD	10e-10
 #define PIVOT_THRESHHOLD	10e-10
 
 

+ 10 - 10
examples/lu/lu-float.h

@@ -35,16 +35,16 @@
 #define CUBLAS_SWAP	cublasSswap
 #define CUBLAS_SWAP	cublasSswap
 #define CUBLAS_IAMAX	cublasIsamax
 #define CUBLAS_IAMAX	cublasIsamax
 
 
-#define CPU_GEMM	SGEMM
-#define CPU_TRSM	STRSM
-#define CPU_SCAL	SSCAL
-#define CPU_GER		SGER
-#define CPU_SWAP	SSWAP
-
-#define CPU_TRMM	STRMM
-#define CPU_AXPY	SAXPY
-#define CPU_ASUM	SASUM
-#define CPU_IAMAX	ISAMAX
+#define CPU_GEMM	STARPU_SGEMM
+#define CPU_TRSM	STARPU_STRSM
+#define CPU_SCAL	STARPU_SSCAL
+#define CPU_GER		STARPU_SGER
+#define CPU_SWAP	STARPU_SSWAP
+
+#define CPU_TRMM	STARPU_STRMM
+#define CPU_AXPY	STARPU_SAXPY
+#define CPU_ASUM	STARPU_SASUM
+#define CPU_IAMAX	STARPU_ISAMAX
 
 
 #define PIVOT_THRESHHOLD	10e-5
 #define PIVOT_THRESHHOLD	10e-5
 
 

+ 1 - 1
examples/lu/xlu.h

@@ -60,7 +60,7 @@ static void STARPU_ATTRIBUTE_UNUSED compare_A_LU(float *A, float *LU,
 	}
 	}
 
 
         /* now A_err = L, compute L*U */
         /* now A_err = L, compute L*U */
-	STRMM("R", "U", "N", "U", size, size, 1.0f, U, size, L, size);
+	STARPU_STRMM("R", "U", "N", "U", size, size, 1.0f, U, size, L, size);
 
 
 	float max_err = 0.0f;
 	float max_err = 0.0f;
 	for (i = 0; i < size ; i++)
 	for (i = 0; i < size ; i++)

+ 3 - 3
examples/mult/double.h

@@ -17,9 +17,9 @@
 #define TYPE	double
 #define TYPE	double
 
 
 #define CUBLAS_GEMM cublasDgemm
 #define CUBLAS_GEMM cublasDgemm
-#define CPU_GEMM	DGEMM
-#define CPU_ASUM	DASUM
-#define CPU_IAMAX	IDAMAX
+#define CPU_GEMM	STARPU_DGEMM
+#define CPU_ASUM	STARPU_DASUM
+#define CPU_IAMAX	STARPU_IDAMAX
 #define STARPU_GEMM(name)	starpu_dgemm_##name
 #define STARPU_GEMM(name)	starpu_dgemm_##name
 
 
 #define str(s) #s
 #define str(s) #s

+ 3 - 3
examples/mult/simple.h

@@ -17,9 +17,9 @@
 #define TYPE	float
 #define TYPE	float
 
 
 #define CUBLAS_GEMM cublasSgemm
 #define CUBLAS_GEMM cublasSgemm
-#define CPU_GEMM	SGEMM
-#define CPU_ASUM	SASUM
-#define CPU_IAMAX	ISAMAX
+#define CPU_GEMM	STARPU_SGEMM
+#define CPU_ASUM	STARPU_SASUM
+#define CPU_IAMAX	STARPU_ISAMAX
 #define STARPU_GEMM(name)	starpu_sgemm_##name
 #define STARPU_GEMM(name)	starpu_sgemm_##name
 
 
 #define str(s) #s
 #define str(s) #s

+ 2 - 2
examples/pipeline/pipeline.c

@@ -90,7 +90,7 @@ void pipeline_cpu_axpy(void *descr[], void *arg)
 	float *y = (float *) STARPU_VECTOR_GET_PTR(descr[1]);
 	float *y = (float *) STARPU_VECTOR_GET_PTR(descr[1]);
 	int n = STARPU_VECTOR_GET_NX(descr[0]);
 	int n = STARPU_VECTOR_GET_NX(descr[0]);
 
 
-	SAXPY(n, 1., x, 1, y, 1);
+	STARPU_SAXPY(n, 1., x, 1, y, 1);
 }
 }
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
@@ -129,7 +129,7 @@ void pipeline_cpu_sum(void *descr[], void *_args)
 	int n = STARPU_VECTOR_GET_NX(descr[0]);
 	int n = STARPU_VECTOR_GET_NX(descr[0]);
 	float y;
 	float y;
 
 
-	y = SASUM(n, x, 1);
+	y = STARPU_SASUM(n, x, 1);
 
 
 	FPRINTF(stderr,"CPU finished with %f\n", y);
 	FPRINTF(stderr,"CPU finished with %f\n", y);
 }
 }

+ 58 - 24
examples/sched_ctx/sched_ctx.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010-2013  Université de Bordeaux 1
  * Copyright (C) 2010-2013  Université de Bordeaux 1
- * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2014  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  *
  *
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
+OB */
 
 
 #include <starpu.h>
 #include <starpu.h>
 
 
@@ -26,16 +26,20 @@
 int tasks_executed = 0;
 int tasks_executed = 0;
 starpu_pthread_mutex_t mut;
 starpu_pthread_mutex_t mut;
 
 
-static void sched_ctx_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg STARPU_ATTRIBUTE_UNUSED)
+static void sched_ctx_cpu_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg STARPU_ATTRIBUTE_UNUSED)
 {
 {
 	starpu_pthread_mutex_lock(&mut);
 	starpu_pthread_mutex_lock(&mut);
 	tasks_executed++;
 	tasks_executed++;
 	starpu_pthread_mutex_unlock(&mut);
 	starpu_pthread_mutex_unlock(&mut);
 }
 }
 
 
-static struct starpu_codelet sched_ctx_codelet =
+static void sched_ctx_cuda_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg STARPU_ATTRIBUTE_UNUSED)
 {
 {
-	.cpu_funcs = {sched_ctx_func, NULL},
+}
+
+static struct starpu_codelet sched_ctx_codelet1 =
+{
+	.cpu_funcs = {sched_ctx_cpu_func, NULL},
 	.cuda_funcs = {NULL},
 	.cuda_funcs = {NULL},
 	.opencl_funcs = {NULL},
 	.opencl_funcs = {NULL},
 	.model = NULL,
 	.model = NULL,
@@ -43,11 +47,25 @@ static struct starpu_codelet sched_ctx_codelet =
 	.name = "sched_ctx"
 	.name = "sched_ctx"
 };
 };
 
 
+static struct starpu_codelet sched_ctx_codelet2 =
+{
+	.cpu_funcs = {sched_ctx_cpu_func, NULL},
+	.cuda_funcs = {sched_ctx_cuda_func, NULL},
+	.opencl_funcs = {NULL},
+	.model = NULL,
+	.nbuffers = 0,
+	.name = "sched_ctx"
+};
+
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
 	int ntasks = NTASKS;
 	int ntasks = NTASKS;
 	int ret;
 	int ret;
+	unsigned ncuda = 0;
+	int nprocs1 = 0;
+	int nprocs2 = 0;
+	int procs1[STARPU_NMAXWORKERS], procs2[STARPU_NMAXWORKERS];
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	if (ret == -ENODEV)
 	if (ret == -ENODEV)
@@ -55,25 +73,23 @@ int main(int argc, char **argv)
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
 	starpu_pthread_mutex_init(&mut, NULL);
 	starpu_pthread_mutex_init(&mut, NULL);
-	int nprocs1 = 1;
-	int nprocs2 = 1;
-	int procs1[STARPU_NMAXWORKERS], procs2[STARPU_NMAXWORKERS];
-	procs1[0] = 0;
-	procs2[0] = 0;
 
 
 #ifdef STARPU_USE_CPU
 #ifdef STARPU_USE_CPU
-	unsigned ncpus =  starpu_cpu_worker_get_count();
-	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, ncpus);
-
-	nprocs1 = ncpus;
+	nprocs1 = starpu_cpu_worker_get_count();
+	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, nprocs1);
 #endif
 #endif
+	// if there is no cpu, skip
+	if (nprocs1 == 0) goto enodev;
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	unsigned ncuda = starpu_cuda_worker_get_count();
-	starpu_worker_get_ids_by_type(STARPU_CUDA_WORKER, procs2, ncuda);
-
-	nprocs2 = ncuda == 0 ? 1 : ncuda;
+	ncuda = nprocs2 = starpu_cuda_worker_get_count();
+	starpu_worker_get_ids_by_type(STARPU_CUDA_WORKER, procs2, nprocs2);
 #endif
 #endif
+	if (nprocs2 == 0)
+	{
+	     nprocs2 = 1;
+	     procs2[0] = procs1[0];
+	}
 
 
 	/*create contexts however you want*/
 	/*create contexts however you want*/
 	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", STARPU_SCHED_CTX_POLICY_NAME, "eager", 0);
 	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", STARPU_SCHED_CTX_POLICY_NAME, "eager", 0);
@@ -82,17 +98,18 @@ int main(int argc, char **argv)
 	/*indicate what to do with the resources when context 2 finishes (it depends on your application)*/
 	/*indicate what to do with the resources when context 2 finishes (it depends on your application)*/
 	starpu_sched_ctx_set_inheritor(sched_ctx2, sched_ctx1);
 	starpu_sched_ctx_set_inheritor(sched_ctx2, sched_ctx1);
 
 
+	starpu_sched_ctx_display_workers(sched_ctx2, stderr);
+
 	int i;
 	int i;
 	for (i = 0; i < ntasks/2; i++)
 	for (i = 0; i < ntasks/2; i++)
 	{
 	{
 		struct starpu_task *task = starpu_task_create();
 		struct starpu_task *task = starpu_task_create();
 
 
-		task->cl = &sched_ctx_codelet;
+		task->cl = &sched_ctx_codelet1;
 		task->cl_arg = NULL;
 		task->cl_arg = NULL;
 
 
 		/*submit tasks to context*/
 		/*submit tasks to context*/
 		ret = starpu_task_submit_to_ctx(task,sched_ctx1);
 		ret = starpu_task_submit_to_ctx(task,sched_ctx1);
-
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}
 	}
 
 
@@ -102,11 +119,27 @@ int main(int argc, char **argv)
 
 
 	starpu_sched_ctx_finished_submit(sched_ctx1);
 	starpu_sched_ctx_finished_submit(sched_ctx1);
 
 
+	/* task with no cuda impl submitted to a ctx with gpus only */
+	struct starpu_task *task2 = starpu_task_create();
+	task2->cl = &sched_ctx_codelet1;
+	task2->cl_arg = NULL;
+
+	/*submit tasks to context*/
+	ret = starpu_task_submit_to_ctx(task2,sched_ctx2);
+	if (ncuda == 0)
+	{
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+	else
+	{
+		STARPU_ASSERT_MSG(ret == -ENODEV, "submit task should ret enodev when the ctx does not have the PUs needed by the task");
+	}
+
 	for (i = 0; i < ntasks/2; i++)
 	for (i = 0; i < ntasks/2; i++)
 	{
 	{
 		struct starpu_task *task = starpu_task_create();
 		struct starpu_task *task = starpu_task_create();
 
 
-		task->cl = &sched_ctx_codelet;
+		task->cl = &sched_ctx_codelet2;
 		task->cl_arg = NULL;
 		task->cl_arg = NULL;
 
 
 		ret = starpu_task_submit_to_ctx(task,sched_ctx2);
 		ret = starpu_task_submit_to_ctx(task,sched_ctx2);
@@ -121,8 +154,9 @@ int main(int argc, char **argv)
 
 
 	starpu_sched_ctx_delete(sched_ctx1);
 	starpu_sched_ctx_delete(sched_ctx1);
 	starpu_sched_ctx_delete(sched_ctx2);
 	starpu_sched_ctx_delete(sched_ctx2);
-	printf("tasks executed %d out of %d\n", tasks_executed, ntasks);
-	starpu_shutdown();
+	printf("tasks executed %d out of %d\n", tasks_executed, ntasks/2);
 
 
-	return 0;
+enodev:
+	starpu_shutdown();
+	return nprocs1 == 0 ? 77 : 0;
 }
 }

+ 8 - 4
examples/sched_ctx/sched_ctx_without_sched_policy.c

@@ -73,6 +73,7 @@ int main(int argc, char **argv)
 	tasks_executed[1] = 0;
 	tasks_executed[1] = 0;
 	int ntasks = NTASKS;
 	int ntasks = NTASKS;
 	int ret, j, k;
 	int ret, j, k;
+	unsigned ncpus = 0;
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	if (ret == -ENODEV)
 	if (ret == -ENODEV)
@@ -85,7 +86,7 @@ int main(int argc, char **argv)
 	int *procs1, *procs2;
 	int *procs1, *procs2;
 
 
 #ifdef STARPU_USE_CPU
 #ifdef STARPU_USE_CPU
-	unsigned ncpus =  starpu_cpu_worker_get_count();
+	ncpus = starpu_cpu_worker_get_count();
 	procs1 = (int*)malloc(ncpus*sizeof(int));
 	procs1 = (int*)malloc(ncpus*sizeof(int));
 	procs2 = (int*)malloc(ncpus*sizeof(int));
 	procs2 = (int*)malloc(ncpus*sizeof(int));
 	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, ncpus);
 	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, ncpus);
@@ -96,7 +97,7 @@ int main(int argc, char **argv)
 		nprocs2 =  ncpus-nprocs1;
 		nprocs2 =  ncpus-nprocs1;
 		k = 0;
 		k = 0;
 		for(j = nprocs1; j < nprocs1+nprocs2; j++)
 		for(j = nprocs1; j < nprocs1+nprocs2; j++)
-			procs2[k++] = j;
+			procs2[k++] = procs1[j];
 	}
 	}
 	else
 	else
 	{
 	{
@@ -113,6 +114,8 @@ int main(int argc, char **argv)
 	procs2[0] = 0;
 	procs2[0] = 0;
 #endif
 #endif
 
 
+	if (ncpus == 0) goto enodev;
+
 	/*create contexts however you want*/
 	/*create contexts however you want*/
 	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", 0);
 	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", 0);
 	unsigned sched_ctx2 = starpu_sched_ctx_create(procs2, nprocs2, "ctx2", 0);
 	unsigned sched_ctx2 = starpu_sched_ctx_create(procs2, nprocs2, "ctx2", 0);
@@ -158,7 +161,8 @@ int main(int argc, char **argv)
 	starpu_sched_ctx_delete(sched_ctx2);
 	starpu_sched_ctx_delete(sched_ctx2);
 	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx1, tasks_executed[0], NTASKS*NTASKS);
 	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx1, tasks_executed[0], NTASKS*NTASKS);
 	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx2, tasks_executed[1], NTASKS*NTASKS);
 	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx2, tasks_executed[1], NTASKS*NTASKS);
-	starpu_shutdown();
 
 
-	return 0;
+enodev:
+	starpu_shutdown();
+	return ncpus == 0 ? 77 : 0;
 }
 }

+ 6 - 1
examples/worker_collections/worker_tree_example.c

@@ -30,7 +30,12 @@ int main(int argc, char **argv)
 
 
 int main()
 int main()
 {
 {
-	starpu_init(NULL);
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
 	int procs[STARPU_NMAXWORKERS];
 	int procs[STARPU_NMAXWORKERS];
 	unsigned ncpus =  starpu_cpu_worker_get_count();
 	unsigned ncpus =  starpu_cpu_worker_get_count();

+ 1 - 1
gcc-plugin/examples/cholesky/cholesky.c

@@ -203,7 +203,7 @@ int main(int argc, char **argv)
 		}
 		}
 	}
 	}
 	float test_mat[size * size] __heap;
 	float test_mat[size * size] __heap;
-	SSYRK("L", "N", size, size, 1.0f,
+	STARPU_SSYRK("L", "N", size, size, 1.0f,
 	      rmat, size, 0.0f, test_mat, size);
 	      rmat, size, 0.0f, test_mat, size);
 
 
 	fprintf(stderr, "comparing results ...\n");
 	fprintf(stderr, "comparing results ...\n");

+ 4 - 4
gcc-plugin/examples/cholesky/cholesky_kernels.c

@@ -42,7 +42,7 @@ static inline void chol_common_cpu_codelet_update_u22(const float *left, const f
 
 
 	switch (s) {
 	switch (s) {
 		case 0:
 		case 0:
-			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21,
+			STARPU_SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21,
 				right, ld12, 1.0f, center, ld22);
 				right, ld12, 1.0f, center, ld22);
 			break;
 			break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
@@ -95,7 +95,7 @@ static inline void chol_common_codelet_update_u21(const float *sub11, float *sub
 {
 {
 	switch (s) {
 	switch (s) {
 		case 0:
 		case 0:
-			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
+			STARPU_STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
 			break;
 			break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
@@ -153,9 +153,9 @@ static inline void chol_common_codelet_update_u11(float *sub11, unsigned nx, uns
 
 
 				STARPU_ASSERT(lambda11 != 0.0f);
 				STARPU_ASSERT(lambda11 != 0.0f);
 
 
-				SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
+				STARPU_SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
 
 
-				SSYR("L", nx - z - 1, -1.0f,
+				STARPU_SSYR("L", nx - z - 1, -1.0f,
 							&sub11[(z+1)+z*ld], 1,
 							&sub11[(z+1)+z*ld], 1,
 							&sub11[(z+1)+(z+1)*ld], ld);
 							&sub11[(z+1)+(z+1)*ld], ld);
 			}
 			}

+ 2 - 0
include/starpu_sched_ctx.h

@@ -40,6 +40,8 @@ void starpu_sched_ctx_add_workers(int *workerids_ctx, int nworkers_ctx, unsigned
 
 
 void starpu_sched_ctx_remove_workers(int *workerids_ctx, int nworkers_ctx, unsigned sched_ctx_id);
 void starpu_sched_ctx_remove_workers(int *workerids_ctx, int nworkers_ctx, unsigned sched_ctx_id);
 
 
+void starpu_sched_ctx_display_workers(unsigned sched_ctx_id, FILE *f);
+
 void starpu_sched_ctx_delete(unsigned sched_ctx_id);
 void starpu_sched_ctx_delete(unsigned sched_ctx_id);
 
 
 void starpu_sched_ctx_set_inheritor(unsigned sched_ctx_id, unsigned inheritor);
 void starpu_sched_ctx_set_inheritor(unsigned sched_ctx_id, unsigned inheritor);

+ 1 - 1
mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c

@@ -203,7 +203,7 @@ void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *corr
 	float *test_mat = malloc(size*size*sizeof(float));
 	float *test_mat = malloc(size*size*sizeof(float));
 	STARPU_ASSERT(test_mat);
 	STARPU_ASSERT(test_mat);
 
 
-	SSYRK("L", "N", size, size, 1.0f,
+	STARPU_SSYRK("L", "N", size, size, 1.0f,
 			rmat, size, 0.0f, test_mat, size);
 			rmat, size, 0.0f, test_mat, size);
 
 
 	FPRINTF(stderr, "[%d] comparing results ...\n", rank);
 	FPRINTF(stderr, "[%d] comparing results ...\n", rank);

+ 4 - 4
mpi/examples/matrix_decomposition/mpi_cholesky_kernels.c

@@ -55,7 +55,7 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STAR
 	switch (s)
 	switch (s)
 	{
 	{
 		case 0:
 		case 0:
-			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21,
+			STARPU_SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21,
 				right, ld12, 1.0f, center, ld22);
 				right, ld12, 1.0f, center, ld22);
 			break;
 			break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
@@ -111,7 +111,7 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, STARPU_A
 	switch (s)
 	switch (s)
 	{
 	{
 		case 0:
 		case 0:
-			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
+			STARPU_STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
 			break;
 			break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
@@ -171,9 +171,9 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 
 
 				STARPU_ASSERT(lambda11 != 0.0f);
 				STARPU_ASSERT(lambda11 != 0.0f);
 
 
-				SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
+				STARPU_SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
 
 
-				SSYR("L", nx - z - 1, -1.0f,
+				STARPU_SSYR("L", nx - z - 1, -1.0f,
 							&sub11[(z+1)+z*ld], 1,
 							&sub11[(z+1)+z*ld], 1,
 							&sub11[(z+1)+(z+1)*ld], ld);
 							&sub11[(z+1)+(z+1)*ld], ld);
 			}
 			}

+ 10 - 10
mpi/examples/mpi_lu/mpi_lu-double.h

@@ -27,16 +27,16 @@
 #define CUBLAS_SWAP	cublasDswap
 #define CUBLAS_SWAP	cublasDswap
 #define CUBLAS_IAMAX	cublasIdamax
 #define CUBLAS_IAMAX	cublasIdamax
 
 
-#define CPU_GEMM	DGEMM
-#define CPU_GEMV	DGEMV
-#define CPU_TRSM	DTRSM
-#define CPU_SCAL	DSCAL
-#define CPU_GER		DGER
-#define CPU_SWAP	DSWAP
+#define CPU_GEMM	STARPU_DGEMM
+#define CPU_GEMV	STARPU_DGEMV
+#define CPU_TRSM	STARPU_DTRSM
+#define CPU_SCAL	STARPU_DSCAL
+#define CPU_GER		STARPU_DGER
+#define CPU_SWAP	STARPU_DSWAP
 
 
-#define CPU_TRMM	DTRMM
-#define CPU_AXPY	DAXPY
-#define CPU_ASUM	DASUM
-#define CPU_IAMAX	IDAMAX
+#define CPU_TRMM	STARPU_DTRMM
+#define CPU_AXPY	STARPU_DAXPY
+#define CPU_ASUM	STARPU_DASUM
+#define CPU_IAMAX	STARPU_IDAMAX
 
 
 #define PIVOT_THRESHHOLD	10e-10
 #define PIVOT_THRESHHOLD	10e-10

+ 10 - 10
mpi/examples/mpi_lu/mpi_lu-float.h

@@ -27,16 +27,16 @@
 #define CUBLAS_SWAP	cublasSswap
 #define CUBLAS_SWAP	cublasSswap
 #define CUBLAS_IAMAX	cublasIsamax
 #define CUBLAS_IAMAX	cublasIsamax
 
 
-#define CPU_GEMM	SGEMM
-#define CPU_GEMV	SGEMV
-#define CPU_TRSM	STRSM
-#define CPU_SCAL	SSCAL
-#define CPU_GER		SGER
-#define CPU_SWAP	SSWAP
+#define CPU_GEMM	STARPU_SGEMM
+#define CPU_GEMV	STARPU_SGEMV
+#define CPU_TRSM	STARPU_STRSM
+#define CPU_SCAL	STARPU_SSCAL
+#define CPU_GER		STARPU_SGER
+#define CPU_SWAP	STARPU_SSWAP
 
 
-#define CPU_TRMM	STRMM
-#define CPU_AXPY	SAXPY
-#define CPU_ASUM	SASUM
-#define CPU_IAMAX	ISAMAX
+#define CPU_TRMM	STARPU_STRMM
+#define CPU_AXPY	STARPU_SAXPY
+#define CPU_ASUM	STARPU_SASUM
+#define CPU_IAMAX	STARPU_ISAMAX
 
 
 #define PIVOT_THRESHHOLD	10e-5
 #define PIVOT_THRESHHOLD	10e-5

+ 6 - 2
mpi/src/Makefile.am

@@ -39,7 +39,9 @@ noinst_HEADERS =					\
 	starpu_mpi_task_insert.h			\
 	starpu_mpi_task_insert.h			\
 	starpu_mpi_datatype.h				\
 	starpu_mpi_datatype.h				\
 	starpu_mpi_cache.h				\
 	starpu_mpi_cache.h				\
-	starpu_mpi_cache_stats.h
+	starpu_mpi_cache_stats.h			\
+	starpu_mpi_early_data.h				\
+	starpu_mpi_early_request.h
 
 
 libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_SOURCES =	\
 libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_SOURCES =	\
 	starpu_mpi.c					\
 	starpu_mpi.c					\
@@ -50,7 +52,9 @@ libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_SOURCES =	\
 	starpu_mpi_stats.c				\
 	starpu_mpi_stats.c				\
 	starpu_mpi_private.c				\
 	starpu_mpi_private.c				\
 	starpu_mpi_cache.c				\
 	starpu_mpi_cache.c				\
-	starpu_mpi_cache_stats.c
+	starpu_mpi_cache_stats.c			\
+	starpu_mpi_early_data.c				\
+	starpu_mpi_early_request.c
 
 
 showcheck:
 showcheck:
 	-cat /dev/null
 	-cat /dev/null

+ 138 - 348
mpi/src/starpu_mpi.c

@@ -22,6 +22,8 @@
 #include <starpu_profiling.h>
 #include <starpu_profiling.h>
 #include <starpu_mpi_stats.h>
 #include <starpu_mpi_stats.h>
 #include <starpu_mpi_task_insert.h>
 #include <starpu_mpi_task_insert.h>
+#include <starpu_mpi_early_data.h>
+#include <starpu_mpi_early_request.h>
 #include <common/config.h>
 #include <common/config.h>
 #include <common/thread.h>
 #include <common/thread.h>
 #include <datawizard/interfaces/data_interface.h>
 #include <datawizard/interfaces/data_interface.h>
@@ -65,234 +67,52 @@ static int posted_requests = 0, newer_requests, barrier_running = 0;
 
 
 #define _STARPU_MPI_INC_POSTED_REQUESTS(value) { STARPU_PTHREAD_MUTEX_LOCK(&mutex_posted_requests); posted_requests += value; STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_posted_requests); }
 #define _STARPU_MPI_INC_POSTED_REQUESTS(value) { STARPU_PTHREAD_MUTEX_LOCK(&mutex_posted_requests); posted_requests += value; STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_posted_requests); }
 
 
-LIST_TYPE(_starpu_mpi_copy_handle,
-	  starpu_data_handle_t handle;
-	  struct _starpu_mpi_envelope *env;
-	  struct _starpu_mpi_req *req;
-	  void *buffer;
-	  int mpi_tag;
-	  int source;
-	  int req_ready;
-	  starpu_pthread_mutex_t req_mutex;
-	  starpu_pthread_cond_t req_cond;
-);
-
-struct _starpu_mpi_copy_handle_hashlist
+static void _starpu_mpi_request_init(struct _starpu_mpi_req **req)
 {
 {
-	struct _starpu_mpi_copy_handle_list *list;
-	UT_hash_handle hh;
-	int mpi_tag;
-};
-
-/********************************************************/
-/*                                                      */
-/*  Hashmap's requests functionalities                  */
-/*                                                      */
-/********************************************************/
-
-/** stores application requests for which data have not been received yet */
-static struct _starpu_mpi_req **_starpu_mpi_app_req_hashmap = NULL;
-static int _starpu_mpi_app_req_hashmap_count = 0;
-/** stores data which have been received by MPI but have not been requested by the application */
-static struct _starpu_mpi_copy_handle_hashlist **_starpu_mpi_copy_handle_hashmap = NULL;
-static int _starpu_mpi_copy_handle_hashmap_count = 0;
-
-static struct _starpu_mpi_req* find_app_req(int mpi_tag, int source)
-{
-	struct _starpu_mpi_req* req;
-
-	HASH_FIND_INT(_starpu_mpi_app_req_hashmap[source], &mpi_tag, req);
-
-	return req;
-}
-
-static void add_app_req(struct _starpu_mpi_req *req)
-{
-	struct _starpu_mpi_req *test_req;
-
-	test_req = find_app_req(req->mpi_tag, req->srcdst);
-
-	if (test_req == NULL)
-	{
-		HASH_ADD_INT(_starpu_mpi_app_req_hashmap[req->srcdst], mpi_tag, req);
-		_starpu_mpi_app_req_hashmap_count ++;
-		_STARPU_MPI_DEBUG(3, "Adding request %p with tag %d in the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
-	}
-	else
-	{
-		_STARPU_MPI_DEBUG(3, "[Error] request %p with tag %d already in the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
-		int seq_const = starpu_data_get_sequential_consistency_flag(req->data_handle);
-		if (seq_const &&  req->sequential_consistency)
-		{
-			STARPU_ASSERT_MSG(!test_req, "[Error] request %p with tag %d wanted to be added to the application request hashmap[%d], while another request %p with the same tag is already in it. \n Sequential consistency is activated : this is not supported by StarPU.", req, req->mpi_tag, req->srcdst, test_req);
-		}
-		else
-		{
-			STARPU_ASSERT_MSG(!test_req, "[Error] request %p with tag %d wanted to be added to the application request hashmap[%d], while another request %p with the same tag is already in it. \n Sequential consistency isn't activated for this handle : you should want to add dependencies between requests for which the sequential consistency is deactivated.", req, req->mpi_tag, req->srcdst, test_req);
-		}
-	}
-}
-
-static void delete_app_req(struct _starpu_mpi_req *req)
-{
-	struct _starpu_mpi_req *test_req;
+	*req = malloc(sizeof(struct _starpu_mpi_req));
+	STARPU_ASSERT_MSG(*req, "Invalid request");
 
 
-	test_req = find_app_req(req->mpi_tag, req->srcdst);
-
-	if (test_req != NULL)
-	{
-		HASH_DEL(_starpu_mpi_app_req_hashmap[req->srcdst], req);
-		_starpu_mpi_app_req_hashmap_count --;
-		_STARPU_MPI_DEBUG(3, "Deleting application request %p with tag %d from the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
-	}
-	else
-	{
-		_STARPU_MPI_DEBUG(3, "[Warning] request %p with tag %d is NOT in the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
-	}
-}
-
-#ifdef STARPU_VERBOSE
-static void _starpu_mpi_copy_handle_display_hash(int source, int tag)
-{
-	struct _starpu_mpi_copy_handle_hashlist *hashlist;
-	HASH_FIND_INT(_starpu_mpi_copy_handle_hashmap[source], &tag, hashlist);
-
-	if (hashlist == NULL)
-	{
-		_STARPU_MPI_DEBUG(60, "Hashlist for source %d and tag %d does not exist\n", source, tag);
-	}
-	else if (_starpu_mpi_copy_handle_list_empty(hashlist->list))
-	{
-		_STARPU_MPI_DEBUG(60, "Hashlist for source %d and tag %d is empty\n", source, tag);
-	}
-	else
-	{
-		struct _starpu_mpi_copy_handle *cur;
-		for (cur = _starpu_mpi_copy_handle_list_begin(hashlist->list) ;
-		     cur != _starpu_mpi_copy_handle_list_end(hashlist->list);
-		     cur = _starpu_mpi_copy_handle_list_next(cur))
-		{
-			_STARPU_MPI_DEBUG(60, "Element for source %d and tag %d: %p\n", source, tag, cur);
-		}
-	}
-}
-#endif
-
-static struct _starpu_mpi_copy_handle *pop_chandle(int mpi_tag, int source, int delete)
-{
-	struct _starpu_mpi_copy_handle_hashlist *hashlist;
-	struct _starpu_mpi_copy_handle *chandle;
-
-	_STARPU_MPI_DEBUG(60, "Looking for chandle with tag %d in the hashmap[%d]\n", mpi_tag, source);
-	HASH_FIND_INT(_starpu_mpi_copy_handle_hashmap[source], &mpi_tag, hashlist);
-	if (hashlist == NULL)
-	{
-		chandle = NULL;
-	}
-	else
-	{
-		if (_starpu_mpi_copy_handle_list_empty(hashlist->list))
-		{
-			chandle = NULL;
-		}
-		else
-		{
-			if (delete == 1)
-			{
-				chandle = _starpu_mpi_copy_handle_list_pop_front(hashlist->list);
-			}
-			else
-			{
-				chandle = _starpu_mpi_copy_handle_list_front(hashlist->list);
-			}
-		}
-	}
-	_STARPU_MPI_DEBUG(60, "Found chandle %p with tag %d in the hashmap[%d]\n", chandle, mpi_tag, source);
-	return chandle;
-}
-
-static struct _starpu_mpi_copy_handle *find_chandle(int mpi_tag, int source)
-{
-	return pop_chandle(mpi_tag, source, 0);
-}
-
-static void add_chandle(struct _starpu_mpi_copy_handle *chandle)
-{
-	_STARPU_MPI_DEBUG(60, "Trying to add chandle %p with tag %d in the hashmap[%d]\n", chandle, chandle->mpi_tag, chandle->source);
-
-	struct _starpu_mpi_copy_handle_hashlist *hashlist;
-	HASH_FIND_INT(_starpu_mpi_copy_handle_hashmap[chandle->source], &chandle->mpi_tag, hashlist);
-	if (hashlist == NULL)
-	{
-		hashlist = malloc(sizeof(struct _starpu_mpi_copy_handle_hashlist));
-		hashlist->list = _starpu_mpi_copy_handle_list_new();
-		hashlist->mpi_tag = chandle->mpi_tag;
-		HASH_ADD_INT(_starpu_mpi_copy_handle_hashmap[chandle->source], mpi_tag, hashlist);
-	}
-	_starpu_mpi_copy_handle_list_push_back(hashlist->list, chandle);
-	_starpu_mpi_copy_handle_hashmap_count ++;
-#ifdef STARPU_VERBOSE
-	_starpu_mpi_copy_handle_display_hash(chandle->source, chandle->mpi_tag);
-#endif
-}
-
-static void delete_chandle(struct _starpu_mpi_copy_handle *chandle)
-{
-	_STARPU_MPI_DEBUG(60, "Trying to delete chandle %p with tag %d in the hashmap[%d]\n", chandle, chandle->mpi_tag, chandle->source);
-	struct _starpu_mpi_copy_handle *found = pop_chandle(chandle->mpi_tag, chandle->source, 1);
-
-	STARPU_ASSERT_MSG(found == chandle,
-			  "Error delete_chandle : chandle %p with tag %d is NOT in the hashmap[%d]\n", chandle, chandle->mpi_tag, chandle->source);
-
-	_starpu_mpi_copy_handle_hashmap_count --;
-#ifdef STARPU_VERBOSE
-	_starpu_mpi_copy_handle_display_hash(chandle->source, chandle->mpi_tag);
-#endif
-}
-
-static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
-{
 	/* Initialize the request structure */
 	/* Initialize the request structure */
-	req->data_handle = NULL;
+	(*req)->data_handle = NULL;
 
 
-	req->datatype = 0;
-	req->ptr = NULL;
-	req->count = -1;
-	req->user_datatype = -1;
+	(*req)->datatype = 0;
+	(*req)->ptr = NULL;
+	(*req)->count = -1;
+	(*req)->user_datatype = -1;
 
 
-	req->srcdst = -1;
-	req->mpi_tag = -1;
-	req->comm = 0;
+	(*req)->srcdst = -1;
+	(*req)->mpi_tag = -1;
+	(*req)->comm = 0;
 
 
-	req->func = NULL;
+	(*req)->func = NULL;
 
 
-	req->status = NULL;
-	req->request = 0;
-	req->flag = NULL;
+	(*req)->status = NULL;
+	(*req)->request = 0;
+	(*req)->flag = NULL;
 
 
-	req->ret = -1;
-	STARPU_PTHREAD_MUTEX_INIT(&req->req_mutex, NULL);
-	STARPU_PTHREAD_COND_INIT(&req->req_cond, NULL);
-	STARPU_PTHREAD_MUTEX_INIT(&req->posted_mutex, NULL);
-	STARPU_PTHREAD_COND_INIT(&req->posted_cond, NULL);
+	(*req)->ret = -1;
+	STARPU_PTHREAD_MUTEX_INIT(&((*req)->req_mutex), NULL);
+	STARPU_PTHREAD_COND_INIT(&((*req)->req_cond), NULL);
+	STARPU_PTHREAD_MUTEX_INIT(&((*req)->posted_mutex), NULL);
+	STARPU_PTHREAD_COND_INIT(&((*req)->posted_cond), NULL);
 
 
-	req->request_type = UNKNOWN_REQ;
+	(*req)->request_type = UNKNOWN_REQ;
 
 
-	req->submitted = 0;
-	req->completed = 0;
-	req->posted = 0;
+	(*req)->submitted = 0;
+	(*req)->completed = 0;
+	(*req)->posted = 0;
 
 
-	req->other_request = NULL;
+	(*req)->other_request = NULL;
 
 
-	req->detached = -1;
-	req->callback = NULL;
-	req->callback_arg = NULL;
+	(*req)->detached = -1;
+	(*req)->callback = NULL;
+	(*req)->callback_arg = NULL;
 
 
-	req->size_req = 0;
-	req->internal_req = NULL;
-	req->is_internal_req = 0;
-	req->envelope = NULL;
-	req->sequential_consistency = 1;
+	(*req)->size_req = 0;
+	(*req)->internal_req = NULL;
+	(*req)->is_internal_req = 0;
+	(*req)->envelope = NULL;
+	(*req)->sequential_consistency = 1;
 }
 }
 
 
  /********************************************************/
  /********************************************************/
@@ -310,35 +130,33 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
 							       int is_internal_req,
 							       int is_internal_req,
 							       ssize_t count)
 							       ssize_t count)
 {
 {
+	struct _starpu_mpi_req *req;
 
 
-	 _STARPU_MPI_LOG_IN();
-	 struct _starpu_mpi_req *req = malloc(sizeof(struct _starpu_mpi_req));
-	 STARPU_ASSERT_MSG(req, "Invalid request");
-
-	 _STARPU_MPI_INC_POSTED_REQUESTS(1);
-
-	 /* Initialize the request structure */
-	 _starpu_mpi_request_init(req);
-	 req->request_type = request_type;
-	 req->data_handle = data_handle;
-	 req->srcdst = srcdst;
-	 req->mpi_tag = mpi_tag;
-	 req->comm = comm;
-	 req->detached = detached;
-	 req->callback = callback;
-	 req->callback_arg = arg;
-	 req->func = func;
-	 req->sequential_consistency = sequential_consistency;
-	 req->is_internal_req = is_internal_req;
-	 req->count = count;
-
-	 /* Asynchronously request StarPU to fetch the data in main memory: when
-	  * it is available in main memory, _starpu_mpi_submit_new_mpi_request(req) is called and
-	  * the request is actually submitted */
-	 starpu_data_acquire_cb_sequential_consistency(data_handle, mode, _starpu_mpi_submit_new_mpi_request, (void *)req, sequential_consistency);
+	_STARPU_MPI_LOG_IN();
+	_STARPU_MPI_INC_POSTED_REQUESTS(1);
 
 
-	 _STARPU_MPI_LOG_OUT();
-	 return req;
+	/* Initialize the request structure */
+	_starpu_mpi_request_init(&req);
+	req->request_type = request_type;
+	req->data_handle = data_handle;
+	req->srcdst = srcdst;
+	req->mpi_tag = mpi_tag;
+	req->comm = comm;
+	req->detached = detached;
+	req->callback = callback;
+	req->callback_arg = arg;
+	req->func = func;
+	req->sequential_consistency = sequential_consistency;
+	req->is_internal_req = is_internal_req;
+	req->count = count;
+
+	/* Asynchronously request StarPU to fetch the data in main memory: when
+	 * it is available in main memory, _starpu_mpi_submit_new_mpi_request(req) is called and
+	 * the request is actually submitted */
+	starpu_data_acquire_cb_sequential_consistency(data_handle, mode, _starpu_mpi_submit_new_mpi_request, (void *)req, sequential_consistency);
+
+	_STARPU_MPI_LOG_OUT();
+	return req;
  }
  }
 
 
  /********************************************************/
  /********************************************************/
@@ -608,15 +426,11 @@ static void _starpu_mpi_wait_func(struct _starpu_mpi_req *waiting_req)
 
 
 int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 {
 {
-	_STARPU_MPI_LOG_IN();
 	int ret;
 	int ret;
-
-	struct _starpu_mpi_req *waiting_req = malloc(sizeof(struct _starpu_mpi_req));
-	_starpu_mpi_request_init(waiting_req);
-	STARPU_ASSERT_MSG(waiting_req, "Allocation failed");
-
 	struct _starpu_mpi_req *req = *public_req;
 	struct _starpu_mpi_req *req = *public_req;
+	struct _starpu_mpi_req *waiting_req;
 
 
+	_STARPU_MPI_LOG_IN();
 	_STARPU_MPI_INC_POSTED_REQUESTS(1);
 	_STARPU_MPI_INC_POSTED_REQUESTS(1);
 
 
 	/* We cannot try to complete a MPI request that was not actually posted
 	/* We cannot try to complete a MPI request that was not actually posted
@@ -627,7 +441,7 @@ int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 	STARPU_PTHREAD_MUTEX_UNLOCK(&(req->req_mutex));
 	STARPU_PTHREAD_MUTEX_UNLOCK(&(req->req_mutex));
 
 
 	/* Initialize the request structure */
 	/* Initialize the request structure */
-	 _starpu_mpi_request_init(waiting_req);
+	 _starpu_mpi_request_init(&waiting_req);
 	waiting_req->status = status;
 	waiting_req->status = status;
 	waiting_req->other_request = req;
 	waiting_req->other_request = req;
 	waiting_req->func = _starpu_mpi_wait_func;
 	waiting_req->func = _starpu_mpi_wait_func;
@@ -704,9 +518,8 @@ int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 
 
 	if (submitted)
 	if (submitted)
 	{
 	{
-		struct _starpu_mpi_req *testing_req = malloc(sizeof(struct _starpu_mpi_req));
-		STARPU_ASSERT_MSG(testing_req, "allocation failed");
-		_starpu_mpi_request_init(testing_req);
+		struct _starpu_mpi_req *testing_req;
+		_starpu_mpi_request_init(&testing_req);
 
 
 		/* Initialize the request structure */
 		/* Initialize the request structure */
 		STARPU_PTHREAD_MUTEX_INIT(&(testing_req->req_mutex), NULL);
 		STARPU_PTHREAD_MUTEX_INIT(&(testing_req->req_mutex), NULL);
@@ -768,11 +581,11 @@ static void _starpu_mpi_barrier_func(struct _starpu_mpi_req *barrier_req)
 
 
 int starpu_mpi_barrier(MPI_Comm comm)
 int starpu_mpi_barrier(MPI_Comm comm)
 {
 {
-	_STARPU_MPI_LOG_IN();
 	int ret;
 	int ret;
-	struct _starpu_mpi_req *barrier_req = malloc(sizeof(struct _starpu_mpi_req));
-	STARPU_ASSERT_MSG(barrier_req, "allocation failed");
-	_starpu_mpi_request_init(barrier_req);
+	struct _starpu_mpi_req *barrier_req;
+
+	_STARPU_MPI_LOG_IN();
+	_starpu_mpi_request_init(&barrier_req);
 
 
 	/* First wait for *both* all tasks and MPI requests to finish, in case
 	/* First wait for *both* all tasks and MPI requests to finish, in case
 	 * some tasks generate MPI requests, MPI requests generate tasks, etc.
 	 * some tasks generate MPI requests, MPI requests generate tasks, etc.
@@ -855,11 +668,11 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 
 
 	if (req->internal_req)
 	if (req->internal_req)
 	{
 	{
-		struct _starpu_mpi_copy_handle *chandle = find_chandle(req->mpi_tag, req->srcdst);
-		STARPU_ASSERT_MSG(chandle, "Could not find a copy data handle with the tag %d and the node %d\n", req->mpi_tag, req->srcdst);
-		_STARPU_MPI_DEBUG(3, "Handling deleting of copy_handle structure from the hashmap..\n");
-		delete_chandle(chandle);
-		free(chandle);
+		struct _starpu_mpi_early_data_handle *early_data_handle = _starpu_mpi_early_data_find(req->mpi_tag, req->srcdst);
+		STARPU_ASSERT_MSG(early_data_handle, "Could not find a copy data handle with the tag %d and the node %d\n", req->mpi_tag, req->srcdst);
+		_STARPU_MPI_DEBUG(3, "Handling deleting of early_data structure from the hashmap..\n");
+		_starpu_mpi_early_data_delete(early_data_handle);
+		free(early_data_handle);
 	}
 	}
 	else
 	else
 	{
 	{
@@ -911,17 +724,17 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 	_STARPU_MPI_LOG_OUT();
 	_STARPU_MPI_LOG_OUT();
 }
 }
 
 
-struct _starpu_mpi_copy_cb_args
+struct _starpu_mpi_early_data_cb_args
 {
 {
 	starpu_data_handle_t data_handle;
 	starpu_data_handle_t data_handle;
-	starpu_data_handle_t copy_handle;
+	starpu_data_handle_t early_handle;
 	struct _starpu_mpi_req *req;
 	struct _starpu_mpi_req *req;
 	void *buffer;
 	void *buffer;
 };
 };
 
 
-static void _starpu_mpi_copy_cb(void* arg)
+static void _starpu_mpi_early_data_cb(void* arg)
 {
 {
-	struct _starpu_mpi_copy_cb_args *args = arg;
+	struct _starpu_mpi_early_data_cb_args *args = arg;
 
 
 	// We store in the application request the internal MPI
 	// We store in the application request the internal MPI
 	// request so that it can be used by starpu_mpi_wait
 	// request so that it can be used by starpu_mpi_wait
@@ -931,16 +744,16 @@ static void _starpu_mpi_copy_cb(void* arg)
 	if (args->buffer)
 	if (args->buffer)
 	{
 	{
 		/* Data has been received as a raw memory, it has to be unpacked */
 		/* Data has been received as a raw memory, it has to be unpacked */
-		struct starpu_data_interface_ops *itf_src = starpu_data_get_interface_ops(args->copy_handle);
+		struct starpu_data_interface_ops *itf_src = starpu_data_get_interface_ops(args->early_handle);
 		struct starpu_data_interface_ops *itf_dst = starpu_data_get_interface_ops(args->data_handle);
 		struct starpu_data_interface_ops *itf_dst = starpu_data_get_interface_ops(args->data_handle);
 		STARPU_ASSERT_MSG(itf_dst->unpack_data, "The data interface does not define an unpack function\n");
 		STARPU_ASSERT_MSG(itf_dst->unpack_data, "The data interface does not define an unpack function\n");
-		itf_dst->unpack_data(args->data_handle, STARPU_MAIN_RAM, args->buffer, itf_src->get_size(args->copy_handle));
+		itf_dst->unpack_data(args->data_handle, STARPU_MAIN_RAM, args->buffer, itf_src->get_size(args->early_handle));
 		free(args->buffer);
 		free(args->buffer);
 	}
 	}
 	else
 	else
 	{
 	{
-		struct starpu_data_interface_ops *itf = starpu_data_get_interface_ops(args->copy_handle);
-		void* itf_src = starpu_data_get_interface_on_node(args->copy_handle, STARPU_MAIN_RAM);
+		struct starpu_data_interface_ops *itf = starpu_data_get_interface_ops(args->early_handle);
+		void* itf_src = starpu_data_get_interface_on_node(args->early_handle, STARPU_MAIN_RAM);
 		void* itf_dst = starpu_data_get_interface_on_node(args->data_handle, STARPU_MAIN_RAM);
 		void* itf_dst = starpu_data_get_interface_on_node(args->data_handle, STARPU_MAIN_RAM);
 
 
 		if (!itf->copy_methods->ram_to_ram)
 		if (!itf->copy_methods->ram_to_ram)
@@ -955,11 +768,11 @@ static void _starpu_mpi_copy_cb(void* arg)
 		}
 		}
 	}
 	}
 
 
-	_STARPU_MPI_DEBUG(3, "Done, handling release of copy_handle..\n");
-	starpu_data_release(args->copy_handle);
+	_STARPU_MPI_DEBUG(3, "Done, handling release of early_handle..\n");
+	starpu_data_release(args->early_handle);
 
 
-	_STARPU_MPI_DEBUG(3, "Done, handling unregister of copy_handle..\n");
-	starpu_data_unregister_submit(args->copy_handle);
+	_STARPU_MPI_DEBUG(3, "Done, handling unregister of early_handle..\n");
+	starpu_data_unregister_submit(args->early_handle);
 
 
 	_STARPU_MPI_DEBUG(3, "Done, handling request %p termination of the already received request\n",args->req);
 	_STARPU_MPI_DEBUG(3, "Done, handling request %p termination of the already received request\n",args->req);
 	// If the request is detached, we need to call _starpu_mpi_handle_request_termination
 	// If the request is detached, we need to call _starpu_mpi_handle_request_termination
@@ -1019,41 +832,41 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 		else
 		else
 		{
 		{
 			/* test whether the receive request has already been submitted internally by StarPU-MPI*/
 			/* test whether the receive request has already been submitted internally by StarPU-MPI*/
-			struct _starpu_mpi_copy_handle *chandle = find_chandle(req->mpi_tag, req->srcdst);
+			struct _starpu_mpi_early_data_handle *early_data_handle = _starpu_mpi_early_data_find(req->mpi_tag, req->srcdst);
 
 
 			/* Case : the request has already been submitted internally by StarPU.
 			/* Case : the request has already been submitted internally by StarPU.
 			 * We'll asynchronously ask a Read permission over the temporary handle, so as when
 			 * We'll asynchronously ask a Read permission over the temporary handle, so as when
-			 * the internal receive will be over, the _starpu_mpi_copy_cb function will be called to
+			 * the internal receive will be over, the _starpu_mpi_early_data_cb function will be called to
 			 * bring the data back to the original data handle associated to the request.*/
 			 * bring the data back to the original data handle associated to the request.*/
-			if (chandle)
+			if (early_data_handle)
 			{
 			{
 				STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 				STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
-				STARPU_PTHREAD_MUTEX_LOCK(&(chandle->req_mutex));
-				while (!(chandle->req_ready))
-					STARPU_PTHREAD_COND_WAIT(&(chandle->req_cond), &(chandle->req_mutex));
-				STARPU_PTHREAD_MUTEX_UNLOCK(&(chandle->req_mutex));
+				STARPU_PTHREAD_MUTEX_LOCK(&(early_data_handle->req_mutex));
+				while (!(early_data_handle->req_ready))
+					STARPU_PTHREAD_COND_WAIT(&(early_data_handle->req_cond), &(early_data_handle->req_mutex));
+				STARPU_PTHREAD_MUTEX_UNLOCK(&(early_data_handle->req_mutex));
 				STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 				STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 
 
 				_STARPU_MPI_DEBUG(3, "The RECV request %p with tag %d has already been received, copying previously received data into handle's pointer..\n", req, req->mpi_tag);
 				_STARPU_MPI_DEBUG(3, "The RECV request %p with tag %d has already been received, copying previously received data into handle's pointer..\n", req, req->mpi_tag);
-				STARPU_ASSERT(req->data_handle != chandle->handle);
+				STARPU_ASSERT(req->data_handle != early_data_handle->handle);
 
 
-				req->internal_req = chandle->req;
+				req->internal_req = early_data_handle->req;
 
 
-				struct _starpu_mpi_copy_cb_args *cb_args = malloc(sizeof(struct _starpu_mpi_copy_cb_args));
+				struct _starpu_mpi_early_data_cb_args *cb_args = malloc(sizeof(struct _starpu_mpi_early_data_cb_args));
 				cb_args->data_handle = req->data_handle;
 				cb_args->data_handle = req->data_handle;
-				cb_args->copy_handle = chandle->handle;
-				cb_args->buffer = chandle->buffer;
+				cb_args->early_handle = early_data_handle->handle;
+				cb_args->buffer = early_data_handle->buffer;
 				cb_args->req = req;
 				cb_args->req = req;
 
 
 				_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
 				_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
-				starpu_data_acquire_cb(chandle->handle,STARPU_R,_starpu_mpi_copy_cb,(void*) cb_args);
+				starpu_data_acquire_cb(early_data_handle->handle,STARPU_R,_starpu_mpi_early_data_cb,(void*) cb_args);
 			}
 			}
 			/* Case : a classic receive request with no send received earlier than expected.
 			/* Case : a classic receive request with no send received earlier than expected.
 			 * We just add the pending receive request to the requests' hashmap. */
 			 * We just add the pending receive request to the requests' hashmap. */
 			else
 			else
 			{
 			{
 				_STARPU_MPI_DEBUG(3, "Adding the pending receive request %p (srcdst %d tag %d) into the request hashmap\n", req, req->srcdst, req->mpi_tag);
 				_STARPU_MPI_DEBUG(3, "Adding the pending receive request %p (srcdst %d tag %d) into the request hashmap\n", req, req->srcdst, req->mpi_tag);
-				add_app_req(req);
+				_starpu_mpi_early_request_add(req);
 			}
 			}
 		}
 		}
 	}
 	}
@@ -1252,14 +1065,8 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	_starpu_mpi_comm_amounts_init(MPI_COMM_WORLD);
 	_starpu_mpi_comm_amounts_init(MPI_COMM_WORLD);
 	_starpu_mpi_cache_init(MPI_COMM_WORLD);
 	_starpu_mpi_cache_init(MPI_COMM_WORLD);
 
 
-	{
-		int nb_nodes, k;
-		MPI_Comm_size(MPI_COMM_WORLD, &nb_nodes);
-		_starpu_mpi_app_req_hashmap = malloc(nb_nodes * sizeof(struct _starpu_mpi_req *));
-		for(k=0 ; k<nb_nodes ; k++) _starpu_mpi_app_req_hashmap[k] = NULL;
-		_starpu_mpi_copy_handle_hashmap = malloc(nb_nodes * sizeof(struct _starpu_mpi_copy_handle_hash_list *));
-		for(k=0 ; k<nb_nodes ; k++) _starpu_mpi_copy_handle_hashmap[k] = NULL;
-	}
+	_starpu_mpi_early_request_init(worldsize);
+	_starpu_mpi_early_data_init(worldsize);
 
 
 	/* notify the main thread that the progression thread is ready */
 	/* notify the main thread that the progression thread is ready */
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
@@ -1276,7 +1083,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	while (running || posted_requests || !(_starpu_mpi_req_list_empty(new_requests)) || !(_starpu_mpi_req_list_empty(detached_requests)))
 	while (running || posted_requests || !(_starpu_mpi_req_list_empty(new_requests)) || !(_starpu_mpi_req_list_empty(detached_requests)))
 	{
 	{
 		/* shall we block ? */
 		/* shall we block ? */
-		unsigned block = _starpu_mpi_req_list_empty(new_requests) && (_starpu_mpi_app_req_hashmap_count == 0);
+		unsigned block = _starpu_mpi_req_list_empty(new_requests) && _starpu_mpi_early_request_count() == 0;
 
 
 #ifndef STARPU_MPI_ACTIVITY
 #ifndef STARPU_MPI_ACTIVITY
 		STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
 		STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
@@ -1316,7 +1123,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 		/* If there is no currently submitted header_req submitted to catch envelopes from senders, and there is some pending receive
 		/* If there is no currently submitted header_req submitted to catch envelopes from senders, and there is some pending receive
 		 * requests in our side, we resubmit a header request. */
 		 * requests in our side, we resubmit a header request. */
 		MPI_Request header_req;
 		MPI_Request header_req;
-		if ((_starpu_mpi_app_req_hashmap_count > 0) && (header_req_submitted == 0))// && (HASH_COUNT(_starpu_mpi_copy_handle_hashmap) == 0))
+		if ((_starpu_mpi_early_request_count() > 0) && (header_req_submitted == 0))// && (HASH_COUNT(_starpu_mpi_early_data_handle_hashmap) == 0))
 		{
 		{
 			_STARPU_MPI_DEBUG(3, "Posting a receive to get a data envelop\n");
 			_STARPU_MPI_DEBUG(3, "Posting a receive to get a data envelop\n");
 			MPI_Irecv(recv_env, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, MPI_ANY_SOURCE, _starpu_mpi_tag, MPI_COMM_WORLD, &header_req);
 			MPI_Irecv(recv_env, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, MPI_ANY_SOURCE, _starpu_mpi_tag, MPI_COMM_WORLD, &header_req);
@@ -1342,14 +1149,14 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 			{
 			{
 				_STARPU_MPI_DEBUG(3, "Searching for application request with tag %d and source %d (size %ld)\n", recv_env->mpi_tag, status.MPI_SOURCE, recv_env->size);
 				_STARPU_MPI_DEBUG(3, "Searching for application request with tag %d and source %d (size %ld)\n", recv_env->mpi_tag, status.MPI_SOURCE, recv_env->size);
 
 
-				struct _starpu_mpi_req *found_req = find_app_req(recv_env->mpi_tag, status.MPI_SOURCE);
+				struct _starpu_mpi_req *found_req = _starpu_mpi_early_request_find(recv_env->mpi_tag, status.MPI_SOURCE);
 
 
 				/* Case : a data will arrive before the matching receive has been submitted in our side of the application.
 				/* Case : a data will arrive before the matching receive has been submitted in our side of the application.
 				 * We will allow a temporary handle to store the incoming data, by submitting a starpu_mpi_irecv_detached
 				 * We will allow a temporary handle to store the incoming data, by submitting a starpu_mpi_irecv_detached
 				 * on this handle, and register this so as the StarPU-MPI layer can remember it.*/
 				 * on this handle, and register this so as the StarPU-MPI layer can remember it.*/
 				if (!found_req)
 				if (!found_req)
 				{
 				{
-					_STARPU_MPI_DEBUG(3, "Request with tag %d and source %d not found, creating a copy_handle to receive incoming data..\n", recv_env->mpi_tag, status.MPI_SOURCE);
+					_STARPU_MPI_DEBUG(3, "Request with tag %d and source %d not found, creating a early_handle to receive incoming data..\n", recv_env->mpi_tag, status.MPI_SOURCE);
 
 
 					starpu_data_handle_t data_handle = NULL;
 					starpu_data_handle_t data_handle = NULL;
 
 
@@ -1357,19 +1164,19 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 					data_handle = _starpu_data_get_data_handle_from_tag(recv_env->mpi_tag);
 					data_handle = _starpu_data_get_data_handle_from_tag(recv_env->mpi_tag);
 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 
 
-					struct _starpu_mpi_copy_handle* chandle = calloc(1, sizeof(struct _starpu_mpi_copy_handle));
-					STARPU_ASSERT(chandle);
-					STARPU_PTHREAD_MUTEX_INIT(&chandle->req_mutex, NULL);
-					STARPU_PTHREAD_COND_INIT(&chandle->req_cond, NULL);
-					chandle->mpi_tag = recv_env->mpi_tag;
-					chandle->env = recv_env;
-					chandle->source = status.MPI_SOURCE;
+					struct _starpu_mpi_early_data_handle* early_data_handle = calloc(1, sizeof(struct _starpu_mpi_early_data_handle));
+					STARPU_ASSERT(early_data_handle);
+					STARPU_PTHREAD_MUTEX_INIT(&early_data_handle->req_mutex, NULL);
+					STARPU_PTHREAD_COND_INIT(&early_data_handle->req_cond, NULL);
+					early_data_handle->mpi_tag = recv_env->mpi_tag;
+					early_data_handle->env = recv_env;
+					early_data_handle->source = status.MPI_SOURCE;
 
 
 					if (data_handle)
 					if (data_handle)
 					{
 					{
-						chandle->buffer = NULL;
-						starpu_data_register_same(&chandle->handle, data_handle);
-						add_chandle(chandle);
+						early_data_handle->buffer = NULL;
+						starpu_data_register_same(&early_data_handle->handle, data_handle);
+						_starpu_mpi_early_data_add(early_data_handle);
 					}
 					}
 					else
 					else
 					{
 					{
@@ -1377,15 +1184,17 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 						 * we are going to receive the data as a raw memory, and give it
 						 * we are going to receive the data as a raw memory, and give it
 						 * to the application when it post a receive for this tag
 						 * to the application when it post a receive for this tag
 						 */
 						 */
-						_STARPU_MPI_DEBUG(20, "Posting a receive for a data of size %d which has not yet been registered\n", (int)chandle->env->size);
-						chandle->buffer = malloc(chandle->env->size);
-						starpu_vector_data_register(&chandle->handle, STARPU_MAIN_RAM, (uintptr_t) chandle->buffer, chandle->env->size, 1);
-						add_chandle(chandle);
+						_STARPU_MPI_DEBUG(20, "Posting a receive for a data of size %d which has not yet been registered\n", (int)early_data_handle->env->size);
+						early_data_handle->buffer = malloc(early_data_handle->env->size);
+						starpu_vector_data_register(&early_data_handle->handle, STARPU_MAIN_RAM, (uintptr_t) early_data_handle->buffer, early_data_handle->env->size, 1);
+						_starpu_mpi_early_data_add(early_data_handle);
 					}
 					}
 
 
-					_STARPU_MPI_DEBUG(20, "Posting internal detached irecv on copy_handle with tag %d from src %d ..\n", chandle->mpi_tag, status.MPI_SOURCE);
+					_STARPU_MPI_DEBUG(20, "Posting internal detached irecv on early_handle with tag %d from src %d ..\n", early_data_handle->mpi_tag, status.MPI_SOURCE);
 					STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 					STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
-					chandle->req = _starpu_mpi_irecv_common(chandle->handle, status.MPI_SOURCE, chandle->mpi_tag, MPI_COMM_WORLD, 1, NULL, NULL, 1, 1, recv_env->size);
+					early_data_handle->req = _starpu_mpi_irecv_common(early_data_handle->handle, status.MPI_SOURCE,
+											  early_data_handle->mpi_tag, MPI_COMM_WORLD, 1,
+											  NULL, NULL, 1, 1, recv_env->size);
 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 
 
 					// We wait until the request is pushed in the
 					// We wait until the request is pushed in the
@@ -1394,15 +1203,15 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 					// on the request and post the corresponding mpi_irecv,
 					// on the request and post the corresponding mpi_irecv,
 					// otherwise, it may lead to read data as envelop
 					// otherwise, it may lead to read data as envelop
 					STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 					STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
-					STARPU_PTHREAD_MUTEX_LOCK(&(chandle->req->posted_mutex));
-					while (!(chandle->req->posted))
-					     STARPU_PTHREAD_COND_WAIT(&(chandle->req->posted_cond), &(chandle->req->posted_mutex));
-					STARPU_PTHREAD_MUTEX_UNLOCK(&(chandle->req->posted_mutex));
-
-					STARPU_PTHREAD_MUTEX_LOCK(&chandle->req_mutex);
-					chandle->req_ready = 1;
-					STARPU_PTHREAD_COND_BROADCAST(&chandle->req_cond);
-					STARPU_PTHREAD_MUTEX_UNLOCK(&chandle->req_mutex);
+					STARPU_PTHREAD_MUTEX_LOCK(&(early_data_handle->req->posted_mutex));
+					while (!(early_data_handle->req->posted))
+					     STARPU_PTHREAD_COND_WAIT(&(early_data_handle->req->posted_cond), &(early_data_handle->req->posted_mutex));
+					STARPU_PTHREAD_MUTEX_UNLOCK(&(early_data_handle->req->posted_mutex));
+
+					STARPU_PTHREAD_MUTEX_LOCK(&early_data_handle->req_mutex);
+					early_data_handle->req_ready = 1;
+					STARPU_PTHREAD_COND_BROADCAST(&early_data_handle->req_cond);
+					STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_handle->req_mutex);
 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 				}
 				}
 				/* Case : a matching receive has been found for the incoming data, we handle the correct allocation of the pointer associated to
 				/* Case : a matching receive has been found for the incoming data, we handle the correct allocation of the pointer associated to
@@ -1411,7 +1220,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 				{
 				{
 					_STARPU_MPI_DEBUG(3, "A matching receive has been found for the incoming data with tag %d\n", recv_env->mpi_tag);
 					_STARPU_MPI_DEBUG(3, "A matching receive has been found for the incoming data with tag %d\n", recv_env->mpi_tag);
 
 
-					delete_app_req(found_req);
+					_starpu_mpi_early_request_delete(found_req);
 
 
 					_starpu_mpi_handle_allocate_datatype(found_req->data_handle, &found_req->datatype, &found_req->user_datatype);
 					_starpu_mpi_handle_allocate_datatype(found_req->data_handle, &found_req->datatype, &found_req->user_datatype);
 					if (found_req->user_datatype == 0)
 					if (found_req->user_datatype == 0)
@@ -1448,8 +1257,8 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(detached_requests), "List of detached requests not empty");
 	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(detached_requests), "List of detached requests not empty");
 	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(new_requests), "List of new requests not empty");
 	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(new_requests), "List of new requests not empty");
 	STARPU_ASSERT_MSG(posted_requests == 0, "Number of posted request is not zero");
 	STARPU_ASSERT_MSG(posted_requests == 0, "Number of posted request is not zero");
-	STARPU_ASSERT_MSG(_starpu_mpi_app_req_hashmap_count == 0, "Number of receive requests left is not zero");
-	STARPU_ASSERT_MSG(_starpu_mpi_copy_handle_hashmap_count == 0, "Number of copy requests left is not zero");
+	_starpu_mpi_early_request_check_termination();
+	_starpu_mpi_early_data_check_termination();
 
 
 	if (argc_argv->initialize_mpi)
 	if (argc_argv->initialize_mpi)
 	{
 	{
@@ -1459,27 +1268,8 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
 
 	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 
 
-	{
-		int n;
-		struct _starpu_mpi_copy_handle_hashlist *hashlist;
-
-		for(n=0 ; n<worldsize; n++)
-		{
-			for(hashlist=_starpu_mpi_copy_handle_hashmap[n]; hashlist != NULL; hashlist=hashlist->hh.next)
-			{
-				_starpu_mpi_copy_handle_list_delete(hashlist->list);
-			}
-			struct _starpu_mpi_copy_handle_hashlist *current, *tmp;
-			HASH_ITER(hh, _starpu_mpi_copy_handle_hashmap[n], current, tmp)
-			{
-				HASH_DEL(_starpu_mpi_copy_handle_hashmap[n], current);
-				free(current);
-			}
-		}
-	}
-
-	free(_starpu_mpi_app_req_hashmap);
-	free(_starpu_mpi_copy_handle_hashmap);
+	_starpu_mpi_early_data_free(worldsize);
+	_starpu_mpi_early_request_free();
 	free(argc_argv);
 	free(argc_argv);
 	free(recv_env);
 	free(recv_env);
 
 

+ 54 - 5
mpi/src/starpu_mpi_cache.c

@@ -30,6 +30,8 @@ struct _starpu_data_entry
 	starpu_data_handle_t data;
 	starpu_data_handle_t data;
 };
 };
 
 
+static starpu_pthread_mutex_t *_cache_sent_mutex;
+static starpu_pthread_mutex_t *_cache_received_mutex;
 static struct _starpu_data_entry **_cache_sent_data = NULL;
 static struct _starpu_data_entry **_cache_sent_data = NULL;
 static struct _starpu_data_entry **_cache_received_data = NULL;
 static struct _starpu_data_entry **_cache_received_data = NULL;
 int _cache_enabled=1;
 int _cache_enabled=1;
@@ -53,11 +55,19 @@ void _starpu_mpi_cache_init(MPI_Comm comm)
 
 
 	MPI_Comm_size(comm, &nb_nodes);
 	MPI_Comm_size(comm, &nb_nodes);
 	_STARPU_MPI_DEBUG(2, "Initialising htable for cache\n");
 	_STARPU_MPI_DEBUG(2, "Initialising htable for cache\n");
+
 	_cache_sent_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
 	_cache_sent_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
-	for(i=0 ; i<nb_nodes ; i++) _cache_sent_data[i] = NULL;
 	_cache_received_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
 	_cache_received_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
-	for(i=0 ; i<nb_nodes ; i++) _cache_received_data[i] = NULL;
-	_starpu_mpi_cache_stats_init(comm);
+	_cache_sent_mutex = malloc(nb_nodes * sizeof(starpu_pthread_mutex_t));
+	_cache_received_mutex = malloc(nb_nodes * sizeof(starpu_pthread_mutex_t));
+
+	for(i=0 ; i<nb_nodes ; i++)
+	{
+		_cache_sent_data[i] = NULL;
+		_cache_received_data[i] = NULL;
+		STARPU_PTHREAD_MUTEX_INIT(&_cache_sent_mutex[i], NULL);
+		STARPU_PTHREAD_MUTEX_INIT(&_cache_received_mutex[i], NULL);
+	}
 }
 }
 
 
 static
 static
@@ -72,27 +82,44 @@ void _starpu_mpi_cache_empty_tables(int world_size)
 	for(i=0 ; i<world_size ; i++)
 	for(i=0 ; i<world_size ; i++)
 	{
 	{
 		struct _starpu_data_entry *entry, *tmp;
 		struct _starpu_data_entry *entry, *tmp;
+
+		STARPU_PTHREAD_MUTEX_LOCK(&_cache_sent_mutex[i]);
 		HASH_ITER(hh, _cache_sent_data[i], entry, tmp)
 		HASH_ITER(hh, _cache_sent_data[i], entry, tmp)
 		{
 		{
 			HASH_DEL(_cache_sent_data[i], entry);
 			HASH_DEL(_cache_sent_data[i], entry);
 			free(entry);
 			free(entry);
 		}
 		}
+		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_sent_mutex[i]);
+
+		STARPU_PTHREAD_MUTEX_LOCK(&_cache_received_mutex[i]);
 		HASH_ITER(hh, _cache_received_data[i], entry, tmp)
 		HASH_ITER(hh, _cache_received_data[i], entry, tmp)
 		{
 		{
 			HASH_DEL(_cache_received_data[i], entry);
 			HASH_DEL(_cache_received_data[i], entry);
 			_starpu_mpi_cache_stats_dec(-1, i, entry->data);
 			_starpu_mpi_cache_stats_dec(-1, i, entry->data);
 			free(entry);
 			free(entry);
 		}
 		}
+		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_received_mutex[i]);
 	}
 	}
 }
 }
 
 
 void _starpu_mpi_cache_free(int world_size)
 void _starpu_mpi_cache_free(int world_size)
 {
 {
+	int i;
+
 	if (_cache_enabled == 0) return;
 	if (_cache_enabled == 0) return;
 
 
 	_starpu_mpi_cache_empty_tables(world_size);
 	_starpu_mpi_cache_empty_tables(world_size);
 	free(_cache_sent_data);
 	free(_cache_sent_data);
 	free(_cache_received_data);
 	free(_cache_received_data);
+
+	for(i=0 ; i<world_size ; i++)
+	{
+		STARPU_PTHREAD_MUTEX_DESTROY(&_cache_sent_mutex[i]);
+		STARPU_PTHREAD_MUTEX_DESTROY(&_cache_received_mutex[i]);
+	}
+	free(_cache_sent_mutex);
+	free(_cache_received_mutex);
+
 	_starpu_mpi_cache_stats_free();
 	_starpu_mpi_cache_stats_free();
 }
 }
 
 
@@ -104,6 +131,8 @@ void _starpu_mpi_cache_flush_sent(MPI_Comm comm, starpu_data_handle_t data)
 	for(n=0 ; n<size ; n++)
 	for(n=0 ; n<size ; n++)
 	{
 	{
 		struct _starpu_data_entry *already_sent;
 		struct _starpu_data_entry *already_sent;
+
+		STARPU_PTHREAD_MUTEX_LOCK(&_cache_sent_mutex[n]);
 		HASH_FIND_PTR(_cache_sent_data[n], &data, already_sent);
 		HASH_FIND_PTR(_cache_sent_data[n], &data, already_sent);
 		if (already_sent)
 		if (already_sent)
 		{
 		{
@@ -111,6 +140,7 @@ void _starpu_mpi_cache_flush_sent(MPI_Comm comm, starpu_data_handle_t data)
 			HASH_DEL(_cache_sent_data[n], already_sent);
 			HASH_DEL(_cache_sent_data[n], already_sent);
 			free(already_sent);
 			free(already_sent);
 		}
 		}
+		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_sent_mutex[n]);
 	}
 	}
 }
 }
 
 
@@ -119,6 +149,7 @@ void _starpu_mpi_cache_flush_recv(starpu_data_handle_t data, int me)
 	int mpi_rank = starpu_data_get_rank(data);
 	int mpi_rank = starpu_data_get_rank(data);
 	struct _starpu_data_entry *already_received;
 	struct _starpu_data_entry *already_received;
 
 
+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_received_mutex[mpi_rank]);
 	HASH_FIND_PTR(_cache_received_data[mpi_rank], &data, already_received);
 	HASH_FIND_PTR(_cache_received_data[mpi_rank], &data, already_received);
 	if (already_received)
 	if (already_received)
 	{
 	{
@@ -131,6 +162,7 @@ void _starpu_mpi_cache_flush_recv(starpu_data_handle_t data, int me)
 		free(already_received);
 		free(already_received);
 		starpu_data_invalidate_submit(data);
 		starpu_data_invalidate_submit(data);
 	}
 	}
+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_received_mutex[mpi_rank]);
 }
 }
 
 
 void starpu_mpi_cache_flush_all_data(MPI_Comm comm)
 void starpu_mpi_cache_flush_all_data(MPI_Comm comm)
@@ -146,6 +178,8 @@ void starpu_mpi_cache_flush_all_data(MPI_Comm comm)
 	for(i=0 ; i<nb_nodes ; i++)
 	for(i=0 ; i<nb_nodes ; i++)
 	{
 	{
 		struct _starpu_data_entry *entry, *tmp;
 		struct _starpu_data_entry *entry, *tmp;
+
+		STARPU_PTHREAD_MUTEX_LOCK(&_cache_sent_mutex[i]);
 		HASH_ITER(hh, _cache_sent_data[i], entry, tmp)
 		HASH_ITER(hh, _cache_sent_data[i], entry, tmp)
 		{
 		{
 			mpi_rank = starpu_data_get_rank(entry->data);
 			mpi_rank = starpu_data_get_rank(entry->data);
@@ -154,6 +188,9 @@ void starpu_mpi_cache_flush_all_data(MPI_Comm comm)
 			HASH_DEL(_cache_sent_data[i], entry);
 			HASH_DEL(_cache_sent_data[i], entry);
 			free(entry);
 			free(entry);
 		}
 		}
+		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_sent_mutex[i]);
+
+		STARPU_PTHREAD_MUTEX_LOCK(&_cache_received_mutex[i]);
 		HASH_ITER(hh, _cache_received_data[i], entry, tmp)
 		HASH_ITER(hh, _cache_received_data[i], entry, tmp)
 		{
 		{
 			mpi_rank = starpu_data_get_rank(entry->data);
 			mpi_rank = starpu_data_get_rank(entry->data);
@@ -163,6 +200,7 @@ void starpu_mpi_cache_flush_all_data(MPI_Comm comm)
 			_starpu_mpi_cache_stats_dec(my_rank, i, entry->data);
 			_starpu_mpi_cache_stats_dec(my_rank, i, entry->data);
 			free(entry);
 			free(entry);
 		}
 		}
+		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_received_mutex[i]);
 	}
 	}
 }
 }
 
 
@@ -180,6 +218,7 @@ void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle)
 
 
 	for(i=0 ; i<nb_nodes ; i++)
 	for(i=0 ; i<nb_nodes ; i++)
 	{
 	{
+		STARPU_PTHREAD_MUTEX_LOCK(&_cache_sent_mutex[i]);
 		HASH_FIND_PTR(_cache_sent_data[i], &data_handle, avail);
 		HASH_FIND_PTR(_cache_sent_data[i], &data_handle, avail);
 		if (avail)
 		if (avail)
 		{
 		{
@@ -187,6 +226,9 @@ void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle)
 			HASH_DEL(_cache_sent_data[i], avail);
 			HASH_DEL(_cache_sent_data[i], avail);
 			free(avail);
 			free(avail);
 		}
 		}
+		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_sent_mutex[i]);
+
+		STARPU_PTHREAD_MUTEX_LOCK(&_cache_received_mutex[i]);
 		HASH_FIND_PTR(_cache_received_data[i], &data_handle, avail);
 		HASH_FIND_PTR(_cache_received_data[i], &data_handle, avail);
 		if (avail)
 		if (avail)
 		{
 		{
@@ -195,6 +237,7 @@ void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle)
 			_starpu_mpi_cache_stats_dec(my_rank, i, data_handle);
 			_starpu_mpi_cache_stats_dec(my_rank, i, data_handle);
 			free(avail);
 			free(avail);
 		}
 		}
+		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_received_mutex[i]);
 	}
 	}
 
 
 	if (mpi_rank != my_rank && mpi_rank != -1)
 	if (mpi_rank != my_rank && mpi_rank != -1)
@@ -203,9 +246,11 @@ void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle)
 
 
 void *_starpu_mpi_already_received(int src, starpu_data_handle_t data, int mpi_rank)
 void *_starpu_mpi_already_received(int src, starpu_data_handle_t data, int mpi_rank)
 {
 {
+	struct _starpu_data_entry *already_received;
+
 	if (_cache_enabled == 0) return NULL;
 	if (_cache_enabled == 0) return NULL;
 
 
-	struct _starpu_data_entry *already_received;
+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_received_mutex[mpi_rank]);
 	HASH_FIND_PTR(_cache_received_data[mpi_rank], &data, already_received);
 	HASH_FIND_PTR(_cache_received_data[mpi_rank], &data, already_received);
 	if (already_received == NULL)
 	if (already_received == NULL)
 	{
 	{
@@ -218,14 +263,17 @@ void *_starpu_mpi_already_received(int src, starpu_data_handle_t data, int mpi_r
 	{
 	{
 		_STARPU_MPI_DEBUG(2, "Do not receive data %p from node %d as it is already available\n", data, mpi_rank);
 		_STARPU_MPI_DEBUG(2, "Do not receive data %p from node %d as it is already available\n", data, mpi_rank);
 	}
 	}
+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_received_mutex[mpi_rank]);
 	return already_received;
 	return already_received;
 }
 }
 
 
 void *_starpu_mpi_already_sent(starpu_data_handle_t data, int dest)
 void *_starpu_mpi_already_sent(starpu_data_handle_t data, int dest)
 {
 {
+	struct _starpu_data_entry *already_sent;
+
 	if (_cache_enabled == 0) return NULL;
 	if (_cache_enabled == 0) return NULL;
 
 
-	struct _starpu_data_entry *already_sent;
+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_sent_mutex[dest]);
 	HASH_FIND_PTR(_cache_sent_data[dest], &data, already_sent);
 	HASH_FIND_PTR(_cache_sent_data[dest], &data, already_sent);
 	if (already_sent == NULL)
 	if (already_sent == NULL)
 	{
 	{
@@ -238,6 +286,7 @@ void *_starpu_mpi_already_sent(starpu_data_handle_t data, int dest)
 	{
 	{
 		_STARPU_MPI_DEBUG(2, "Do not send data %p to node %d as it has already been sent\n", data, dest);
 		_STARPU_MPI_DEBUG(2, "Do not send data %p to node %d as it has already been sent\n", data, dest);
 	}
 	}
+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_sent_mutex[dest]);
 	return already_sent;
 	return already_sent;
 }
 }
 
 

+ 168 - 0
mpi/src/starpu_mpi_early_data.c

@@ -0,0 +1,168 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010-2014  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdlib.h>
+#include <starpu_mpi.h>
+#include <starpu_mpi_early_data.h>
+#include <starpu_mpi_private.h>
+#include <common/uthash.h>
+
+struct _starpu_mpi_early_data_handle_hashlist
+{
+	struct _starpu_mpi_early_data_handle_list *list;
+	UT_hash_handle hh;
+	int mpi_tag;
+};
+
+/** stores data which have been received by MPI but have not been requested by the application */
+static struct _starpu_mpi_early_data_handle_hashlist **_starpu_mpi_early_data_handle_hashmap = NULL;
+static int _starpu_mpi_early_data_handle_hashmap_count = 0;
+
+void _starpu_mpi_early_data_init(int world_size)
+{
+	int k;
+
+	_starpu_mpi_early_data_handle_hashmap = malloc(world_size * sizeof(struct _starpu_mpi_early_data_handle_hash_list *));
+	for(k=0 ; k<world_size ; k++) _starpu_mpi_early_data_handle_hashmap[k] = NULL;
+}
+
+void _starpu_mpi_early_data_check_termination()
+{
+	STARPU_ASSERT_MSG(_starpu_mpi_early_data_handle_hashmap_count == 0, "Number of copy requests left is not zero");
+}
+
+void _starpu_mpi_early_data_free(int world_size)
+{
+	int n;
+	struct _starpu_mpi_early_data_handle_hashlist *hashlist;
+
+	for(n=0 ; n<world_size; n++)
+	{
+		for(hashlist=_starpu_mpi_early_data_handle_hashmap[n]; hashlist != NULL; hashlist=hashlist->hh.next)
+		{
+			_starpu_mpi_early_data_handle_list_delete(hashlist->list);
+		}
+		struct _starpu_mpi_early_data_handle_hashlist *current, *tmp;
+		HASH_ITER(hh, _starpu_mpi_early_data_handle_hashmap[n], current, tmp)
+		{
+			HASH_DEL(_starpu_mpi_early_data_handle_hashmap[n], current);
+			free(current);
+		}
+	}
+	free(_starpu_mpi_early_data_handle_hashmap);
+}
+
+#ifdef STARPU_VERBOSE
+static void _starpu_mpi_early_data_handle_display_hash(int source, int tag)
+{
+	struct _starpu_mpi_early_data_handle_hashlist *hashlist;
+	HASH_FIND_INT(_starpu_mpi_early_data_handle_hashmap[source], &tag, hashlist);
+
+	if (hashlist == NULL)
+	{
+		_STARPU_MPI_DEBUG(60, "Hashlist for source %d and tag %d does not exist\n", source, tag);
+	}
+	else if (_starpu_mpi_early_data_handle_list_empty(hashlist->list))
+	{
+		_STARPU_MPI_DEBUG(60, "Hashlist for source %d and tag %d is empty\n", source, tag);
+	}
+	else
+	{
+		struct _starpu_mpi_early_data_handle *cur;
+		for (cur = _starpu_mpi_early_data_handle_list_begin(hashlist->list) ;
+		     cur != _starpu_mpi_early_data_handle_list_end(hashlist->list);
+		     cur = _starpu_mpi_early_data_handle_list_next(cur))
+		{
+			_STARPU_MPI_DEBUG(60, "Element for source %d and tag %d: %p\n", source, tag, cur);
+		}
+	}
+}
+#endif
+
+static
+struct _starpu_mpi_early_data_handle *_starpu_mpi_early_data_pop(int mpi_tag, int source, int delete)
+{
+	struct _starpu_mpi_early_data_handle_hashlist *hashlist;
+	struct _starpu_mpi_early_data_handle *early_data_handle;
+
+	_STARPU_MPI_DEBUG(60, "Looking for early_data_handle with tag %d in the hashmap[%d]\n", mpi_tag, source);
+	HASH_FIND_INT(_starpu_mpi_early_data_handle_hashmap[source], &mpi_tag, hashlist);
+	if (hashlist == NULL)
+	{
+		early_data_handle = NULL;
+	}
+	else
+	{
+		if (_starpu_mpi_early_data_handle_list_empty(hashlist->list))
+		{
+			early_data_handle = NULL;
+		}
+		else
+		{
+			if (delete == 1)
+			{
+				early_data_handle = _starpu_mpi_early_data_handle_list_pop_front(hashlist->list);
+			}
+			else
+			{
+				early_data_handle = _starpu_mpi_early_data_handle_list_front(hashlist->list);
+			}
+		}
+	}
+	_STARPU_MPI_DEBUG(60, "Found early_data_handle %p with tag %d in the hashmap[%d]\n", early_data_handle, mpi_tag, source);
+	return early_data_handle;
+}
+
+struct _starpu_mpi_early_data_handle *_starpu_mpi_early_data_find(int mpi_tag, int source)
+{
+	return _starpu_mpi_early_data_pop(mpi_tag, source, 0);
+}
+
+void _starpu_mpi_early_data_add(struct _starpu_mpi_early_data_handle *early_data_handle)
+{
+	_STARPU_MPI_DEBUG(60, "Trying to add early_data_handle %p with tag %d in the hashmap[%d]\n", early_data_handle, early_data_handle->mpi_tag, early_data_handle->source);
+
+	struct _starpu_mpi_early_data_handle_hashlist *hashlist;
+	HASH_FIND_INT(_starpu_mpi_early_data_handle_hashmap[early_data_handle->source], &early_data_handle->mpi_tag, hashlist);
+	if (hashlist == NULL)
+	{
+		hashlist = malloc(sizeof(struct _starpu_mpi_early_data_handle_hashlist));
+		hashlist->list = _starpu_mpi_early_data_handle_list_new();
+		hashlist->mpi_tag = early_data_handle->mpi_tag;
+		HASH_ADD_INT(_starpu_mpi_early_data_handle_hashmap[early_data_handle->source], mpi_tag, hashlist);
+	}
+	_starpu_mpi_early_data_handle_list_push_back(hashlist->list, early_data_handle);
+	_starpu_mpi_early_data_handle_hashmap_count ++;
+#ifdef STARPU_VERBOSE
+	_starpu_mpi_early_data_handle_display_hash(early_data_handle->source, early_data_handle->mpi_tag);
+#endif
+}
+
+void _starpu_mpi_early_data_delete(struct _starpu_mpi_early_data_handle *early_data_handle)
+{
+	_STARPU_MPI_DEBUG(60, "Trying to delete early_data_handle %p with tag %d in the hashmap[%d]\n", early_data_handle, early_data_handle->mpi_tag, early_data_handle->source);
+	struct _starpu_mpi_early_data_handle *found = _starpu_mpi_early_data_pop(early_data_handle->mpi_tag, early_data_handle->source, 1);
+
+	STARPU_ASSERT_MSG(found == early_data_handle,
+			  "[_starpu_mpi_early_data_delete][error] early_data_handle %p with tag %d is NOT in the hashmap[%d]\n", early_data_handle, early_data_handle->mpi_tag, early_data_handle->source);
+
+	_starpu_mpi_early_data_handle_hashmap_count --;
+#ifdef STARPU_VERBOSE
+	_starpu_mpi_early_data_handle_display_hash(early_data_handle->source, early_data_handle->mpi_tag);
+#endif
+}
+

+ 55 - 0
mpi/src/starpu_mpi_early_data.h

@@ -0,0 +1,55 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010-2014  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_MPI_EARLY_DATA_H__
+#define __STARPU_MPI_EARLY_DATA_H__
+
+#include <starpu.h>
+#include <stdlib.h>
+#include <mpi.h>
+#include <common/config.h>
+#include <common/list.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+LIST_TYPE(_starpu_mpi_early_data_handle,
+	  starpu_data_handle_t handle;
+	  struct _starpu_mpi_envelope *env;
+	  struct _starpu_mpi_req *req;
+	  void *buffer;
+	  int mpi_tag;
+	  int source;
+	  int req_ready;
+	  starpu_pthread_mutex_t req_mutex;
+	  starpu_pthread_cond_t req_cond;
+);
+
+void _starpu_mpi_early_data_init(int world_size);
+void _starpu_mpi_early_data_check_termination();
+void _starpu_mpi_early_data_free(int world_size);
+
+struct _starpu_mpi_early_data_handle *_starpu_mpi_early_data_find(int mpi_tag, int source);
+void _starpu_mpi_early_data_add(struct _starpu_mpi_early_data_handle *early_data_handle);
+void _starpu_mpi_early_data_delete(struct _starpu_mpi_early_data_handle *early_data_handle);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __STARPU_MPI_EARLY_DATA_H__ */

+ 104 - 0
mpi/src/starpu_mpi_early_request.c

@@ -0,0 +1,104 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010-2014  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdlib.h>
+#include <starpu_mpi.h>
+#include <starpu_mpi_private.h>
+#include <starpu_mpi_early_request.h>
+#include <common/uthash.h>
+
+/** stores application requests for which data have not been received yet */
+static struct _starpu_mpi_req **_starpu_mpi_app_req_hashmap = NULL;
+static int _starpu_mpi_app_req_hashmap_count = 0;
+
+void _starpu_mpi_early_request_init(int world_size)
+{
+	int k;
+
+	_starpu_mpi_app_req_hashmap = malloc(world_size * sizeof(struct _starpu_mpi_req *));
+	for(k=0 ; k<world_size ; k++) _starpu_mpi_app_req_hashmap[k] = NULL;
+}
+
+void _starpu_mpi_early_request_free()
+{
+	free(_starpu_mpi_app_req_hashmap);
+}
+
+int _starpu_mpi_early_request_count()
+{
+	return _starpu_mpi_app_req_hashmap_count;
+}
+
+void _starpu_mpi_early_request_check_termination()
+{
+	STARPU_ASSERT_MSG(_starpu_mpi_early_request_count() == 0, "Number of receive requests left is not zero");
+}
+
+struct _starpu_mpi_req* _starpu_mpi_early_request_find(int mpi_tag, int source)
+{
+	struct _starpu_mpi_req* req;
+
+	HASH_FIND_INT(_starpu_mpi_app_req_hashmap[source], &mpi_tag, req);
+
+	return req;
+}
+
+void _starpu_mpi_early_request_add(struct _starpu_mpi_req *req)
+{
+	struct _starpu_mpi_req *test_req;
+
+	test_req = _starpu_mpi_early_request_find(req->mpi_tag, req->srcdst);
+
+	if (test_req == NULL)
+	{
+		HASH_ADD_INT(_starpu_mpi_app_req_hashmap[req->srcdst], mpi_tag, req);
+		_starpu_mpi_app_req_hashmap_count ++;
+		_STARPU_MPI_DEBUG(3, "Adding request %p with tag %d in the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
+	}
+	else
+	{
+		_STARPU_MPI_DEBUG(3, "[Error] request %p with tag %d already in the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
+		int seq_const = starpu_data_get_sequential_consistency_flag(req->data_handle);
+		if (seq_const &&  req->sequential_consistency)
+		{
+			STARPU_ASSERT_MSG(!test_req, "[Error] request %p with tag %d wanted to be added to the application request hashmap[%d], while another request %p with the same tag is already in it. \n Sequential consistency is activated : this is not supported by StarPU.", req, req->mpi_tag, req->srcdst, test_req);
+		}
+		else
+		{
+			STARPU_ASSERT_MSG(!test_req, "[Error] request %p with tag %d wanted to be added to the application request hashmap[%d], while another request %p with the same tag is already in it. \n Sequential consistency isn't activated for this handle : you should want to add dependencies between requests for which the sequential consistency is deactivated.", req, req->mpi_tag, req->srcdst, test_req);
+		}
+	}
+}
+
+void _starpu_mpi_early_request_delete(struct _starpu_mpi_req *req)
+{
+	struct _starpu_mpi_req *test_req;
+
+	test_req = _starpu_mpi_early_request_find(req->mpi_tag, req->srcdst);
+
+	if (test_req != NULL)
+	{
+		HASH_DEL(_starpu_mpi_app_req_hashmap[req->srcdst], req);
+		_starpu_mpi_app_req_hashmap_count --;
+		_STARPU_MPI_DEBUG(3, "Deleting application request %p with tag %d from the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
+	}
+	else
+	{
+		_STARPU_MPI_DEBUG(3, "[Warning] request %p with tag %d is NOT in the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
+	}
+}
+

+ 44 - 0
mpi/src/starpu_mpi_early_request.h

@@ -0,0 +1,44 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010-2014  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_MPI_EARLY_REQUEST_H__
+#define __STARPU_MPI_EARLY_REQUEST_H__
+
+#include <starpu.h>
+#include <stdlib.h>
+#include <mpi.h>
+#include <common/config.h>
+#include <common/list.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void _starpu_mpi_early_request_init(int world_size);
+void _starpu_mpi_early_request_free();
+int _starpu_mpi_early_request_count();
+void _starpu_mpi_early_request_check_termination();
+
+void _starpu_mpi_early_request_add(struct _starpu_mpi_req *req);
+struct _starpu_mpi_req* _starpu_mpi_early_request_find(int mpi_tag, int source);
+void _starpu_mpi_early_request_delete(struct _starpu_mpi_req *req);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __STARPU_MPI_EARLY_REQUEST_H__ */

+ 26 - 17
mpi/tests/mpi_earlyrecv.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -21,9 +21,9 @@
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
-	int ret, rank, size, i, nb_requests;
-	starpu_data_handle_t tab_handle[3];
-	starpu_mpi_req request[3];
+	int ret, rank, size, i;
+	starpu_data_handle_t tab_handle[4];
+	starpu_mpi_req request[2] = {NULL, NULL};
 
 
 	MPI_Init(NULL, NULL);
 	MPI_Init(NULL, NULL);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
@@ -43,11 +43,14 @@ int main(int argc, char **argv)
 	ret = starpu_mpi_init(NULL, NULL, 0);
 	ret = starpu_mpi_init(NULL, NULL, 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 
-	for(i=0 ; i<3 ; i++)
+	for(i=0 ; i<4 ; i++)
 	{
 	{
-		starpu_variable_data_register(&tab_handle[i], STARPU_MAIN_RAM, (uintptr_t)&rank, sizeof(int));
-		starpu_data_set_tag(tab_handle[i], i);
-		request[i] = NULL;
+		if (i<3 || rank%2)
+		{
+			// all data are registered on all nodes, bu the 4th data which is not registered on the receiving node
+			starpu_variable_data_register(&tab_handle[i], STARPU_MAIN_RAM, (uintptr_t)&rank, sizeof(int));
+			starpu_mpi_data_register(tab_handle[i], i, rank);
+		}
 	}
 	}
 
 
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
@@ -56,23 +59,30 @@ int main(int argc, char **argv)
 
 
 	if (rank%2)
 	if (rank%2)
 	{
 	{
+		// this data will be received as an early registered data
 		starpu_mpi_isend(tab_handle[0], &request[0], other_rank, 0, MPI_COMM_WORLD);
 		starpu_mpi_isend(tab_handle[0], &request[0], other_rank, 0, MPI_COMM_WORLD);
+		// this data will be received as an early UNregistered data
+		starpu_mpi_isend(tab_handle[3], &request[1], other_rank, 3, MPI_COMM_WORLD);
+
+		starpu_mpi_send(tab_handle[1], other_rank, 1, MPI_COMM_WORLD);
 		starpu_mpi_recv(tab_handle[2], other_rank, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
 		starpu_mpi_recv(tab_handle[2], other_rank, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-		starpu_mpi_isend(tab_handle[1], &request[1], other_rank, 1, MPI_COMM_WORLD);
-		nb_requests = 2;
 	}
 	}
 	else
 	else
 	{
 	{
+		starpu_mpi_recv(tab_handle[1], other_rank, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+		starpu_mpi_send(tab_handle[2], other_rank, 2, MPI_COMM_WORLD);
+
+		// we register the data
+		starpu_variable_data_register(&tab_handle[3], STARPU_MAIN_RAM, (uintptr_t)&rank, sizeof(int));
+		starpu_mpi_data_register(tab_handle[3], 3, rank);
+		starpu_mpi_irecv(tab_handle[3], &request[1], other_rank, 3, MPI_COMM_WORLD);
 		starpu_mpi_irecv(tab_handle[0], &request[0], other_rank, 0, MPI_COMM_WORLD);
 		starpu_mpi_irecv(tab_handle[0], &request[0], other_rank, 0, MPI_COMM_WORLD);
-		starpu_mpi_irecv(tab_handle[1], &request[1], other_rank, 1, MPI_COMM_WORLD);
-		starpu_mpi_isend(tab_handle[2], &request[2], other_rank, 2, MPI_COMM_WORLD);
-		nb_requests = 3;
 	}
 	}
 
 
 	int finished=0;
 	int finished=0;
 	while (!finished)
 	while (!finished)
 	{
 	{
-		for(i=0 ; i<nb_requests ; i++)
+		for(i=0 ; i<2 ; i++)
 		{
 		{
 			if (request[i])
 			if (request[i])
 			{
 			{
@@ -83,11 +93,10 @@ int main(int argc, char **argv)
 					FPRINTF_MPI("request[%d] = %d %p\n", i, flag, request[i]);
 					FPRINTF_MPI("request[%d] = %d %p\n", i, flag, request[i]);
 			}
 			}
 		}
 		}
-		finished = request[0] == NULL;
-		for(i=1 ; i<nb_requests ; i++) finished = finished && request[i] == NULL;
+		finished = request[0] == NULL && request[1] == NULL;
 	}
 	}
 
 
-	for(i=0 ; i<3 ; i++)
+	for(i=0 ; i<4 ; i++)
 		starpu_data_unregister(tab_handle[i]);
 		starpu_data_unregister(tab_handle[i]);
 
 
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();

+ 3 - 3
mpi/tests/mpi_earlyrecv2.c

@@ -131,7 +131,7 @@ int exchange_variable(int rank, int detached)
 	{
 	{
 		value[i]=i*rank;
 		value[i]=i*rank;
 		starpu_variable_data_register(&tab_handle[i], STARPU_MAIN_RAM, (uintptr_t)&value[i], sizeof(int));
 		starpu_variable_data_register(&tab_handle[i], STARPU_MAIN_RAM, (uintptr_t)&value[i], sizeof(int));
-		starpu_data_set_tag(tab_handle[i], i);
+		starpu_mpi_data_register(tab_handle[i], i, rank);
 	}
 	}
 	ret = exchange(rank, tab_handle, check_variable, detached);
 	ret = exchange(rank, tab_handle, check_variable, detached);
 	for(i=0 ; i<NB ; i++)
 	for(i=0 ; i<NB ; i++)
@@ -154,7 +154,7 @@ int exchange_void(int rank, int detached)
 	for(i=0 ; i<NB ; i++)
 	for(i=0 ; i<NB ; i++)
 	{
 	{
 		starpu_void_data_register(&tab_handle[i]);
 		starpu_void_data_register(&tab_handle[i]);
-		starpu_data_set_tag(tab_handle[i], i);
+		starpu_mpi_data_register(tab_handle[i], i, rank);
 	}
 	}
 	ret = exchange(rank, tab_handle, check_void, detached);
 	ret = exchange(rank, tab_handle, check_void, detached);
 	for(i=0 ; i<NB ; i++)
 	for(i=0 ; i<NB ; i++)
@@ -191,7 +191,7 @@ int exchange_complex(int rank, int detached)
 		real[i] = (i*rank)+12;
 		real[i] = (i*rank)+12;
 		imaginary[i] = (i*rank)+45;
 		imaginary[i] = (i*rank)+45;
 		starpu_complex_data_register(&handle[i], STARPU_MAIN_RAM, &real[i], &imaginary[i], 1);
 		starpu_complex_data_register(&handle[i], STARPU_MAIN_RAM, &real[i], &imaginary[i], 1);
-		starpu_data_set_tag(handle[i], i);
+		starpu_mpi_data_register(handle[i], i, rank);
 	}
 	}
 	ret = exchange(rank, handle, check_complex, detached);
 	ret = exchange(rank, handle, check_complex, detached);
 	for(i=0 ; i<NB ; i++)
 	for(i=0 ; i<NB ; i++)

+ 1 - 1
sc_hypervisor/examples/cholesky/cholesky_grain_tag.c

@@ -406,7 +406,7 @@ int main(int argc, char **argv)
 	float *test_mat = malloc(size*size*sizeof(float));
 	float *test_mat = malloc(size*size*sizeof(float));
 	STARPU_ASSERT(test_mat);
 	STARPU_ASSERT(test_mat);
 
 
-	SSYRK("L", "N", size, size, 1.0f,
+	STARPU_SSYRK("L", "N", size, size, 1.0f,
 				mat, size, 0.0f, test_mat, size);
 				mat, size, 0.0f, test_mat, size);
 
 
 	FPRINTF(stderr, "comparing results ...\n");
 	FPRINTF(stderr, "comparing results ...\n");

+ 1 - 1
sc_hypervisor/examples/cholesky/cholesky_implicit.c

@@ -290,7 +290,7 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 		float *test_mat = malloc(size*size*sizeof(float));
 		float *test_mat = malloc(size*size*sizeof(float));
 		STARPU_ASSERT(test_mat);
 		STARPU_ASSERT(test_mat);
 
 
-		SSYRK("L", "N", size, size, 1.0f,
+		STARPU_SSYRK("L", "N", size, size, 1.0f,
 					mat, size, 0.0f, test_mat, size);
 					mat, size, 0.0f, test_mat, size);
 
 
 		FPRINTF(stderr, "comparing results ...\n");
 		FPRINTF(stderr, "comparing results ...\n");

+ 5 - 5
sc_hypervisor/examples/cholesky/cholesky_kernels.c

@@ -52,7 +52,7 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STAR
 		if (worker_size == 1)
 		if (worker_size == 1)
 		{
 		{
 			/* Sequential CPU kernel */
 			/* Sequential CPU kernel */
-			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21, 
+			STARPU_SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21, 
 				right, ld12, 1.0f, center, ld22);
 				right, ld12, 1.0f, center, ld22);
 		}
 		}
 		else
 		else
@@ -66,7 +66,7 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STAR
 			float *new_left = &left[block_size*rank];
 			float *new_left = &left[block_size*rank];
 			float *new_center = &center[block_size*rank];
 			float *new_center = &center[block_size*rank];
 
 
-			SGEMM("N", "T", dy, new_dx, dz, -1.0f, new_left, ld21, 
+			STARPU_SGEMM("N", "T", dy, new_dx, dz, -1.0f, new_left, ld21, 
 				right, ld12, 1.0f, new_center, ld22);
 				right, ld12, 1.0f, new_center, ld22);
 		}
 		}
 	}
 	}
@@ -117,7 +117,7 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, STARPU_A
 	switch (s)
 	switch (s)
 	{
 	{
 		case 0:
 		case 0:
-			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
+			STARPU_STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
 			break;
 			break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
@@ -177,9 +177,9 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 
 
 				STARPU_ASSERT(lambda11 != 0.0f);
 				STARPU_ASSERT(lambda11 != 0.0f);
 		
 		
-				SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
+				STARPU_SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
 		
 		
-				SSYR("L", nx - z - 1, -1.0f, 
+				STARPU_SSYR("L", nx - z - 1, -1.0f, 
 							&sub11[(z+1)+z*ld], 1,
 							&sub11[(z+1)+z*ld], 1,
 							&sub11[(z+1)+(z+1)*ld], ld);
 							&sub11[(z+1)+(z+1)*ld], ld);
 			}
 			}

+ 1 - 1
sc_hypervisor/examples/cholesky/cholesky_tag.c

@@ -391,7 +391,7 @@ int main(int argc, char **argv)
 	float *test_mat = malloc(size*size*sizeof(float));
 	float *test_mat = malloc(size*size*sizeof(float));
 	STARPU_ASSERT(test_mat);
 	STARPU_ASSERT(test_mat);
 
 
-	SSYRK("L", "N", size, size, 1.0f,
+	STARPU_SSYRK("L", "N", size, size, 1.0f,
 				mat, size, 0.0f, test_mat, size);
 				mat, size, 0.0f, test_mat, size);
 
 
 	FPRINTF(stderr, "comparing results ...\n");
 	FPRINTF(stderr, "comparing results ...\n");

+ 17 - 2
src/common/fxt.h

@@ -107,8 +107,6 @@
 
 
 #define _STARPU_FUT_EVENT	0x513c
 #define _STARPU_FUT_EVENT	0x513c
 
 
-#define _STARPU_FUT_WORKER_SCHEDULING_START	0x513e
-
 #define _STARPU_FUT_LOCKING_MUTEX	0x5140	
 #define _STARPU_FUT_LOCKING_MUTEX	0x5140	
 #define _STARPU_FUT_MUTEX_LOCKED	0x5141	
 #define _STARPU_FUT_MUTEX_LOCKED	0x5141	
 
 
@@ -156,6 +154,11 @@
 #define _STARPU_FUT_BARRIER_WAIT_BEGIN		0x5162
 #define _STARPU_FUT_BARRIER_WAIT_BEGIN		0x5162
 #define _STARPU_FUT_BARRIER_WAIT_END		0x5163
 #define _STARPU_FUT_BARRIER_WAIT_END		0x5163
 
 
+#define _STARPU_FUT_WORKER_SCHEDULING_START	0x5164
+#define _STARPU_FUT_WORKER_SCHEDULING_END	0x5165
+#define _STARPU_FUT_WORKER_SCHEDULING_PUSH	0x5166
+#define _STARPU_FUT_WORKER_SCHEDULING_POP	0x5167
+
 #ifdef STARPU_USE_FXT
 #ifdef STARPU_USE_FXT
 #include <fxt/fxt.h>
 #include <fxt/fxt.h>
 #include <fxt/fut.h>
 #include <fxt/fut.h>
@@ -487,6 +490,15 @@ do {										\
 #define _STARPU_TRACE_WORKER_SCHEDULING_START	\
 #define _STARPU_TRACE_WORKER_SCHEDULING_START	\
 	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SCHEDULING_START, _starpu_gettid());
 	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SCHEDULING_START, _starpu_gettid());
 
 
+#define _STARPU_TRACE_WORKER_SCHEDULING_END	\
+	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SCHEDULING_END, _starpu_gettid());
+
+#define _STARPU_TRACE_WORKER_SCHEDULING_PUSH	\
+	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SCHEDULING_PUSH, _starpu_gettid());
+
+#define _STARPU_TRACE_WORKER_SCHEDULING_POP	\
+	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SCHEDULING_POP, _starpu_gettid());
+
 #define _STARPU_TRACE_WORKER_SLEEP_START	\
 #define _STARPU_TRACE_WORKER_SLEEP_START	\
 	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SLEEP_START, _starpu_gettid());
 	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SLEEP_START, _starpu_gettid());
 
 
@@ -760,6 +772,9 @@ do {										\
 #define _STARPU_TRACE_WORKER_DEINIT_START	do {} while(0)
 #define _STARPU_TRACE_WORKER_DEINIT_START	do {} while(0)
 #define _STARPU_TRACE_WORKER_DEINIT_END(a)	do {} while(0)
 #define _STARPU_TRACE_WORKER_DEINIT_END(a)	do {} while(0)
 #define _STARPU_TRACE_WORKER_SCHEDULING_START		do {} while(0)
 #define _STARPU_TRACE_WORKER_SCHEDULING_START		do {} while(0)
+#define _STARPU_TRACE_WORKER_SCHEDULING_END		do {} while(0)
+#define _STARPU_TRACE_WORKER_SCHEDULING_PUSH		do {} while(0)
+#define _STARPU_TRACE_WORKER_SCHEDULING_POP		do {} while(0)
 #define _STARPU_TRACE_WORKER_SLEEP_START		do {} while(0)
 #define _STARPU_TRACE_WORKER_SLEEP_START		do {} while(0)
 #define _STARPU_TRACE_WORKER_SLEEP_END		do {} while(0)
 #define _STARPU_TRACE_WORKER_SLEEP_END		do {} while(0)
 #define _STARPU_TRACE_USER_DEFINED_START		do {} while(0)
 #define _STARPU_TRACE_USER_DEFINED_START		do {} while(0)

+ 4 - 4
src/common/utils.h

@@ -82,10 +82,10 @@
 #  define _STARPU_DEBUG(fmt, ...) do { } while (0)
 #  define _STARPU_DEBUG(fmt, ...) do { } while (0)
 #endif
 #endif
 
 
-#ifdef STARPU_VERBOSE0
-#  define _STARPU_LOG_IN()             do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s] -->\n", pthread_self(), __starpu_func__ ); }} while(0)
-#  define _STARPU_LOG_OUT()            do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s] <--\n", pthread_self(), __starpu_func__ ); }} while(0)
-#  define _STARPU_LOG_OUT_TAG(outtag)  do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s] <-- (%s)\n", pthread_self(), __starpu_func__, outtag); }} while(0)
+#ifdef STARPU_EXTRA_VERBOSE
+#  define _STARPU_LOG_IN()             do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] -->\n", pthread_self(), __starpu_func__,__FILE__,  __LINE__); }} while(0)
+#  define _STARPU_LOG_OUT()            do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] <--\n", pthread_self(), __starpu_func__, __FILE__,  __LINE__); }} while(0)
+#  define _STARPU_LOG_OUT_TAG(outtag)  do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] <-- (%s)\n", pthread_self(), __starpu_func__, __FILE__, __LINE__, outtag); }} while(0)
 #else
 #else
 #  define _STARPU_LOG_IN()
 #  define _STARPU_LOG_IN()
 #  define _STARPU_LOG_OUT()
 #  define _STARPU_LOG_OUT()

+ 16 - 0
src/core/sched_ctx.c

@@ -1157,6 +1157,22 @@ struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection(unsig
 	return sched_ctx->workers;
 	return sched_ctx->workers;
 }
 }
 
 
+void starpu_sched_ctx_display_workers(unsigned sched_ctx_id, FILE *f)
+{
+	int *workerids = NULL;
+	unsigned nworkers;
+	unsigned i;
+
+	nworkers = starpu_sched_ctx_get_workers_list(sched_ctx_id, &workerids);
+	fprintf(f, "[sched_ctx %d]: %d worker%s\n", sched_ctx_id, nworkers, nworkers>1?"s":"");
+	for (i = 0; i < nworkers; i++)
+	{
+		char name[256];
+		starpu_worker_get_name(workerids[i], name, 256);
+		fprintf(f, "\t\t%s\n", name);
+	}
+}
+
 unsigned starpu_sched_ctx_get_workers_list(unsigned sched_ctx_id, int **workerids)
 unsigned starpu_sched_ctx_get_workers_list(unsigned sched_ctx_id, int **workerids)
 {
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);

+ 8 - 1
src/core/sched_policy.c

@@ -455,7 +455,14 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 			starpu_pthread_rwlock_t *changing_ctx_mutex = _starpu_sched_ctx_get_changing_ctx_mutex(sched_ctx->id);
 			starpu_pthread_rwlock_t *changing_ctx_mutex = _starpu_sched_ctx_get_changing_ctx_mutex(sched_ctx->id);
 			STARPU_PTHREAD_RWLOCK_RDLOCK(changing_ctx_mutex);
 			STARPU_PTHREAD_RWLOCK_RDLOCK(changing_ctx_mutex);
 			nworkers = starpu_sched_ctx_get_nworkers(sched_ctx->id);
 			nworkers = starpu_sched_ctx_get_nworkers(sched_ctx->id);
-			ret = nworkers == 0 ? -1 : sched_ctx->sched_policy->push_task(task);
+			if (nworkers == 0)
+				ret = -1;
+			else
+			{
+				_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
+				ret = sched_ctx->sched_policy->push_task(task);
+				_STARPU_TRACE_WORKER_SCHEDULING_POP;
+			}
 			STARPU_PTHREAD_RWLOCK_UNLOCK(changing_ctx_mutex);
 			STARPU_PTHREAD_RWLOCK_UNLOCK(changing_ctx_mutex);
 		}
 		}
 
 

+ 26 - 10
src/core/workers.c

@@ -95,12 +95,17 @@ static uint32_t _starpu_worker_exists_and_can_execute(struct starpu_task *task,
 						      enum starpu_worker_archtype arch)
 						      enum starpu_worker_archtype arch)
 {
 {
 	int i;
 	int i;
-	int nworkers = starpu_worker_get_count();
-
 	_starpu_codelet_check_deprecated_fields(task->cl);
 	_starpu_codelet_check_deprecated_fields(task->cl);
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
+	struct starpu_worker_collection *workers = sched_ctx->workers;
 
 
-	for (i = 0; i < nworkers; i++)
-	{
+        struct starpu_sched_ctx_iterator it;
+        if(workers->init_iterator)
+                workers->init_iterator(workers, &it);
+
+        while(workers->has_next(workers, &it))
+        {
+                i = workers->get_next(workers, &it);
 		if (starpu_worker_get_type(i) != arch)
 		if (starpu_worker_get_type(i) != arch)
 			continue;
 			continue;
 
 
@@ -141,7 +146,10 @@ static uint32_t _starpu_worker_exists_and_can_execute(struct starpu_task *task,
 			if (!test_implementation)
 			if (!test_implementation)
 				break;
 				break;
 
 
-			if (task->cl->can_execute(i, task, impl))
+			if (task->cl->can_execute)
+				return task->cl->can_execute(i, task, impl);
+
+			if(test_implementation)
 				return 1;
 				return 1;
 		}
 		}
 	}
 	}
@@ -155,11 +163,18 @@ uint32_t _starpu_worker_exists(struct starpu_task *task)
 {
 {
 	_starpu_codelet_check_deprecated_fields(task->cl);
 	_starpu_codelet_check_deprecated_fields(task->cl);
 
 
-	if (!(task->cl->where & config.worker_mask))
-		return 0;
-
-	if (!task->cl->can_execute)
-		return 1;
+	/* if the task belongs to the init context we can
+	   check out all the worker mask of the machine
+	   if not we should iterate on the workers of the ctx
+	   and verify if it exists a worker able to exec the task */
+	if(task->sched_ctx == 0)
+	{
+		if (!(task->cl->where & config.worker_mask))
+			return 0;
+		
+		if (!task->cl->can_execute)
+			return 1;
+	}
 
 
 #if defined(STARPU_USE_CPU) || defined(STARPU_SIMGRID)
 #if defined(STARPU_USE_CPU) || defined(STARPU_SIMGRID)
 	if ((task->cl->where & STARPU_CPU) &&
 	if ((task->cl->where & STARPU_CPU) &&
@@ -186,6 +201,7 @@ uint32_t _starpu_worker_exists(struct starpu_task *task)
 	    _starpu_worker_exists_and_can_execute(task, STARPU_SCC_WORKER))
 	    _starpu_worker_exists_and_can_execute(task, STARPU_SCC_WORKER))
 		return 1;
 		return 1;
 #endif
 #endif
+
 	return 0;
 	return 0;
 }
 }
 
 

+ 2 - 2
src/datawizard/malloc.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009-2010, 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2009-2010, 2012-2014  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -217,7 +217,7 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 end:
 end:
 	if (ret == 0)
 	if (ret == 0)
 	{
 	{
-		STARPU_ASSERT(*A);
+		STARPU_ASSERT_MSG(*A, "Failed to allocated memory of size %ld b\n", dim);
 	}
 	}
 
 
 	return ret;
 	return ret;

+ 64 - 0
src/debug/traces/starpu_fxt.c

@@ -275,6 +275,28 @@ static void worker_set_state(double time, const char *prefix, long unsigned int
 #endif
 #endif
 }
 }
 
 
+static void worker_push_state(double time, const char *prefix, long unsigned int workerid, const char *name)
+{
+#ifdef STARPU_HAVE_POTI
+	char container[STARPU_POTI_STR_LEN];
+	thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, workerid);
+	poti_PushState(time, container, "S", name);
+#else
+	fprintf(out_paje_file, "11	%.9f	%st%lu	S	%s\n", time, prefix, workerid, name);
+#endif
+}
+
+static void worker_pop_state(double time, const char *prefix, long unsigned int workerid)
+{
+#ifdef STARPU_HAVE_POTI
+	char container[STARPU_POTI_STR_LEN];
+	thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, workerid);
+	poti_PopState(time, container, "S");
+#else
+	fprintf(out_paje_file, "12	%.9f	%st%lu	S\n", time, prefix, workerid);
+#endif
+}
+
 static void mpicommthread_set_state(double time, const char *prefix, const char *name)
 static void mpicommthread_set_state(double time, const char *prefix, const char *name)
 {
 {
 #ifdef STARPU_HAVE_POTI
 #ifdef STARPU_HAVE_POTI
@@ -790,6 +812,36 @@ static void handle_start_scheduling(struct fxt_ev_64 *ev, struct starpu_fxt_opti
 		worker_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "Sc");
 		worker_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "Sc");
 }
 }
 
 
+static void handle_end_scheduling(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+{
+	int worker;
+	worker = find_worker_id(ev->param[0]);
+	if (worker < 0) return;
+
+	if (out_paje_file)
+		worker_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "B");
+}
+
+static void handle_push_scheduling(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+{
+	int worker;
+	worker = find_worker_id(ev->param[0]);
+	if (worker < 0) return;
+
+	if (out_paje_file)
+		worker_push_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "Sc");
+}
+
+static void handle_pop_scheduling(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+{
+	int worker;
+	worker = find_worker_id(ev->param[0]);
+	if (worker < 0) return;
+
+	if (out_paje_file)
+		worker_pop_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0]);
+}
+
 static void handle_start_sleep(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 static void handle_start_sleep(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 {
 {
 	int worker;
 	int worker;
@@ -1506,6 +1558,18 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 				handle_start_scheduling(&ev, options);
 				handle_start_scheduling(&ev, options);
 				break;
 				break;
 
 
+			case _STARPU_FUT_WORKER_SCHEDULING_END:
+				handle_end_scheduling(&ev, options);
+				break;
+
+			case _STARPU_FUT_WORKER_SCHEDULING_PUSH:
+				handle_push_scheduling(&ev, options);
+				break;
+
+			case _STARPU_FUT_WORKER_SCHEDULING_POP:
+				handle_pop_scheduling(&ev, options);
+				break;
+
 			case _STARPU_FUT_WORKER_SLEEP_START:
 			case _STARPU_FUT_WORKER_SLEEP_START:
 				handle_start_sleep(&ev, options);
 				handle_start_sleep(&ev, options);
 				break;
 				break;

+ 2 - 2
src/debug/traces/starpu_paje.c

@@ -87,14 +87,14 @@ void _starpu_fxt_write_paje_header(FILE *file)
 	fprintf(file, "%%EndEventDef\n");
 	fprintf(file, "%%EndEventDef\n");
 	fprintf(file, "%%EventDef	PajePushState	11\n");
 	fprintf(file, "%%EventDef	PajePushState	11\n");
 	fprintf(file, "%%	Time	date\n");
 	fprintf(file, "%%	Time	date\n");
-	fprintf(file, "%%	Type	string\n");
 	fprintf(file, "%%	Container	string\n");
 	fprintf(file, "%%	Container	string\n");
+	fprintf(file, "%%	Type	string\n");
 	fprintf(file, "%%	Value	string\n");
 	fprintf(file, "%%	Value	string\n");
 	fprintf(file, "%%EndEventDef\n");
 	fprintf(file, "%%EndEventDef\n");
 	fprintf(file, "%%EventDef	PajePopState	12\n");
 	fprintf(file, "%%EventDef	PajePopState	12\n");
 	fprintf(file, "%%	Time	date\n");
 	fprintf(file, "%%	Time	date\n");
-	fprintf(file, "%%	Type	string\n");
 	fprintf(file, "%%	Container	string\n");
 	fprintf(file, "%%	Container	string\n");
+	fprintf(file, "%%	Type	string\n");
 	fprintf(file, "%%EndEventDef\n");
 	fprintf(file, "%%EndEventDef\n");
 	fprintf(file, "%%EventDef	PajeSetVariable	13\n");
 	fprintf(file, "%%EventDef	PajeSetVariable	13\n");
 	fprintf(file, "%%	Time	date\n");
 	fprintf(file, "%%	Time	date\n");

+ 10 - 0
src/drivers/driver_common/driver_common.c

@@ -220,7 +220,15 @@ static void _starpu_worker_set_status_scheduling(int workerid)
 		_STARPU_TRACE_WORKER_SCHEDULING_START;
 		_STARPU_TRACE_WORKER_SCHEDULING_START;
 		_starpu_worker_set_status(workerid, STATUS_SCHEDULING);
 		_starpu_worker_set_status(workerid, STATUS_SCHEDULING);
 	}
 	}
+}
 
 
+static void _starpu_worker_set_status_scheduling_done(int workerid)
+{
+	if (_starpu_worker_get_status(workerid) == STATUS_SCHEDULING)
+	{
+		_STARPU_TRACE_WORKER_SCHEDULING_END;
+		_starpu_worker_set_status(workerid, STATUS_UNKNOWN);
+	}
 }
 }
 
 
 static void _starpu_worker_set_status_sleeping(int workerid)
 static void _starpu_worker_set_status_sleeping(int workerid)
@@ -355,6 +363,8 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 
 
 	STARPU_PTHREAD_MUTEX_UNLOCK(&args->sched_mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&args->sched_mutex);
 
 
+	_starpu_worker_set_status_scheduling_done(workerid);
+
 	_starpu_worker_set_status_wakeup(workerid);
 	_starpu_worker_set_status_wakeup(workerid);
 	args->spinning_backoff = BACKOFF_MIN;
 	args->spinning_backoff = BACKOFF_MIN;
 
 

+ 0 - 4
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -547,10 +547,6 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 	{
 	{
 		worker = workers->get_next(workers, &it);
 		worker = workers->get_next(workers, &it);
 
 
-		if (worker >= nworkers)
-			/* This is a just-added worker, discard it */
-			continue;
-
 		struct _starpu_fifo_taskq *fifo = dt->queue_array[worker];
 		struct _starpu_fifo_taskq *fifo = dt->queue_array[worker];
 		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
 		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
 		unsigned memory_node = starpu_worker_get_memory_node(worker);
 		unsigned memory_node = starpu_worker_get_memory_node(worker);

+ 10 - 0
tests/main/starpu_worker_exists.c

@@ -67,21 +67,31 @@ main(int argc, char **argv)
 	task = starpu_task_create();
 	task = starpu_task_create();
 	task->cl = &cl;
 	task->cl = &cl;
 	task->destroy = 0;
 	task->destroy = 0;
+	task->sched_ctx = 0;
 
 
 	cl.can_execute = NULL;
 	cl.can_execute = NULL;
 	ret = _starpu_worker_exists(task);
 	ret = _starpu_worker_exists(task);
 	if (!ret)
 	if (!ret)
+	{
+		FPRINTF(stderr, "failure with can_execute=NULL\n");
 		return EXIT_FAILURE;
 		return EXIT_FAILURE;
+	}
 
 
 	cl.can_execute = can_always_execute;
 	cl.can_execute = can_always_execute;
 	ret = _starpu_worker_exists(task);
 	ret = _starpu_worker_exists(task);
 	if (!ret)
 	if (!ret)
+	{
+		FPRINTF(stderr, "failure with can_always_execute\n");
 		return EXIT_FAILURE;
 		return EXIT_FAILURE;
+	}
 
 
 	cl.can_execute = can_never_execute;
 	cl.can_execute = can_never_execute;
 	ret = _starpu_worker_exists(task);
 	ret = _starpu_worker_exists(task);
 	if (ret)
 	if (ret)
+	{
+		FPRINTF(stderr, "failure with can_never_execute\n");
 		return EXIT_FAILURE;
 		return EXIT_FAILURE;
+	}
 
 
 	starpu_task_destroy(task);
 	starpu_task_destroy(task);
 	starpu_shutdown();
 	starpu_shutdown();

+ 1 - 0
tests/perfmodels/regression_based.c

@@ -80,6 +80,7 @@ static struct starpu_codelet nl_memset_cl =
 {
 {
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {memset_cuda, NULL},
 	.cuda_funcs = {memset_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {memset_opencl, NULL},
 	.opencl_funcs = {memset_opencl, NULL},