11 년 전 · 721ce5f920
--- a/ChangeLog
+++ b/ChangeLog
@@ -17,6 +17,12 @@
 
				 StarPU 1.2.0 (svn revision xxxx)
			
 
				 ==============================================
			
 
				 
			
 
				+Small features:
			
 
				+  * New function starpu_sched_ctx_display_workers() to display worker
			
 
				+    information belonging to a given scheduler context
			
 
				+  * The option --enable-verbose can be called with
			
 
				+    --enable-verbose=extra to increase the verbosity
			
 
				+
			
 
				 StarPU 1.1.2 (svn revision xxxx)
			
 
				 ==============================================
			
 
				 The scheduling context release
			
--- a/configure.ac
+++ b/configure.ac
@@ -1393,12 +1393,16 @@ fi
 
				 
			
 
				 AC_MSG_CHECKING(whether debug messages should be displayed)
			
 
				 AC_ARG_ENABLE(verbose, [AS_HELP_STRING([--enable-verbose],
			
 
				-			[display verbose debug messages])],
			
 
				+			[display verbose debug messages (--enable-verbose=extra increase the verbosity)])],
			
 
				 			enable_verbose=$enableval, enable_verbose=no)
			
 
				 AC_MSG_RESULT($enable_verbose)
			
 
				 if test x$enable_verbose = xyes; then
			
 
				 	AC_DEFINE(STARPU_VERBOSE, [1], [display verbose debug messages])
			
 
				 fi
			
 
				+if test x$enable_verbose = xextra; then
			
 
				+	AC_DEFINE(STARPU_VERBOSE, [1], [display verbose debug messages])
			
 
				+	AC_DEFINE(STARPU_EXTRA_VERBOSE, [1], [display verbose debug messages])
			
 
				+fi
			
 
				 
			
 
				 AC_MSG_CHECKING(whether coverage testing should be enabled)
			
 
				 AC_ARG_ENABLE(coverage, [AS_HELP_STRING([--enable-coverage],
			
--- a/doc/doxygen/chapters/41configure_options.doxy
+++ b/doc/doxygen/chapters/41configure_options.doxy
@@ -42,7 +42,7 @@ Disable assertion checks, which saves computation time.
 
				 \addindex __configure__--enable-verbose
			
 
				 Increase the verbosity of the debugging messages.  This can be disabled
			
 
				 at runtime by setting the environment variable \ref STARPU_SILENT to
			
 
				-any value.
			
 
				+any value. <c>--enable-verbose=extra</c> increase even more the verbosity.
			
 
				 
			
 
				 \verbatim
			
 
				 $ STARPU_SILENT=1 ./vector_scal
			
--- a/doc/doxygen/chapters/api/scheduling_contexts.doxy
+++ b/doc/doxygen/chapters/api/scheduling_contexts.doxy
@@ -108,6 +108,10 @@ This function removes the workers in \p workerids_ctx from the context
 
				 \p sched_ctx_id. The last argument cannot be greater than
			
 
				 STARPU_NMAX_SCHED_CTXS.
			
 
				 
			
 
				+\fn void starpu_sched_ctx_display_workers(unsigned sched_ctx_id, FILE *f)
			
 
				+\ingroup API_Scheduling_Contexts
			
 
				+This function prints on the file \p f the worker names belonging to the context \p sched_ctx_id
			
 
				+
			
 
				 \fn void starpu_sched_ctx_delete(unsigned sched_ctx_id)
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				 Delete scheduling context \p sched_ctx_id and transfer remaining
			
--- a/examples/axpy/axpy.c
+++ b/examples/axpy/axpy.c
@@ -31,7 +31,7 @@
 
				 
			
 
				 #include "axpy.h"
			
 
				 
			
 
				-#define AXPY	SAXPY
			
 
				+#define AXPY	STARPU_SAXPY
			
 
				 #define CUBLASAXPY	cublasSaxpy
			
 
				 
			
 
				 #define N	(16*1024*1024)
			
--- a/examples/cg/cg.h
+++ b/examples/cg/cg.h
@@ -30,11 +30,11 @@
 
				 
			
 
				 #ifdef DOUBLE
			
 
				 #define TYPE	double
			
 
				-#define GEMV	DGEMV
			
 
				-#define DOT	DDOT
			
 
				-#define GEMV	DGEMV
			
 
				-#define AXPY	DAXPY
			
 
				-#define SCAL	DSCAL
			
 
				+#define GEMV	STARPU_DGEMV
			
 
				+#define DOT	STARPU_DDOT
			
 
				+#define GEMV	STARPU_DGEMV
			
 
				+#define AXPY	STARPU_DAXPY
			
 
				+#define SCAL	STARPU_DSCAL
			
 
				 #define cublasdot	cublasDdot
			
 
				 #define cublasscal	cublasDscal
			
 
				 #define cublasaxpy	cublasDaxpy
			
@@ -42,11 +42,11 @@
 
				 #define cublasscal	cublasDscal
			
 
				 #else
			
 
				 #define TYPE	float
			
 
				-#define GEMV	SGEMV
			
 
				-#define DOT	SDOT
			
 
				-#define GEMV	SGEMV
			
 
				-#define AXPY	SAXPY
			
 
				-#define SCAL	SSCAL
			
 
				+#define GEMV	STARPU_SGEMV
			
 
				+#define DOT	STARPU_SDOT
			
 
				+#define GEMV	STARPU_SGEMV
			
 
				+#define AXPY	STARPU_SAXPY
			
 
				+#define SCAL	STARPU_SSCAL
			
 
				 #define cublasdot	cublasSdot
			
 
				 #define cublasscal	cublasSscal
			
 
				 #define cublasaxpy	cublasSaxpy
			
--- a/examples/cholesky/cholesky_grain_tag.c
+++ b/examples/cholesky/cholesky_grain_tag.c
@@ -392,7 +392,7 @@ int main(int argc, char **argv)
 
				 	float *test_mat = malloc(size*size*sizeof(float));
			
 
				 	STARPU_ASSERT(test_mat);
			
 
				 
			
 
				-	SSYRK("L", "N", size, size, 1.0f,
			
 
				+	STARPU_SSYRK("L", "N", size, size, 1.0f,
			
 
				 				mat, size, 0.0f, test_mat, size);
			
 
				 
			
 
				 	FPRINTF(stderr, "comparing results ...\n");
			
--- a/examples/cholesky/cholesky_implicit.c
+++ b/examples/cholesky/cholesky_implicit.c
@@ -243,7 +243,7 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 
				 		float *test_mat = malloc(size*size*sizeof(float));
			
 
				 		STARPU_ASSERT(test_mat);
			
 
				 
			
 
				-		SSYRK("L", "N", size, size, 1.0f,
			
 
				+		STARPU_SSYRK("L", "N", size, size, 1.0f,
			
 
				 					mat, size, 0.0f, test_mat, size);
			
 
				 
			
 
				 		FPRINTF(stderr, "comparing results ...\n");
			
--- a/examples/cholesky/cholesky_kernels.c
+++ b/examples/cholesky/cholesky_kernels.c
@@ -49,7 +49,7 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STAR
 
				 		if (worker_size == 1)
			
 
				 		{
			
 
				 			/* Sequential CPU kernel */
			
 
				-			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21, 
			
 
				+			STARPU_SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21, 
			
 
				 				right, ld12, 1.0f, center, ld22);
			
 
				 		}
			
 
				 		else
			
@@ -63,7 +63,7 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STAR
 
				 			float *new_left = &left[block_size*rank];
			
 
				 			float *new_center = &center[block_size*rank];
			
 
				 
			
 
				-			SGEMM("N", "T", dy, new_dx, dz, -1.0f, new_left, ld21, 
			
 
				+			STARPU_SGEMM("N", "T", dy, new_dx, dz, -1.0f, new_left, ld21, 
			
 
				 				right, ld12, 1.0f, new_center, ld22);
			
 
				 		}
			
 
				 	}
			
@@ -113,7 +113,7 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, STARPU_A
 
				 	switch (s)
			
 
				 	{
			
 
				 		case 0:
			
 
				-			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
			
 
				+			STARPU_STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
			
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
@@ -172,9 +172,9 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 
				 
			
 
				 				STARPU_ASSERT(lambda11 != 0.0f);
			
 
				 		
			
 
				-				SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
			
 
				+				STARPU_SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
			
 
				 		
			
 
				-				SSYR("L", nx - z - 1, -1.0f, 
			
 
				+				STARPU_SSYR("L", nx - z - 1, -1.0f, 
			
 
				 							&sub11[(z+1)+z*ld], 1,
			
 
				 							&sub11[(z+1)+(z+1)*ld], ld);
			
 
				 			}
			
@@ -191,7 +191,6 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 
				 				fprintf(stderr, "Error in Magma: %d\n", ret);
			
 
				 				STARPU_ABORT();
			
 
				 			}
			
 
				-			STARPU_ASSERT(!cures);
			
 
				 			}
			
 
				 #else
			
 
				 			{
			
--- a/examples/cholesky/cholesky_tag.c
+++ b/examples/cholesky/cholesky_tag.c
@@ -368,7 +368,7 @@ int main(int argc, char **argv)
 
				 	float *test_mat = malloc(size*size*sizeof(float));
			
 
				 	STARPU_ASSERT(test_mat);
			
 
				 
			
 
				-	SSYRK("L", "N", size, size, 1.0f,
			
 
				+	STARPU_SSYRK("L", "N", size, size, 1.0f,
			
 
				 				mat, size, 0.0f, test_mat, size);
			
 
				 
			
 
				 	FPRINTF(stderr, "comparing results ...\n");
			
--- a/examples/common/blas.c
+++ b/examples/common/blas.c
@@ -23,13 +23,13 @@
 
				 
			
 
				 /*
			
 
				     This files contains BLAS wrappers for the different BLAS implementations
			
 
				-  (eg. REFBLAS, STARPU_ATLAS, GOTOBLAS ...). We assume a Fortran orientation as most
			
 
				+  (eg. REFBLAS, ATLAS, GOTOBLAS ...). We assume a Fortran orientation as most
			
 
				   libraries do not supply C-based ordering.
			
 
				  */
			
 
				 
			
 
				 #ifdef STARPU_ATLAS
			
 
				 
			
 
				-inline void SGEMM(char *transa, char *transb, int M, int N, int K, 
			
 
				+inline void STARPU_SGEMM(char *transa, char *transb, int M, int N, int K, 
			
 
				 			float alpha, const float *A, int lda, const float *B, int ldb, 
			
 
				 			float beta, float *C, int ldc)
			
 
				 {
			
@@ -40,7 +40,7 @@ inline void SGEMM(char *transa, char *transb, int M, int N, int K,
 
				 			M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);				
			
 
				 }
			
 
				 
			
 
				-inline void DGEMM(char *transa, char *transb, int M, int N, int K, 
			
 
				+inline void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, 
			
 
				 			double alpha, double *A, int lda, double *B, int ldb, 
			
 
				 			double beta, double *C, int ldc)
			
 
				 {
			
@@ -51,7 +51,7 @@ inline void DGEMM(char *transa, char *transb, int M, int N, int K,
 
				 			M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);				
			
 
				 }
			
 
				 
			
 
				-inline void SGEMV(char *transa, int M, int N, float alpha, float *A, int lda, float *X, int incX, float beta, float *Y, int incY)
			
 
				+inline void STARPU_SGEMV(char *transa, int M, int N, float alpha, float *A, int lda, float *X, int incX, float beta, float *Y, int incY)
			
 
				 {
			
 
				 	enum CBLAS_TRANSPOSE ta = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
			
 
				 
			
@@ -59,7 +59,7 @@ inline void SGEMV(char *transa, int M, int N, float alpha, float *A, int lda, fl
 
				 					X, incX, beta, Y, incY);
			
 
				 }
			
 
				 
			
 
				-inline void DGEMV(char *transa, int M, int N, double alpha, double *A, int lda, double *X, int incX, double beta, double *Y, int incY)
			
 
				+inline void STARPU_DGEMV(char *transa, int M, int N, double alpha, double *A, int lda, double *X, int incX, double beta, double *Y, int incY)
			
 
				 {
			
 
				 	enum CBLAS_TRANSPOSE ta = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
			
 
				 
			
@@ -67,27 +67,27 @@ inline void DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
 
				 					X, incX, beta, Y, incY);
			
 
				 }
			
 
				 
			
 
				-inline float SASUM(int N, float *X, int incX)
			
 
				+inline float STARPU_SASUM(int N, float *X, int incX)
			
 
				 {
			
 
				 	return cblas_sasum(N, X, incX);
			
 
				 }
			
 
				 
			
 
				-inline double DASUM(int N, double *X, int incX)
			
 
				+inline double STARPU_DASUM(int N, double *X, int incX)
			
 
				 {
			
 
				 	return cblas_dasum(N, X, incX);
			
 
				 }
			
 
				 
			
 
				-void SSCAL(int N, float alpha, float *X, int incX)
			
 
				+void STARPU_SSCAL(int N, float alpha, float *X, int incX)
			
 
				 {
			
 
				 	cblas_sscal(N, alpha, X, incX);
			
 
				 }
			
 
				 
			
 
				-void DSCAL(int N, double alpha, double *X, int incX)
			
 
				+void STARPU_DSCAL(int N, double alpha, double *X, int incX)
			
 
				 {
			
 
				 	cblas_dscal(N, alpha, X, incX);
			
 
				 }
			
 
				 
			
 
				-void STRSM (const char *side, const char *uplo, const char *transa,
			
 
				+void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
			
 
				                    const char *diag, const int m, const int n,
			
 
				                    const float alpha, const float *A, const int lda,
			
 
				                    float *B, const int ldb)
			
@@ -100,7 +100,7 @@ void STRSM (const char *side, const char *uplo, const char *transa,
 
				 	cblas_strsm(CblasColMajor, side_, uplo_, transa_, diag_, m, n, alpha, A, lda, B, ldb);
			
 
				 }
			
 
				 
			
 
				-void DTRSM (const char *side, const char *uplo, const char *transa,
			
 
				+void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
			
 
				                    const char *diag, const int m, const int n,
			
 
				                    const double alpha, const double *A, const int lda,
			
 
				                    double *B, const int ldb)
			
@@ -113,7 +113,7 @@ void DTRSM (const char *side, const char *uplo, const char *transa,
 
				 	cblas_dtrsm(CblasColMajor, side_, uplo_, transa_, diag_, m, n, alpha, A, lda, B, ldb);
			
 
				 }
			
 
				 
			
 
				-void SSYR (const char *uplo, const int n, const float alpha,
			
 
				+void STARPU_SSYR (const char *uplo, const int n, const float alpha,
			
 
				                   const float *x, const int incx, float *A, const int lda)
			
 
				 {
			
 
				 	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
			
@@ -121,7 +121,7 @@ void SSYR (const char *uplo, const int n, const float alpha,
 
				 	cblas_ssyr(CblasColMajor, uplo_, n, alpha, x, incx, A, lda); 
			
 
				 }
			
 
				 
			
 
				-void SSYRK (const char *uplo, const char *trans, const int n,
			
 
				+void STARPU_SSYRK (const char *uplo, const char *trans, const int n,
			
 
				                    const int k, const float alpha, const float *A,
			
 
				                    const int lda, const float beta, float *C,
			
 
				                    const int ldc)
			
@@ -132,21 +132,21 @@ void SSYRK (const char *uplo, const char *trans, const int n,
 
				 	cblas_ssyrk(CblasColMajor, uplo_, trans_, n, k, alpha, A, lda, beta, C, ldc); 
			
 
				 }
			
 
				 
			
 
				-void SGER(const int m, const int n, const float alpha,
			
 
				+void STARPU_SGER(const int m, const int n, const float alpha,
			
 
				                   const float *x, const int incx, const float *y,
			
 
				                   const int incy, float *A, const int lda)
			
 
				 {
			
 
				 	cblas_sger(CblasColMajor, m, n, alpha, x, incx, y, incy, A, lda);
			
 
				 }
			
 
				 
			
 
				-void DGER(const int m, const int n, const double alpha,
			
 
				+void STARPU_DGER(const int m, const int n, const double alpha,
			
 
				                   const double *x, const int incx, const double *y,
			
 
				                   const int incy, double *A, const int lda)
			
 
				 {
			
 
				 	cblas_dger(CblasColMajor, m, n, alpha, x, incx, y, incy, A, lda);
			
 
				 }
			
 
				 
			
 
				-void STRSV (const char *uplo, const char *trans, const char *diag, 
			
 
				+void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
			
 
				                    const int n, const float *A, const int lda, float *x, 
			
 
				                    const int incx)
			
 
				 {
			
@@ -157,7 +157,7 @@ void STRSV (const char *uplo, const char *trans, const char *diag,
 
				 	cblas_strsv(CblasColMajor, uplo_, trans_, diag_, n, A, lda, x, incx);
			
 
				 }
			
 
				 
			
 
				-void STRMM(const char *side, const char *uplo, const char *transA,
			
 
				+void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
			
 
				                  const char *diag, const int m, const int n,
			
 
				                  const float alpha, const float *A, const int lda,
			
 
				                  float *B, const int ldb)
			
@@ -170,7 +170,7 @@ void STRMM(const char *side, const char *uplo, const char *transA,
 
				 	cblas_strmm(CblasColMajor, side_, uplo_, transA_, diag_, m, n, alpha, A, lda, B, ldb);
			
 
				 }
			
 
				 
			
 
				-void DTRMM(const char *side, const char *uplo, const char *transA,
			
 
				+void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
			
 
				                  const char *diag, const int m, const int n,
			
 
				                  const double alpha, const double *A, const int lda,
			
 
				                  double *B, const int ldb)
			
@@ -183,7 +183,7 @@ void DTRMM(const char *side, const char *uplo, const char *transA,
 
				 	cblas_dtrmm(CblasColMajor, side_, uplo_, transA_, diag_, m, n, alpha, A, lda, B, ldb);
			
 
				 }
			
 
				 
			
 
				-void STRMV(const char *uplo, const char *transA, const char *diag,
			
 
				+void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
			
 
				                  const int n, const float *A, const int lda, float *X,
			
 
				                  const int incX)
			
 
				 {
			
@@ -194,53 +194,53 @@ void STRMV(const char *uplo, const char *transA, const char *diag,
 
				 	cblas_strmv(CblasColMajor, uplo_, transA_, diag_, n, A, lda, X, incX);
			
 
				 }
			
 
				 
			
 
				-void SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY)
			
 
				+void STARPU_SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY)
			
 
				 {
			
 
				 	cblas_saxpy(n, alpha, X, incX, Y, incY);
			
 
				 }
			
 
				 
			
 
				-void DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY)
			
 
				+void STARPU_DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY)
			
 
				 {
			
 
				 	cblas_daxpy(n, alpha, X, incX, Y, incY);
			
 
				 }
			
 
				 
			
 
				-int ISAMAX (const int n, float *X, const int incX)
			
 
				+int STARPU_ISAMAX (const int n, float *X, const int incX)
			
 
				 {
			
 
				     int retVal;
			
 
				     retVal = cblas_isamax(n, X, incX);
			
 
				     return retVal;
			
 
				 }
			
 
				 
			
 
				-int IDAMAX (const int n, double *X, const int incX)
			
 
				+int STARPU_IDAMAX (const int n, double *X, const int incX)
			
 
				 {
			
 
				     int retVal;
			
 
				     retVal = cblas_idamax(n, X, incX);
			
 
				     return retVal;
			
 
				 }
			
 
				 
			
 
				-float SDOT(const int n, const float *x, const int incx, const float *y, const int incy)
			
 
				+float STARPU_SDOT(const int n, const float *x, const int incx, const float *y, const int incy)
			
 
				 {
			
 
				 	return cblas_sdot(n, x, incx, y, incy);
			
 
				 }
			
 
				 
			
 
				-double DDOT(const int n, const double *x, const int incx, const double *y, const int incy)
			
 
				+double STARPU_DDOT(const int n, const double *x, const int incx, const double *y, const int incy)
			
 
				 {
			
 
				 	return cblas_ddot(n, x, incx, y, incy);
			
 
				 }
			
 
				 
			
 
				-void SSWAP(const int n, float *x, const int incx, float *y, const int incy)
			
 
				+void STARPU_SSWAP(const int n, float *x, const int incx, float *y, const int incy)
			
 
				 {
			
 
				 	cblas_sswap(n, x, incx, y, incy);
			
 
				 }
			
 
				 
			
 
				-void DSWAP(const int n, double *x, const int incx, double *y, const int incy)
			
 
				+void STARPU_DSWAP(const int n, double *x, const int incx, double *y, const int incy)
			
 
				 {
			
 
				 	cblas_dswap(n, x, incx, y, incy);
			
 
				 }
			
 
				 
			
 
				 #elif defined(STARPU_GOTO) || defined(STARPU_SYSTEM_BLAS) || defined(STARPU_MKL)
			
 
				 
			
 
				-inline void SGEMM(char *transa, char *transb, int M, int N, int K, 
			
 
				+inline void STARPU_SGEMM(char *transa, char *transb, int M, int N, int K, 
			
 
				 			float alpha, const float *A, int lda, const float *B, int ldb, 
			
 
				 			float beta, float *C, int ldc)
			
 
				 {
			
@@ -249,7 +249,7 @@ inline void SGEMM(char *transa, char *transb, int M, int N, int K,
 
				 			 &beta, C, &ldc);	
			
 
				 }
			
 
				 
			
 
				-inline void DGEMM(char *transa, char *transb, int M, int N, int K, 
			
 
				+inline void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, 
			
 
				 			double alpha, double *A, int lda, double *B, int ldb, 
			
 
				 			double beta, double *C, int ldc)
			
 
				 {
			
@@ -259,39 +259,39 @@ inline void DGEMM(char *transa, char *transb, int M, int N, int K,
 
				 }
			
 
				 
			
 
				 
			
 
				-inline void SGEMV(char *transa, int M, int N, float alpha, float *A, int lda,
			
 
				+inline void STARPU_SGEMV(char *transa, int M, int N, float alpha, float *A, int lda,
			
 
				 		float *X, int incX, float beta, float *Y, int incY)
			
 
				 {
			
 
				 	sgemv_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
			
 
				 }
			
 
				 
			
 
				-inline void DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
			
 
				+inline void STARPU_DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
			
 
				 		double *X, int incX, double beta, double *Y, int incY)
			
 
				 {
			
 
				 	dgemv_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
			
 
				 }
			
 
				 
			
 
				-inline float SASUM(int N, float *X, int incX)
			
 
				+inline float STARPU_SASUM(int N, float *X, int incX)
			
 
				 {
			
 
				 	return sasum_(&N, X, &incX);
			
 
				 }
			
 
				 
			
 
				-inline double DASUM(int N, double *X, int incX)
			
 
				+inline double STARPU_DASUM(int N, double *X, int incX)
			
 
				 {
			
 
				 	return dasum_(&N, X, &incX);
			
 
				 }
			
 
				 
			
 
				-void SSCAL(int N, float alpha, float *X, int incX)
			
 
				+void STARPU_SSCAL(int N, float alpha, float *X, int incX)
			
 
				 {
			
 
				 	sscal_(&N, &alpha, X, &incX);
			
 
				 }
			
 
				 
			
 
				-void DSCAL(int N, double alpha, double *X, int incX)
			
 
				+void STARPU_DSCAL(int N, double alpha, double *X, int incX)
			
 
				 {
			
 
				 	dscal_(&N, &alpha, X, &incX);
			
 
				 }
			
 
				 
			
 
				-void STRSM (const char *side, const char *uplo, const char *transa,
			
 
				+void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
			
 
				                    const char *diag, const int m, const int n,
			
 
				                    const float alpha, const float *A, const int lda,
			
 
				                    float *B, const int ldb)
			
@@ -299,7 +299,7 @@ void STRSM (const char *side, const char *uplo, const char *transa,
 
				 	strsm_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
			
 
				 }
			
 
				 
			
 
				-void DTRSM (const char *side, const char *uplo, const char *transa,
			
 
				+void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
			
 
				                    const char *diag, const int m, const int n,
			
 
				                    const double alpha, const double *A, const int lda,
			
 
				                    double *B, const int ldb)
			
@@ -307,13 +307,13 @@ void DTRSM (const char *side, const char *uplo, const char *transa,
 
				 	dtrsm_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
			
 
				 }
			
 
				 
			
 
				-void SSYR (const char *uplo, const int n, const float alpha,
			
 
				+void STARPU_SSYR (const char *uplo, const int n, const float alpha,
			
 
				                   const float *x, const int incx, float *A, const int lda)
			
 
				 {
			
 
				 	ssyr_(uplo, &n, &alpha, x, &incx, A, &lda); 
			
 
				 }
			
 
				 
			
 
				-void SSYRK (const char *uplo, const char *trans, const int n,
			
 
				+void STARPU_SSYRK (const char *uplo, const char *trans, const int n,
			
 
				                    const int k, const float alpha, const float *A,
			
 
				                    const int lda, const float beta, float *C,
			
 
				                    const int ldc)
			
@@ -321,28 +321,28 @@ void SSYRK (const char *uplo, const char *trans, const int n,
 
				 	ssyrk_(uplo, trans, &n, &k, &alpha, A, &lda, &beta, C, &ldc); 
			
 
				 }
			
 
				 
			
 
				-void SGER(const int m, const int n, const float alpha,
			
 
				+void STARPU_SGER(const int m, const int n, const float alpha,
			
 
				                   const float *x, const int incx, const float *y,
			
 
				                   const int incy, float *A, const int lda)
			
 
				 {
			
 
				 	sger_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
			
 
				 }
			
 
				 
			
 
				-void DGER(const int m, const int n, const double alpha,
			
 
				+void STARPU_DGER(const int m, const int n, const double alpha,
			
 
				                   const double *x, const int incx, const double *y,
			
 
				                   const int incy, double *A, const int lda)
			
 
				 {
			
 
				 	dger_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
			
 
				 }
			
 
				 
			
 
				-void STRSV (const char *uplo, const char *trans, const char *diag, 
			
 
				+void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
			
 
				                    const int n, const float *A, const int lda, float *x, 
			
 
				                    const int incx)
			
 
				 {
			
 
				 	strsv_(uplo, trans, diag, &n, A, &lda, x, &incx);
			
 
				 }
			
 
				 
			
 
				-void STRMM(const char *side, const char *uplo, const char *transA,
			
 
				+void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
			
 
				                  const char *diag, const int m, const int n,
			
 
				                  const float alpha, const float *A, const int lda,
			
 
				                  float *B, const int ldb)
			
@@ -350,7 +350,7 @@ void STRMM(const char *side, const char *uplo, const char *transA,
 
				 	strmm_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
			
 
				 }
			
 
				 
			
 
				-void DTRMM(const char *side, const char *uplo, const char *transA,
			
 
				+void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
			
 
				                  const char *diag, const int m, const int n,
			
 
				                  const double alpha, const double *A, const int lda,
			
 
				                  double *B, const int ldb)
			
@@ -358,38 +358,38 @@ void DTRMM(const char *side, const char *uplo, const char *transA,
 
				 	dtrmm_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
			
 
				 }
			
 
				 
			
 
				-void STRMV(const char *uplo, const char *transA, const char *diag,
			
 
				+void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
			
 
				                  const int n, const float *A, const int lda, float *X,
			
 
				                  const int incX)
			
 
				 {
			
 
				 	strmv_(uplo, transA, diag, &n, A, &lda, X, &incX);
			
 
				 }
			
 
				 
			
 
				-void SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY)
			
 
				+void STARPU_SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY)
			
 
				 {
			
 
				 	saxpy_(&n, &alpha, X, &incX, Y, &incY);
			
 
				 }
			
 
				 
			
 
				-void DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY)
			
 
				+void STARPU_DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY)
			
 
				 {
			
 
				 	daxpy_(&n, &alpha, X, &incX, Y, &incY);
			
 
				 }
			
 
				 
			
 
				-int ISAMAX (const int n, float *X, const int incX)
			
 
				+int STARPU_ISAMAX (const int n, float *X, const int incX)
			
 
				 {
			
 
				     int retVal;
			
 
				     retVal = isamax_ (&n, X, &incX);
			
 
				     return retVal;
			
 
				 }
			
 
				 
			
 
				-int IDAMAX (const int n, double *X, const int incX)
			
 
				+int STARPU_IDAMAX (const int n, double *X, const int incX)
			
 
				 {
			
 
				     int retVal;
			
 
				     retVal = idamax_ (&n, X, &incX);
			
 
				     return retVal;
			
 
				 }
			
 
				 
			
 
				-float SDOT(const int n, const float *x, const int incx, const float *y, const int incy)
			
 
				+float STARPU_SDOT(const int n, const float *x, const int incx, const float *y, const int incy)
			
 
				 {
			
 
				 	float retVal = 0;
			
 
				 
			
@@ -399,104 +399,104 @@ float SDOT(const int n, const float *x, const int incx, const float *y, const in
 
				 	return retVal;
			
 
				 }
			
 
				 
			
 
				-double DDOT(const int n, const double *x, const int incx, const double *y, const int incy)
			
 
				+double STARPU_DDOT(const int n, const double *x, const int incx, const double *y, const int incy)
			
 
				 {
			
 
				 	return ddot_(&n, x, &incx, y, &incy);
			
 
				 }
			
 
				 
			
 
				-void SSWAP(const int n, float *X, const int incX, float *Y, const int incY)
			
 
				+void STARPU_SSWAP(const int n, float *X, const int incX, float *Y, const int incY)
			
 
				 {
			
 
				 	sswap_(&n, X, &incX, Y, &incY);
			
 
				 }
			
 
				 
			
 
				-void DSWAP(const int n, double *X, const int incX, double *Y, const int incY)
			
 
				+void STARPU_DSWAP(const int n, double *X, const int incX, double *Y, const int incY)
			
 
				 {
			
 
				 	dswap_(&n, X, &incX, Y, &incY);
			
 
				 }
			
 
				 
			
 
				 
			
 
				 #elif defined(STARPU_SIMGRID)
			
 
				-inline void SGEMM(char *transa, char *transb, int M, int N, int K, 
			
 
				+inline void STARPU_SGEMM(char *transa, char *transb, int M, int N, int K, 
			
 
				 			float alpha, const float *A, int lda, const float *B, int ldb, 
			
 
				 			float beta, float *C, int ldc) { }
			
 
				 
			
 
				-inline void DGEMM(char *transa, char *transb, int M, int N, int K, 
			
 
				+inline void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, 
			
 
				 			double alpha, double *A, int lda, double *B, int ldb, 
			
 
				 			double beta, double *C, int ldc) { }
			
 
				 
			
 
				-inline void SGEMV(char *transa, int M, int N, float alpha, float *A, int lda,
			
 
				+inline void STARPU_SGEMV(char *transa, int M, int N, float alpha, float *A, int lda,
			
 
				 		float *X, int incX, float beta, float *Y, int incY) { }
			
 
				 
			
 
				-inline void DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
			
 
				+inline void STARPU_DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
			
 
				 		double *X, int incX, double beta, double *Y, int incY) { }
			
 
				 
			
 
				-inline float SASUM(int N, float *X, int incX) { }
			
 
				+inline float STARPU_SASUM(int N, float *X, int incX) { }
			
 
				 
			
 
				-inline double DASUM(int N, double *X, int incX) { }
			
 
				+inline double STARPU_DASUM(int N, double *X, int incX) { }
			
 
				 
			
 
				-void SSCAL(int N, float alpha, float *X, int incX) { }
			
 
				+void STARPU_SSCAL(int N, float alpha, float *X, int incX) { }
			
 
				 
			
 
				-void DSCAL(int N, double alpha, double *X, int incX) { }
			
 
				+void STARPU_DSCAL(int N, double alpha, double *X, int incX) { }
			
 
				 
			
 
				-void STRSM (const char *side, const char *uplo, const char *transa,
			
 
				+void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
			
 
				                    const char *diag, const int m, const int n,
			
 
				                    const float alpha, const float *A, const int lda,
			
 
				                    float *B, const int ldb) { }
			
 
				 
			
 
				-void DTRSM (const char *side, const char *uplo, const char *transa,
			
 
				+void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
			
 
				                    const char *diag, const int m, const int n,
			
 
				                    const double alpha, const double *A, const int lda,
			
 
				                    double *B, const int ldb) { }
			
 
				 
			
 
				-void SSYR (const char *uplo, const int n, const float alpha,
			
 
				+void STARPU_SSYR (const char *uplo, const int n, const float alpha,
			
 
				                   const float *x, const int incx, float *A, const int lda) { }
			
 
				 
			
 
				-void SSYRK (const char *uplo, const char *trans, const int n,
			
 
				+void STARPU_SSYRK (const char *uplo, const char *trans, const int n,
			
 
				                    const int k, const float alpha, const float *A,
			
 
				                    const int lda, const float beta, float *C,
			
 
				                    const int ldc) { }
			
 
				 
			
 
				-void SGER(const int m, const int n, const float alpha,
			
 
				+void STARPU_SGER(const int m, const int n, const float alpha,
			
 
				                   const float *x, const int incx, const float *y,
			
 
				                   const int incy, float *A, const int lda) { }
			
 
				 
			
 
				-void DGER(const int m, const int n, const double alpha,
			
 
				+void STARPU_DGER(const int m, const int n, const double alpha,
			
 
				                   const double *x, const int incx, const double *y,
			
 
				                   const int incy, double *A, const int lda) { }
			
 
				 
			
 
				-void STRSV (const char *uplo, const char *trans, const char *diag, 
			
 
				+void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
			
 
				                    const int n, const float *A, const int lda, float *x, 
			
 
				                    const int incx) { }
			
 
				 
			
 
				-void STRMM(const char *side, const char *uplo, const char *transA,
			
 
				+void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
			
 
				                  const char *diag, const int m, const int n,
			
 
				                  const float alpha, const float *A, const int lda,
			
 
				                  float *B, const int ldb) { }
			
 
				 
			
 
				-void DTRMM(const char *side, const char *uplo, const char *transA,
			
 
				+void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
			
 
				                  const char *diag, const int m, const int n,
			
 
				                  const double alpha, const double *A, const int lda,
			
 
				                  double *B, const int ldb) { }
			
 
				 
			
 
				-void STRMV(const char *uplo, const char *transA, const char *diag,
			
 
				+void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
			
 
				                  const int n, const float *A, const int lda, float *X,
			
 
				                  const int incX) { }
			
 
				 
			
 
				-void SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY) { }
			
 
				+void STARPU_SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY) { }
			
 
				 
			
 
				-void DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY) { }
			
 
				+void STARPU_DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY) { }
			
 
				 
			
 
				-int ISAMAX (const int n, float *X, const int incX) { }
			
 
				+int STARPU_ISAMAX (const int n, float *X, const int incX) { }
			
 
				 
			
 
				-int IDAMAX (const int n, double *X, const int incX) { }
			
 
				+int STARPU_IDAMAX (const int n, double *X, const int incX) { }
			
 
				 
			
 
				-float SDOT(const int n, const float *x, const int incx, const float *y, const int incy) { }
			
 
				+float STARPU_SDOT(const int n, const float *x, const int incx, const float *y, const int incy) { }
			
 
				 
			
 
				-double DDOT(const int n, const double *x, const int incx, const double *y, const int incy) { }
			
 
				+double STARPU_DDOT(const int n, const double *x, const int incx, const double *y, const int incy) { }
			
 
				 
			
 
				-void SSWAP(const int n, float *X, const int incX, float *Y, const int incY) { }
			
 
				+void STARPU_SSWAP(const int n, float *X, const int incX, float *Y, const int incY) { }
			
 
				 
			
 
				-void DSWAP(const int n, double *X, const int incX, double *Y, const int incY) { }
			
 
				+void STARPU_DSWAP(const int n, double *X, const int incX, double *Y, const int incY) { }
			
 
				 
			
 
				 
			
 
				 #else
			
--- a/examples/common/blas.h
+++ b/examples/common/blas.h
@@ -24,63 +24,63 @@
 
				 #include <cblas.h>
			
 
				 #endif
			
 
				 
			
 
				-void SGEMM(char *transa, char *transb, int M, int N, int K, float alpha, const float *A, int lda, 
			
 
				+void STARPU_SGEMM(char *transa, char *transb, int M, int N, int K, float alpha, const float *A, int lda, 
			
 
				 		const float *B, int ldb, float beta, float *C, int ldc);
			
 
				-void DGEMM(char *transa, char *transb, int M, int N, int K, double alpha, double *A, int lda, 
			
 
				+void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, double alpha, double *A, int lda, 
			
 
				 		double *B, int ldb, double beta, double *C, int ldc);
			
 
				-void SGEMV(char *transa, int M, int N, float alpha, float *A, int lda,
			
 
				+void STARPU_SGEMV(char *transa, int M, int N, float alpha, float *A, int lda,
			
 
				 		float *X, int incX, float beta, float *Y, int incY);
			
 
				-void DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
			
 
				+void STARPU_DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
			
 
				 		double *X, int incX, double beta, double *Y, int incY);
			
 
				-float SASUM(int N, float *X, int incX);
			
 
				-double DASUM(int N, double *X, int incX);
			
 
				-void SSCAL(int N, float alpha, float *X, int incX);
			
 
				-void DSCAL(int N, double alpha, double *X, int incX);
			
 
				-void STRSM (const char *side, const char *uplo, const char *transa,
			
 
				+float STARPU_SASUM(int N, float *X, int incX);
			
 
				+double STARPU_DASUM(int N, double *X, int incX);
			
 
				+void STARPU_SSCAL(int N, float alpha, float *X, int incX);
			
 
				+void STARPU_DSCAL(int N, double alpha, double *X, int incX);
			
 
				+void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
			
 
				                    const char *diag, const int m, const int n,
			
 
				                    const float alpha, const float *A, const int lda,
			
 
				                    float *B, const int ldb);
			
 
				-void DTRSM (const char *side, const char *uplo, const char *transa,
			
 
				+void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
			
 
				                    const char *diag, const int m, const int n,
			
 
				                    const double alpha, const double *A, const int lda,
			
 
				                    double *B, const int ldb);
			
 
				-void DGEMM(char *transa, char *transb, int M, int N, int K, 
			
 
				+void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, 
			
 
				 			double alpha, double *A, int lda, double *B, int ldb, 
			
 
				 			double beta, double *C, int ldc);
			
 
				-void SSYR (const char *uplo, const int n, const float alpha,
			
 
				+void STARPU_SSYR (const char *uplo, const int n, const float alpha,
			
 
				                   const float *x, const int incx, float *A, const int lda);
			
 
				-void SSYRK (const char *uplo, const char *trans, const int n,
			
 
				+void STARPU_SSYRK (const char *uplo, const char *trans, const int n,
			
 
				                    const int k, const float alpha, const float *A,
			
 
				                    const int lda, const float beta, float *C,
			
 
				                    const int ldc);
			
 
				-void SGER (const int m, const int n, const float alpha,
			
 
				+void STARPU_SGER (const int m, const int n, const float alpha,
			
 
				                   const float *x, const int incx, const float *y,
			
 
				                   const int incy, float *A, const int lda);
			
 
				-void DGER(const int m, const int n, const double alpha,
			
 
				+void STARPU_DGER(const int m, const int n, const double alpha,
			
 
				                   const double *x, const int incx, const double *y,
			
 
				                   const int incy, double *A, const int lda);
			
 
				-void STRSV (const char *uplo, const char *trans, const char *diag, 
			
 
				+void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
			
 
				                    const int n, const float *A, const int lda, float *x, 
			
 
				                    const int incx);
			
 
				-void STRMM(const char *side, const char *uplo, const char *transA,
			
 
				+void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
			
 
				                  const char *diag, const int m, const int n,
			
 
				                  const float alpha, const float *A, const int lda,
			
 
				                  float *B, const int ldb);
			
 
				-void DTRMM(const char *side, const char *uplo, const char *transA,
			
 
				+void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
			
 
				                  const char *diag, const int m, const int n,
			
 
				                  const double alpha, const double *A, const int lda,
			
 
				                  double *B, const int ldb);
			
 
				-void STRMV(const char *uplo, const char *transA, const char *diag,
			
 
				+void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
			
 
				                  const int n, const float *A, const int lda, float *X,
			
 
				                  const int incX);
			
 
				-void SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incy);
			
 
				-void DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY);
			
 
				-int ISAMAX (const int n, float *X, const int incX);
			
 
				-int IDAMAX (const int n, double *X, const int incX);
			
 
				-float SDOT(const int n, const float *x, const int incx, const float *y, const int incy);
			
 
				-double DDOT(const int n, const double *x, const int incx, const double *y, const int incy);
			
 
				-void SSWAP(const int n, float *x, const int incx, float *y, const int incy);
			
 
				-void DSWAP(const int n, double *x, const int incx, double *y, const int incy);
			
 
				+void STARPU_SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incy);
			
 
				+void STARPU_DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY);
			
 
				+int STARPU_ISAMAX (const int n, float *X, const int incX);
			
 
				+int STARPU_IDAMAX (const int n, double *X, const int incX);
			
 
				+float STARPU_SDOT(const int n, const float *x, const int incx, const float *y, const int incy);
			
 
				+double STARPU_DDOT(const int n, const double *x, const int incx, const double *y, const int incy);
			
 
				+void STARPU_SSWAP(const int n, float *x, const int incx, float *y, const int incy);
			
 
				+void STARPU_DSWAP(const int n, double *x, const int incx, double *y, const int incy);
			
 
				 
			
 
				 #if defined(STARPU_GOTO) || defined(STARPU_SYSTEM_BLAS) || defined(STARPU_MKL)
			
 
				 
			
--- a/examples/heat/dw_factolu.h
+++ b/examples/heat/dw_factolu.h
@@ -141,7 +141,7 @@ static void STARPU_ATTRIBUTE_UNUSED compare_A_LU(float *A, float *LU,
 
				 
			
 
				 
			
 
				         /* now A_err = L, compute L*U */
			
 
				-	STRMM("R", "U", "N", "U", size, size, 1.0f, U, size, L, size);
			
 
				+	STARPU_STRMM("R", "U", "N", "U", size, size, 1.0f, U, size, L, size);
			
 
				 
			
 
				 	float max_err = 0.0f;
			
 
				 	for (i = 0; i < size ; i++)
			
--- a/examples/heat/dw_factolu_kernels.c
+++ b/examples/heat/dw_factolu_kernels.c
@@ -124,7 +124,7 @@ static inline void dw_common_cpu_codelet_update_u22(void *descr[], int s, STARPU
 
				 	switch (s)
			
 
				 	{
			
 
				 		case 0:
			
 
				-			SGEMM("N", "N",	dy, dx, dz, 
			
 
				+			STARPU_SGEMM("N", "N",	dy, dx, dz, 
			
 
				 				-1.0f, left, ld21, right, ld12,
			
 
				 					     1.0f, center, ld22);
			
 
				 			break;
			
@@ -189,7 +189,7 @@ static inline void dw_common_codelet_update_u12(void *descr[], int s, STARPU_ATT
 
				 	switch (s)
			
 
				 	{
			
 
				 		case 0:
			
 
				-			STRSM("L", "L", "N", "N",
			
 
				+			STARPU_STRSM("L", "L", "N", "N",
			
 
				 					 nx12, ny12, 1.0f, sub11, ld11, sub12, ld12);
			
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -251,7 +251,7 @@ static inline void dw_common_codelet_update_u21(void *descr[], int s, STARPU_ATT
 
				 	switch (s)
			
 
				 	{
			
 
				 		case 0:
			
 
				-			STRSM("R", "U", "N", "U", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
			
 
				+			STARPU_STRSM("R", "U", "N", "U", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
			
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
@@ -325,9 +325,9 @@ static inline void dw_common_codelet_update_u11(void *descr[], int s, STARPU_ATT
 
				 				pivot = sub11[z+z*ld];
			
 
				 				STARPU_ASSERT(pivot != 0.0f);
			
 
				 		
			
 
				-				SSCAL(nx - z - 1, (1.0f/pivot), &sub11[z+(z+1)*ld], ld);
			
 
				+				STARPU_SSCAL(nx - z - 1, (1.0f/pivot), &sub11[z+(z+1)*ld], ld);
			
 
				 		
			
 
				-				SGER(nx - z - 1, nx - z - 1, -1.0f,
			
 
				+				STARPU_SGER(nx - z - 1, nx - z - 1, -1.0f,
			
 
				 						&sub11[z+(z+1)*ld], ld,
			
 
				 						&sub11[(z+1)+z*ld], 1,
			
 
				 						&sub11[(z+1) + (z+1)*ld],ld);
			
--- a/examples/heat/dw_sparse_cg_kernels.c
+++ b/examples/heat/dw_sparse_cg_kernels.c
@@ -126,7 +126,7 @@ void cpu_codelet_func_3(void *descr[], void *arg)
 
				 	vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	size = (int)STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				-	dot = SDOT(size, vec, 1, vec, 1);
			
 
				+	dot = STARPU_SDOT(size, vec, 1, vec, 1);
			
 
				 
			
 
				 	fprintf(stderr, "func 3 : DOT = %f\n", dot);
			
 
				 
			
@@ -218,7 +218,7 @@ void cpu_codelet_func_5(void *descr[], void *arg)
 
				 	STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
			
 
				 	size = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				-	dot = SDOT(size, vecd, 1, vecq, 1);
			
 
				+	dot = STARPU_SDOT(size, vecd, 1, vecq, 1);
			
 
				 
			
 
				 	pb->alpha = pb->delta_new / dot;
			
 
				 }
			
@@ -265,7 +265,7 @@ void cpu_codelet_func_6(void *descr[], void *arg)
 
				 
			
 
				 	size = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				-	SAXPY(size, pb->alpha, vecd, 1, vecx, 1);
			
 
				+	STARPU_SAXPY(size, pb->alpha, vecd, 1, vecx, 1);
			
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -304,7 +304,7 @@ void cpu_codelet_func_7(void *descr[], void *arg)
 
				 
			
 
				 	size = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				-	SAXPY(size, -pb->alpha, vecq, 1, vecr, 1);
			
 
				+	STARPU_SAXPY(size, -pb->alpha, vecq, 1, vecr, 1);
			
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -344,7 +344,7 @@ void cpu_codelet_func_8(void *descr[], void *arg)
 
				 	vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	size = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				-	dot = SDOT(size, vecr, 1, vecr, 1);
			
 
				+	dot = STARPU_SDOT(size, vecr, 1, vecr, 1);
			
 
				 
			
 
				 	pb->delta_old = pb->delta_new;
			
 
				 	pb->delta_new = dot;
			
@@ -392,10 +392,10 @@ void cpu_codelet_func_9(void *descr[], void *arg)
 
				 	size = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				 	/* d = beta d */
			
 
				-	SSCAL(size, pb->beta, vecd, 1);
			
 
				+	STARPU_SSCAL(size, pb->beta, vecd, 1);
			
 
				 
			
 
				 	/* d = r + d */
			
 
				-	SAXPY (size, 1.0f, vecr, 1, vecd, 1);
			
 
				+	STARPU_SAXPY (size, 1.0f, vecr, 1, vecd, 1);
			
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
--- a/examples/heat/heat.c
+++ b/examples/heat/heat.c
@@ -362,10 +362,10 @@ static void solve_system(unsigned size, unsigned subsize, float *result, int *Re
 
				 	}
			
 
				 
			
 
				 		/* L */
			
 
				-		STRSV("L", "N", "N", subsize, A, subsize, B, 1);
			
 
				+		STARPU_STRSV("L", "N", "N", subsize, A, subsize, B, 1);
			
 
				 	
			
 
				 		/* U */
			
 
				-	        STRSV("U", "N", "U", subsize, A, subsize, B, 1);
			
 
				+	        STARPU_STRSV("U", "N", "U", subsize, A, subsize, B, 1);
			
 
				 	
			
 
				 		STARPU_ASSERT(DIM == size);
			
 
				 	
			
@@ -378,19 +378,19 @@ static void solve_system(unsigned size, unsigned subsize, float *result, int *Re
 
				 	
			
 
				 	
			
 
				 		/* LUB = U * LUB */
			
 
				-		STRMV("U", "N", "U", subsize, A, subsize, LUB, 1);
			
 
				+		STARPU_STRMV("U", "N", "U", subsize, A, subsize, LUB, 1);
			
 
				 		
			
 
				 		/* LUB = L * LUB */
			
 
				-		STRMV("L", "N", "N", subsize, A, subsize, LUB, 1);
			
 
				+		STARPU_STRMV("L", "N", "N", subsize, A, subsize, LUB, 1);
			
 
				 	
			
 
				 		/* LUB -= B */
			
 
				-		SAXPY(subsize, -1.0f, savedB, 1, LUB, 1);
			
 
				+		STARPU_SAXPY(subsize, -1.0f, savedB, 1, LUB, 1);
			
 
				 	
			
 
				 		/* check if LUB is close to the 0 vector */
			
 
				-		int maxind = ISAMAX(subsize, LUB, 1);
			
 
				+		int maxind = STARPU_ISAMAX(subsize, LUB, 1);
			
 
				 		FPRINTF(stderr, "max error (LUX - B) = %e\n",LUB[maxind - 1]);
			
 
				 
			
 
				-		float sum = SASUM(subsize, LUB, 1);
			
 
				+		float sum = STARPU_SASUM(subsize, LUB, 1);
			
 
				 		FPRINTF(stderr,"avg. error %e\n", sum/subsize);
			
 
				 	
			
 
				 		free(LUB);
			
--- a/examples/lu/lu-double.h
+++ b/examples/lu/lu-double.h
@@ -33,16 +33,16 @@
 
				 #define CUBLAS_SWAP	cublasDswap
			
 
				 #define CUBLAS_IAMAX	cublasIdamax
			
 
				 
			
 
				-#define CPU_GEMM	DGEMM
			
 
				-#define CPU_TRSM	DTRSM
			
 
				-#define CPU_SCAL	DSCAL
			
 
				-#define CPU_GER		DGER
			
 
				-#define CPU_SWAP	DSWAP
			
 
				+#define CPU_GEMM	STARPU_DGEMM
			
 
				+#define CPU_TRSM	STARPU_DTRSM
			
 
				+#define CPU_SCAL	STARPU_DSCAL
			
 
				+#define CPU_GER		STARPU_DGER
			
 
				+#define CPU_SWAP	STARPU_DSWAP
			
 
				 
			
 
				-#define CPU_TRMM	DTRMM
			
 
				-#define CPU_AXPY	DAXPY
			
 
				-#define CPU_ASUM	DASUM
			
 
				-#define CPU_IAMAX	IDAMAX
			
 
				+#define CPU_TRMM	STARPU_DTRMM
			
 
				+#define CPU_AXPY	STARPU_DAXPY
			
 
				+#define CPU_ASUM	STARPU_DASUM
			
 
				+#define CPU_IAMAX	STARPU_IDAMAX
			
 
				 
			
 
				 #define PIVOT_THRESHHOLD	10e-10
			
 
				 
			
--- a/examples/lu/lu-float.h
+++ b/examples/lu/lu-float.h
@@ -35,16 +35,16 @@
 
				 #define CUBLAS_SWAP	cublasSswap
			
 
				 #define CUBLAS_IAMAX	cublasIsamax
			
 
				 
			
 
				-#define CPU_GEMM	SGEMM
			
 
				-#define CPU_TRSM	STRSM
			
 
				-#define CPU_SCAL	SSCAL
			
 
				-#define CPU_GER		SGER
			
 
				-#define CPU_SWAP	SSWAP
			
 
				-
			
 
				-#define CPU_TRMM	STRMM
			
 
				-#define CPU_AXPY	SAXPY
			
 
				-#define CPU_ASUM	SASUM
			
 
				-#define CPU_IAMAX	ISAMAX
			
 
				+#define CPU_GEMM	STARPU_SGEMM
			
 
				+#define CPU_TRSM	STARPU_STRSM
			
 
				+#define CPU_SCAL	STARPU_SSCAL
			
 
				+#define CPU_GER		STARPU_SGER
			
 
				+#define CPU_SWAP	STARPU_SSWAP
			
 
				+
			
 
				+#define CPU_TRMM	STARPU_STRMM
			
 
				+#define CPU_AXPY	STARPU_SAXPY
			
 
				+#define CPU_ASUM	STARPU_SASUM
			
 
				+#define CPU_IAMAX	STARPU_ISAMAX
			
 
				 
			
 
				 #define PIVOT_THRESHHOLD	10e-5
			
 
				 
			
--- a/examples/lu/xlu.h
+++ b/examples/lu/xlu.h
@@ -60,7 +60,7 @@ static void STARPU_ATTRIBUTE_UNUSED compare_A_LU(float *A, float *LU,
 
				 	}
			
 
				 
			
 
				         /* now A_err = L, compute L*U */
			
 
				-	STRMM("R", "U", "N", "U", size, size, 1.0f, U, size, L, size);
			
 
				+	STARPU_STRMM("R", "U", "N", "U", size, size, 1.0f, U, size, L, size);
			
 
				 
			
 
				 	float max_err = 0.0f;
			
 
				 	for (i = 0; i < size ; i++)
			
--- a/examples/mult/double.h
+++ b/examples/mult/double.h
@@ -17,9 +17,9 @@
 
				 #define TYPE	double
			
 
				 
			
 
				 #define CUBLAS_GEMM cublasDgemm
			
 
				-#define CPU_GEMM	DGEMM
			
 
				-#define CPU_ASUM	DASUM
			
 
				-#define CPU_IAMAX	IDAMAX
			
 
				+#define CPU_GEMM	STARPU_DGEMM
			
 
				+#define CPU_ASUM	STARPU_DASUM
			
 
				+#define CPU_IAMAX	STARPU_IDAMAX
			
 
				 #define STARPU_GEMM(name)	starpu_dgemm_##name
			
 
				 
			
 
				 #define str(s) #s
			
--- a/examples/mult/simple.h
+++ b/examples/mult/simple.h
@@ -17,9 +17,9 @@
 
				 #define TYPE	float
			
 
				 
			
 
				 #define CUBLAS_GEMM cublasSgemm
			
 
				-#define CPU_GEMM	SGEMM
			
 
				-#define CPU_ASUM	SASUM
			
 
				-#define CPU_IAMAX	ISAMAX
			
 
				+#define CPU_GEMM	STARPU_SGEMM
			
 
				+#define CPU_ASUM	STARPU_SASUM
			
 
				+#define CPU_IAMAX	STARPU_ISAMAX
			
 
				 #define STARPU_GEMM(name)	starpu_sgemm_##name
			
 
				 
			
 
				 #define str(s) #s
			
--- a/examples/pipeline/pipeline.c
+++ b/examples/pipeline/pipeline.c
@@ -90,7 +90,7 @@ void pipeline_cpu_axpy(void *descr[], void *arg)
 
				 	float *y = (float *) STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				 	int n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				-	SAXPY(n, 1., x, 1, y, 1);
			
 
				+	STARPU_SAXPY(n, 1., x, 1, y, 1);
			
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -129,7 +129,7 @@ void pipeline_cpu_sum(void *descr[], void *_args)
 
				 	int n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 	float y;
			
 
				 
			
 
				-	y = SASUM(n, x, 1);
			
 
				+	y = STARPU_SASUM(n, x, 1);
			
 
				 
			
 
				 	FPRINTF(stderr,"CPU finished with %f\n", y);
			
 
				 }
			
--- a/examples/sched_ctx/sched_ctx.c
+++ b/examples/sched_ctx/sched_ctx.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2013  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010-2014  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -13,7 +13,7 @@
 
				  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				  *
			
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				+OB */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				 
			
@@ -26,16 +26,20 @@
 
				 int tasks_executed = 0;
			
 
				 starpu_pthread_mutex_t mut;
			
 
				 
			
 
				-static void sched_ctx_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg STARPU_ATTRIBUTE_UNUSED)
			
 
				+static void sched_ctx_cpu_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	starpu_pthread_mutex_lock(&mut);
			
 
				 	tasks_executed++;
			
 
				 	starpu_pthread_mutex_unlock(&mut);
			
 
				 }
			
 
				 
			
 
				-static struct starpu_codelet sched_ctx_codelet =
			
 
				+static void sched_ctx_cuda_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				-	.cpu_funcs = {sched_ctx_func, NULL},
			
 
				+}
			
 
				+
			
 
				+static struct starpu_codelet sched_ctx_codelet1 =
			
 
				+{
			
 
				+	.cpu_funcs = {sched_ctx_cpu_func, NULL},
			
 
				 	.cuda_funcs = {NULL},
			
 
				 	.opencl_funcs = {NULL},
			
 
				 	.model = NULL,
			
@@ -43,11 +47,25 @@ static struct starpu_codelet sched_ctx_codelet =
 
				 	.name = "sched_ctx"
			
 
				 };
			
 
				 
			
 
				+static struct starpu_codelet sched_ctx_codelet2 =
			
 
				+{
			
 
				+	.cpu_funcs = {sched_ctx_cpu_func, NULL},
			
 
				+	.cuda_funcs = {sched_ctx_cuda_func, NULL},
			
 
				+	.opencl_funcs = {NULL},
			
 
				+	.model = NULL,
			
 
				+	.nbuffers = 0,
			
 
				+	.name = "sched_ctx"
			
 
				+};
			
 
				+
			
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	int ntasks = NTASKS;
			
 
				 	int ret;
			
 
				+	unsigned ncuda = 0;
			
 
				+	int nprocs1 = 0;
			
 
				+	int nprocs2 = 0;
			
 
				+	int procs1[STARPU_NMAXWORKERS], procs2[STARPU_NMAXWORKERS];
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	if (ret == -ENODEV)
			
@@ -55,25 +73,23 @@ int main(int argc, char **argv)
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	starpu_pthread_mutex_init(&mut, NULL);
			
 
				-	int nprocs1 = 1;
			
 
				-	int nprocs2 = 1;
			
 
				-	int procs1[STARPU_NMAXWORKERS], procs2[STARPU_NMAXWORKERS];
			
 
				-	procs1[0] = 0;
			
 
				-	procs2[0] = 0;
			
 
				 
			
 
				 #ifdef STARPU_USE_CPU
			
 
				-	unsigned ncpus =  starpu_cpu_worker_get_count();
			
 
				-	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, ncpus);
			
 
				-
			
 
				-	nprocs1 = ncpus;
			
 
				+	nprocs1 = starpu_cpu_worker_get_count();
			
 
				+	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, nprocs1);
			
 
				 #endif
			
 
				+	// if there is no cpu, skip
			
 
				+	if (nprocs1 == 0) goto enodev;
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	unsigned ncuda = starpu_cuda_worker_get_count();
			
 
				-	starpu_worker_get_ids_by_type(STARPU_CUDA_WORKER, procs2, ncuda);
			
 
				-
			
 
				-	nprocs2 = ncuda == 0 ? 1 : ncuda;
			
 
				+	ncuda = nprocs2 = starpu_cuda_worker_get_count();
			
 
				+	starpu_worker_get_ids_by_type(STARPU_CUDA_WORKER, procs2, nprocs2);
			
 
				 #endif
			
 
				+	if (nprocs2 == 0)
			
 
				+	{
			
 
				+	     nprocs2 = 1;
			
 
				+	     procs2[0] = procs1[0];
			
 
				+	}
			
 
				 
			
 
				 	/*create contexts however you want*/
			
 
				 	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", STARPU_SCHED_CTX_POLICY_NAME, "eager", 0);
			
@@ -82,17 +98,18 @@ int main(int argc, char **argv)
 
				 	/*indicate what to do with the resources when context 2 finishes (it depends on your application)*/
			
 
				 	starpu_sched_ctx_set_inheritor(sched_ctx2, sched_ctx1);
			
 
				 
			
 
				+	starpu_sched_ctx_display_workers(sched_ctx2, stderr);
			
 
				+
			
 
				 	int i;
			
 
				 	for (i = 0; i < ntasks/2; i++)
			
 
				 	{
			
 
				 		struct starpu_task *task = starpu_task_create();
			
 
				 
			
 
				-		task->cl = &sched_ctx_codelet;
			
 
				+		task->cl = &sched_ctx_codelet1;
			
 
				 		task->cl_arg = NULL;
			
 
				 
			
 
				 		/*submit tasks to context*/
			
 
				 		ret = starpu_task_submit_to_ctx(task,sched_ctx1);
			
 
				-
			
 
				 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 	}
			
 
				 
			
@@ -102,11 +119,27 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_sched_ctx_finished_submit(sched_ctx1);
			
 
				 
			
 
				+	/* task with no cuda impl submitted to a ctx with gpus only */
			
 
				+	struct starpu_task *task2 = starpu_task_create();
			
 
				+	task2->cl = &sched_ctx_codelet1;
			
 
				+	task2->cl_arg = NULL;
			
 
				+
			
 
				+	/*submit tasks to context*/
			
 
				+	ret = starpu_task_submit_to_ctx(task2,sched_ctx2);
			
 
				+	if (ncuda == 0)
			
 
				+	{
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		STARPU_ASSERT_MSG(ret == -ENODEV, "submit task should ret enodev when the ctx does not have the PUs needed by the task");
			
 
				+	}
			
 
				+
			
 
				 	for (i = 0; i < ntasks/2; i++)
			
 
				 	{
			
 
				 		struct starpu_task *task = starpu_task_create();
			
 
				 
			
 
				-		task->cl = &sched_ctx_codelet;
			
 
				+		task->cl = &sched_ctx_codelet2;
			
 
				 		task->cl_arg = NULL;
			
 
				 
			
 
				 		ret = starpu_task_submit_to_ctx(task,sched_ctx2);
			
@@ -121,8 +154,9 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_sched_ctx_delete(sched_ctx1);
			
 
				 	starpu_sched_ctx_delete(sched_ctx2);
			
 
				-	printf("tasks executed %d out of %d\n", tasks_executed, ntasks);
			
 
				-	starpu_shutdown();
			
 
				+	printf("tasks executed %d out of %d\n", tasks_executed, ntasks/2);
			
 
				 
			
 
				-	return 0;
			
 
				+enodev:
			
 
				+	starpu_shutdown();
			
 
				+	return nprocs1 == 0 ? 77 : 0;
			
 
				 }
			
--- a/examples/sched_ctx/sched_ctx_without_sched_policy.c
+++ b/examples/sched_ctx/sched_ctx_without_sched_policy.c
@@ -73,6 +73,7 @@ int main(int argc, char **argv)
 
				 	tasks_executed[1] = 0;
			
 
				 	int ntasks = NTASKS;
			
 
				 	int ret, j, k;
			
 
				+	unsigned ncpus = 0;
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	if (ret == -ENODEV)
			
@@ -85,7 +86,7 @@ int main(int argc, char **argv)
 
				 	int *procs1, *procs2;
			
 
				 
			
 
				 #ifdef STARPU_USE_CPU
			
 
				-	unsigned ncpus =  starpu_cpu_worker_get_count();
			
 
				+	ncpus = starpu_cpu_worker_get_count();
			
 
				 	procs1 = (int*)malloc(ncpus*sizeof(int));
			
 
				 	procs2 = (int*)malloc(ncpus*sizeof(int));
			
 
				 	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, ncpus);
			
@@ -96,7 +97,7 @@ int main(int argc, char **argv)
 
				 		nprocs2 =  ncpus-nprocs1;
			
 
				 		k = 0;
			
 
				 		for(j = nprocs1; j < nprocs1+nprocs2; j++)
			
 
				-			procs2[k++] = j;
			
 
				+			procs2[k++] = procs1[j];
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
@@ -113,6 +114,8 @@ int main(int argc, char **argv)
 
				 	procs2[0] = 0;
			
 
				 #endif
			
 
				 
			
 
				+	if (ncpus == 0) goto enodev;
			
 
				+
			
 
				 	/*create contexts however you want*/
			
 
				 	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", 0);
			
 
				 	unsigned sched_ctx2 = starpu_sched_ctx_create(procs2, nprocs2, "ctx2", 0);
			
@@ -158,7 +161,8 @@ int main(int argc, char **argv)
 
				 	starpu_sched_ctx_delete(sched_ctx2);
			
 
				 	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx1, tasks_executed[0], NTASKS*NTASKS);
			
 
				 	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx2, tasks_executed[1], NTASKS*NTASKS);
			
 
				-	starpu_shutdown();
			
 
				 
			
 
				-	return 0;
			
 
				+enodev:
			
 
				+	starpu_shutdown();
			
 
				+	return ncpus == 0 ? 77 : 0;
			
 
				 }
			
--- a/examples/worker_collections/worker_tree_example.c
+++ b/examples/worker_collections/worker_tree_example.c
@@ -30,7 +30,12 @@ int main(int argc, char **argv)
 
				 
			
 
				 int main()
			
 
				 {
			
 
				-	starpu_init(NULL);
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV)
			
 
				+		return 77;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	int procs[STARPU_NMAXWORKERS];
			
 
				 	unsigned ncpus =  starpu_cpu_worker_get_count();
			
--- a/gcc-plugin/examples/cholesky/cholesky.c
+++ b/gcc-plugin/examples/cholesky/cholesky.c
@@ -203,7 +203,7 @@ int main(int argc, char **argv)
 
				 		}
			
 
				 	}
			
 
				 	float test_mat[size * size] __heap;
			
 
				-	SSYRK("L", "N", size, size, 1.0f,
			
 
				+	STARPU_SSYRK("L", "N", size, size, 1.0f,
			
 
				 	      rmat, size, 0.0f, test_mat, size);
			
 
				 
			
 
				 	fprintf(stderr, "comparing results ...\n");
			
--- a/gcc-plugin/examples/cholesky/cholesky_kernels.c
+++ b/gcc-plugin/examples/cholesky/cholesky_kernels.c
@@ -42,7 +42,7 @@ static inline void chol_common_cpu_codelet_update_u22(const float *left, const f
 
				 
			
 
				 	switch (s) {
			
 
				 		case 0:
			
 
				-			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21,
			
 
				+			STARPU_SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21,
			
 
				 				right, ld12, 1.0f, center, ld22);
			
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -95,7 +95,7 @@ static inline void chol_common_codelet_update_u21(const float *sub11, float *sub
 
				 {
			
 
				 	switch (s) {
			
 
				 		case 0:
			
 
				-			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
			
 
				+			STARPU_STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
			
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
@@ -153,9 +153,9 @@ static inline void chol_common_codelet_update_u11(float *sub11, unsigned nx, uns
 
				 
			
 
				 				STARPU_ASSERT(lambda11 != 0.0f);
			
 
				 
			
 
				-				SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
			
 
				+				STARPU_SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
			
 
				 
			
 
				-				SSYR("L", nx - z - 1, -1.0f,
			
 
				+				STARPU_SSYR("L", nx - z - 1, -1.0f,
			
 
				 							&sub11[(z+1)+z*ld], 1,
			
 
				 							&sub11[(z+1)+(z+1)*ld], ld);
			
 
				 			}
			
--- a/include/starpu_sched_ctx.h
+++ b/include/starpu_sched_ctx.h
@@ -40,6 +40,8 @@ void starpu_sched_ctx_add_workers(int *workerids_ctx, int nworkers_ctx, unsigned
 
				 
			
 
				 void starpu_sched_ctx_remove_workers(int *workerids_ctx, int nworkers_ctx, unsigned sched_ctx_id);
			
 
				 
			
 
				+void starpu_sched_ctx_display_workers(unsigned sched_ctx_id, FILE *f);
			
 
				+
			
 
				 void starpu_sched_ctx_delete(unsigned sched_ctx_id);
			
 
				 
			
 
				 void starpu_sched_ctx_set_inheritor(unsigned sched_ctx_id, unsigned inheritor);
			
--- a/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
+++ b/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
@@ -203,7 +203,7 @@ void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *corr
 
				 	float *test_mat = malloc(size*size*sizeof(float));
			
 
				 	STARPU_ASSERT(test_mat);
			
 
				 
			
 
				-	SSYRK("L", "N", size, size, 1.0f,
			
 
				+	STARPU_SSYRK("L", "N", size, size, 1.0f,
			
 
				 			rmat, size, 0.0f, test_mat, size);
			
 
				 
			
 
				 	FPRINTF(stderr, "[%d] comparing results ...\n", rank);
			
--- a/mpi/examples/matrix_decomposition/mpi_cholesky_kernels.c
+++ b/mpi/examples/matrix_decomposition/mpi_cholesky_kernels.c
@@ -55,7 +55,7 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STAR
 
				 	switch (s)
			
 
				 	{
			
 
				 		case 0:
			
 
				-			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21,
			
 
				+			STARPU_SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21,
			
 
				 				right, ld12, 1.0f, center, ld22);
			
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -111,7 +111,7 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, STARPU_A
 
				 	switch (s)
			
 
				 	{
			
 
				 		case 0:
			
 
				-			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
			
 
				+			STARPU_STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
			
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
@@ -171,9 +171,9 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 
				 
			
 
				 				STARPU_ASSERT(lambda11 != 0.0f);
			
 
				 
			
 
				-				SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
			
 
				+				STARPU_SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
			
 
				 
			
 
				-				SSYR("L", nx - z - 1, -1.0f,
			
 
				+				STARPU_SSYR("L", nx - z - 1, -1.0f,
			
 
				 							&sub11[(z+1)+z*ld], 1,
			
 
				 							&sub11[(z+1)+(z+1)*ld], ld);
			
 
				 			}
			
--- a/mpi/examples/mpi_lu/mpi_lu-double.h
+++ b/mpi/examples/mpi_lu/mpi_lu-double.h
@@ -27,16 +27,16 @@
 
				 #define CUBLAS_SWAP	cublasDswap
			
 
				 #define CUBLAS_IAMAX	cublasIdamax
			
 
				 
			
 
				-#define CPU_GEMM	DGEMM
			
 
				-#define CPU_GEMV	DGEMV
			
 
				-#define CPU_TRSM	DTRSM
			
 
				-#define CPU_SCAL	DSCAL
			
 
				-#define CPU_GER		DGER
			
 
				-#define CPU_SWAP	DSWAP
			
 
				+#define CPU_GEMM	STARPU_DGEMM
			
 
				+#define CPU_GEMV	STARPU_DGEMV
			
 
				+#define CPU_TRSM	STARPU_DTRSM
			
 
				+#define CPU_SCAL	STARPU_DSCAL
			
 
				+#define CPU_GER		STARPU_DGER
			
 
				+#define CPU_SWAP	STARPU_DSWAP
			
 
				 
			
 
				-#define CPU_TRMM	DTRMM
			
 
				-#define CPU_AXPY	DAXPY
			
 
				-#define CPU_ASUM	DASUM
			
 
				-#define CPU_IAMAX	IDAMAX
			
 
				+#define CPU_TRMM	STARPU_DTRMM
			
 
				+#define CPU_AXPY	STARPU_DAXPY
			
 
				+#define CPU_ASUM	STARPU_DASUM
			
 
				+#define CPU_IAMAX	STARPU_IDAMAX
			
 
				 
			
 
				 #define PIVOT_THRESHHOLD	10e-10
			
--- a/mpi/examples/mpi_lu/mpi_lu-float.h
+++ b/mpi/examples/mpi_lu/mpi_lu-float.h
@@ -27,16 +27,16 @@
 
				 #define CUBLAS_SWAP	cublasSswap
			
 
				 #define CUBLAS_IAMAX	cublasIsamax
			
 
				 
			
 
				-#define CPU_GEMM	SGEMM
			
 
				-#define CPU_GEMV	SGEMV
			
 
				-#define CPU_TRSM	STRSM
			
 
				-#define CPU_SCAL	SSCAL
			
 
				-#define CPU_GER		SGER
			
 
				-#define CPU_SWAP	SSWAP
			
 
				+#define CPU_GEMM	STARPU_SGEMM
			
 
				+#define CPU_GEMV	STARPU_SGEMV
			
 
				+#define CPU_TRSM	STARPU_STRSM
			
 
				+#define CPU_SCAL	STARPU_SSCAL
			
 
				+#define CPU_GER		STARPU_SGER
			
 
				+#define CPU_SWAP	STARPU_SSWAP
			
 
				 
			
 
				-#define CPU_TRMM	STRMM
			
 
				-#define CPU_AXPY	SAXPY
			
 
				-#define CPU_ASUM	SASUM
			
 
				-#define CPU_IAMAX	ISAMAX
			
 
				+#define CPU_TRMM	STARPU_STRMM
			
 
				+#define CPU_AXPY	STARPU_SAXPY
			
 
				+#define CPU_ASUM	STARPU_SASUM
			
 
				+#define CPU_IAMAX	STARPU_ISAMAX
			
 
				 
			
 
				 #define PIVOT_THRESHHOLD	10e-5
			
--- a/mpi/src/Makefile.am
+++ b/mpi/src/Makefile.am
@@ -39,7 +39,9 @@ noinst_HEADERS =					\
 
				 	starpu_mpi_task_insert.h			\
			
 
				 	starpu_mpi_datatype.h				\
			
 
				 	starpu_mpi_cache.h				\
			
 
				-	starpu_mpi_cache_stats.h
			
 
				+	starpu_mpi_cache_stats.h			\
			
 
				+	starpu_mpi_early_data.h				\
			
 
				+	starpu_mpi_early_request.h
			
 
				 
			
 
				 libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_SOURCES =	\
			
 
				 	starpu_mpi.c					\
			
@@ -50,7 +52,9 @@ libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_SOURCES =	\
 
				 	starpu_mpi_stats.c				\
			
 
				 	starpu_mpi_private.c				\
			
 
				 	starpu_mpi_cache.c				\
			
 
				-	starpu_mpi_cache_stats.c
			
 
				+	starpu_mpi_cache_stats.c			\
			
 
				+	starpu_mpi_early_data.c				\
			
 
				+	starpu_mpi_early_request.c
			
 
				 
			
 
				 showcheck:
			
 
				 	-cat /dev/null
			
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -22,6 +22,8 @@
 
				 #include <starpu_profiling.h>
			
 
				 #include <starpu_mpi_stats.h>
			
 
				 #include <starpu_mpi_task_insert.h>
			
 
				+#include <starpu_mpi_early_data.h>
			
 
				+#include <starpu_mpi_early_request.h>
			
 
				 #include <common/config.h>
			
 
				 #include <common/thread.h>
			
 
				 #include <datawizard/interfaces/data_interface.h>
			
@@ -65,234 +67,52 @@ static int posted_requests = 0, newer_requests, barrier_running = 0;
 
				 
			
 
				 #define _STARPU_MPI_INC_POSTED_REQUESTS(value) { STARPU_PTHREAD_MUTEX_LOCK(&mutex_posted_requests); posted_requests += value; STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_posted_requests); }
			
 
				 
			
 
				-LIST_TYPE(_starpu_mpi_copy_handle,
			
 
				-	  starpu_data_handle_t handle;
			
 
				-	  struct _starpu_mpi_envelope *env;
			
 
				-	  struct _starpu_mpi_req *req;
			
 
				-	  void *buffer;
			
 
				-	  int mpi_tag;
			
 
				-	  int source;
			
 
				-	  int req_ready;
			
 
				-	  starpu_pthread_mutex_t req_mutex;
			
 
				-	  starpu_pthread_cond_t req_cond;
			
 
				-);
			
 
				-
			
 
				-struct _starpu_mpi_copy_handle_hashlist
			
 
				+static void _starpu_mpi_request_init(struct _starpu_mpi_req **req)
			
 
				 {
			
 
				-	struct _starpu_mpi_copy_handle_list *list;
			
 
				-	UT_hash_handle hh;
			
 
				-	int mpi_tag;
			
 
				-};
			
 
				-
			
 
				-/********************************************************/
			
 
				-/*                                                      */
			
 
				-/*  Hashmap's requests functionalities                  */
			
 
				-/*                                                      */
			
 
				-/********************************************************/
			
 
				-
			
 
				-/** stores application requests for which data have not been received yet */
			
 
				-static struct _starpu_mpi_req **_starpu_mpi_app_req_hashmap = NULL;
			
 
				-static int _starpu_mpi_app_req_hashmap_count = 0;
			
 
				-/** stores data which have been received by MPI but have not been requested by the application */
			
 
				-static struct _starpu_mpi_copy_handle_hashlist **_starpu_mpi_copy_handle_hashmap = NULL;
			
 
				-static int _starpu_mpi_copy_handle_hashmap_count = 0;
			
 
				-
			
 
				-static struct _starpu_mpi_req* find_app_req(int mpi_tag, int source)
			
 
				-{
			
 
				-	struct _starpu_mpi_req* req;
			
 
				-
			
 
				-	HASH_FIND_INT(_starpu_mpi_app_req_hashmap[source], &mpi_tag, req);
			
 
				-
			
 
				-	return req;
			
 
				-}
			
 
				-
			
 
				-static void add_app_req(struct _starpu_mpi_req *req)
			
 
				-{
			
 
				-	struct _starpu_mpi_req *test_req;
			
 
				-
			
 
				-	test_req = find_app_req(req->mpi_tag, req->srcdst);
			
 
				-
			
 
				-	if (test_req == NULL)
			
 
				-	{
			
 
				-		HASH_ADD_INT(_starpu_mpi_app_req_hashmap[req->srcdst], mpi_tag, req);
			
 
				-		_starpu_mpi_app_req_hashmap_count ++;
			
 
				-		_STARPU_MPI_DEBUG(3, "Adding request %p with tag %d in the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		_STARPU_MPI_DEBUG(3, "[Error] request %p with tag %d already in the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
			
 
				-		int seq_const = starpu_data_get_sequential_consistency_flag(req->data_handle);
			
 
				-		if (seq_const &&  req->sequential_consistency)
			
 
				-		{
			
 
				-			STARPU_ASSERT_MSG(!test_req, "[Error] request %p with tag %d wanted to be added to the application request hashmap[%d], while another request %p with the same tag is already in it. \n Sequential consistency is activated : this is not supported by StarPU.", req, req->mpi_tag, req->srcdst, test_req);
			
 
				-		}
			
 
				-		else
			
 
				-		{
			
 
				-			STARPU_ASSERT_MSG(!test_req, "[Error] request %p with tag %d wanted to be added to the application request hashmap[%d], while another request %p with the same tag is already in it. \n Sequential consistency isn't activated for this handle : you should want to add dependencies between requests for which the sequential consistency is deactivated.", req, req->mpi_tag, req->srcdst, test_req);
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-static void delete_app_req(struct _starpu_mpi_req *req)
			
 
				-{
			
 
				-	struct _starpu_mpi_req *test_req;
			
 
				+	*req = malloc(sizeof(struct _starpu_mpi_req));
			
 
				+	STARPU_ASSERT_MSG(*req, "Invalid request");
			
 
				 
			
 
				-	test_req = find_app_req(req->mpi_tag, req->srcdst);
			
 
				-
			
 
				-	if (test_req != NULL)
			
 
				-	{
			
 
				-		HASH_DEL(_starpu_mpi_app_req_hashmap[req->srcdst], req);
			
 
				-		_starpu_mpi_app_req_hashmap_count --;
			
 
				-		_STARPU_MPI_DEBUG(3, "Deleting application request %p with tag %d from the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		_STARPU_MPI_DEBUG(3, "[Warning] request %p with tag %d is NOT in the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-#ifdef STARPU_VERBOSE
			
 
				-static void _starpu_mpi_copy_handle_display_hash(int source, int tag)
			
 
				-{
			
 
				-	struct _starpu_mpi_copy_handle_hashlist *hashlist;
			
 
				-	HASH_FIND_INT(_starpu_mpi_copy_handle_hashmap[source], &tag, hashlist);
			
 
				-
			
 
				-	if (hashlist == NULL)
			
 
				-	{
			
 
				-		_STARPU_MPI_DEBUG(60, "Hashlist for source %d and tag %d does not exist\n", source, tag);
			
 
				-	}
			
 
				-	else if (_starpu_mpi_copy_handle_list_empty(hashlist->list))
			
 
				-	{
			
 
				-		_STARPU_MPI_DEBUG(60, "Hashlist for source %d and tag %d is empty\n", source, tag);
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		struct _starpu_mpi_copy_handle *cur;
			
 
				-		for (cur = _starpu_mpi_copy_handle_list_begin(hashlist->list) ;
			
 
				-		     cur != _starpu_mpi_copy_handle_list_end(hashlist->list);
			
 
				-		     cur = _starpu_mpi_copy_handle_list_next(cur))
			
 
				-		{
			
 
				-			_STARPU_MPI_DEBUG(60, "Element for source %d and tag %d: %p\n", source, tag, cur);
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-static struct _starpu_mpi_copy_handle *pop_chandle(int mpi_tag, int source, int delete)
			
 
				-{
			
 
				-	struct _starpu_mpi_copy_handle_hashlist *hashlist;
			
 
				-	struct _starpu_mpi_copy_handle *chandle;
			
 
				-
			
 
				-	_STARPU_MPI_DEBUG(60, "Looking for chandle with tag %d in the hashmap[%d]\n", mpi_tag, source);
			
 
				-	HASH_FIND_INT(_starpu_mpi_copy_handle_hashmap[source], &mpi_tag, hashlist);
			
 
				-	if (hashlist == NULL)
			
 
				-	{
			
 
				-		chandle = NULL;
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		if (_starpu_mpi_copy_handle_list_empty(hashlist->list))
			
 
				-		{
			
 
				-			chandle = NULL;
			
 
				-		}
			
 
				-		else
			
 
				-		{
			
 
				-			if (delete == 1)
			
 
				-			{
			
 
				-				chandle = _starpu_mpi_copy_handle_list_pop_front(hashlist->list);
			
 
				-			}
			
 
				-			else
			
 
				-			{
			
 
				-				chandle = _starpu_mpi_copy_handle_list_front(hashlist->list);
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				-	_STARPU_MPI_DEBUG(60, "Found chandle %p with tag %d in the hashmap[%d]\n", chandle, mpi_tag, source);
			
 
				-	return chandle;
			
 
				-}
			
 
				-
			
 
				-static struct _starpu_mpi_copy_handle *find_chandle(int mpi_tag, int source)
			
 
				-{
			
 
				-	return pop_chandle(mpi_tag, source, 0);
			
 
				-}
			
 
				-
			
 
				-static void add_chandle(struct _starpu_mpi_copy_handle *chandle)
			
 
				-{
			
 
				-	_STARPU_MPI_DEBUG(60, "Trying to add chandle %p with tag %d in the hashmap[%d]\n", chandle, chandle->mpi_tag, chandle->source);
			
 
				-
			
 
				-	struct _starpu_mpi_copy_handle_hashlist *hashlist;
			
 
				-	HASH_FIND_INT(_starpu_mpi_copy_handle_hashmap[chandle->source], &chandle->mpi_tag, hashlist);
			
 
				-	if (hashlist == NULL)
			
 
				-	{
			
 
				-		hashlist = malloc(sizeof(struct _starpu_mpi_copy_handle_hashlist));
			
 
				-		hashlist->list = _starpu_mpi_copy_handle_list_new();
			
 
				-		hashlist->mpi_tag = chandle->mpi_tag;
			
 
				-		HASH_ADD_INT(_starpu_mpi_copy_handle_hashmap[chandle->source], mpi_tag, hashlist);
			
 
				-	}
			
 
				-	_starpu_mpi_copy_handle_list_push_back(hashlist->list, chandle);
			
 
				-	_starpu_mpi_copy_handle_hashmap_count ++;
			
 
				-#ifdef STARPU_VERBOSE
			
 
				-	_starpu_mpi_copy_handle_display_hash(chandle->source, chandle->mpi_tag);
			
 
				-#endif
			
 
				-}
			
 
				-
			
 
				-static void delete_chandle(struct _starpu_mpi_copy_handle *chandle)
			
 
				-{
			
 
				-	_STARPU_MPI_DEBUG(60, "Trying to delete chandle %p with tag %d in the hashmap[%d]\n", chandle, chandle->mpi_tag, chandle->source);
			
 
				-	struct _starpu_mpi_copy_handle *found = pop_chandle(chandle->mpi_tag, chandle->source, 1);
			
 
				-
			
 
				-	STARPU_ASSERT_MSG(found == chandle,
			
 
				-			  "Error delete_chandle : chandle %p with tag %d is NOT in the hashmap[%d]\n", chandle, chandle->mpi_tag, chandle->source);
			
 
				-
			
 
				-	_starpu_mpi_copy_handle_hashmap_count --;
			
 
				-#ifdef STARPU_VERBOSE
			
 
				-	_starpu_mpi_copy_handle_display_hash(chandle->source, chandle->mpi_tag);
			
 
				-#endif
			
 
				-}
			
 
				-
			
 
				-static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
			
 
				-{
			
 
				 	/* Initialize the request structure */
			
 
				-	req->data_handle = NULL;
			
 
				+	(*req)->data_handle = NULL;
			
 
				 
			
 
				-	req->datatype = 0;
			
 
				-	req->ptr = NULL;
			
 
				-	req->count = -1;
			
 
				-	req->user_datatype = -1;
			
 
				+	(*req)->datatype = 0;
			
 
				+	(*req)->ptr = NULL;
			
 
				+	(*req)->count = -1;
			
 
				+	(*req)->user_datatype = -1;
			
 
				 
			
 
				-	req->srcdst = -1;
			
 
				-	req->mpi_tag = -1;
			
 
				-	req->comm = 0;
			
 
				+	(*req)->srcdst = -1;
			
 
				+	(*req)->mpi_tag = -1;
			
 
				+	(*req)->comm = 0;
			
 
				 
			
 
				-	req->func = NULL;
			
 
				+	(*req)->func = NULL;
			
 
				 
			
 
				-	req->status = NULL;
			
 
				-	req->request = 0;
			
 
				-	req->flag = NULL;
			
 
				+	(*req)->status = NULL;
			
 
				+	(*req)->request = 0;
			
 
				+	(*req)->flag = NULL;
			
 
				 
			
 
				-	req->ret = -1;
			
 
				-	STARPU_PTHREAD_MUTEX_INIT(&req->req_mutex, NULL);
			
 
				-	STARPU_PTHREAD_COND_INIT(&req->req_cond, NULL);
			
 
				-	STARPU_PTHREAD_MUTEX_INIT(&req->posted_mutex, NULL);
			
 
				-	STARPU_PTHREAD_COND_INIT(&req->posted_cond, NULL);
			
 
				+	(*req)->ret = -1;
			
 
				+	STARPU_PTHREAD_MUTEX_INIT(&((*req)->req_mutex), NULL);
			
 
				+	STARPU_PTHREAD_COND_INIT(&((*req)->req_cond), NULL);
			
 
				+	STARPU_PTHREAD_MUTEX_INIT(&((*req)->posted_mutex), NULL);
			
 
				+	STARPU_PTHREAD_COND_INIT(&((*req)->posted_cond), NULL);
			
 
				 
			
 
				-	req->request_type = UNKNOWN_REQ;
			
 
				+	(*req)->request_type = UNKNOWN_REQ;
			
 
				 
			
 
				-	req->submitted = 0;
			
 
				-	req->completed = 0;
			
 
				-	req->posted = 0;
			
 
				+	(*req)->submitted = 0;
			
 
				+	(*req)->completed = 0;
			
 
				+	(*req)->posted = 0;
			
 
				 
			
 
				-	req->other_request = NULL;
			
 
				+	(*req)->other_request = NULL;
			
 
				 
			
 
				-	req->detached = -1;
			
 
				-	req->callback = NULL;
			
 
				-	req->callback_arg = NULL;
			
 
				+	(*req)->detached = -1;
			
 
				+	(*req)->callback = NULL;
			
 
				+	(*req)->callback_arg = NULL;
			
 
				 
			
 
				-	req->size_req = 0;
			
 
				-	req->internal_req = NULL;
			
 
				-	req->is_internal_req = 0;
			
 
				-	req->envelope = NULL;
			
 
				-	req->sequential_consistency = 1;
			
 
				+	(*req)->size_req = 0;
			
 
				+	(*req)->internal_req = NULL;
			
 
				+	(*req)->is_internal_req = 0;
			
 
				+	(*req)->envelope = NULL;
			
 
				+	(*req)->sequential_consistency = 1;
			
 
				 }
			
 
				 
			
 
				  /********************************************************/
			
@@ -310,35 +130,33 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
 
				 							       int is_internal_req,
			
 
				 							       ssize_t count)
			
 
				 {
			
 
				+	struct _starpu_mpi_req *req;
			
 
				 
			
 
				-	 _STARPU_MPI_LOG_IN();
			
 
				-	 struct _starpu_mpi_req *req = malloc(sizeof(struct _starpu_mpi_req));
			
 
				-	 STARPU_ASSERT_MSG(req, "Invalid request");
			
 
				-
			
 
				-	 _STARPU_MPI_INC_POSTED_REQUESTS(1);
			
 
				-
			
 
				-	 /* Initialize the request structure */
			
 
				-	 _starpu_mpi_request_init(req);
			
 
				-	 req->request_type = request_type;
			
 
				-	 req->data_handle = data_handle;
			
 
				-	 req->srcdst = srcdst;
			
 
				-	 req->mpi_tag = mpi_tag;
			
 
				-	 req->comm = comm;
			
 
				-	 req->detached = detached;
			
 
				-	 req->callback = callback;
			
 
				-	 req->callback_arg = arg;
			
 
				-	 req->func = func;
			
 
				-	 req->sequential_consistency = sequential_consistency;
			
 
				-	 req->is_internal_req = is_internal_req;
			
 
				-	 req->count = count;
			
 
				-
			
 
				-	 /* Asynchronously request StarPU to fetch the data in main memory: when
			
 
				-	  * it is available in main memory, _starpu_mpi_submit_new_mpi_request(req) is called and
			
 
				-	  * the request is actually submitted */
			
 
				-	 starpu_data_acquire_cb_sequential_consistency(data_handle, mode, _starpu_mpi_submit_new_mpi_request, (void *)req, sequential_consistency);
			
 
				+	_STARPU_MPI_LOG_IN();
			
 
				+	_STARPU_MPI_INC_POSTED_REQUESTS(1);
			
 
				 
			
 
				-	 _STARPU_MPI_LOG_OUT();
			
 
				-	 return req;
			
 
				+	/* Initialize the request structure */
			
 
				+	_starpu_mpi_request_init(&req);
			
 
				+	req->request_type = request_type;
			
 
				+	req->data_handle = data_handle;
			
 
				+	req->srcdst = srcdst;
			
 
				+	req->mpi_tag = mpi_tag;
			
 
				+	req->comm = comm;
			
 
				+	req->detached = detached;
			
 
				+	req->callback = callback;
			
 
				+	req->callback_arg = arg;
			
 
				+	req->func = func;
			
 
				+	req->sequential_consistency = sequential_consistency;
			
 
				+	req->is_internal_req = is_internal_req;
			
 
				+	req->count = count;
			
 
				+
			
 
				+	/* Asynchronously request StarPU to fetch the data in main memory: when
			
 
				+	 * it is available in main memory, _starpu_mpi_submit_new_mpi_request(req) is called and
			
 
				+	 * the request is actually submitted */
			
 
				+	starpu_data_acquire_cb_sequential_consistency(data_handle, mode, _starpu_mpi_submit_new_mpi_request, (void *)req, sequential_consistency);
			
 
				+
			
 
				+	_STARPU_MPI_LOG_OUT();
			
 
				+	return req;
			
 
				  }
			
 
				 
			
 
				  /********************************************************/
			
@@ -608,15 +426,11 @@ static void _starpu_mpi_wait_func(struct _starpu_mpi_req *waiting_req)
 
				 
			
 
				 int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
			
 
				 {
			
 
				-	_STARPU_MPI_LOG_IN();
			
 
				 	int ret;
			
 
				-
			
 
				-	struct _starpu_mpi_req *waiting_req = malloc(sizeof(struct _starpu_mpi_req));
			
 
				-	_starpu_mpi_request_init(waiting_req);
			
 
				-	STARPU_ASSERT_MSG(waiting_req, "Allocation failed");
			
 
				-
			
 
				 	struct _starpu_mpi_req *req = *public_req;
			
 
				+	struct _starpu_mpi_req *waiting_req;
			
 
				 
			
 
				+	_STARPU_MPI_LOG_IN();
			
 
				 	_STARPU_MPI_INC_POSTED_REQUESTS(1);
			
 
				 
			
 
				 	/* We cannot try to complete a MPI request that was not actually posted
			
@@ -627,7 +441,7 @@ int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&(req->req_mutex));
			
 
				 
			
 
				 	/* Initialize the request structure */
			
 
				-	 _starpu_mpi_request_init(waiting_req);
			
 
				+	 _starpu_mpi_request_init(&waiting_req);
			
 
				 	waiting_req->status = status;
			
 
				 	waiting_req->other_request = req;
			
 
				 	waiting_req->func = _starpu_mpi_wait_func;
			
@@ -704,9 +518,8 @@ int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 
				 
			
 
				 	if (submitted)
			
 
				 	{
			
 
				-		struct _starpu_mpi_req *testing_req = malloc(sizeof(struct _starpu_mpi_req));
			
 
				-		STARPU_ASSERT_MSG(testing_req, "allocation failed");
			
 
				-		_starpu_mpi_request_init(testing_req);
			
 
				+		struct _starpu_mpi_req *testing_req;
			
 
				+		_starpu_mpi_request_init(&testing_req);
			
 
				 
			
 
				 		/* Initialize the request structure */
			
 
				 		STARPU_PTHREAD_MUTEX_INIT(&(testing_req->req_mutex), NULL);
			
@@ -768,11 +581,11 @@ static void _starpu_mpi_barrier_func(struct _starpu_mpi_req *barrier_req)
 
				 
			
 
				 int starpu_mpi_barrier(MPI_Comm comm)
			
 
				 {
			
 
				-	_STARPU_MPI_LOG_IN();
			
 
				 	int ret;
			
 
				-	struct _starpu_mpi_req *barrier_req = malloc(sizeof(struct _starpu_mpi_req));
			
 
				-	STARPU_ASSERT_MSG(barrier_req, "allocation failed");
			
 
				-	_starpu_mpi_request_init(barrier_req);
			
 
				+	struct _starpu_mpi_req *barrier_req;
			
 
				+
			
 
				+	_STARPU_MPI_LOG_IN();
			
 
				+	_starpu_mpi_request_init(&barrier_req);
			
 
				 
			
 
				 	/* First wait for *both* all tasks and MPI requests to finish, in case
			
 
				 	 * some tasks generate MPI requests, MPI requests generate tasks, etc.
			
@@ -855,11 +668,11 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 
				 
			
 
				 	if (req->internal_req)
			
 
				 	{
			
 
				-		struct _starpu_mpi_copy_handle *chandle = find_chandle(req->mpi_tag, req->srcdst);
			
 
				-		STARPU_ASSERT_MSG(chandle, "Could not find a copy data handle with the tag %d and the node %d\n", req->mpi_tag, req->srcdst);
			
 
				-		_STARPU_MPI_DEBUG(3, "Handling deleting of copy_handle structure from the hashmap..\n");
			
 
				-		delete_chandle(chandle);
			
 
				-		free(chandle);
			
 
				+		struct _starpu_mpi_early_data_handle *early_data_handle = _starpu_mpi_early_data_find(req->mpi_tag, req->srcdst);
			
 
				+		STARPU_ASSERT_MSG(early_data_handle, "Could not find a copy data handle with the tag %d and the node %d\n", req->mpi_tag, req->srcdst);
			
 
				+		_STARPU_MPI_DEBUG(3, "Handling deleting of early_data structure from the hashmap..\n");
			
 
				+		_starpu_mpi_early_data_delete(early_data_handle);
			
 
				+		free(early_data_handle);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
@@ -911,17 +724,17 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 
				 	_STARPU_MPI_LOG_OUT();
			
 
				 }
			
 
				 
			
 
				-struct _starpu_mpi_copy_cb_args
			
 
				+struct _starpu_mpi_early_data_cb_args
			
 
				 {
			
 
				 	starpu_data_handle_t data_handle;
			
 
				-	starpu_data_handle_t copy_handle;
			
 
				+	starpu_data_handle_t early_handle;
			
 
				 	struct _starpu_mpi_req *req;
			
 
				 	void *buffer;
			
 
				 };
			
 
				 
			
 
				-static void _starpu_mpi_copy_cb(void* arg)
			
 
				+static void _starpu_mpi_early_data_cb(void* arg)
			
 
				 {
			
 
				-	struct _starpu_mpi_copy_cb_args *args = arg;
			
 
				+	struct _starpu_mpi_early_data_cb_args *args = arg;
			
 
				 
			
 
				 	// We store in the application request the internal MPI
			
 
				 	// request so that it can be used by starpu_mpi_wait
			
@@ -931,16 +744,16 @@ static void _starpu_mpi_copy_cb(void* arg)
 
				 	if (args->buffer)
			
 
				 	{
			
 
				 		/* Data has been received as a raw memory, it has to be unpacked */
			
 
				-		struct starpu_data_interface_ops *itf_src = starpu_data_get_interface_ops(args->copy_handle);
			
 
				+		struct starpu_data_interface_ops *itf_src = starpu_data_get_interface_ops(args->early_handle);
			
 
				 		struct starpu_data_interface_ops *itf_dst = starpu_data_get_interface_ops(args->data_handle);
			
 
				 		STARPU_ASSERT_MSG(itf_dst->unpack_data, "The data interface does not define an unpack function\n");
			
 
				-		itf_dst->unpack_data(args->data_handle, STARPU_MAIN_RAM, args->buffer, itf_src->get_size(args->copy_handle));
			
 
				+		itf_dst->unpack_data(args->data_handle, STARPU_MAIN_RAM, args->buffer, itf_src->get_size(args->early_handle));
			
 
				 		free(args->buffer);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				-		struct starpu_data_interface_ops *itf = starpu_data_get_interface_ops(args->copy_handle);
			
 
				-		void* itf_src = starpu_data_get_interface_on_node(args->copy_handle, STARPU_MAIN_RAM);
			
 
				+		struct starpu_data_interface_ops *itf = starpu_data_get_interface_ops(args->early_handle);
			
 
				+		void* itf_src = starpu_data_get_interface_on_node(args->early_handle, STARPU_MAIN_RAM);
			
 
				 		void* itf_dst = starpu_data_get_interface_on_node(args->data_handle, STARPU_MAIN_RAM);
			
 
				 
			
 
				 		if (!itf->copy_methods->ram_to_ram)
			
@@ -955,11 +768,11 @@ static void _starpu_mpi_copy_cb(void* arg)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	_STARPU_MPI_DEBUG(3, "Done, handling release of copy_handle..\n");
			
 
				-	starpu_data_release(args->copy_handle);
			
 
				+	_STARPU_MPI_DEBUG(3, "Done, handling release of early_handle..\n");
			
 
				+	starpu_data_release(args->early_handle);
			
 
				 
			
 
				-	_STARPU_MPI_DEBUG(3, "Done, handling unregister of copy_handle..\n");
			
 
				-	starpu_data_unregister_submit(args->copy_handle);
			
 
				+	_STARPU_MPI_DEBUG(3, "Done, handling unregister of early_handle..\n");
			
 
				+	starpu_data_unregister_submit(args->early_handle);
			
 
				 
			
 
				 	_STARPU_MPI_DEBUG(3, "Done, handling request %p termination of the already received request\n",args->req);
			
 
				 	// If the request is detached, we need to call _starpu_mpi_handle_request_termination
			
@@ -1019,41 +832,41 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 
				 		else
			
 
				 		{
			
 
				 			/* test whether the receive request has already been submitted internally by StarPU-MPI*/
			
 
				-			struct _starpu_mpi_copy_handle *chandle = find_chandle(req->mpi_tag, req->srcdst);
			
 
				+			struct _starpu_mpi_early_data_handle *early_data_handle = _starpu_mpi_early_data_find(req->mpi_tag, req->srcdst);
			
 
				 
			
 
				 			/* Case : the request has already been submitted internally by StarPU.
			
 
				 			 * We'll asynchronously ask a Read permission over the temporary handle, so as when
			
 
				-			 * the internal receive will be over, the _starpu_mpi_copy_cb function will be called to
			
 
				+			 * the internal receive will be over, the _starpu_mpi_early_data_cb function will be called to
			
 
				 			 * bring the data back to the original data handle associated to the request.*/
			
 
				-			if (chandle)
			
 
				+			if (early_data_handle)
			
 
				 			{
			
 
				 				STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				-				STARPU_PTHREAD_MUTEX_LOCK(&(chandle->req_mutex));
			
 
				-				while (!(chandle->req_ready))
			
 
				-					STARPU_PTHREAD_COND_WAIT(&(chandle->req_cond), &(chandle->req_mutex));
			
 
				-				STARPU_PTHREAD_MUTEX_UNLOCK(&(chandle->req_mutex));
			
 
				+				STARPU_PTHREAD_MUTEX_LOCK(&(early_data_handle->req_mutex));
			
 
				+				while (!(early_data_handle->req_ready))
			
 
				+					STARPU_PTHREAD_COND_WAIT(&(early_data_handle->req_cond), &(early_data_handle->req_mutex));
			
 
				+				STARPU_PTHREAD_MUTEX_UNLOCK(&(early_data_handle->req_mutex));
			
 
				 				STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				 
			
 
				 				_STARPU_MPI_DEBUG(3, "The RECV request %p with tag %d has already been received, copying previously received data into handle's pointer..\n", req, req->mpi_tag);
			
 
				-				STARPU_ASSERT(req->data_handle != chandle->handle);
			
 
				+				STARPU_ASSERT(req->data_handle != early_data_handle->handle);
			
 
				 
			
 
				-				req->internal_req = chandle->req;
			
 
				+				req->internal_req = early_data_handle->req;
			
 
				 
			
 
				-				struct _starpu_mpi_copy_cb_args *cb_args = malloc(sizeof(struct _starpu_mpi_copy_cb_args));
			
 
				+				struct _starpu_mpi_early_data_cb_args *cb_args = malloc(sizeof(struct _starpu_mpi_early_data_cb_args));
			
 
				 				cb_args->data_handle = req->data_handle;
			
 
				-				cb_args->copy_handle = chandle->handle;
			
 
				-				cb_args->buffer = chandle->buffer;
			
 
				+				cb_args->early_handle = early_data_handle->handle;
			
 
				+				cb_args->buffer = early_data_handle->buffer;
			
 
				 				cb_args->req = req;
			
 
				 
			
 
				 				_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
			
 
				-				starpu_data_acquire_cb(chandle->handle,STARPU_R,_starpu_mpi_copy_cb,(void*) cb_args);
			
 
				+				starpu_data_acquire_cb(early_data_handle->handle,STARPU_R,_starpu_mpi_early_data_cb,(void*) cb_args);
			
 
				 			}
			
 
				 			/* Case : a classic receive request with no send received earlier than expected.
			
 
				 			 * We just add the pending receive request to the requests' hashmap. */
			
 
				 			else
			
 
				 			{
			
 
				 				_STARPU_MPI_DEBUG(3, "Adding the pending receive request %p (srcdst %d tag %d) into the request hashmap\n", req, req->srcdst, req->mpi_tag);
			
 
				-				add_app_req(req);
			
 
				+				_starpu_mpi_early_request_add(req);
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
@@ -1252,14 +1065,8 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 	_starpu_mpi_comm_amounts_init(MPI_COMM_WORLD);
			
 
				 	_starpu_mpi_cache_init(MPI_COMM_WORLD);
			
 
				 
			
 
				-	{
			
 
				-		int nb_nodes, k;
			
 
				-		MPI_Comm_size(MPI_COMM_WORLD, &nb_nodes);
			
 
				-		_starpu_mpi_app_req_hashmap = malloc(nb_nodes * sizeof(struct _starpu_mpi_req *));
			
 
				-		for(k=0 ; k<nb_nodes ; k++) _starpu_mpi_app_req_hashmap[k] = NULL;
			
 
				-		_starpu_mpi_copy_handle_hashmap = malloc(nb_nodes * sizeof(struct _starpu_mpi_copy_handle_hash_list *));
			
 
				-		for(k=0 ; k<nb_nodes ; k++) _starpu_mpi_copy_handle_hashmap[k] = NULL;
			
 
				-	}
			
 
				+	_starpu_mpi_early_request_init(worldsize);
			
 
				+	_starpu_mpi_early_data_init(worldsize);
			
 
				 
			
 
				 	/* notify the main thread that the progression thread is ready */
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
@@ -1276,7 +1083,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 	while (running || posted_requests || !(_starpu_mpi_req_list_empty(new_requests)) || !(_starpu_mpi_req_list_empty(detached_requests)))
			
 
				 	{
			
 
				 		/* shall we block ? */
			
 
				-		unsigned block = _starpu_mpi_req_list_empty(new_requests) && (_starpu_mpi_app_req_hashmap_count == 0);
			
 
				+		unsigned block = _starpu_mpi_req_list_empty(new_requests) && _starpu_mpi_early_request_count() == 0;
			
 
				 
			
 
				 #ifndef STARPU_MPI_ACTIVITY
			
 
				 		STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
			
@@ -1316,7 +1123,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 		/* If there is no currently submitted header_req submitted to catch envelopes from senders, and there is some pending receive
			
 
				 		 * requests in our side, we resubmit a header request. */
			
 
				 		MPI_Request header_req;
			
 
				-		if ((_starpu_mpi_app_req_hashmap_count > 0) && (header_req_submitted == 0))// && (HASH_COUNT(_starpu_mpi_copy_handle_hashmap) == 0))
			
 
				+		if ((_starpu_mpi_early_request_count() > 0) && (header_req_submitted == 0))// && (HASH_COUNT(_starpu_mpi_early_data_handle_hashmap) == 0))
			
 
				 		{
			
 
				 			_STARPU_MPI_DEBUG(3, "Posting a receive to get a data envelop\n");
			
 
				 			MPI_Irecv(recv_env, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, MPI_ANY_SOURCE, _starpu_mpi_tag, MPI_COMM_WORLD, &header_req);
			
@@ -1342,14 +1149,14 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 			{
			
 
				 				_STARPU_MPI_DEBUG(3, "Searching for application request with tag %d and source %d (size %ld)\n", recv_env->mpi_tag, status.MPI_SOURCE, recv_env->size);
			
 
				 
			
 
				-				struct _starpu_mpi_req *found_req = find_app_req(recv_env->mpi_tag, status.MPI_SOURCE);
			
 
				+				struct _starpu_mpi_req *found_req = _starpu_mpi_early_request_find(recv_env->mpi_tag, status.MPI_SOURCE);
			
 
				 
			
 
				 				/* Case : a data will arrive before the matching receive has been submitted in our side of the application.
			
 
				 				 * We will allow a temporary handle to store the incoming data, by submitting a starpu_mpi_irecv_detached
			
 
				 				 * on this handle, and register this so as the StarPU-MPI layer can remember it.*/
			
 
				 				if (!found_req)
			
 
				 				{
			
 
				-					_STARPU_MPI_DEBUG(3, "Request with tag %d and source %d not found, creating a copy_handle to receive incoming data..\n", recv_env->mpi_tag, status.MPI_SOURCE);
			
 
				+					_STARPU_MPI_DEBUG(3, "Request with tag %d and source %d not found, creating a early_handle to receive incoming data..\n", recv_env->mpi_tag, status.MPI_SOURCE);
			
 
				 
			
 
				 					starpu_data_handle_t data_handle = NULL;
			
 
				 
			
@@ -1357,19 +1164,19 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 					data_handle = _starpu_data_get_data_handle_from_tag(recv_env->mpi_tag);
			
 
				 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				 
			
 
				-					struct _starpu_mpi_copy_handle* chandle = calloc(1, sizeof(struct _starpu_mpi_copy_handle));
			
 
				-					STARPU_ASSERT(chandle);
			
 
				-					STARPU_PTHREAD_MUTEX_INIT(&chandle->req_mutex, NULL);
			
 
				-					STARPU_PTHREAD_COND_INIT(&chandle->req_cond, NULL);
			
 
				-					chandle->mpi_tag = recv_env->mpi_tag;
			
 
				-					chandle->env = recv_env;
			
 
				-					chandle->source = status.MPI_SOURCE;
			
 
				+					struct _starpu_mpi_early_data_handle* early_data_handle = calloc(1, sizeof(struct _starpu_mpi_early_data_handle));
			
 
				+					STARPU_ASSERT(early_data_handle);
			
 
				+					STARPU_PTHREAD_MUTEX_INIT(&early_data_handle->req_mutex, NULL);
			
 
				+					STARPU_PTHREAD_COND_INIT(&early_data_handle->req_cond, NULL);
			
 
				+					early_data_handle->mpi_tag = recv_env->mpi_tag;
			
 
				+					early_data_handle->env = recv_env;
			
 
				+					early_data_handle->source = status.MPI_SOURCE;
			
 
				 
			
 
				 					if (data_handle)
			
 
				 					{
			
 
				-						chandle->buffer = NULL;
			
 
				-						starpu_data_register_same(&chandle->handle, data_handle);
			
 
				-						add_chandle(chandle);
			
 
				+						early_data_handle->buffer = NULL;
			
 
				+						starpu_data_register_same(&early_data_handle->handle, data_handle);
			
 
				+						_starpu_mpi_early_data_add(early_data_handle);
			
 
				 					}
			
 
				 					else
			
 
				 					{
			
@@ -1377,15 +1184,17 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 						 * we are going to receive the data as a raw memory, and give it
			
 
				 						 * to the application when it post a receive for this tag
			
 
				 						 */
			
 
				-						_STARPU_MPI_DEBUG(20, "Posting a receive for a data of size %d which has not yet been registered\n", (int)chandle->env->size);
			
 
				-						chandle->buffer = malloc(chandle->env->size);
			
 
				-						starpu_vector_data_register(&chandle->handle, STARPU_MAIN_RAM, (uintptr_t) chandle->buffer, chandle->env->size, 1);
			
 
				-						add_chandle(chandle);
			
 
				+						_STARPU_MPI_DEBUG(20, "Posting a receive for a data of size %d which has not yet been registered\n", (int)early_data_handle->env->size);
			
 
				+						early_data_handle->buffer = malloc(early_data_handle->env->size);
			
 
				+						starpu_vector_data_register(&early_data_handle->handle, STARPU_MAIN_RAM, (uintptr_t) early_data_handle->buffer, early_data_handle->env->size, 1);
			
 
				+						_starpu_mpi_early_data_add(early_data_handle);
			
 
				 					}
			
 
				 
			
 
				-					_STARPU_MPI_DEBUG(20, "Posting internal detached irecv on copy_handle with tag %d from src %d ..\n", chandle->mpi_tag, status.MPI_SOURCE);
			
 
				+					_STARPU_MPI_DEBUG(20, "Posting internal detached irecv on early_handle with tag %d from src %d ..\n", early_data_handle->mpi_tag, status.MPI_SOURCE);
			
 
				 					STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				-					chandle->req = _starpu_mpi_irecv_common(chandle->handle, status.MPI_SOURCE, chandle->mpi_tag, MPI_COMM_WORLD, 1, NULL, NULL, 1, 1, recv_env->size);
			
 
				+					early_data_handle->req = _starpu_mpi_irecv_common(early_data_handle->handle, status.MPI_SOURCE,
			
 
				+											  early_data_handle->mpi_tag, MPI_COMM_WORLD, 1,
			
 
				+											  NULL, NULL, 1, 1, recv_env->size);
			
 
				 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				 
			
 
				 					// We wait until the request is pushed in the
			
@@ -1394,15 +1203,15 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 					// on the request and post the corresponding mpi_irecv,
			
 
				 					// otherwise, it may lead to read data as envelop
			
 
				 					STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				-					STARPU_PTHREAD_MUTEX_LOCK(&(chandle->req->posted_mutex));
			
 
				-					while (!(chandle->req->posted))
			
 
				-					     STARPU_PTHREAD_COND_WAIT(&(chandle->req->posted_cond), &(chandle->req->posted_mutex));
			
 
				-					STARPU_PTHREAD_MUTEX_UNLOCK(&(chandle->req->posted_mutex));
			
 
				-
			
 
				-					STARPU_PTHREAD_MUTEX_LOCK(&chandle->req_mutex);
			
 
				-					chandle->req_ready = 1;
			
 
				-					STARPU_PTHREAD_COND_BROADCAST(&chandle->req_cond);
			
 
				-					STARPU_PTHREAD_MUTEX_UNLOCK(&chandle->req_mutex);
			
 
				+					STARPU_PTHREAD_MUTEX_LOCK(&(early_data_handle->req->posted_mutex));
			
 
				+					while (!(early_data_handle->req->posted))
			
 
				+					     STARPU_PTHREAD_COND_WAIT(&(early_data_handle->req->posted_cond), &(early_data_handle->req->posted_mutex));
			
 
				+					STARPU_PTHREAD_MUTEX_UNLOCK(&(early_data_handle->req->posted_mutex));
			
 
				+
			
 
				+					STARPU_PTHREAD_MUTEX_LOCK(&early_data_handle->req_mutex);
			
 
				+					early_data_handle->req_ready = 1;
			
 
				+					STARPU_PTHREAD_COND_BROADCAST(&early_data_handle->req_cond);
			
 
				+					STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_handle->req_mutex);
			
 
				 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				 				}
			
 
				 				/* Case : a matching receive has been found for the incoming data, we handle the correct allocation of the pointer associated to
			
@@ -1411,7 +1220,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 				{
			
 
				 					_STARPU_MPI_DEBUG(3, "A matching receive has been found for the incoming data with tag %d\n", recv_env->mpi_tag);
			
 
				 
			
 
				-					delete_app_req(found_req);
			
 
				+					_starpu_mpi_early_request_delete(found_req);
			
 
				 
			
 
				 					_starpu_mpi_handle_allocate_datatype(found_req->data_handle, &found_req->datatype, &found_req->user_datatype);
			
 
				 					if (found_req->user_datatype == 0)
			
@@ -1448,8 +1257,8 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(detached_requests), "List of detached requests not empty");
			
 
				 	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(new_requests), "List of new requests not empty");
			
 
				 	STARPU_ASSERT_MSG(posted_requests == 0, "Number of posted request is not zero");
			
 
				-	STARPU_ASSERT_MSG(_starpu_mpi_app_req_hashmap_count == 0, "Number of receive requests left is not zero");
			
 
				-	STARPU_ASSERT_MSG(_starpu_mpi_copy_handle_hashmap_count == 0, "Number of copy requests left is not zero");
			
 
				+	_starpu_mpi_early_request_check_termination();
			
 
				+	_starpu_mpi_early_data_check_termination();
			
 
				 
			
 
				 	if (argc_argv->initialize_mpi)
			
 
				 	{
			
@@ -1459,27 +1268,8 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				 
			
 
				-	{
			
 
				-		int n;
			
 
				-		struct _starpu_mpi_copy_handle_hashlist *hashlist;
			
 
				-
			
 
				-		for(n=0 ; n<worldsize; n++)
			
 
				-		{
			
 
				-			for(hashlist=_starpu_mpi_copy_handle_hashmap[n]; hashlist != NULL; hashlist=hashlist->hh.next)
			
 
				-			{
			
 
				-				_starpu_mpi_copy_handle_list_delete(hashlist->list);
			
 
				-			}
			
 
				-			struct _starpu_mpi_copy_handle_hashlist *current, *tmp;
			
 
				-			HASH_ITER(hh, _starpu_mpi_copy_handle_hashmap[n], current, tmp)
			
 
				-			{
			
 
				-				HASH_DEL(_starpu_mpi_copy_handle_hashmap[n], current);
			
 
				-				free(current);
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	free(_starpu_mpi_app_req_hashmap);
			
 
				-	free(_starpu_mpi_copy_handle_hashmap);
			
 
				+	_starpu_mpi_early_data_free(worldsize);
			
 
				+	_starpu_mpi_early_request_free();
			
 
				 	free(argc_argv);
			
 
				 	free(recv_env);
			
 
				 
			
--- a/mpi/src/starpu_mpi_cache.c
+++ b/mpi/src/starpu_mpi_cache.c
@@ -30,6 +30,8 @@ struct _starpu_data_entry
 
				 	starpu_data_handle_t data;
			
 
				 };
			
 
				 
			
 
				+static starpu_pthread_mutex_t *_cache_sent_mutex;
			
 
				+static starpu_pthread_mutex_t *_cache_received_mutex;
			
 
				 static struct _starpu_data_entry **_cache_sent_data = NULL;
			
 
				 static struct _starpu_data_entry **_cache_received_data = NULL;
			
 
				 int _cache_enabled=1;
			
@@ -53,11 +55,19 @@ void _starpu_mpi_cache_init(MPI_Comm comm)
 
				 
			
 
				 	MPI_Comm_size(comm, &nb_nodes);
			
 
				 	_STARPU_MPI_DEBUG(2, "Initialising htable for cache\n");
			
 
				+
			
 
				 	_cache_sent_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
			
 
				-	for(i=0 ; i<nb_nodes ; i++) _cache_sent_data[i] = NULL;
			
 
				 	_cache_received_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
			
 
				-	for(i=0 ; i<nb_nodes ; i++) _cache_received_data[i] = NULL;
			
 
				-	_starpu_mpi_cache_stats_init(comm);
			
 
				+	_cache_sent_mutex = malloc(nb_nodes * sizeof(starpu_pthread_mutex_t));
			
 
				+	_cache_received_mutex = malloc(nb_nodes * sizeof(starpu_pthread_mutex_t));
			
 
				+
			
 
				+	for(i=0 ; i<nb_nodes ; i++)
			
 
				+	{
			
 
				+		_cache_sent_data[i] = NULL;
			
 
				+		_cache_received_data[i] = NULL;
			
 
				+		STARPU_PTHREAD_MUTEX_INIT(&_cache_sent_mutex[i], NULL);
			
 
				+		STARPU_PTHREAD_MUTEX_INIT(&_cache_received_mutex[i], NULL);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 static
			
@@ -72,27 +82,44 @@ void _starpu_mpi_cache_empty_tables(int world_size)
 
				 	for(i=0 ; i<world_size ; i++)
			
 
				 	{
			
 
				 		struct _starpu_data_entry *entry, *tmp;
			
 
				+
			
 
				+		STARPU_PTHREAD_MUTEX_LOCK(&_cache_sent_mutex[i]);
			
 
				 		HASH_ITER(hh, _cache_sent_data[i], entry, tmp)
			
 
				 		{
			
 
				 			HASH_DEL(_cache_sent_data[i], entry);
			
 
				 			free(entry);
			
 
				 		}
			
 
				+		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_sent_mutex[i]);
			
 
				+
			
 
				+		STARPU_PTHREAD_MUTEX_LOCK(&_cache_received_mutex[i]);
			
 
				 		HASH_ITER(hh, _cache_received_data[i], entry, tmp)
			
 
				 		{
			
 
				 			HASH_DEL(_cache_received_data[i], entry);
			
 
				 			_starpu_mpi_cache_stats_dec(-1, i, entry->data);
			
 
				 			free(entry);
			
 
				 		}
			
 
				+		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_received_mutex[i]);
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				 void _starpu_mpi_cache_free(int world_size)
			
 
				 {
			
 
				+	int i;
			
 
				+
			
 
				 	if (_cache_enabled == 0) return;
			
 
				 
			
 
				 	_starpu_mpi_cache_empty_tables(world_size);
			
 
				 	free(_cache_sent_data);
			
 
				 	free(_cache_received_data);
			
 
				+
			
 
				+	for(i=0 ; i<world_size ; i++)
			
 
				+	{
			
 
				+		STARPU_PTHREAD_MUTEX_DESTROY(&_cache_sent_mutex[i]);
			
 
				+		STARPU_PTHREAD_MUTEX_DESTROY(&_cache_received_mutex[i]);
			
 
				+	}
			
 
				+	free(_cache_sent_mutex);
			
 
				+	free(_cache_received_mutex);
			
 
				+
			
 
				 	_starpu_mpi_cache_stats_free();
			
 
				 }
			
 
				 
			
@@ -104,6 +131,8 @@ void _starpu_mpi_cache_flush_sent(MPI_Comm comm, starpu_data_handle_t data)
 
				 	for(n=0 ; n<size ; n++)
			
 
				 	{
			
 
				 		struct _starpu_data_entry *already_sent;
			
 
				+
			
 
				+		STARPU_PTHREAD_MUTEX_LOCK(&_cache_sent_mutex[n]);
			
 
				 		HASH_FIND_PTR(_cache_sent_data[n], &data, already_sent);
			
 
				 		if (already_sent)
			
 
				 		{
			
@@ -111,6 +140,7 @@ void _starpu_mpi_cache_flush_sent(MPI_Comm comm, starpu_data_handle_t data)
 
				 			HASH_DEL(_cache_sent_data[n], already_sent);
			
 
				 			free(already_sent);
			
 
				 		}
			
 
				+		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_sent_mutex[n]);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -119,6 +149,7 @@ void _starpu_mpi_cache_flush_recv(starpu_data_handle_t data, int me)
 
				 	int mpi_rank = starpu_data_get_rank(data);
			
 
				 	struct _starpu_data_entry *already_received;
			
 
				 
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_received_mutex[mpi_rank]);
			
 
				 	HASH_FIND_PTR(_cache_received_data[mpi_rank], &data, already_received);
			
 
				 	if (already_received)
			
 
				 	{
			
@@ -131,6 +162,7 @@ void _starpu_mpi_cache_flush_recv(starpu_data_handle_t data, int me)
 
				 		free(already_received);
			
 
				 		starpu_data_invalidate_submit(data);
			
 
				 	}
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_received_mutex[mpi_rank]);
			
 
				 }
			
 
				 
			
 
				 void starpu_mpi_cache_flush_all_data(MPI_Comm comm)
			
@@ -146,6 +178,8 @@ void starpu_mpi_cache_flush_all_data(MPI_Comm comm)
 
				 	for(i=0 ; i<nb_nodes ; i++)
			
 
				 	{
			
 
				 		struct _starpu_data_entry *entry, *tmp;
			
 
				+
			
 
				+		STARPU_PTHREAD_MUTEX_LOCK(&_cache_sent_mutex[i]);
			
 
				 		HASH_ITER(hh, _cache_sent_data[i], entry, tmp)
			
 
				 		{
			
 
				 			mpi_rank = starpu_data_get_rank(entry->data);
			
@@ -154,6 +188,9 @@ void starpu_mpi_cache_flush_all_data(MPI_Comm comm)
 
				 			HASH_DEL(_cache_sent_data[i], entry);
			
 
				 			free(entry);
			
 
				 		}
			
 
				+		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_sent_mutex[i]);
			
 
				+
			
 
				+		STARPU_PTHREAD_MUTEX_LOCK(&_cache_received_mutex[i]);
			
 
				 		HASH_ITER(hh, _cache_received_data[i], entry, tmp)
			
 
				 		{
			
 
				 			mpi_rank = starpu_data_get_rank(entry->data);
			
@@ -163,6 +200,7 @@ void starpu_mpi_cache_flush_all_data(MPI_Comm comm)
 
				 			_starpu_mpi_cache_stats_dec(my_rank, i, entry->data);
			
 
				 			free(entry);
			
 
				 		}
			
 
				+		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_received_mutex[i]);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -180,6 +218,7 @@ void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle)
 
				 
			
 
				 	for(i=0 ; i<nb_nodes ; i++)
			
 
				 	{
			
 
				+		STARPU_PTHREAD_MUTEX_LOCK(&_cache_sent_mutex[i]);
			
 
				 		HASH_FIND_PTR(_cache_sent_data[i], &data_handle, avail);
			
 
				 		if (avail)
			
 
				 		{
			
@@ -187,6 +226,9 @@ void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle)
 
				 			HASH_DEL(_cache_sent_data[i], avail);
			
 
				 			free(avail);
			
 
				 		}
			
 
				+		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_sent_mutex[i]);
			
 
				+
			
 
				+		STARPU_PTHREAD_MUTEX_LOCK(&_cache_received_mutex[i]);
			
 
				 		HASH_FIND_PTR(_cache_received_data[i], &data_handle, avail);
			
 
				 		if (avail)
			
 
				 		{
			
@@ -195,6 +237,7 @@ void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle)
 
				 			_starpu_mpi_cache_stats_dec(my_rank, i, data_handle);
			
 
				 			free(avail);
			
 
				 		}
			
 
				+		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_received_mutex[i]);
			
 
				 	}
			
 
				 
			
 
				 	if (mpi_rank != my_rank && mpi_rank != -1)
			
@@ -203,9 +246,11 @@ void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle)
 
				 
			
 
				 void *_starpu_mpi_already_received(int src, starpu_data_handle_t data, int mpi_rank)
			
 
				 {
			
 
				+	struct _starpu_data_entry *already_received;
			
 
				+
			
 
				 	if (_cache_enabled == 0) return NULL;
			
 
				 
			
 
				-	struct _starpu_data_entry *already_received;
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_received_mutex[mpi_rank]);
			
 
				 	HASH_FIND_PTR(_cache_received_data[mpi_rank], &data, already_received);
			
 
				 	if (already_received == NULL)
			
 
				 	{
			
@@ -218,14 +263,17 @@ void *_starpu_mpi_already_received(int src, starpu_data_handle_t data, int mpi_r
 
				 	{
			
 
				 		_STARPU_MPI_DEBUG(2, "Do not receive data %p from node %d as it is already available\n", data, mpi_rank);
			
 
				 	}
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_received_mutex[mpi_rank]);
			
 
				 	return already_received;
			
 
				 }
			
 
				 
			
 
				 void *_starpu_mpi_already_sent(starpu_data_handle_t data, int dest)
			
 
				 {
			
 
				+	struct _starpu_data_entry *already_sent;
			
 
				+
			
 
				 	if (_cache_enabled == 0) return NULL;
			
 
				 
			
 
				-	struct _starpu_data_entry *already_sent;
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_sent_mutex[dest]);
			
 
				 	HASH_FIND_PTR(_cache_sent_data[dest], &data, already_sent);
			
 
				 	if (already_sent == NULL)
			
 
				 	{
			
@@ -238,6 +286,7 @@ void *_starpu_mpi_already_sent(starpu_data_handle_t data, int dest)
 
				 	{
			
 
				 		_STARPU_MPI_DEBUG(2, "Do not send data %p to node %d as it has already been sent\n", data, dest);
			
 
				 	}
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_sent_mutex[dest]);
			
 
				 	return already_sent;
			
 
				 }
			
 
				 
			
--- a/mpi/src/starpu_mpi_early_data.c
+++ b/mpi/src/starpu_mpi_early_data.c
@@ -0,0 +1,168 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010-2014  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <stdlib.h>
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <starpu_mpi_early_data.h>
			
 
				+#include <starpu_mpi_private.h>
			
 
				+#include <common/uthash.h>
			
 
				+
			
 
				+struct _starpu_mpi_early_data_handle_hashlist
			
 
				+{
			
 
				+	struct _starpu_mpi_early_data_handle_list *list;
			
 
				+	UT_hash_handle hh;
			
 
				+	int mpi_tag;
			
 
				+};
			
 
				+
			
 
				+/** stores data which have been received by MPI but have not been requested by the application */
			
 
				+static struct _starpu_mpi_early_data_handle_hashlist **_starpu_mpi_early_data_handle_hashmap = NULL;
			
 
				+static int _starpu_mpi_early_data_handle_hashmap_count = 0;
			
 
				+
			
 
				+void _starpu_mpi_early_data_init(int world_size)
			
 
				+{
			
 
				+	int k;
			
 
				+
			
 
				+	_starpu_mpi_early_data_handle_hashmap = malloc(world_size * sizeof(struct _starpu_mpi_early_data_handle_hash_list *));
			
 
				+	for(k=0 ; k<world_size ; k++) _starpu_mpi_early_data_handle_hashmap[k] = NULL;
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_early_data_check_termination()
			
 
				+{
			
 
				+	STARPU_ASSERT_MSG(_starpu_mpi_early_data_handle_hashmap_count == 0, "Number of copy requests left is not zero");
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_early_data_free(int world_size)
			
 
				+{
			
 
				+	int n;
			
 
				+	struct _starpu_mpi_early_data_handle_hashlist *hashlist;
			
 
				+
			
 
				+	for(n=0 ; n<world_size; n++)
			
 
				+	{
			
 
				+		for(hashlist=_starpu_mpi_early_data_handle_hashmap[n]; hashlist != NULL; hashlist=hashlist->hh.next)
			
 
				+		{
			
 
				+			_starpu_mpi_early_data_handle_list_delete(hashlist->list);
			
 
				+		}
			
 
				+		struct _starpu_mpi_early_data_handle_hashlist *current, *tmp;
			
 
				+		HASH_ITER(hh, _starpu_mpi_early_data_handle_hashmap[n], current, tmp)
			
 
				+		{
			
 
				+			HASH_DEL(_starpu_mpi_early_data_handle_hashmap[n], current);
			
 
				+			free(current);
			
 
				+		}
			
 
				+	}
			
 
				+	free(_starpu_mpi_early_data_handle_hashmap);
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_VERBOSE
			
 
				+static void _starpu_mpi_early_data_handle_display_hash(int source, int tag)
			
 
				+{
			
 
				+	struct _starpu_mpi_early_data_handle_hashlist *hashlist;
			
 
				+	HASH_FIND_INT(_starpu_mpi_early_data_handle_hashmap[source], &tag, hashlist);
			
 
				+
			
 
				+	if (hashlist == NULL)
			
 
				+	{
			
 
				+		_STARPU_MPI_DEBUG(60, "Hashlist for source %d and tag %d does not exist\n", source, tag);
			
 
				+	}
			
 
				+	else if (_starpu_mpi_early_data_handle_list_empty(hashlist->list))
			
 
				+	{
			
 
				+		_STARPU_MPI_DEBUG(60, "Hashlist for source %d and tag %d is empty\n", source, tag);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		struct _starpu_mpi_early_data_handle *cur;
			
 
				+		for (cur = _starpu_mpi_early_data_handle_list_begin(hashlist->list) ;
			
 
				+		     cur != _starpu_mpi_early_data_handle_list_end(hashlist->list);
			
 
				+		     cur = _starpu_mpi_early_data_handle_list_next(cur))
			
 
				+		{
			
 
				+			_STARPU_MPI_DEBUG(60, "Element for source %d and tag %d: %p\n", source, tag, cur);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+static
			
 
				+struct _starpu_mpi_early_data_handle *_starpu_mpi_early_data_pop(int mpi_tag, int source, int delete)
			
 
				+{
			
 
				+	struct _starpu_mpi_early_data_handle_hashlist *hashlist;
			
 
				+	struct _starpu_mpi_early_data_handle *early_data_handle;
			
 
				+
			
 
				+	_STARPU_MPI_DEBUG(60, "Looking for early_data_handle with tag %d in the hashmap[%d]\n", mpi_tag, source);
			
 
				+	HASH_FIND_INT(_starpu_mpi_early_data_handle_hashmap[source], &mpi_tag, hashlist);
			
 
				+	if (hashlist == NULL)
			
 
				+	{
			
 
				+		early_data_handle = NULL;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		if (_starpu_mpi_early_data_handle_list_empty(hashlist->list))
			
 
				+		{
			
 
				+			early_data_handle = NULL;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			if (delete == 1)
			
 
				+			{
			
 
				+				early_data_handle = _starpu_mpi_early_data_handle_list_pop_front(hashlist->list);
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				early_data_handle = _starpu_mpi_early_data_handle_list_front(hashlist->list);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	_STARPU_MPI_DEBUG(60, "Found early_data_handle %p with tag %d in the hashmap[%d]\n", early_data_handle, mpi_tag, source);
			
 
				+	return early_data_handle;
			
 
				+}
			
 
				+
			
 
				+struct _starpu_mpi_early_data_handle *_starpu_mpi_early_data_find(int mpi_tag, int source)
			
 
				+{
			
 
				+	return _starpu_mpi_early_data_pop(mpi_tag, source, 0);
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_early_data_add(struct _starpu_mpi_early_data_handle *early_data_handle)
			
 
				+{
			
 
				+	_STARPU_MPI_DEBUG(60, "Trying to add early_data_handle %p with tag %d in the hashmap[%d]\n", early_data_handle, early_data_handle->mpi_tag, early_data_handle->source);
			
 
				+
			
 
				+	struct _starpu_mpi_early_data_handle_hashlist *hashlist;
			
 
				+	HASH_FIND_INT(_starpu_mpi_early_data_handle_hashmap[early_data_handle->source], &early_data_handle->mpi_tag, hashlist);
			
 
				+	if (hashlist == NULL)
			
 
				+	{
			
 
				+		hashlist = malloc(sizeof(struct _starpu_mpi_early_data_handle_hashlist));
			
 
				+		hashlist->list = _starpu_mpi_early_data_handle_list_new();
			
 
				+		hashlist->mpi_tag = early_data_handle->mpi_tag;
			
 
				+		HASH_ADD_INT(_starpu_mpi_early_data_handle_hashmap[early_data_handle->source], mpi_tag, hashlist);
			
 
				+	}
			
 
				+	_starpu_mpi_early_data_handle_list_push_back(hashlist->list, early_data_handle);
			
 
				+	_starpu_mpi_early_data_handle_hashmap_count ++;
			
 
				+#ifdef STARPU_VERBOSE
			
 
				+	_starpu_mpi_early_data_handle_display_hash(early_data_handle->source, early_data_handle->mpi_tag);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_early_data_delete(struct _starpu_mpi_early_data_handle *early_data_handle)
			
 
				+{
			
 
				+	_STARPU_MPI_DEBUG(60, "Trying to delete early_data_handle %p with tag %d in the hashmap[%d]\n", early_data_handle, early_data_handle->mpi_tag, early_data_handle->source);
			
 
				+	struct _starpu_mpi_early_data_handle *found = _starpu_mpi_early_data_pop(early_data_handle->mpi_tag, early_data_handle->source, 1);
			
 
				+
			
 
				+	STARPU_ASSERT_MSG(found == early_data_handle,
			
 
				+			  "[_starpu_mpi_early_data_delete][error] early_data_handle %p with tag %d is NOT in the hashmap[%d]\n", early_data_handle, early_data_handle->mpi_tag, early_data_handle->source);
			
 
				+
			
 
				+	_starpu_mpi_early_data_handle_hashmap_count --;
			
 
				+#ifdef STARPU_VERBOSE
			
 
				+	_starpu_mpi_early_data_handle_display_hash(early_data_handle->source, early_data_handle->mpi_tag);
			
 
				+#endif
			
 
				+}
			
 
				+
			
--- a/mpi/src/starpu_mpi_early_data.h
+++ b/mpi/src/starpu_mpi_early_data.h
@@ -0,0 +1,55 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010-2014  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STARPU_MPI_EARLY_DATA_H__
			
 
				+#define __STARPU_MPI_EARLY_DATA_H__
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <mpi.h>
			
 
				+#include <common/config.h>
			
 
				+#include <common/list.h>
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+LIST_TYPE(_starpu_mpi_early_data_handle,
			
 
				+	  starpu_data_handle_t handle;
			
 
				+	  struct _starpu_mpi_envelope *env;
			
 
				+	  struct _starpu_mpi_req *req;
			
 
				+	  void *buffer;
			
 
				+	  int mpi_tag;
			
 
				+	  int source;
			
 
				+	  int req_ready;
			
 
				+	  starpu_pthread_mutex_t req_mutex;
			
 
				+	  starpu_pthread_cond_t req_cond;
			
 
				+);
			
 
				+
			
 
				+void _starpu_mpi_early_data_init(int world_size);
			
 
				+void _starpu_mpi_early_data_check_termination();
			
 
				+void _starpu_mpi_early_data_free(int world_size);
			
 
				+
			
 
				+struct _starpu_mpi_early_data_handle *_starpu_mpi_early_data_find(int mpi_tag, int source);
			
 
				+void _starpu_mpi_early_data_add(struct _starpu_mpi_early_data_handle *early_data_handle);
			
 
				+void _starpu_mpi_early_data_delete(struct _starpu_mpi_early_data_handle *early_data_handle);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif /* __STARPU_MPI_EARLY_DATA_H__ */
			
--- a/mpi/src/starpu_mpi_early_request.c
+++ b/mpi/src/starpu_mpi_early_request.c
@@ -0,0 +1,104 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010-2014  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <stdlib.h>
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <starpu_mpi_private.h>
			
 
				+#include <starpu_mpi_early_request.h>
			
 
				+#include <common/uthash.h>
			
 
				+
			
 
				+/** stores application requests for which data have not been received yet */
			
 
				+static struct _starpu_mpi_req **_starpu_mpi_app_req_hashmap = NULL;
			
 
				+static int _starpu_mpi_app_req_hashmap_count = 0;
			
 
				+
			
 
				+void _starpu_mpi_early_request_init(int world_size)
			
 
				+{
			
 
				+	int k;
			
 
				+
			
 
				+	_starpu_mpi_app_req_hashmap = malloc(world_size * sizeof(struct _starpu_mpi_req *));
			
 
				+	for(k=0 ; k<world_size ; k++) _starpu_mpi_app_req_hashmap[k] = NULL;
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_early_request_free()
			
 
				+{
			
 
				+	free(_starpu_mpi_app_req_hashmap);
			
 
				+}
			
 
				+
			
 
				+int _starpu_mpi_early_request_count()
			
 
				+{
			
 
				+	return _starpu_mpi_app_req_hashmap_count;
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_early_request_check_termination()
			
 
				+{
			
 
				+	STARPU_ASSERT_MSG(_starpu_mpi_early_request_count() == 0, "Number of receive requests left is not zero");
			
 
				+}
			
 
				+
			
 
				+struct _starpu_mpi_req* _starpu_mpi_early_request_find(int mpi_tag, int source)
			
 
				+{
			
 
				+	struct _starpu_mpi_req* req;
			
 
				+
			
 
				+	HASH_FIND_INT(_starpu_mpi_app_req_hashmap[source], &mpi_tag, req);
			
 
				+
			
 
				+	return req;
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_early_request_add(struct _starpu_mpi_req *req)
			
 
				+{
			
 
				+	struct _starpu_mpi_req *test_req;
			
 
				+
			
 
				+	test_req = _starpu_mpi_early_request_find(req->mpi_tag, req->srcdst);
			
 
				+
			
 
				+	if (test_req == NULL)
			
 
				+	{
			
 
				+		HASH_ADD_INT(_starpu_mpi_app_req_hashmap[req->srcdst], mpi_tag, req);
			
 
				+		_starpu_mpi_app_req_hashmap_count ++;
			
 
				+		_STARPU_MPI_DEBUG(3, "Adding request %p with tag %d in the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		_STARPU_MPI_DEBUG(3, "[Error] request %p with tag %d already in the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
			
 
				+		int seq_const = starpu_data_get_sequential_consistency_flag(req->data_handle);
			
 
				+		if (seq_const &&  req->sequential_consistency)
			
 
				+		{
			
 
				+			STARPU_ASSERT_MSG(!test_req, "[Error] request %p with tag %d wanted to be added to the application request hashmap[%d], while another request %p with the same tag is already in it. \n Sequential consistency is activated : this is not supported by StarPU.", req, req->mpi_tag, req->srcdst, test_req);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			STARPU_ASSERT_MSG(!test_req, "[Error] request %p with tag %d wanted to be added to the application request hashmap[%d], while another request %p with the same tag is already in it. \n Sequential consistency isn't activated for this handle : you should want to add dependencies between requests for which the sequential consistency is deactivated.", req, req->mpi_tag, req->srcdst, test_req);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_early_request_delete(struct _starpu_mpi_req *req)
			
 
				+{
			
 
				+	struct _starpu_mpi_req *test_req;
			
 
				+
			
 
				+	test_req = _starpu_mpi_early_request_find(req->mpi_tag, req->srcdst);
			
 
				+
			
 
				+	if (test_req != NULL)
			
 
				+	{
			
 
				+		HASH_DEL(_starpu_mpi_app_req_hashmap[req->srcdst], req);
			
 
				+		_starpu_mpi_app_req_hashmap_count --;
			
 
				+		_STARPU_MPI_DEBUG(3, "Deleting application request %p with tag %d from the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		_STARPU_MPI_DEBUG(3, "[Warning] request %p with tag %d is NOT in the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
			
 
				+	}
			
 
				+}
			
 
				+
			
--- a/mpi/src/starpu_mpi_early_request.h
+++ b/mpi/src/starpu_mpi_early_request.h
@@ -0,0 +1,44 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010-2014  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STARPU_MPI_EARLY_REQUEST_H__
			
 
				+#define __STARPU_MPI_EARLY_REQUEST_H__
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <mpi.h>
			
 
				+#include <common/config.h>
			
 
				+#include <common/list.h>
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+void _starpu_mpi_early_request_init(int world_size);
			
 
				+void _starpu_mpi_early_request_free();
			
 
				+int _starpu_mpi_early_request_count();
			
 
				+void _starpu_mpi_early_request_check_termination();
			
 
				+
			
 
				+void _starpu_mpi_early_request_add(struct _starpu_mpi_req *req);
			
 
				+struct _starpu_mpi_req* _starpu_mpi_early_request_find(int mpi_tag, int source);
			
 
				+void _starpu_mpi_early_request_delete(struct _starpu_mpi_req *req);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif /* __STARPU_MPI_EARLY_REQUEST_H__ */
			
--- a/mpi/tests/mpi_earlyrecv.c
+++ b/mpi/tests/mpi_earlyrecv.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -21,9 +21,9 @@
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				-	int ret, rank, size, i, nb_requests;
			
 
				-	starpu_data_handle_t tab_handle[3];
			
 
				-	starpu_mpi_req request[3];
			
 
				+	int ret, rank, size, i;
			
 
				+	starpu_data_handle_t tab_handle[4];
			
 
				+	starpu_mpi_req request[2] = {NULL, NULL};
			
 
				 
			
 
				 	MPI_Init(NULL, NULL);
			
 
				 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
@@ -43,11 +43,14 @@ int main(int argc, char **argv)
 
				 	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				-	for(i=0 ; i<3 ; i++)
			
 
				+	for(i=0 ; i<4 ; i++)
			
 
				 	{
			
 
				-		starpu_variable_data_register(&tab_handle[i], STARPU_MAIN_RAM, (uintptr_t)&rank, sizeof(int));
			
 
				-		starpu_data_set_tag(tab_handle[i], i);
			
 
				-		request[i] = NULL;
			
 
				+		if (i<3 || rank%2)
			
 
				+		{
			
 
				+			// all data are registered on all nodes, bu the 4th data which is not registered on the receiving node
			
 
				+			starpu_variable_data_register(&tab_handle[i], STARPU_MAIN_RAM, (uintptr_t)&rank, sizeof(int));
			
 
				+			starpu_mpi_data_register(tab_handle[i], i, rank);
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
			
@@ -56,23 +59,30 @@ int main(int argc, char **argv)
 
				 
			
 
				 	if (rank%2)
			
 
				 	{
			
 
				+		// this data will be received as an early registered data
			
 
				 		starpu_mpi_isend(tab_handle[0], &request[0], other_rank, 0, MPI_COMM_WORLD);
			
 
				+		// this data will be received as an early UNregistered data
			
 
				+		starpu_mpi_isend(tab_handle[3], &request[1], other_rank, 3, MPI_COMM_WORLD);
			
 
				+
			
 
				+		starpu_mpi_send(tab_handle[1], other_rank, 1, MPI_COMM_WORLD);
			
 
				 		starpu_mpi_recv(tab_handle[2], other_rank, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
			
 
				-		starpu_mpi_isend(tab_handle[1], &request[1], other_rank, 1, MPI_COMM_WORLD);
			
 
				-		nb_requests = 2;
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				+		starpu_mpi_recv(tab_handle[1], other_rank, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
			
 
				+		starpu_mpi_send(tab_handle[2], other_rank, 2, MPI_COMM_WORLD);
			
 
				+
			
 
				+		// we register the data
			
 
				+		starpu_variable_data_register(&tab_handle[3], STARPU_MAIN_RAM, (uintptr_t)&rank, sizeof(int));
			
 
				+		starpu_mpi_data_register(tab_handle[3], 3, rank);
			
 
				+		starpu_mpi_irecv(tab_handle[3], &request[1], other_rank, 3, MPI_COMM_WORLD);
			
 
				 		starpu_mpi_irecv(tab_handle[0], &request[0], other_rank, 0, MPI_COMM_WORLD);
			
 
				-		starpu_mpi_irecv(tab_handle[1], &request[1], other_rank, 1, MPI_COMM_WORLD);
			
 
				-		starpu_mpi_isend(tab_handle[2], &request[2], other_rank, 2, MPI_COMM_WORLD);
			
 
				-		nb_requests = 3;
			
 
				 	}
			
 
				 
			
 
				 	int finished=0;
			
 
				 	while (!finished)
			
 
				 	{
			
 
				-		for(i=0 ; i<nb_requests ; i++)
			
 
				+		for(i=0 ; i<2 ; i++)
			
 
				 		{
			
 
				 			if (request[i])
			
 
				 			{
			
@@ -83,11 +93,10 @@ int main(int argc, char **argv)
 
				 					FPRINTF_MPI("request[%d] = %d %p\n", i, flag, request[i]);
			
 
				 			}
			
 
				 		}
			
 
				-		finished = request[0] == NULL;
			
 
				-		for(i=1 ; i<nb_requests ; i++) finished = finished && request[i] == NULL;
			
 
				+		finished = request[0] == NULL && request[1] == NULL;
			
 
				 	}
			
 
				 
			
 
				-	for(i=0 ; i<3 ; i++)
			
 
				+	for(i=0 ; i<4 ; i++)
			
 
				 		starpu_data_unregister(tab_handle[i]);
			
 
				 
			
 
				 	starpu_mpi_shutdown();
			
--- a/mpi/tests/mpi_earlyrecv2.c
+++ b/mpi/tests/mpi_earlyrecv2.c
@@ -131,7 +131,7 @@ int exchange_variable(int rank, int detached)
 
				 	{
			
 
				 		value[i]=i*rank;
			
 
				 		starpu_variable_data_register(&tab_handle[i], STARPU_MAIN_RAM, (uintptr_t)&value[i], sizeof(int));
			
 
				-		starpu_data_set_tag(tab_handle[i], i);
			
 
				+		starpu_mpi_data_register(tab_handle[i], i, rank);
			
 
				 	}
			
 
				 	ret = exchange(rank, tab_handle, check_variable, detached);
			
 
				 	for(i=0 ; i<NB ; i++)
			
@@ -154,7 +154,7 @@ int exchange_void(int rank, int detached)
 
				 	for(i=0 ; i<NB ; i++)
			
 
				 	{
			
 
				 		starpu_void_data_register(&tab_handle[i]);
			
 
				-		starpu_data_set_tag(tab_handle[i], i);
			
 
				+		starpu_mpi_data_register(tab_handle[i], i, rank);
			
 
				 	}
			
 
				 	ret = exchange(rank, tab_handle, check_void, detached);
			
 
				 	for(i=0 ; i<NB ; i++)
			
@@ -191,7 +191,7 @@ int exchange_complex(int rank, int detached)
 
				 		real[i] = (i*rank)+12;
			
 
				 		imaginary[i] = (i*rank)+45;
			
 
				 		starpu_complex_data_register(&handle[i], STARPU_MAIN_RAM, &real[i], &imaginary[i], 1);
			
 
				-		starpu_data_set_tag(handle[i], i);
			
 
				+		starpu_mpi_data_register(handle[i], i, rank);
			
 
				 	}
			
 
				 	ret = exchange(rank, handle, check_complex, detached);
			
 
				 	for(i=0 ; i<NB ; i++)
			
--- a/sc_hypervisor/examples/cholesky/cholesky_grain_tag.c
+++ b/sc_hypervisor/examples/cholesky/cholesky_grain_tag.c
@@ -406,7 +406,7 @@ int main(int argc, char **argv)
 
				 	float *test_mat = malloc(size*size*sizeof(float));
			
 
				 	STARPU_ASSERT(test_mat);
			
 
				 
			
 
				-	SSYRK("L", "N", size, size, 1.0f,
			
 
				+	STARPU_SSYRK("L", "N", size, size, 1.0f,
			
 
				 				mat, size, 0.0f, test_mat, size);
			
 
				 
			
 
				 	FPRINTF(stderr, "comparing results ...\n");
			
--- a/sc_hypervisor/examples/cholesky/cholesky_implicit.c
+++ b/sc_hypervisor/examples/cholesky/cholesky_implicit.c
@@ -290,7 +290,7 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 
				 		float *test_mat = malloc(size*size*sizeof(float));
			
 
				 		STARPU_ASSERT(test_mat);
			
 
				 
			
 
				-		SSYRK("L", "N", size, size, 1.0f,
			
 
				+		STARPU_SSYRK("L", "N", size, size, 1.0f,
			
 
				 					mat, size, 0.0f, test_mat, size);
			
 
				 
			
 
				 		FPRINTF(stderr, "comparing results ...\n");
			
--- a/sc_hypervisor/examples/cholesky/cholesky_kernels.c
+++ b/sc_hypervisor/examples/cholesky/cholesky_kernels.c
@@ -52,7 +52,7 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STAR
 
				 		if (worker_size == 1)
			
 
				 		{
			
 
				 			/* Sequential CPU kernel */
			
 
				-			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21, 
			
 
				+			STARPU_SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21, 
			
 
				 				right, ld12, 1.0f, center, ld22);
			
 
				 		}
			
 
				 		else
			
@@ -66,7 +66,7 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STAR
 
				 			float *new_left = &left[block_size*rank];
			
 
				 			float *new_center = &center[block_size*rank];
			
 
				 
			
 
				-			SGEMM("N", "T", dy, new_dx, dz, -1.0f, new_left, ld21, 
			
 
				+			STARPU_SGEMM("N", "T", dy, new_dx, dz, -1.0f, new_left, ld21, 
			
 
				 				right, ld12, 1.0f, new_center, ld22);
			
 
				 		}
			
 
				 	}
			
@@ -117,7 +117,7 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, STARPU_A
 
				 	switch (s)
			
 
				 	{
			
 
				 		case 0:
			
 
				-			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
			
 
				+			STARPU_STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
			
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
@@ -177,9 +177,9 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 
				 
			
 
				 				STARPU_ASSERT(lambda11 != 0.0f);
			
 
				 		
			
 
				-				SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
			
 
				+				STARPU_SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
			
 
				 		
			
 
				-				SSYR("L", nx - z - 1, -1.0f, 
			
 
				+				STARPU_SSYR("L", nx - z - 1, -1.0f, 
			
 
				 							&sub11[(z+1)+z*ld], 1,
			
 
				 							&sub11[(z+1)+(z+1)*ld], ld);
			
 
				 			}
			
--- a/sc_hypervisor/examples/cholesky/cholesky_tag.c
+++ b/sc_hypervisor/examples/cholesky/cholesky_tag.c
@@ -391,7 +391,7 @@ int main(int argc, char **argv)
 
				 	float *test_mat = malloc(size*size*sizeof(float));
			
 
				 	STARPU_ASSERT(test_mat);
			
 
				 
			
 
				-	SSYRK("L", "N", size, size, 1.0f,
			
 
				+	STARPU_SSYRK("L", "N", size, size, 1.0f,
			
 
				 				mat, size, 0.0f, test_mat, size);
			
 
				 
			
 
				 	FPRINTF(stderr, "comparing results ...\n");
			
--- a/src/common/fxt.h
+++ b/src/common/fxt.h
@@ -107,8 +107,6 @@
 
				 
			
 
				 #define _STARPU_FUT_EVENT	0x513c
			
 
				 
			
 
				-#define _STARPU_FUT_WORKER_SCHEDULING_START	0x513e
			
 
				-
			
 
				 #define _STARPU_FUT_LOCKING_MUTEX	0x5140	
			
 
				 #define _STARPU_FUT_MUTEX_LOCKED	0x5141	
			
 
				 
			
@@ -156,6 +154,11 @@
 
				 #define _STARPU_FUT_BARRIER_WAIT_BEGIN		0x5162
			
 
				 #define _STARPU_FUT_BARRIER_WAIT_END		0x5163
			
 
				 
			
 
				+#define _STARPU_FUT_WORKER_SCHEDULING_START	0x5164
			
 
				+#define _STARPU_FUT_WORKER_SCHEDULING_END	0x5165
			
 
				+#define _STARPU_FUT_WORKER_SCHEDULING_PUSH	0x5166
			
 
				+#define _STARPU_FUT_WORKER_SCHEDULING_POP	0x5167
			
 
				+
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 #include <fxt/fxt.h>
			
 
				 #include <fxt/fut.h>
			
@@ -487,6 +490,15 @@ do {										\
 
				 #define _STARPU_TRACE_WORKER_SCHEDULING_START	\
			
 
				 	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SCHEDULING_START, _starpu_gettid());
			
 
				 
			
 
				+#define _STARPU_TRACE_WORKER_SCHEDULING_END	\
			
 
				+	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SCHEDULING_END, _starpu_gettid());
			
 
				+
			
 
				+#define _STARPU_TRACE_WORKER_SCHEDULING_PUSH	\
			
 
				+	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SCHEDULING_PUSH, _starpu_gettid());
			
 
				+
			
 
				+#define _STARPU_TRACE_WORKER_SCHEDULING_POP	\
			
 
				+	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SCHEDULING_POP, _starpu_gettid());
			
 
				+
			
 
				 #define _STARPU_TRACE_WORKER_SLEEP_START	\
			
 
				 	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SLEEP_START, _starpu_gettid());
			
 
				 
			
@@ -760,6 +772,9 @@ do {										\
 
				 #define _STARPU_TRACE_WORKER_DEINIT_START	do {} while(0)
			
 
				 #define _STARPU_TRACE_WORKER_DEINIT_END(a)	do {} while(0)
			
 
				 #define _STARPU_TRACE_WORKER_SCHEDULING_START		do {} while(0)
			
 
				+#define _STARPU_TRACE_WORKER_SCHEDULING_END		do {} while(0)
			
 
				+#define _STARPU_TRACE_WORKER_SCHEDULING_PUSH		do {} while(0)
			
 
				+#define _STARPU_TRACE_WORKER_SCHEDULING_POP		do {} while(0)
			
 
				 #define _STARPU_TRACE_WORKER_SLEEP_START		do {} while(0)
			
 
				 #define _STARPU_TRACE_WORKER_SLEEP_END		do {} while(0)
			
 
				 #define _STARPU_TRACE_USER_DEFINED_START		do {} while(0)
			
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -82,10 +82,10 @@
 
				 #  define _STARPU_DEBUG(fmt, ...) do { } while (0)
			
 
				 #endif
			
 
				 
			
 
				-#ifdef STARPU_VERBOSE0
			
 
				-#  define _STARPU_LOG_IN()             do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s] -->\n", pthread_self(), __starpu_func__ ); }} while(0)
			
 
				-#  define _STARPU_LOG_OUT()            do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s] <--\n", pthread_self(), __starpu_func__ ); }} while(0)
			
 
				-#  define _STARPU_LOG_OUT_TAG(outtag)  do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s] <-- (%s)\n", pthread_self(), __starpu_func__, outtag); }} while(0)
			
 
				+#ifdef STARPU_EXTRA_VERBOSE
			
 
				+#  define _STARPU_LOG_IN()             do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] -->\n", pthread_self(), __starpu_func__,__FILE__,  __LINE__); }} while(0)
			
 
				+#  define _STARPU_LOG_OUT()            do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] <--\n", pthread_self(), __starpu_func__, __FILE__,  __LINE__); }} while(0)
			
 
				+#  define _STARPU_LOG_OUT_TAG(outtag)  do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] <-- (%s)\n", pthread_self(), __starpu_func__, __FILE__, __LINE__, outtag); }} while(0)
			
 
				 #else
			
 
				 #  define _STARPU_LOG_IN()
			
 
				 #  define _STARPU_LOG_OUT()
			
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
@@ -1157,6 +1157,22 @@ struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection(unsig
 
				 	return sched_ctx->workers;
			
 
				 }
			
 
				 
			
 
				+void starpu_sched_ctx_display_workers(unsigned sched_ctx_id, FILE *f)
			
 
				+{
			
 
				+	int *workerids = NULL;
			
 
				+	unsigned nworkers;
			
 
				+	unsigned i;
			
 
				+
			
 
				+	nworkers = starpu_sched_ctx_get_workers_list(sched_ctx_id, &workerids);
			
 
				+	fprintf(f, "[sched_ctx %d]: %d worker%s\n", sched_ctx_id, nworkers, nworkers>1?"s":"");
			
 
				+	for (i = 0; i < nworkers; i++)
			
 
				+	{
			
 
				+		char name[256];
			
 
				+		starpu_worker_get_name(workerids[i], name, 256);
			
 
				+		fprintf(f, "\t\t%s\n", name);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 unsigned starpu_sched_ctx_get_workers_list(unsigned sched_ctx_id, int **workerids)
			
 
				 {
			
 
				 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -455,7 +455,14 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 
				 			starpu_pthread_rwlock_t *changing_ctx_mutex = _starpu_sched_ctx_get_changing_ctx_mutex(sched_ctx->id);
			
 
				 			STARPU_PTHREAD_RWLOCK_RDLOCK(changing_ctx_mutex);
			
 
				 			nworkers = starpu_sched_ctx_get_nworkers(sched_ctx->id);
			
 
				-			ret = nworkers == 0 ? -1 : sched_ctx->sched_policy->push_task(task);
			
 
				+			if (nworkers == 0)
			
 
				+				ret = -1;
			
 
				+			else
			
 
				+			{
			
 
				+				_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
			
 
				+				ret = sched_ctx->sched_policy->push_task(task);
			
 
				+				_STARPU_TRACE_WORKER_SCHEDULING_POP;
			
 
				+			}
			
 
				 			STARPU_PTHREAD_RWLOCK_UNLOCK(changing_ctx_mutex);
			
 
				 		}
			
 
				 
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -95,12 +95,17 @@ static uint32_t _starpu_worker_exists_and_can_execute(struct starpu_task *task,
 
				 						      enum starpu_worker_archtype arch)
			
 
				 {
			
 
				 	int i;
			
 
				-	int nworkers = starpu_worker_get_count();
			
 
				-
			
 
				 	_starpu_codelet_check_deprecated_fields(task->cl);
			
 
				+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
			
 
				+	struct starpu_worker_collection *workers = sched_ctx->workers;
			
 
				 
			
 
				-	for (i = 0; i < nworkers; i++)
			
 
				-	{
			
 
				+        struct starpu_sched_ctx_iterator it;
			
 
				+        if(workers->init_iterator)
			
 
				+                workers->init_iterator(workers, &it);
			
 
				+
			
 
				+        while(workers->has_next(workers, &it))
			
 
				+        {
			
 
				+                i = workers->get_next(workers, &it);
			
 
				 		if (starpu_worker_get_type(i) != arch)
			
 
				 			continue;
			
 
				 
			
@@ -141,7 +146,10 @@ static uint32_t _starpu_worker_exists_and_can_execute(struct starpu_task *task,
 
				 			if (!test_implementation)
			
 
				 				break;
			
 
				 
			
 
				-			if (task->cl->can_execute(i, task, impl))
			
 
				+			if (task->cl->can_execute)
			
 
				+				return task->cl->can_execute(i, task, impl);
			
 
				+
			
 
				+			if(test_implementation)
			
 
				 				return 1;
			
 
				 		}
			
 
				 	}
			
@@ -155,11 +163,18 @@ uint32_t _starpu_worker_exists(struct starpu_task *task)
 
				 {
			
 
				 	_starpu_codelet_check_deprecated_fields(task->cl);
			
 
				 
			
 
				-	if (!(task->cl->where & config.worker_mask))
			
 
				-		return 0;
			
 
				-
			
 
				-	if (!task->cl->can_execute)
			
 
				-		return 1;
			
 
				+	/* if the task belongs to the init context we can
			
 
				+	   check out all the worker mask of the machine
			
 
				+	   if not we should iterate on the workers of the ctx
			
 
				+	   and verify if it exists a worker able to exec the task */
			
 
				+	if(task->sched_ctx == 0)
			
 
				+	{
			
 
				+		if (!(task->cl->where & config.worker_mask))
			
 
				+			return 0;
			
 
				+		
			
 
				+		if (!task->cl->can_execute)
			
 
				+			return 1;
			
 
				+	}
			
 
				 
			
 
				 #if defined(STARPU_USE_CPU) || defined(STARPU_SIMGRID)
			
 
				 	if ((task->cl->where & STARPU_CPU) &&
			
@@ -186,6 +201,7 @@ uint32_t _starpu_worker_exists(struct starpu_task *task)
 
				 	    _starpu_worker_exists_and_can_execute(task, STARPU_SCC_WORKER))
			
 
				 		return 1;
			
 
				 #endif
			
 
				+
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
--- a/src/datawizard/malloc.c
+++ b/src/datawizard/malloc.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009-2010, 2012-2014  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -217,7 +217,7 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 
				 end:
			
 
				 	if (ret == 0)
			
 
				 	{
			
 
				-		STARPU_ASSERT(*A);
			
 
				+		STARPU_ASSERT_MSG(*A, "Failed to allocated memory of size %ld b\n", dim);
			
 
				 	}
			
 
				 
			
 
				 	return ret;
			
--- a/src/debug/traces/starpu_fxt.c
+++ b/src/debug/traces/starpu_fxt.c
@@ -275,6 +275,28 @@ static void worker_set_state(double time, const char *prefix, long unsigned int
 
				 #endif
			
 
				 }
			
 
				 
			
 
				+static void worker_push_state(double time, const char *prefix, long unsigned int workerid, const char *name)
			
 
				+{
			
 
				+#ifdef STARPU_HAVE_POTI
			
 
				+	char container[STARPU_POTI_STR_LEN];
			
 
				+	thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, workerid);
			
 
				+	poti_PushState(time, container, "S", name);
			
 
				+#else
			
 
				+	fprintf(out_paje_file, "11	%.9f	%st%lu	S	%s\n", time, prefix, workerid, name);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+static void worker_pop_state(double time, const char *prefix, long unsigned int workerid)
			
 
				+{
			
 
				+#ifdef STARPU_HAVE_POTI
			
 
				+	char container[STARPU_POTI_STR_LEN];
			
 
				+	thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, workerid);
			
 
				+	poti_PopState(time, container, "S");
			
 
				+#else
			
 
				+	fprintf(out_paje_file, "12	%.9f	%st%lu	S\n", time, prefix, workerid);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				 static void mpicommthread_set_state(double time, const char *prefix, const char *name)
			
 
				 {
			
 
				 #ifdef STARPU_HAVE_POTI
			
@@ -790,6 +812,36 @@ static void handle_start_scheduling(struct fxt_ev_64 *ev, struct starpu_fxt_opti
 
				 		worker_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "Sc");
			
 
				 }
			
 
				 
			
 
				+static void handle_end_scheduling(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
			
 
				+{
			
 
				+	int worker;
			
 
				+	worker = find_worker_id(ev->param[0]);
			
 
				+	if (worker < 0) return;
			
 
				+
			
 
				+	if (out_paje_file)
			
 
				+		worker_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "B");
			
 
				+}
			
 
				+
			
 
				+static void handle_push_scheduling(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
			
 
				+{
			
 
				+	int worker;
			
 
				+	worker = find_worker_id(ev->param[0]);
			
 
				+	if (worker < 0) return;
			
 
				+
			
 
				+	if (out_paje_file)
			
 
				+		worker_push_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "Sc");
			
 
				+}
			
 
				+
			
 
				+static void handle_pop_scheduling(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
			
 
				+{
			
 
				+	int worker;
			
 
				+	worker = find_worker_id(ev->param[0]);
			
 
				+	if (worker < 0) return;
			
 
				+
			
 
				+	if (out_paje_file)
			
 
				+		worker_pop_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0]);
			
 
				+}
			
 
				+
			
 
				 static void handle_start_sleep(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
			
 
				 {
			
 
				 	int worker;
			
@@ -1506,6 +1558,18 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 
				 				handle_start_scheduling(&ev, options);
			
 
				 				break;
			
 
				 
			
 
				+			case _STARPU_FUT_WORKER_SCHEDULING_END:
			
 
				+				handle_end_scheduling(&ev, options);
			
 
				+				break;
			
 
				+
			
 
				+			case _STARPU_FUT_WORKER_SCHEDULING_PUSH:
			
 
				+				handle_push_scheduling(&ev, options);
			
 
				+				break;
			
 
				+
			
 
				+			case _STARPU_FUT_WORKER_SCHEDULING_POP:
			
 
				+				handle_pop_scheduling(&ev, options);
			
 
				+				break;
			
 
				+
			
 
				 			case _STARPU_FUT_WORKER_SLEEP_START:
			
 
				 				handle_start_sleep(&ev, options);
			
 
				 				break;
			
--- a/src/debug/traces/starpu_paje.c
+++ b/src/debug/traces/starpu_paje.c
@@ -87,14 +87,14 @@ void _starpu_fxt_write_paje_header(FILE *file)
 
				 	fprintf(file, "%%EndEventDef\n");
			
 
				 	fprintf(file, "%%EventDef	PajePushState	11\n");
			
 
				 	fprintf(file, "%%	Time	date\n");
			
 
				-	fprintf(file, "%%	Type	string\n");
			
 
				 	fprintf(file, "%%	Container	string\n");
			
 
				+	fprintf(file, "%%	Type	string\n");
			
 
				 	fprintf(file, "%%	Value	string\n");
			
 
				 	fprintf(file, "%%EndEventDef\n");
			
 
				 	fprintf(file, "%%EventDef	PajePopState	12\n");
			
 
				 	fprintf(file, "%%	Time	date\n");
			
 
				-	fprintf(file, "%%	Type	string\n");
			
 
				 	fprintf(file, "%%	Container	string\n");
			
 
				+	fprintf(file, "%%	Type	string\n");
			
 
				 	fprintf(file, "%%EndEventDef\n");
			
 
				 	fprintf(file, "%%EventDef	PajeSetVariable	13\n");
			
 
				 	fprintf(file, "%%	Time	date\n");
			
--- a/src/drivers/driver_common/driver_common.c
+++ b/src/drivers/driver_common/driver_common.c
@@ -220,7 +220,15 @@ static void _starpu_worker_set_status_scheduling(int workerid)
 
				 		_STARPU_TRACE_WORKER_SCHEDULING_START;
			
 
				 		_starpu_worker_set_status(workerid, STATUS_SCHEDULING);
			
 
				 	}
			
 
				+}
			
 
				 
			
 
				+static void _starpu_worker_set_status_scheduling_done(int workerid)
			
 
				+{
			
 
				+	if (_starpu_worker_get_status(workerid) == STATUS_SCHEDULING)
			
 
				+	{
			
 
				+		_STARPU_TRACE_WORKER_SCHEDULING_END;
			
 
				+		_starpu_worker_set_status(workerid, STATUS_UNKNOWN);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 static void _starpu_worker_set_status_sleeping(int workerid)
			
@@ -355,6 +363,8 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 
				 
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&args->sched_mutex);
			
 
				 
			
 
				+	_starpu_worker_set_status_scheduling_done(workerid);
			
 
				+
			
 
				 	_starpu_worker_set_status_wakeup(workerid);
			
 
				 	args->spinning_backoff = BACKOFF_MIN;
			
 
				 
			
--- a/src/sched_policies/deque_modeling_policy_data_aware.c
+++ b/src/sched_policies/deque_modeling_policy_data_aware.c
@@ -547,10 +547,6 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				 	{
			
 
				 		worker = workers->get_next(workers, &it);
			
 
				 
			
 
				-		if (worker >= nworkers)
			
 
				-			/* This is a just-added worker, discard it */
			
 
				-			continue;
			
 
				-
			
 
				 		struct _starpu_fifo_taskq *fifo = dt->queue_array[worker];
			
 
				 		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				 		unsigned memory_node = starpu_worker_get_memory_node(worker);
			
--- a/tests/main/starpu_worker_exists.c
+++ b/tests/main/starpu_worker_exists.c
@@ -67,21 +67,31 @@ main(int argc, char **argv)
 
				 	task = starpu_task_create();
			
 
				 	task->cl = &cl;
			
 
				 	task->destroy = 0;
			
 
				+	task->sched_ctx = 0;
			
 
				 
			
 
				 	cl.can_execute = NULL;
			
 
				 	ret = _starpu_worker_exists(task);
			
 
				 	if (!ret)
			
 
				+	{
			
 
				+		FPRINTF(stderr, "failure with can_execute=NULL\n");
			
 
				 		return EXIT_FAILURE;
			
 
				+	}
			
 
				 
			
 
				 	cl.can_execute = can_always_execute;
			
 
				 	ret = _starpu_worker_exists(task);
			
 
				 	if (!ret)
			
 
				+	{
			
 
				+		FPRINTF(stderr, "failure with can_always_execute\n");
			
 
				 		return EXIT_FAILURE;
			
 
				+	}
			
 
				 
			
 
				 	cl.can_execute = can_never_execute;
			
 
				 	ret = _starpu_worker_exists(task);
			
 
				 	if (ret)
			
 
				+	{
			
 
				+		FPRINTF(stderr, "failure with can_never_execute\n");
			
 
				 		return EXIT_FAILURE;
			
 
				+	}
			
 
				 
			
 
				 	starpu_task_destroy(task);
			
 
				 	starpu_shutdown();
			
--- a/tests/perfmodels/regression_based.c
+++ b/tests/perfmodels/regression_based.c
@@ -80,6 +80,7 @@ static struct starpu_codelet nl_memset_cl =
 
				 {
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {memset_cuda, NULL},
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	.opencl_funcs = {memset_opencl, NULL},