Просмотр исходного кода

Merge branch 'master' into mpi_redux-tree

Antoine JEGO лет назад: 4
Родитель
Сommit
bbb022d1ba

+ 16 - 5
configure.ac

@@ -2257,13 +2257,16 @@ if test x$enable_perf_debug = xyes; then
 	IS_SUPPORTED_FLAG(-fno-pie)
 	IS_SUPPORTED_FLAG(-fno-pie)
 fi
 fi
 
 
+IS_SUPPORTED_FLAG(-Wextra)
+IS_SUPPORTED_FLAG(-Wunused)
+IS_SUPPORTED_CFLAG(-Wundef)
+IS_SUPPORTED_CXXFLAG(-Wundef)
+IS_SUPPORTED_FLAG(-Wshadow)
+IS_SUPPORTED_CFLAG(-Wpointer-arith)
+IS_SUPPORTED_CXXFLAG(-Wpointer-arith)
+
 if test "x$STARPU_DEVEL" != x; then
 if test "x$STARPU_DEVEL" != x; then
 	AC_DEFINE(STARPU_DEVEL, [1], [enable developer warnings])
 	AC_DEFINE(STARPU_DEVEL, [1], [enable developer warnings])
-	IS_SUPPORTED_FLAG(-Wextra)
-	IS_SUPPORTED_FLAG(-Wunused)
-	IS_SUPPORTED_CFLAG(-Wundef)
-	IS_SUPPORTED_CXXFLAG(-Wundef)
-	IS_SUPPORTED_FLAG(-Wshadow)
 	IS_SUPPORTED_CFLAG(-Werror=pointer-arith)
 	IS_SUPPORTED_CFLAG(-Werror=pointer-arith)
 	IS_SUPPORTED_CXXFLAG(-Werror=pointer-arith)
 	IS_SUPPORTED_CXXFLAG(-Werror=pointer-arith)
 	IS_SUPPORTED_FLAG(-fno-common)
 	IS_SUPPORTED_FLAG(-fno-common)
@@ -2777,6 +2780,14 @@ AM_CONDITIONAL(STARPU_HAVE_CBLAS_H, test x$have_cblas_h = xyes)
 if test x$have_cblas_h = xyes; then
 if test x$have_cblas_h = xyes; then
     AC_DEFINE(STARPU_HAVE_CBLAS_H, [1], [The blas library has blas.h])
     AC_DEFINE(STARPU_HAVE_CBLAS_H, [1], [The blas library has blas.h])
 fi
 fi
+if test x$blas_lib != xnone; then
+    AC_DEFINE(STARPU_HAVE_BLAS, [1], [The blas library is available])
+    SAVED_LIBS="$LIBS"
+    LIBS="$LIBS -lblas"
+    AC_CHECK_FUNCS([cblas_sgemv])
+    LIBS="$SAVED_LIBS"
+fi
+AM_CONDITIONAL(STARPU_HAVE_CBLAS_SGEMV, test $HAVE_CBLAS_SGEMV = 1)
 
 
 AM_CONDITIONAL(STARPU_ATLAS_BLAS_LIB, test x$blas_lib = xatlas)
 AM_CONDITIONAL(STARPU_ATLAS_BLAS_LIB, test x$blas_lib = xatlas)
 AM_CONDITIONAL(STARPU_GOTO_BLAS_LIB, test x$blas_lib = xgoto)
 AM_CONDITIONAL(STARPU_GOTO_BLAS_LIB, test x$blas_lib = xgoto)

+ 15 - 6
examples/Makefile.am

@@ -309,6 +309,12 @@ endif
 endif
 endif
 endif
 endif
 
 
+if STARPU_USE_CUDA
+STARPU_EXAMPLES +=				\
+	mult/sgemm 				\
+	mult/dgemm
+endif
+
 if !STARPU_NO_BLAS_LIB
 if !STARPU_NO_BLAS_LIB
 STARPU_EXAMPLES +=				\
 STARPU_EXAMPLES +=				\
 	mult/sgemm 				\
 	mult/sgemm 				\
@@ -358,9 +364,11 @@ STARPU_EXAMPLES +=				\
 endif
 endif
 
 
 if STARPU_HAVE_CBLAS_H
 if STARPU_HAVE_CBLAS_H
+if STARPU_HAVE_CBLAS_SGEMV
 STARPU_EXAMPLES +=				\
 STARPU_EXAMPLES +=				\
 	spmv/dw_block_spmv
 	spmv/dw_block_spmv
 endif
 endif
+endif
 
 
 if !STARPU_SIMGRID
 if !STARPU_SIMGRID
 if STARPU_HAVE_F77
 if STARPU_HAVE_F77
@@ -670,22 +678,23 @@ endif
 # Mult example #
 # Mult example #
 ################
 ################
 
 
-if !STARPU_NO_BLAS_LIB
-
 mult_sgemm_SOURCES = 				\
 mult_sgemm_SOURCES = 				\
-	mult/sgemm.c				\
-	common/blas.c
+	mult/sgemm.c
 
 
 mult_sgemm_LDADD =				\
 mult_sgemm_LDADD =				\
 	$(STARPU_BLAS_LDFLAGS)
 	$(STARPU_BLAS_LDFLAGS)
 
 
 mult_dgemm_SOURCES = 				\
 mult_dgemm_SOURCES = 				\
-	mult/dgemm.c				\
-	common/blas.c
+	mult/dgemm.c
 
 
 mult_dgemm_LDADD =				\
 mult_dgemm_LDADD =				\
 	$(STARPU_BLAS_LDFLAGS)
 	$(STARPU_BLAS_LDFLAGS)
 
 
+if !STARPU_NO_BLAS_LIB
+mult_sgemm_SOURCES += 				\
+	common/blas.c
+mult_dgemm_SOURCES += 				\
+	common/blas.c
 endif
 endif
 
 
 ####################
 ####################

+ 6 - 6
examples/cg/cg.c

@@ -79,12 +79,12 @@ static int copy_handle(starpu_data_handle_t dst, starpu_data_handle_t src, unsig
 static TYPE *A, *b, *x;
 static TYPE *A, *b, *x;
 static TYPE *r, *d, *q;
 static TYPE *r, *d, *q;
 
 
-static int copy_handle(starpu_data_handle_t dst, starpu_data_handle_t src, unsigned nblocks)
+static int copy_handle(starpu_data_handle_t dst, starpu_data_handle_t src, unsigned nb)
 {
 {
-	unsigned b;
+	unsigned block;
 
 
-	for (b = 0; b < nblocks; b++)
-		starpu_data_cpy(starpu_data_get_sub_data(dst, 1, b), starpu_data_get_sub_data(src, 1, b), 1, NULL, NULL);
+	for (block = 0; block < nb; block++)
+		starpu_data_cpy(starpu_data_get_sub_data(dst, 1, block), starpu_data_get_sub_data(src, 1, block), 1, NULL, NULL);
 	return 0;
 	return 0;
 }
 }
 
 
@@ -258,12 +258,12 @@ static void display_matrix(void)
 
 
 static void display_x_result(void)
 static void display_x_result(void)
 {
 {
-	int j, i;
+	unsigned j, i;
 	starpu_data_handle_t sub;
 	starpu_data_handle_t sub;
 
 
 	FPRINTF(stderr, "Computed X vector:\n");
 	FPRINTF(stderr, "Computed X vector:\n");
 
 
-	int block_size = n / nblocks;
+	unsigned block_size = n / nblocks;
 
 
 	for (j = 0; j < nblocks; j++)
 	for (j = 0; j < nblocks; j++)
 	{
 	{

+ 50 - 50
examples/cg/cg_kernels.c

@@ -31,7 +31,7 @@ static const TYPE gm1 = -1.0;
 
 
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
 
 
-static int nblocks = 8;
+static unsigned nblocks = 8;
 
 
 #ifdef STARPU_QUICK_CHECK
 #ifdef STARPU_QUICK_CHECK
 static int i_max = 5;
 static int i_max = 5;
@@ -164,9 +164,9 @@ static void accumulate_vector_cuda(void *descr[], void *cl_arg)
 	(void)cl_arg;
 	(void)cl_arg;
 	TYPE *v_dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v_dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v_src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 	TYPE *v_src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
-	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+	unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
 
 
-	cublasStatus_t status = cublasaxpy(starpu_cublas_get_local_handle(), n, &gp1, v_src, 1, v_dst, 1);
+	cublasStatus_t status = cublasaxpy(starpu_cublas_get_local_handle(), nx, &gp1, v_src, 1, v_dst, 1);
 	if (status != CUBLAS_STATUS_SUCCESS)
 	if (status != CUBLAS_STATUS_SUCCESS)
 		STARPU_CUBLAS_REPORT_ERROR(status);
 		STARPU_CUBLAS_REPORT_ERROR(status);
 }
 }
@@ -177,9 +177,9 @@ void accumulate_vector_cpu(void *descr[], void *cl_arg)
 	(void)cl_arg;
 	(void)cl_arg;
 	TYPE *v_dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v_dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v_src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 	TYPE *v_src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
-	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+	unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
 
 
-	AXPY(n, (TYPE)1.0, v_src, 1, v_dst, 1);
+	AXPY(nx, (TYPE)1.0, v_src, 1, v_dst, 1);
 }
 }
 
 
 static struct starpu_perfmodel accumulate_vector_model =
 static struct starpu_perfmodel accumulate_vector_model =
@@ -253,10 +253,10 @@ static void bzero_vector_cuda(void *descr[], void *cl_arg)
 {
 {
 	(void)cl_arg;
 	(void)cl_arg;
 	TYPE *v = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
-	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+	unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
 	size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
 	size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
 
 
-	cudaMemsetAsync(v, 0, n * elemsize, starpu_cuda_get_local_stream());
+	cudaMemsetAsync(v, 0, nx * elemsize, starpu_cuda_get_local_stream());
 }
 }
 #endif
 #endif
 
 
@@ -264,9 +264,9 @@ void bzero_vector_cpu(void *descr[], void *cl_arg)
 {
 {
 	(void)cl_arg;
 	(void)cl_arg;
 	TYPE *v = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
-	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+	unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
 
 
-	memset(v, 0, n*sizeof(TYPE));
+	memset(v, 0, nx*sizeof(TYPE));
 }
 }
 
 
 static struct starpu_perfmodel bzero_vector_model =
 static struct starpu_perfmodel bzero_vector_model =
@@ -302,12 +302,12 @@ static void dot_kernel_cuda(void *descr[], void *cl_arg)
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[2]);
 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[2]);
 
 
-	unsigned n = STARPU_VECTOR_GET_NX(descr[1]);
+	unsigned nx = STARPU_VECTOR_GET_NX(descr[1]);
 
 
 	cublasHandle_t handle = starpu_cublas_get_local_handle();
 	cublasHandle_t handle = starpu_cublas_get_local_handle();
 	cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
 	cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
 	cublasStatus_t status = cublasdot(handle,
 	cublasStatus_t status = cublasdot(handle,
-		n, v1, 1, v2, 1, dot);
+		nx, v1, 1, v2, 1, dot);
 	if (status != CUBLAS_STATUS_SUCCESS)
 	if (status != CUBLAS_STATUS_SUCCESS)
 		STARPU_CUBLAS_REPORT_ERROR(status);
 		STARPU_CUBLAS_REPORT_ERROR(status);
 	cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
 	cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
@@ -321,12 +321,12 @@ void dot_kernel_cpu(void *descr[], void *cl_arg)
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[2]);
 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[2]);
 
 
-	unsigned n = STARPU_VECTOR_GET_NX(descr[1]);
+	unsigned nx = STARPU_VECTOR_GET_NX(descr[1]);
 
 
 	TYPE local_dot;
 	TYPE local_dot;
 	/* Note that we explicitely cast the result of the DOT kernel because
 	/* Note that we explicitely cast the result of the DOT kernel because
 	 * some BLAS library will return a double for sdot for instance. */
 	 * some BLAS library will return a double for sdot for instance. */
-	local_dot = (TYPE)DOT(n, v1, 1, v2, 1);
+	local_dot = (TYPE)DOT(nx, v1, 1, v2, 1);
 
 
 	*dot = *dot + local_dot;
 	*dot = *dot + local_dot;
 }
 }
@@ -354,7 +354,7 @@ static struct starpu_codelet dot_kernel_cl =
 int dot_kernel(HANDLE_TYPE_VECTOR v1,
 int dot_kernel(HANDLE_TYPE_VECTOR v1,
 	       HANDLE_TYPE_VECTOR v2,
 	       HANDLE_TYPE_VECTOR v2,
 	       starpu_data_handle_t s,
 	       starpu_data_handle_t s,
-	       unsigned nblocks)
+	       unsigned nb)
 {
 {
 	int ret;
 	int ret;
 
 
@@ -368,14 +368,14 @@ int dot_kernel(HANDLE_TYPE_VECTOR v1,
 		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
 		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
 	}
 	}
 
 
-	unsigned b;
-	for (b = 0; b < nblocks; b++)
+	unsigned block;
+	for (block = 0; block < nb; block++)
 	{
 	{
 		ret = TASK_INSERT(&dot_kernel_cl,
 		ret = TASK_INSERT(&dot_kernel_cl,
 					 use_reduction?STARPU_REDUX:STARPU_RW, s,
 					 use_reduction?STARPU_REDUX:STARPU_RW, s,
-					 STARPU_R, GET_VECTOR_BLOCK(v1, b),
-					 STARPU_R, GET_VECTOR_BLOCK(v2, b),
-					 STARPU_TAG_ONLY, (starpu_tag_t) b,
+					 STARPU_R, GET_VECTOR_BLOCK(v1, block),
+					 STARPU_R, GET_VECTOR_BLOCK(v2, block),
+					 STARPU_TAG_ONLY, (starpu_tag_t) block,
 					 0);
 					 0);
 		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
 		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
 	}
 	}
@@ -393,11 +393,11 @@ static void scal_kernel_cuda(void *descr[], void *cl_arg)
 	starpu_codelet_unpack_args(cl_arg, &p1);
 	starpu_codelet_unpack_args(cl_arg, &p1);
 
 
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
-	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+	unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
 
 
 	/* v1 = p1 v1 */
 	/* v1 = p1 v1 */
 	TYPE alpha = p1;
 	TYPE alpha = p1;
-	cublasStatus_t status = cublasscal(starpu_cublas_get_local_handle(), n, &alpha, v1, 1);
+	cublasStatus_t status = cublasscal(starpu_cublas_get_local_handle(), nx, &alpha, v1, 1);
 	if (status != CUBLAS_STATUS_SUCCESS)
 	if (status != CUBLAS_STATUS_SUCCESS)
 		STARPU_CUBLAS_REPORT_ERROR(status);
 		STARPU_CUBLAS_REPORT_ERROR(status);
 }
 }
@@ -409,10 +409,10 @@ void scal_kernel_cpu(void *descr[], void *cl_arg)
 	starpu_codelet_unpack_args(cl_arg, &alpha);
 	starpu_codelet_unpack_args(cl_arg, &alpha);
 
 
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
-	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+	unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
 
 
 	/* v1 = alpha v1 */
 	/* v1 = alpha v1 */
-	SCAL(n, alpha, v1, 1);
+	SCAL(nx, alpha, v1, 1);
 }
 }
 
 
 static struct starpu_perfmodel scal_kernel_model =
 static struct starpu_perfmodel scal_kernel_model =
@@ -479,14 +479,14 @@ void gemv_kernel_cpu(void *descr[], void *cl_arg)
 	if (worker_size > 1)
 	if (worker_size > 1)
 	{
 	{
 		/* Parallel CPU task */
 		/* Parallel CPU task */
-		unsigned rank = starpu_combined_worker_get_rank();
+		unsigned i = starpu_combined_worker_get_rank();
 
 
-		unsigned block_size = (ny + worker_size - 1)/worker_size;
-		unsigned new_nx = STARPU_MIN(nx, block_size*(rank+1)) - block_size*rank;
+		unsigned bs = (ny + worker_size - 1)/worker_size;
+		unsigned new_nx = STARPU_MIN(nx, bs*(i+1)) - bs*i;
 
 
 		nx = new_nx;
 		nx = new_nx;
-		v1 = &v1[block_size*rank];
-		M = &M[block_size*rank];
+		v1 = &v1[bs*i];
+		M = &M[bs*i];
 	}
 	}
 
 
 	/* Compute v1 = alpha M v2 + beta v1 */
 	/* Compute v1 = alpha M v2 + beta v1 */
@@ -519,12 +519,12 @@ int gemv_kernel(HANDLE_TYPE_VECTOR v1,
 		HANDLE_TYPE_MATRIX matrix,
 		HANDLE_TYPE_MATRIX matrix,
 		HANDLE_TYPE_VECTOR v2,
 		HANDLE_TYPE_VECTOR v2,
 		TYPE p1, TYPE p2,
 		TYPE p1, TYPE p2,
-		unsigned nblocks)
+		unsigned nb)
 {
 {
 	unsigned b1, b2;
 	unsigned b1, b2;
 	int ret;
 	int ret;
 
 
-	for (b2 = 0; b2 < nblocks; b2++)
+	for (b2 = 0; b2 < nb; b2++)
 	{
 	{
 		ret = TASK_INSERT(&scal_kernel_cl,
 		ret = TASK_INSERT(&scal_kernel_cl,
 					 STARPU_RW, GET_VECTOR_BLOCK(v1, b2),
 					 STARPU_RW, GET_VECTOR_BLOCK(v1, b2),
@@ -535,9 +535,9 @@ int gemv_kernel(HANDLE_TYPE_VECTOR v1,
 		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
 		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
 	}
 	}
 
 
-	for (b2 = 0; b2 < nblocks; b2++)
+	for (b2 = 0; b2 < nb; b2++)
 	{
 	{
-		for (b1 = 0; b1 < nblocks; b1++)
+		for (b1 = 0; b1 < nb; b1++)
 		{
 		{
 			TYPE one = 1.0;
 			TYPE one = 1.0;
 			ret = TASK_INSERT(&gemv_kernel_cl,
 			ret = TASK_INSERT(&gemv_kernel_cl,
@@ -546,7 +546,7 @@ int gemv_kernel(HANDLE_TYPE_VECTOR v1,
 						 STARPU_R,	GET_VECTOR_BLOCK(v2, b1),
 						 STARPU_R,	GET_VECTOR_BLOCK(v2, b1),
 						 STARPU_VALUE,	&one,	sizeof(one),
 						 STARPU_VALUE,	&one,	sizeof(one),
 						 STARPU_VALUE,	&p2,	sizeof(p2),
 						 STARPU_VALUE,	&p2,	sizeof(p2),
-						 STARPU_TAG_ONLY, ((starpu_tag_t)b2) * nblocks + b1,
+						 STARPU_TAG_ONLY, ((starpu_tag_t)b2) * nb + b1,
 						 0);
 						 0);
 			STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
 			STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
 		}
 		}
@@ -566,17 +566,17 @@ static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 
 
-	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+	unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
 
 
 	/* Compute v1 = p1 * v1 + p2 * v2.
 	/* Compute v1 = p1 * v1 + p2 * v2.
 	 *	v1 = p1 v1
 	 *	v1 = p1 v1
 	 *	v1 = v1 + p2 v2
 	 *	v1 = v1 + p2 v2
 	 */
 	 */
 	cublasStatus_t status;
 	cublasStatus_t status;
-	status = cublasscal(starpu_cublas_get_local_handle(), n, &p1, v1, 1);
+	status = cublasscal(starpu_cublas_get_local_handle(), nx, &p1, v1, 1);
 	if (status != CUBLAS_STATUS_SUCCESS)
 	if (status != CUBLAS_STATUS_SUCCESS)
 		STARPU_CUBLAS_REPORT_ERROR(status);
 		STARPU_CUBLAS_REPORT_ERROR(status);
-	status = cublasaxpy(starpu_cublas_get_local_handle(), n, &p2, v2, 1, v1, 1);
+	status = cublasaxpy(starpu_cublas_get_local_handle(), nx, &p2, v2, 1, v1, 1);
 	if (status != CUBLAS_STATUS_SUCCESS)
 	if (status != CUBLAS_STATUS_SUCCESS)
 		STARPU_CUBLAS_REPORT_ERROR(status);
 		STARPU_CUBLAS_REPORT_ERROR(status);
 }
 }
@@ -622,18 +622,18 @@ static struct starpu_codelet scal_axpy_kernel_cl =
 
 
 int scal_axpy_kernel(HANDLE_TYPE_VECTOR v1, TYPE p1,
 int scal_axpy_kernel(HANDLE_TYPE_VECTOR v1, TYPE p1,
 		     HANDLE_TYPE_VECTOR v2, TYPE p2,
 		     HANDLE_TYPE_VECTOR v2, TYPE p2,
-		     unsigned nblocks)
+		     unsigned nb)
 {
 {
-	unsigned b;
-	for (b = 0; b < nblocks; b++)
+	unsigned block;
+	for (block = 0; block < nb; block++)
 	{
 	{
 		int ret;
 		int ret;
 		ret = TASK_INSERT(&scal_axpy_kernel_cl,
 		ret = TASK_INSERT(&scal_axpy_kernel_cl,
-					 STARPU_RW, GET_VECTOR_BLOCK(v1, b),
-					 STARPU_R,  GET_VECTOR_BLOCK(v2, b),
+					 STARPU_RW, GET_VECTOR_BLOCK(v1, block),
+					 STARPU_R,  GET_VECTOR_BLOCK(v2, block),
 					 STARPU_VALUE, &p1, sizeof(p1),
 					 STARPU_VALUE, &p1, sizeof(p1),
 					 STARPU_VALUE, &p2, sizeof(p2),
 					 STARPU_VALUE, &p2, sizeof(p2),
-					 STARPU_TAG_ONLY, (starpu_tag_t) b,
+					 STARPU_TAG_ONLY, (starpu_tag_t) block,
 					 0);
 					 0);
 		if (ret == -ENODEV) return ret;
 		if (ret == -ENODEV) return ret;
 		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
 		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
@@ -654,12 +654,12 @@ static void axpy_kernel_cuda(void *descr[], void *cl_arg)
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 
 
-	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+	unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
 
 
 	/* Compute v1 = v1 + p1 * v2.
 	/* Compute v1 = v1 + p1 * v2.
 	 */
 	 */
 	cublasStatus_t status = cublasaxpy(starpu_cublas_get_local_handle(),
 	cublasStatus_t status = cublasaxpy(starpu_cublas_get_local_handle(),
-			n, &p1, v2, 1, v1, 1);
+			nx, &p1, v2, 1, v1, 1);
 	if (status != CUBLAS_STATUS_SUCCESS)
 	if (status != CUBLAS_STATUS_SUCCESS)
 		STARPU_CUBLAS_REPORT_ERROR(status);
 		STARPU_CUBLAS_REPORT_ERROR(status);
 }
 }
@@ -702,17 +702,17 @@ static struct starpu_codelet axpy_kernel_cl =
 
 
 int axpy_kernel(HANDLE_TYPE_VECTOR v1,
 int axpy_kernel(HANDLE_TYPE_VECTOR v1,
 		HANDLE_TYPE_VECTOR v2, TYPE p1,
 		HANDLE_TYPE_VECTOR v2, TYPE p1,
-		unsigned nblocks)
+		unsigned nb)
 {
 {
-	unsigned b;
-	for (b = 0; b < nblocks; b++)
+	unsigned block;
+	for (block = 0; block < nb; block++)
 	{
 	{
 		int ret;
 		int ret;
 		ret = TASK_INSERT(&axpy_kernel_cl,
 		ret = TASK_INSERT(&axpy_kernel_cl,
-					 STARPU_RW, GET_VECTOR_BLOCK(v1, b),
-					 STARPU_R,  GET_VECTOR_BLOCK(v2, b),
+					 STARPU_RW, GET_VECTOR_BLOCK(v1, block),
+					 STARPU_R,  GET_VECTOR_BLOCK(v2, block),
 					 STARPU_VALUE, &p1, sizeof(p1),
 					 STARPU_VALUE, &p1, sizeof(p1),
-					 STARPU_TAG_ONLY, (starpu_tag_t) b,
+					 STARPU_TAG_ONLY, (starpu_tag_t) block,
 					 0);
 					 0);
 		if (ret == -ENODEV) return ret;
 		if (ret == -ENODEV) return ret;
 		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
 		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");

+ 3 - 3
examples/interface/complex.c

@@ -150,7 +150,7 @@ int main(void)
 	starpu_task_wait_for_all();
 	starpu_task_wait_for_all();
 	if (compare != 0)
 	if (compare != 0)
 	{
 	{
-	     FPRINTF(stderr, "Complex numbers should NOT be similar\n");
+	     _FPRINTF(stderr, "Complex numbers should NOT be similar\n");
 	     goto end;
 	     goto end;
 	}
 	}
 
 
@@ -183,7 +183,7 @@ int main(void)
 
 
 	if (compare != 1)
 	if (compare != 1)
 	{
 	{
-	     FPRINTF(stderr, "Complex numbers should be similar\n");
+	     _FPRINTF(stderr, "Complex numbers should be similar\n");
 	}
 	}
 
 
 	/* Put another value again */
 	/* Put another value again */
@@ -267,7 +267,7 @@ int main(void)
 	starpu_task_wait_for_all();
 	starpu_task_wait_for_all();
 	if (compare != 1)
 	if (compare != 1)
 	{
 	{
-	     FPRINTF(stderr, "Complex numbers should be similar\n");
+	     _FPRINTF(stderr, "Complex numbers should be similar\n");
 	     goto end;
 	     goto end;
 	}
 	}
 
 

+ 3 - 3
examples/interface/complex_codelet.h

@@ -20,7 +20,7 @@
 #ifndef __COMPLEX_CODELET_H
 #ifndef __COMPLEX_CODELET_H
 #define __COMPLEX_CODELET_H
 #define __COMPLEX_CODELET_H
 
 
-#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+#define _FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
 
 
 /* Dumb performance model for simgrid */
 /* Dumb performance model for simgrid */
 static double complex_cost_function(struct starpu_task *task, unsigned nimpl)
 static double complex_cost_function(struct starpu_task *task, unsigned nimpl)
@@ -87,10 +87,10 @@ void display_complex_codelet(void *descr[], void *_args)
 	if (_args)
 	if (_args)
 		starpu_codelet_unpack_args(_args, &msg);
 		starpu_codelet_unpack_args(_args, &msg);
 
 
-	FPRINTF(stderr, "[%s]\n", _args?msg:NULL);
+	_FPRINTF(stderr, "[%s]\n", _args?msg:NULL);
 	for(i=0 ; i<nx ; i++)
 	for(i=0 ; i<nx ; i++)
 	{
 	{
-		FPRINTF(stderr, "\tComplex[%d] = %3.2f + %3.2f i\n", i, real[i], imaginary[i]);
+		_FPRINTF(stderr, "\tComplex[%d] = %3.2f + %3.2f i\n", i, real[i], imaginary[i]);
 	}
 	}
 	fflush(stderr);
 	fflush(stderr);
 }
 }

+ 39 - 2
examples/mult/xgemm.c

@@ -33,7 +33,9 @@
 #include <starpu.h>
 #include <starpu.h>
 #include <starpu_fxt.h>
 #include <starpu_fxt.h>
 
 
+#ifdef STARPU_HAVE_BLAS
 #include <common/blas.h>
 #include <common/blas.h>
+#endif
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #include <cuda.h>
@@ -68,6 +70,7 @@ static starpu_data_handle_t A_handle, B_handle, C_handle;
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
 #define PRINTF(fmt, ...) do { if (!getenv("STARPU_SSILENT")) {printf(fmt, ## __VA_ARGS__); fflush(stdout); }} while(0)
 #define PRINTF(fmt, ...) do { if (!getenv("STARPU_SSILENT")) {printf(fmt, ## __VA_ARGS__); fflush(stdout); }} while(0)
 
 
+#ifdef STARPU_HAVE_BLAS
 static int check_output(void)
 static int check_output(void)
 {
 {
 	/* compute C = C - AB */
 	/* compute C = C - AB */
@@ -92,6 +95,7 @@ static int check_output(void)
 		return 1;
 		return 1;
 	}
 	}
 }
 }
+#endif
 
 
 static void init_problem_data(void)
 static void init_problem_data(void)
 {
 {
@@ -225,6 +229,7 @@ static void cublas_gemm(void *descr[], void *arg)
 }
 }
 #endif
 #endif
 
 
+#ifdef STARPU_HAVE_BLAS
 void cpu_mult(void *descr[], void *arg, TYPE beta)
 void cpu_mult(void *descr[], void *arg, TYPE beta)
 {
 {
 	(void)arg;
 	(void)arg;
@@ -273,6 +278,7 @@ void cpu_gemm(void *descr[], void *arg)
 {
 {
 	cpu_mult(descr, arg, 1.);
 	cpu_mult(descr, arg, 1.);
 }
 }
+#endif
 
 
 static struct starpu_perfmodel starpu_gemm_model =
 static struct starpu_perfmodel starpu_gemm_model =
 {
 {
@@ -282,10 +288,12 @@ static struct starpu_perfmodel starpu_gemm_model =
 
 
 static struct starpu_codelet cl_gemm0 =
 static struct starpu_codelet cl_gemm0 =
 {
 {
+#ifdef STARPU_HAVE_BLAS
 	.type = STARPU_SEQ, /* changed to STARPU_SPMD if -spmd is passed */
 	.type = STARPU_SEQ, /* changed to STARPU_SPMD if -spmd is passed */
 	.max_parallelism = INT_MAX,
 	.max_parallelism = INT_MAX,
 	.cpu_funcs = {cpu_gemm0},
 	.cpu_funcs = {cpu_gemm0},
 	.cpu_funcs_name = {"cpu_gemm0"},
 	.cpu_funcs_name = {"cpu_gemm0"},
+#endif
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cublas_gemm0},
 	.cuda_funcs = {cublas_gemm0},
 #elif defined(STARPU_SIMGRID)
 #elif defined(STARPU_SIMGRID)
@@ -299,10 +307,12 @@ static struct starpu_codelet cl_gemm0 =
 
 
 static struct starpu_codelet cl_gemm =
 static struct starpu_codelet cl_gemm =
 {
 {
+#ifdef STARPU_HAVE_BLAS
 	.type = STARPU_SEQ, /* changed to STARPU_SPMD if -spmd is passed */
 	.type = STARPU_SEQ, /* changed to STARPU_SPMD if -spmd is passed */
 	.max_parallelism = INT_MAX,
 	.max_parallelism = INT_MAX,
 	.cpu_funcs = {cpu_gemm},
 	.cpu_funcs = {cpu_gemm},
 	.cpu_funcs_name = {"cpu_gemm"},
 	.cpu_funcs_name = {"cpu_gemm"},
+#endif
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cublas_gemm},
 	.cuda_funcs = {cublas_gemm},
 #elif defined(STARPU_SIMGRID)
 #elif defined(STARPU_SIMGRID)
@@ -330,24 +340,49 @@ static void parse_args(int argc, char **argv)
 			nslicesx = strtol(argv[++i], &argptr, 10);
 			nslicesx = strtol(argv[++i], &argptr, 10);
 			nslicesy = nslicesx;
 			nslicesy = nslicesx;
 			nslicesz = nslicesx;
 			nslicesz = nslicesx;
+			if (nslicesx == 0)
+			{
+				fprintf(stderr, "the number of blocks in X cannot be 0!\n");
+				exit(EXIT_FAILURE);
+			}
+			if (nslicesy == 0)
+			{
+				fprintf(stderr, "the number of blocks in Y cannot be 0!\n");
+				exit(EXIT_FAILURE);
+			}
 		}
 		}
 
 
 		else if (strcmp(argv[i], "-nblocksx") == 0)
 		else if (strcmp(argv[i], "-nblocksx") == 0)
 		{
 		{
 			char *argptr;
 			char *argptr;
 			nslicesx = strtol(argv[++i], &argptr, 10);
 			nslicesx = strtol(argv[++i], &argptr, 10);
+			if (nslicesx == 0)
+			{
+				fprintf(stderr, "the number of blocks in X cannot be 0!\n");
+				exit(EXIT_FAILURE);
+			}
 		}
 		}
 
 
 		else if (strcmp(argv[i], "-nblocksy") == 0)
 		else if (strcmp(argv[i], "-nblocksy") == 0)
 		{
 		{
 			char *argptr;
 			char *argptr;
 			nslicesy = strtol(argv[++i], &argptr, 10);
 			nslicesy = strtol(argv[++i], &argptr, 10);
+			if (nslicesy == 0)
+			{
+				fprintf(stderr, "the number of blocks in Y cannot be 0!\n");
+				exit(EXIT_FAILURE);
+			}
 		}
 		}
 
 
 		else if (strcmp(argv[i], "-nblocksz") == 0)
 		else if (strcmp(argv[i], "-nblocksz") == 0)
 		{
 		{
 			char *argptr;
 			char *argptr;
 			nslicesz = strtol(argv[++i], &argptr, 10);
 			nslicesz = strtol(argv[++i], &argptr, 10);
+			if (nslicesz == 0)
+			{
+				fprintf(stderr, "the number of blocks in Z cannot be 0!\n");
+				exit(EXIT_FAILURE);
+			}
 		}
 		}
 
 
 		else if (strcmp(argv[i], "-x") == 0)
 		else if (strcmp(argv[i], "-x") == 0)
@@ -416,9 +451,9 @@ static void parse_args(int argc, char **argv)
 		{
 		{
 			fprintf(stderr,"Usage: %s [-3d] [-nblocks n] [-nblocksx x] [-nblocksy y] [-nblocksz z] [-x x] [-y y] [-xy n] [-z z] [-size size] [-iter iter] [-bound] [-check] [-spmd] [-hostname] [-nsleeps nsleeps]\n", argv[0]);
 			fprintf(stderr,"Usage: %s [-3d] [-nblocks n] [-nblocksx x] [-nblocksy y] [-nblocksz z] [-x x] [-y y] [-xy n] [-z z] [-size size] [-iter iter] [-bound] [-check] [-spmd] [-hostname] [-nsleeps nsleeps]\n", argv[0]);
 			if (tiled)
 			if (tiled)
-				fprintf(stderr,"Currently selected: %ux%u * %ux%u and %ux%ux%u blocks, %u iterations, %u sleeps\n", zdim, ydim, xdim, zdim, nslicesx, nslicesy, nslicesz, niter, nsleeps);
+				fprintf(stderr,"Currently selected: %ux%u * %ux%u and %ux%ux%u blocks (size %ux%u length %u), %u iterations, %u sleeps\n", zdim, ydim, xdim, zdim, nslicesx, nslicesy, nslicesz, xdim / nslicesx, ydim / nslicesy, zdim / nslicesz, niter, nsleeps);
 			else
 			else
-				fprintf(stderr,"Currently selected: %ux%u * %ux%u and %ux%u blocks, %u iterations, %u sleeps\n", zdim, ydim, xdim, zdim, nslicesx, nslicesy, niter, nsleeps);
+				fprintf(stderr,"Currently selected: %ux%u * %ux%u and %ux%u blocks (size %ux%u length %u), %u iterations, %u sleeps\n", zdim, ydim, xdim, zdim, nslicesx, nslicesy, xdim / nslicesx, ydim / nslicesy, zdim, niter, nsleeps);
 			exit(EXIT_SUCCESS);
 			exit(EXIT_SUCCESS);
 		}
 		}
 		else
 		else
@@ -574,10 +609,12 @@ enodev:
 	starpu_data_unregister(B_handle);
 	starpu_data_unregister(B_handle);
 	starpu_data_unregister(C_handle);
 	starpu_data_unregister(C_handle);
 
 
+#ifdef STARPU_HAVE_BLAS
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
 	if (check)
 	if (check)
 		ret = check_output();
 		ret = check_output();
 #endif
 #endif
+#endif
 
 
 	starpu_free_flags(A, zdim*ydim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
 	starpu_free_flags(A, zdim*ydim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
 	starpu_free_flags(B, xdim*zdim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
 	starpu_free_flags(B, xdim*zdim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);

+ 1 - 0
include/starpu_config.h.in

@@ -127,6 +127,7 @@
 #undef STARPU_ARMPL
 #undef STARPU_ARMPL
 #undef STARPU_SYSTEM_BLAS
 #undef STARPU_SYSTEM_BLAS
 #undef STARPU_HAVE_CBLAS_H
 #undef STARPU_HAVE_CBLAS_H
+#undef STARPU_HAVE_BLAS
 
 
 /**
 /**
    Define the directory in which the OpenCL codelets of the
    Define the directory in which the OpenCL codelets of the

+ 10 - 0
include/starpu_sched_component.h

@@ -138,6 +138,11 @@ struct starpu_sched_component
 	*/
 	*/
 	int (*can_pull)(struct starpu_sched_component *component);
 	int (*can_pull)(struct starpu_sched_component *component);
 
 
+	/**
+	   This function is called when starpu_do_schedule() is called by the application.
+	*/
+	void (*do_schedule)(struct starpu_sched_component *component);
+
 	int (*notify)(struct starpu_sched_component* component, int message_ID, void* arg);
 	int (*notify)(struct starpu_sched_component* component, int message_ID, void* arg);
 
 
 	/**
 	/**
@@ -260,6 +265,11 @@ void starpu_sched_tree_add_workers(unsigned sched_ctx_id, int *workerids, unsign
 void starpu_sched_tree_remove_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers);
 void starpu_sched_tree_remove_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers);
 
 
 /**
 /**
+   Run the do_schedule method of the components. This is a helper for starpu_sched_policy::do_schedule.
+*/
+void starpu_sched_tree_do_schedule(unsigned sched_ctx_id);
+
+/**
    Attach component \p child to parent \p parent. Some component may accept only one child, others accept several (e.g. MCT)
    Attach component \p child to parent \p parent. Some component may accept only one child, others accept several (e.g. MCT)
 */
 */
 void starpu_sched_component_connect(struct starpu_sched_component *parent, struct starpu_sched_component *child);
 void starpu_sched_component_connect(struct starpu_sched_component *parent, struct starpu_sched_component *child);

+ 9 - 3
include/starpu_task.h

@@ -1050,15 +1050,21 @@ struct starpu_task
 
 
 	   Set by StarPU.
 	   Set by StarPU.
 	*/
 	*/
-	unsigned failed:1;
+	unsigned char failed;
 
 
 	/**
 	/**
 	   Whether the scheduler has pushed the task on some queue
 	   Whether the scheduler has pushed the task on some queue
 
 
 	   Set by StarPU.
 	   Set by StarPU.
 	*/
 	*/
-	unsigned scheduled:1;
-	unsigned prefetched:1;
+	unsigned char scheduled;
+
+	/**
+	   Whether the scheduler has prefetched the task's data
+
+	   Set by StarPU.
+	*/
+	unsigned char prefetched;
 
 
 	/**
 	/**
 	   Optional field. If the field
 	   Optional field. If the field

+ 86 - 84
mpi/examples/cg/cg.c

@@ -39,7 +39,7 @@ static int copy_handle(starpu_data_handle_t* dst, starpu_data_handle_t* src, uns
 #define BARRIER() starpu_mpi_barrier(MPI_COMM_WORLD);
 #define BARRIER() starpu_mpi_barrier(MPI_COMM_WORLD);
 #define GET_DATA_HANDLE(handle) starpu_mpi_get_data_on_all_nodes_detached(MPI_COMM_WORLD, handle)
 #define GET_DATA_HANDLE(handle) starpu_mpi_get_data_on_all_nodes_detached(MPI_COMM_WORLD, handle)
 
 
-static int block_size;
+static unsigned block_size;
 
 
 static int rank;
 static int rank;
 static int nodes_p = 2;
 static int nodes_p = 2;
@@ -57,20 +57,20 @@ static TYPE **q;
 
 
 #include "../../../examples/cg/cg_kernels.c"
 #include "../../../examples/cg/cg_kernels.c"
 
 
-static int my_distrib(const int y, const int x)
+static int my_distrib(const int yy, const int xx)
 {
 {
-	return (y%nodes_q)*nodes_p + (x%nodes_p);
+	return (yy%nodes_q)*nodes_p + (xx%nodes_p);
 }
 }
 
 
-static int copy_handle(starpu_data_handle_t* dst, starpu_data_handle_t* src, unsigned nblocks)
+static int copy_handle(starpu_data_handle_t* dst, starpu_data_handle_t* src, unsigned nb)
 {
 {
-	unsigned b;
+	unsigned block;
 
 
-	for (b = 0; b < nblocks; b++)
+	for (block = 0; block < nb; block++)
 	{
 	{
-		if (rank == my_distrib(b, 0))
+		if (rank == my_distrib(block, 0))
 		{
 		{
-			starpu_data_cpy(dst[b], src[b], /* asynchronous */ 1, /* without callback */ NULL, NULL);
+			starpu_data_cpy(dst[block], src[block], /* asynchronous */ 1, /* without callback */ NULL, NULL);
 		}
 		}
 	}
 	}
 
 
@@ -82,7 +82,8 @@ static int copy_handle(starpu_data_handle_t* dst, starpu_data_handle_t* src, uns
  */
  */
 static void generate_random_problem(void)
 static void generate_random_problem(void)
 {
 {
-	unsigned nn, mm, m, n, mpi_rank;
+	unsigned ii, jj, j, i;
+	int mpi_rank;
 
 
 	A = malloc(nblocks * sizeof(TYPE **));
 	A = malloc(nblocks * sizeof(TYPE **));
 	x = malloc(nblocks * sizeof(TYPE *));
 	x = malloc(nblocks * sizeof(TYPE *));
@@ -92,47 +93,47 @@ static void generate_random_problem(void)
 	d = malloc(nblocks * sizeof(TYPE *));
 	d = malloc(nblocks * sizeof(TYPE *));
 	q = malloc(nblocks * sizeof(TYPE *));
 	q = malloc(nblocks * sizeof(TYPE *));
 
 
-	for (m = 0; m < nblocks; m++)
+	for (j = 0; j < nblocks; j++)
 	{
 	{
-		A[m] = malloc(nblocks * sizeof(TYPE*));
+		A[j] = malloc(nblocks * sizeof(TYPE*));
 
 
-		mpi_rank = my_distrib(m, 0);
+		mpi_rank = my_distrib(j, 0);
 
 
 		if (mpi_rank == rank || display_result)
 		if (mpi_rank == rank || display_result)
 		{
 		{
-			starpu_malloc((void**) &x[m], block_size*sizeof(TYPE));
+			starpu_malloc((void**) &x[j], block_size*sizeof(TYPE));
 		}
 		}
 
 
 		if (mpi_rank == rank)
 		if (mpi_rank == rank)
 		{
 		{
-			starpu_malloc((void**) &b[m], block_size*sizeof(TYPE));
-			starpu_malloc((void**) &r[m], block_size*sizeof(TYPE));
-			starpu_malloc((void**) &d[m], block_size*sizeof(TYPE));
-			starpu_malloc((void**) &q[m], block_size*sizeof(TYPE));
+			starpu_malloc((void**) &b[j], block_size*sizeof(TYPE));
+			starpu_malloc((void**) &r[j], block_size*sizeof(TYPE));
+			starpu_malloc((void**) &d[j], block_size*sizeof(TYPE));
+			starpu_malloc((void**) &q[j], block_size*sizeof(TYPE));
 
 
-			for (mm = 0; mm < block_size; mm++)
+			for (jj = 0; jj < block_size; jj++)
 			{
 			{
-				x[m][mm] = (TYPE) 0.0;
-				b[m][mm] = (TYPE) 1.0;
-				r[m][mm] = (TYPE) 0.0;
-				d[m][mm] = (TYPE) 0.0;
-				q[m][mm] = (TYPE) 0.0;
+				x[j][jj] = (TYPE) 0.0;
+				b[j][jj] = (TYPE) 1.0;
+				r[j][jj] = (TYPE) 0.0;
+				d[j][jj] = (TYPE) 0.0;
+				q[j][jj] = (TYPE) 0.0;
 			}
 			}
 		}
 		}
 
 
-		for (n = 0; n < nblocks; n++)
+		for (i = 0; i < nblocks; i++)
 		{
 		{
-			mpi_rank = my_distrib(m, n);
+			mpi_rank = my_distrib(j, i);
 			if (mpi_rank == rank)
 			if (mpi_rank == rank)
 			{
 			{
-				starpu_malloc((void**) &A[m][n], block_size*block_size*sizeof(TYPE));
+				starpu_malloc((void**) &A[j][i], block_size*block_size*sizeof(TYPE));
 
 
-				for (nn = 0; nn < block_size; nn++)
+				for (ii = 0; ii < block_size; ii++)
 				{
 				{
-					for (mm = 0; mm < block_size; mm++)
+					for (jj = 0; jj < block_size; jj++)
 					{
 					{
 						/* We take Hilbert matrix that is not well conditionned but definite positive: H(i,j) = 1/(1+i+j) */
 						/* We take Hilbert matrix that is not well conditionned but definite positive: H(i,j) = 1/(1+i+j) */
-						A[m][n][mm + nn*block_size] = (TYPE) (1.0/(1.0+(nn+(m*block_size)+mm+(n*block_size))));
+						A[j][i][jj + ii*block_size] = (TYPE) (1.0/(1.0+(ii+(j*block_size)+jj+(i*block_size))));
 					}
 					}
 				}
 				}
 			}
 			}
@@ -142,35 +143,36 @@ static void generate_random_problem(void)
 
 
 static void free_data(void)
 static void free_data(void)
 {
 {
-	unsigned nn, mm, m, n, mpi_rank;
+	unsigned ii, jj, j, i;
+	int mpi_rank;
 
 
-	for (m = 0; m < nblocks; m++)
+	for (j = 0; j < nblocks; j++)
 	{
 	{
-		mpi_rank = my_distrib(m, 0);
+		mpi_rank = my_distrib(j, 0);
 
 
 		if (mpi_rank == rank || display_result)
 		if (mpi_rank == rank || display_result)
 		{
 		{
-			starpu_free((void*) x[m]);
+			starpu_free((void*) x[j]);
 		}
 		}
 
 
 		if (mpi_rank == rank)
 		if (mpi_rank == rank)
 		{
 		{
-			starpu_free((void*) b[m]);
-			starpu_free((void*) r[m]);
-			starpu_free((void*) d[m]);
-			starpu_free((void*) q[m]);
+			starpu_free((void*) b[j]);
+			starpu_free((void*) r[j]);
+			starpu_free((void*) d[j]);
+			starpu_free((void*) q[j]);
 		}
 		}
 
 
-		for (n = 0; n < nblocks; n++)
+		for (i = 0; i < nblocks; i++)
 		{
 		{
-			mpi_rank = my_distrib(m, n);
+			mpi_rank = my_distrib(j, i);
 			if (mpi_rank == rank)
 			if (mpi_rank == rank)
 			{
 			{
-				starpu_free((void*) A[m][n]);
+				starpu_free((void*) A[j][i]);
 			}
 			}
 		}
 		}
 
 
-		free(A[m]);
+		free(A[j]);
 	}
 	}
 
 
 	free(A);
 	free(A);
@@ -183,7 +185,7 @@ static void free_data(void)
 
 
 static void register_data(void)
 static void register_data(void)
 {
 {
-	unsigned m, n;
+	unsigned j, i;
 	int mpi_rank;
 	int mpi_rank;
 	starpu_mpi_tag_t mpi_tag = 0;
 	starpu_mpi_tag_t mpi_tag = 0;
 
 
@@ -194,68 +196,68 @@ static void register_data(void)
 	d_handle = malloc(nblocks*sizeof(starpu_data_handle_t));
 	d_handle = malloc(nblocks*sizeof(starpu_data_handle_t));
 	q_handle = malloc(nblocks*sizeof(starpu_data_handle_t));
 	q_handle = malloc(nblocks*sizeof(starpu_data_handle_t));
 
 
-	for (m = 0; m < nblocks; m++)
+	for (j = 0; j < nblocks; j++)
 	{
 	{
-		mpi_rank = my_distrib(m, 0);
-		A_handle[m] = malloc(nblocks*sizeof(starpu_data_handle_t));
+		mpi_rank = my_distrib(j, 0);
+		A_handle[j] = malloc(nblocks*sizeof(starpu_data_handle_t));
 
 
 		if (mpi_rank == rank || display_result)
 		if (mpi_rank == rank || display_result)
 		{
 		{
-			starpu_vector_data_register(&x_handle[m], STARPU_MAIN_RAM, (uintptr_t) x[m], block_size, sizeof(TYPE));
+			starpu_vector_data_register(&x_handle[j], STARPU_MAIN_RAM, (uintptr_t) x[j], block_size, sizeof(TYPE));
 		}
 		}
 		else if (!display_result)
 		else if (!display_result)
 		{
 		{
 			assert(mpi_rank != rank);
 			assert(mpi_rank != rank);
-			starpu_vector_data_register(&x_handle[m], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
+			starpu_vector_data_register(&x_handle[j], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
 		}
 		}
 
 
 		if (mpi_rank == rank)
 		if (mpi_rank == rank)
 		{
 		{
-			starpu_vector_data_register(&b_handle[m], STARPU_MAIN_RAM, (uintptr_t) b[m], block_size, sizeof(TYPE));
-			starpu_vector_data_register(&r_handle[m], STARPU_MAIN_RAM, (uintptr_t) r[m], block_size, sizeof(TYPE));
-			starpu_vector_data_register(&d_handle[m], STARPU_MAIN_RAM, (uintptr_t) d[m], block_size, sizeof(TYPE));
-			starpu_vector_data_register(&q_handle[m], STARPU_MAIN_RAM, (uintptr_t) q[m], block_size, sizeof(TYPE));
+			starpu_vector_data_register(&b_handle[j], STARPU_MAIN_RAM, (uintptr_t) b[j], block_size, sizeof(TYPE));
+			starpu_vector_data_register(&r_handle[j], STARPU_MAIN_RAM, (uintptr_t) r[j], block_size, sizeof(TYPE));
+			starpu_vector_data_register(&d_handle[j], STARPU_MAIN_RAM, (uintptr_t) d[j], block_size, sizeof(TYPE));
+			starpu_vector_data_register(&q_handle[j], STARPU_MAIN_RAM, (uintptr_t) q[j], block_size, sizeof(TYPE));
 		}
 		}
 		else
 		else
 		{
 		{
-			starpu_vector_data_register(&b_handle[m], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
-			starpu_vector_data_register(&r_handle[m], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
-			starpu_vector_data_register(&d_handle[m], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
-			starpu_vector_data_register(&q_handle[m], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
+			starpu_vector_data_register(&b_handle[j], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
+			starpu_vector_data_register(&r_handle[j], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
+			starpu_vector_data_register(&d_handle[j], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
+			starpu_vector_data_register(&q_handle[j], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
 		}
 		}
 
 
-		starpu_data_set_coordinates(x_handle[m], 1, m);
-		starpu_mpi_data_register(x_handle[m], ++mpi_tag, mpi_rank);
-		starpu_data_set_coordinates(b_handle[m], 1, m);
-		starpu_mpi_data_register(b_handle[m], ++mpi_tag, mpi_rank);
-		starpu_data_set_coordinates(r_handle[m], 1, m);
-		starpu_mpi_data_register(r_handle[m], ++mpi_tag, mpi_rank);
-		starpu_data_set_coordinates(d_handle[m], 1, m);
-		starpu_mpi_data_register(d_handle[m], ++mpi_tag, mpi_rank);
-		starpu_data_set_coordinates(q_handle[m], 1, m);
-		starpu_mpi_data_register(q_handle[m], ++mpi_tag, mpi_rank);
+		starpu_data_set_coordinates(x_handle[j], 1, j);
+		starpu_mpi_data_register(x_handle[j], ++mpi_tag, mpi_rank);
+		starpu_data_set_coordinates(b_handle[j], 1, j);
+		starpu_mpi_data_register(b_handle[j], ++mpi_tag, mpi_rank);
+		starpu_data_set_coordinates(r_handle[j], 1, j);
+		starpu_mpi_data_register(r_handle[j], ++mpi_tag, mpi_rank);
+		starpu_data_set_coordinates(d_handle[j], 1, j);
+		starpu_mpi_data_register(d_handle[j], ++mpi_tag, mpi_rank);
+		starpu_data_set_coordinates(q_handle[j], 1, j);
+		starpu_mpi_data_register(q_handle[j], ++mpi_tag, mpi_rank);
 
 
 		if (use_reduction)
 		if (use_reduction)
 		{
 		{
-			starpu_data_set_reduction_methods(q_handle[m], &accumulate_vector_cl, &bzero_vector_cl);
-			starpu_data_set_reduction_methods(r_handle[m], &accumulate_vector_cl, &bzero_vector_cl);
+			starpu_data_set_reduction_methods(q_handle[j], &accumulate_vector_cl, &bzero_vector_cl);
+			starpu_data_set_reduction_methods(r_handle[j], &accumulate_vector_cl, &bzero_vector_cl);
 		}
 		}
 
 
-		for (n = 0; n < nblocks; n++)
+		for (i = 0; i < nblocks; i++)
 		{
 		{
-			mpi_rank = my_distrib(m, n);
+			mpi_rank = my_distrib(j, i);
 
 
 			if (mpi_rank == rank)
 			if (mpi_rank == rank)
 			{
 			{
-				starpu_matrix_data_register(&A_handle[m][n], STARPU_MAIN_RAM, (uintptr_t) A[m][n], block_size, block_size, block_size, sizeof(TYPE));
+				starpu_matrix_data_register(&A_handle[j][i], STARPU_MAIN_RAM, (uintptr_t) A[j][i], block_size, block_size, block_size, sizeof(TYPE));
 			}
 			}
 			else
 			else
 			{
 			{
-				starpu_matrix_data_register(&A_handle[m][n], -1, (uintptr_t) NULL, block_size, block_size, block_size, sizeof(TYPE));
+				starpu_matrix_data_register(&A_handle[j][i], -1, (uintptr_t) NULL, block_size, block_size, block_size, sizeof(TYPE));
 			}
 			}
 
 
-			starpu_data_set_coordinates(A_handle[m][n], 2, n, m);
-			starpu_mpi_data_register(A_handle[m][n], ++mpi_tag, mpi_rank);
+			starpu_data_set_coordinates(A_handle[j][i], 2, i, j);
+			starpu_mpi_data_register(A_handle[j][i], ++mpi_tag, mpi_rank);
 		}
 		}
 	}
 	}
 
 
@@ -273,22 +275,22 @@ static void register_data(void)
 
 
 static void unregister_data(void)
 static void unregister_data(void)
 {
 {
-	unsigned m, n;
+	unsigned j, i;
 
 
-	for (m = 0; m < nblocks; m++)
+	for (j = 0; j < nblocks; j++)
 	{
 	{
-		starpu_data_unregister(x_handle[m]);
-		starpu_data_unregister(b_handle[m]);
-		starpu_data_unregister(r_handle[m]);
-		starpu_data_unregister(d_handle[m]);
-		starpu_data_unregister(q_handle[m]);
+		starpu_data_unregister(x_handle[j]);
+		starpu_data_unregister(b_handle[j]);
+		starpu_data_unregister(r_handle[j]);
+		starpu_data_unregister(d_handle[j]);
+		starpu_data_unregister(q_handle[j]);
 
 
-		for (n = 0; n < nblocks; n++)
+		for (i = 0; i < nblocks; i++)
 		{
 		{
-			starpu_data_unregister(A_handle[m][n]);
+			starpu_data_unregister(A_handle[j][i]);
 		}
 		}
 
 
-		free(A_handle[m]);
+		free(A_handle[j]);
 	}
 	}
 
 
 	starpu_data_unregister(dtq_handle);
 	starpu_data_unregister(dtq_handle);
@@ -304,7 +306,7 @@ static void unregister_data(void)
 
 
 static void display_x_result(void)
 static void display_x_result(void)
 {
 {
-	int j, i;
+	unsigned j, i;
 
 
 	for (j = 0; j < nblocks; j++)
 	for (j = 0; j < nblocks; j++)
 	{
 	{

+ 0 - 1
mpi/tests/helper.h

@@ -20,7 +20,6 @@
 #include "../../tests/helper.h"
 #include "../../tests/helper.h"
 
 
 #define PRINTF(fmt, ...) do { if (!getenv("STARPU_SSILENT")) {printf(fmt, ## __VA_ARGS__); fflush(stdout); }} while(0)
 #define PRINTF(fmt, ...) do { if (!getenv("STARPU_SSILENT")) {printf(fmt, ## __VA_ARGS__); fflush(stdout); }} while(0)
-#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
 #define FPRINTF_MPI(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) { \
 #define FPRINTF_MPI(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) { \
 			int _disp_rank; starpu_mpi_comm_rank(MPI_COMM_WORLD, &_disp_rank); \
 			int _disp_rank; starpu_mpi_comm_rank(MPI_COMM_WORLD, &_disp_rank); \
 			fprintf(ofile, "[%d][starpu_mpi][%s] " fmt , _disp_rank, __starpu_func__ ,## __VA_ARGS__); \
 			fprintf(ofile, "[%d][starpu_mpi][%s] " fmt , _disp_rank, __starpu_func__ ,## __VA_ARGS__); \

+ 1 - 1
src/core/perfmodel/perfmodel_bus.c

@@ -118,7 +118,7 @@ static char cudadev_direct[STARPU_MAXNODES][STARPU_MAXNODES];
 #endif
 #endif
 
 
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
-static uint64_t opencl_size[STARPU_MAXCUDADEVS];
+static uint64_t opencl_size[STARPU_MAXOPENCLDEVS];
 #endif
 #endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
 /* preference order of cores (logical indexes) */
 /* preference order of cores (logical indexes) */

+ 1 - 1
src/core/workers.c

@@ -1882,7 +1882,7 @@ void starpu_shutdown(void)
 	STARPU_PTHREAD_MUTEX_UNLOCK(&init_mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&init_mutex);
 
 
 	/* If the workers are frozen, no progress can be made. */
 	/* If the workers are frozen, no progress can be made. */
-	STARPU_ASSERT(_starpu_config.pause_depth <= 0);
+	STARPU_ASSERT_MSG(_starpu_config.pause_depth <= 0, "Did you forget to call starpu_resume before starpu_shutdown?");
 
 
 	starpu_task_wait_for_no_ready();
 	starpu_task_wait_for_no_ready();
 
 

+ 1 - 1
src/datawizard/interfaces/bcsr_filters.c

@@ -91,7 +91,7 @@ void starpu_bcsr_filter_canonical_block(void *father_interface, void *child_inte
 	}
 	}
 }
 }
 
 
-unsigned starpu_bcsr_filter_canonical_block_get_nchildren(struct starpu_data_filter *f, starpu_data_handle_t handle)
+unsigned starpu_bcsr_filter_canonical_block_get_nchildren(STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, starpu_data_handle_t handle)
 {
 {
   return (unsigned)starpu_bcsr_get_nnz(handle);
   return (unsigned)starpu_bcsr_get_nnz(handle);
 }
 }

+ 0 - 1
src/debug/traces/starpu_fxt_mpi.c

@@ -120,7 +120,6 @@ static unsigned mpi_recvs_used[STARPU_FXT_MAX_FILES] = {0};
  * going through the lists from the beginning to match each and every
  * going through the lists from the beginning to match each and every
  * transfer, thus avoiding a quadratic complexity. */
  * transfer, thus avoiding a quadratic complexity. */
 static unsigned mpi_recvs_matched[STARPU_FXT_MAX_FILES][STARPU_FXT_MAX_FILES] = { {0} };
 static unsigned mpi_recvs_matched[STARPU_FXT_MAX_FILES][STARPU_FXT_MAX_FILES] = { {0} };
-static unsigned mpi_sends_matched[STARPU_FXT_MAX_FILES][STARPU_FXT_MAX_FILES] = { {0} };
 
 
 void _starpu_fxt_mpi_add_send_transfer(int src, int dst STARPU_ATTRIBUTE_UNUSED, long mpi_tag, size_t size, float date, long jobid, unsigned long handle, unsigned type, int prio)
 void _starpu_fxt_mpi_add_send_transfer(int src, int dst STARPU_ATTRIBUTE_UNUSED, long mpi_tag, size_t size, float date, long jobid, unsigned long handle, unsigned type, int prio)
 {
 {

+ 2 - 2
src/drivers/cuda/driver_cuda.c

@@ -106,12 +106,12 @@ static size_t _starpu_cuda_get_global_mem_size(unsigned devid)
 }
 }
 
 
 #ifdef STARPU_HAVE_LIBNVIDIA_ML
 #ifdef STARPU_HAVE_LIBNVIDIA_ML
-nvmlDevice_t _starpu_cuda_get_nvmldev(struct cudaDeviceProp *props)
+nvmlDevice_t _starpu_cuda_get_nvmldev(struct cudaDeviceProp *dev_props)
 {
 {
 	char busid[13];
 	char busid[13];
 	nvmlDevice_t ret;
 	nvmlDevice_t ret;
 
 
-	snprintf(busid, sizeof(busid), "%04x:%02x:%02x.0", props->pciDomainID, props->pciBusID, props->pciDeviceID);
+	snprintf(busid, sizeof(busid), "%04x:%02x:%02x.0", dev_props->pciDomainID, dev_props->pciBusID, dev_props->pciDeviceID);
 	if (nvmlDeviceGetHandleByPciBusId(busid, &ret) != NVML_SUCCESS)
 	if (nvmlDeviceGetHandleByPciBusId(busid, &ret) != NVML_SUCCESS)
 		ret = NULL;
 		ret = NULL;
 
 

+ 20 - 0
src/sched_policies/component_sched.c

@@ -468,6 +468,26 @@ void starpu_sched_tree_remove_workers(unsigned sched_ctx_id, int *workerids, uns
 	STARPU_COMPONENT_MUTEX_UNLOCK(&t->lock);
 	STARPU_COMPONENT_MUTEX_UNLOCK(&t->lock);
 }
 }
 
 
+static void _starpu_sched_tree_do_schedule(struct starpu_sched_component *component)
+{
+	unsigned i;
+
+	if (component->do_schedule)
+		component->do_schedule(component);
+
+	for (i = 0; i < component->nchildren; i++)
+		_starpu_sched_tree_do_schedule(component->children[i]);
+}
+
+void starpu_sched_tree_do_schedule(unsigned sched_ctx_id)
+{
+	STARPU_ASSERT(sched_ctx_id < STARPU_NMAX_SCHED_CTXS);
+	struct starpu_sched_tree * t = starpu_sched_ctx_get_policy_data(sched_ctx_id);
+
+	if (t->root)
+		_starpu_sched_tree_do_schedule(t->root);
+}
+
 static struct starpu_sched_tree *trees[STARPU_NMAX_SCHED_CTXS];
 static struct starpu_sched_tree *trees[STARPU_NMAX_SCHED_CTXS];
 
 
 struct starpu_sched_tree * starpu_sched_tree_create(unsigned sched_ctx_id)
 struct starpu_sched_tree * starpu_sched_tree_create(unsigned sched_ctx_id)