Browse Source

fix merge issues in tests

Andra Hugo 12 years ago
parent
commit
a9f3cb5acb

+ 1 - 4
examples/Makefile.am

@@ -3,7 +3,7 @@
 # Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
 # Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
 # Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
 # Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
 # Copyright (C) 2011  Télécom-SudParis
 # Copyright (C) 2011  Télécom-SudParis
-# Copyright (C) 2011-2012  INRIA
+# Copyright (C) 2012 INRIA
 #
 #
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
 # it under the terms of the GNU Lesser General Public License as published by
@@ -47,7 +47,6 @@ EXTRA_DIST = 					\
 	lu/xlu_implicit_pivot.c			\
 	lu/xlu_implicit_pivot.c			\
 	lu/xlu_kernels.c			\
 	lu/xlu_kernels.c			\
 	lu/lu_example.c				\
 	lu/lu_example.c				\
-	sched_ctx_utils/sched_ctx_utils.c		\
 	incrementer/incrementer_kernels_opencl_kernel.cl 	\
 	incrementer/incrementer_kernels_opencl_kernel.cl 	\
 	basic_examples/variable_kernels_opencl_kernel.cl	\
 	basic_examples/variable_kernels_opencl_kernel.cl	\
 	matvecmult/matvecmult_kernel.cl				\
 	matvecmult/matvecmult_kernel.cl				\
@@ -100,7 +99,6 @@ noinst_HEADERS = 				\
 	lu/complex_double.h			\
 	lu/complex_double.h			\
 	lu/blas_complex.h			\
 	lu/blas_complex.h			\
 	cholesky/cholesky.h			\
 	cholesky/cholesky.h			\
-	sched_ctx_utils/sched_ctx_utils.h	\
 	common/blas_model.h			\
 	common/blas_model.h			\
 	common/blas.h				\
 	common/blas.h				\
 	mult/simple.h				\
 	mult/simple.h				\
@@ -514,7 +512,6 @@ cholesky_cholesky_implicit_SOURCES =		\
 	cholesky/cholesky_implicit.c		\
 	cholesky/cholesky_implicit.c		\
 	cholesky/cholesky_models.c		\
 	cholesky/cholesky_models.c		\
 	cholesky/cholesky_kernels.c		\
 	cholesky/cholesky_kernels.c		\
-	sched_ctx_utils/sched_ctx_utils.c	\
 	common/blas.c
 	common/blas.c
 
 
 cholesky_cholesky_implicit_LDADD =		\
 cholesky_cholesky_implicit_LDADD =		\

+ 1 - 1
examples/basic_examples/variable.c

@@ -45,7 +45,7 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) goto enodev;
 	if (ret == -ENODEV) goto enodev;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 	niter /= 100;
 	niter /= 100;
 #endif
 #endif
         if (argc == 2) niter = atoi(argv[1]);
         if (argc == 2) niter = atoi(argv[1]);

+ 1 - 1
examples/cg/cg.c

@@ -406,7 +406,7 @@ int main(int argc, char **argv)
 {
 {
 	int ret;
 	int ret;
 
 
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 	i_max = 16;
 	i_max = 16;
 #endif
 #endif
 
 

+ 154 - 139
examples/cg/cg_kernels.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,6 +16,7 @@
 
 
 #include "cg.h"
 #include "cg.h"
 #include <math.h>
 #include <math.h>
+#include <limits.h>
 
 
 #if 0
 #if 0
 static void print_vector_from_descr(unsigned nx, TYPE *v)
 static void print_vector_from_descr(unsigned nx, TYPE *v)
@@ -43,6 +44,23 @@ static void print_matrix_from_descr(unsigned nx, unsigned ny, unsigned ld, TYPE
 }
 }
 #endif
 #endif
 
 
+static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
+{
+	enum starpu_archtype type = starpu_worker_get_type(workerid);
+	if (type == STARPU_CPU_WORKER || type == STARPU_OPENCL_WORKER)
+		return 1;
+
+#ifdef STARPU_USE_CUDA
+	/* Cuda device */
+	const struct cudaDeviceProp *props;
+	props = starpu_cuda_get_device_properties(workerid);
+	if (props->major >= 2 || props->minor >= 3)
+		/* At least compute capability 1.3, supports doubles */
+		return 1;
+#endif
+	/* Old card, does not support doubles */
+	return 0;
+}
 
 
 /*
 /*
  *	Reduction accumulation methods
  *	Reduction accumulation methods
@@ -67,16 +85,19 @@ static void accumulate_variable_cpu(void *descr[], void *cl_arg)
 	*v_dst = *v_dst + *v_src;
 	*v_dst = *v_dst + *v_src;
 }
 }
 
 
-static struct starpu_perfmodel_t accumulate_variable_model = {
+static struct starpu_perfmodel accumulate_variable_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "accumulate_variable"
 	.symbol = "accumulate_variable"
 };
 };
 
 
-starpu_codelet accumulate_variable_cl = {
+struct starpu_codelet accumulate_variable_cl =
+{
+	.can_execute = can_execute,
 	.where = STARPU_CPU|STARPU_CUDA,
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = accumulate_variable_cpu,
+	.cpu_funcs = {accumulate_variable_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	.cuda_func = accumulate_variable_cuda,
+	.cuda_funcs = {accumulate_variable_cuda, NULL},
 #endif
 #endif
 	.nbuffers = 2,
 	.nbuffers = 2,
 	.model = &accumulate_variable_model
 	.model = &accumulate_variable_model
@@ -103,16 +124,19 @@ static void accumulate_vector_cpu(void *descr[], void *cl_arg)
 	AXPY(n, (TYPE)1.0, v_src, 1, v_dst, 1);
 	AXPY(n, (TYPE)1.0, v_src, 1, v_dst, 1);
 }
 }
 
 
-static struct starpu_perfmodel_t accumulate_vector_model = {
+static struct starpu_perfmodel accumulate_vector_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "accumulate_vector"
 	.symbol = "accumulate_vector"
 };
 };
 
 
-starpu_codelet accumulate_vector_cl = {
+struct starpu_codelet accumulate_vector_cl =
+{
+	.can_execute = can_execute,
 	.where = STARPU_CPU|STARPU_CUDA,
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = accumulate_vector_cpu,
+	.cpu_funcs = {accumulate_vector_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	.cuda_func = accumulate_vector_cuda,
+	.cuda_funcs = {accumulate_vector_cuda, NULL},
 #endif
 #endif
 	.nbuffers = 2,
 	.nbuffers = 2,
 	.model = &accumulate_vector_model
 	.model = &accumulate_vector_model
@@ -141,16 +165,19 @@ static void bzero_variable_cpu(void *descr[], void *cl_arg)
 	*v = (TYPE)0.0;
 	*v = (TYPE)0.0;
 }
 }
 
 
-static struct starpu_perfmodel_t bzero_variable_model = {
+static struct starpu_perfmodel bzero_variable_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "bzero_variable"
 	.symbol = "bzero_variable"
 };
 };
 
 
-starpu_codelet bzero_variable_cl = {
+struct starpu_codelet bzero_variable_cl =
+{
+	.can_execute = can_execute,
 	.where = STARPU_CPU|STARPU_CUDA,
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = bzero_variable_cpu,
+	.cpu_funcs = {bzero_variable_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	.cuda_func = bzero_variable_cuda,
+	.cuda_funcs = {bzero_variable_cuda, NULL},
 #endif
 #endif
 	.nbuffers = 1,
 	.nbuffers = 1,
 	.model = &bzero_variable_model
 	.model = &bzero_variable_model
@@ -176,16 +203,19 @@ static void bzero_vector_cpu(void *descr[], void *cl_arg)
 	memset(v, 0, n*sizeof(TYPE));
 	memset(v, 0, n*sizeof(TYPE));
 }
 }
 
 
-static struct starpu_perfmodel_t bzero_vector_model = {
+static struct starpu_perfmodel bzero_vector_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "bzero_vector"
 	.symbol = "bzero_vector"
 };
 };
 
 
-starpu_codelet bzero_vector_cl = {
+struct starpu_codelet bzero_vector_cl =
+{
+	.can_execute = can_execute,
 	.where = STARPU_CPU|STARPU_CUDA,
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = bzero_vector_cpu,
+	.cpu_funcs = {bzero_vector_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	.cuda_func = bzero_vector_cuda,
+	.cuda_funcs = {bzero_vector_cuda, NULL},
 #endif
 #endif
 	.nbuffers = 1,
 	.nbuffers = 1,
 	.model = &bzero_vector_model
 	.model = &bzero_vector_model
@@ -229,39 +259,48 @@ static void dot_kernel_cpu(void *descr[], void *cl_arg)
 	*dot = *dot + local_dot;
 	*dot = *dot + local_dot;
 }
 }
 
 
-static struct starpu_perfmodel_t dot_kernel_model = {
+static struct starpu_perfmodel dot_kernel_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "dot_kernel"
 	.symbol = "dot_kernel"
 };
 };
 
 
-static starpu_codelet dot_kernel_cl = {
+static struct starpu_codelet dot_kernel_cl =
+{
+	.can_execute = can_execute,
 	.where = STARPU_CPU|STARPU_CUDA,
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = dot_kernel_cpu,
+	.cpu_funcs = {dot_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	.cuda_func = dot_kernel_cuda,
+	.cuda_funcs = {dot_kernel_cuda, NULL},
 #endif
 #endif
 	.nbuffers = 3,
 	.nbuffers = 3,
 	.model = &dot_kernel_model
 	.model = &dot_kernel_model
 };
 };
 
 
-void dot_kernel(starpu_data_handle v1,
-		starpu_data_handle v2,
-		starpu_data_handle s,
-		unsigned nblocks,
-		int use_reduction)
+int dot_kernel(starpu_data_handle_t v1,
+	       starpu_data_handle_t v2,
+	       starpu_data_handle_t s,
+	       unsigned nblocks,
+	       int use_reduction)
 {
 {
+	int ret;
+
 	/* Blank the accumulation variable */
 	/* Blank the accumulation variable */
-	starpu_insert_task(&bzero_variable_cl, STARPU_W, s, 0);
+	ret = starpu_insert_task(&bzero_variable_cl, STARPU_W, s, 0);
+	if (ret == -ENODEV) return ret;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 
 
 	unsigned b;
 	unsigned b;
 	for (b = 0; b < nblocks; b++)
 	for (b = 0; b < nblocks; b++)
 	{
 	{
-		starpu_insert_task(&dot_kernel_cl,
-			use_reduction?STARPU_REDUX:STARPU_RW, s,
-			STARPU_R, starpu_data_get_sub_data(v1, 1, b),
-			STARPU_R, starpu_data_get_sub_data(v2, 1, b),
-			0);
+		ret = starpu_insert_task(&dot_kernel_cl,
+					 use_reduction?STARPU_REDUX:STARPU_RW, s,
+					 STARPU_R, starpu_data_get_sub_data(v1, 1, b),
+					 STARPU_R, starpu_data_get_sub_data(v2, 1, b),
+					 0);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 	}
 	}
+	return 0;
 }
 }
 
 
 /*
 /*
@@ -272,7 +311,7 @@ void dot_kernel(starpu_data_handle v1,
 static void scal_kernel_cuda(void *descr[], void *cl_arg)
 static void scal_kernel_cuda(void *descr[], void *cl_arg)
 {
 {
 	TYPE p1;
 	TYPE p1;
-	starpu_unpack_cl_args(cl_arg, &p1);
+	starpu_codelet_unpack_args(cl_arg, &p1);
 
 
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
@@ -287,7 +326,7 @@ static void scal_kernel_cuda(void *descr[], void *cl_arg)
 static void scal_kernel_cpu(void *descr[], void *cl_arg)
 static void scal_kernel_cpu(void *descr[], void *cl_arg)
 {
 {
 	TYPE alpha;
 	TYPE alpha;
-	starpu_unpack_cl_args(cl_arg, &alpha);
+	starpu_codelet_unpack_args(cl_arg, &alpha);
 
 
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
@@ -296,16 +335,19 @@ static void scal_kernel_cpu(void *descr[], void *cl_arg)
 	SCAL(n, alpha, v1, 1);
 	SCAL(n, alpha, v1, 1);
 }
 }
 
 
-static struct starpu_perfmodel_t scal_kernel_model = {
+static struct starpu_perfmodel scal_kernel_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "scal_kernel"
 	.symbol = "scal_kernel"
 };
 };
 
 
-static starpu_codelet scal_kernel_cl = {
+static struct starpu_codelet scal_kernel_cl =
+{
+	.can_execute = can_execute,
 	.where = STARPU_CPU|STARPU_CUDA,
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = scal_kernel_cpu,
+	.cpu_funcs = {scal_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	.cuda_func = scal_kernel_cuda,
+	.cuda_funcs = {scal_kernel_cuda, NULL},
 #endif
 #endif
 	.nbuffers = 1,
 	.nbuffers = 1,
 	.model = &scal_kernel_model
 	.model = &scal_kernel_model
@@ -327,7 +369,7 @@ static void gemv_kernel_cuda(void *descr[], void *cl_arg)
 	unsigned ny = STARPU_MATRIX_GET_NY(descr[1]);
 	unsigned ny = STARPU_MATRIX_GET_NY(descr[1]);
  
  
 	TYPE alpha, beta;
 	TYPE alpha, beta;
-	starpu_unpack_cl_args(cl_arg, &beta, &alpha);
+	starpu_codelet_unpack_args(cl_arg, &beta, &alpha);
 
 
 	/* Compute v1 = alpha M v2 + beta v1 */
 	/* Compute v1 = alpha M v2 + beta v1 */
 	cublasgemv('N', nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
 	cublasgemv('N', nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
@@ -346,7 +388,7 @@ static void gemv_kernel_cpu(void *descr[], void *cl_arg)
 	unsigned ny = STARPU_MATRIX_GET_NY(descr[1]);
 	unsigned ny = STARPU_MATRIX_GET_NY(descr[1]);
 
 
 	TYPE alpha, beta;
 	TYPE alpha, beta;
-	starpu_unpack_cl_args(cl_arg, &beta, &alpha);
+	starpu_codelet_unpack_args(cl_arg, &beta, &alpha);
 
 
 	int worker_size = starpu_combined_worker_get_size();
 	int worker_size = starpu_combined_worker_get_size();
 
 
@@ -367,38 +409,44 @@ static void gemv_kernel_cpu(void *descr[], void *cl_arg)
 	GEMV("N", nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
 	GEMV("N", nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
 }
 }
 
 
-static struct starpu_perfmodel_t gemv_kernel_model = {
+static struct starpu_perfmodel gemv_kernel_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "gemv_kernel"
 	.symbol = "gemv_kernel"
 };
 };
 
 
-static starpu_codelet gemv_kernel_cl = {
+static struct starpu_codelet gemv_kernel_cl =
+{
+	.can_execute = can_execute,
 	.where = STARPU_CPU|STARPU_CUDA,
 	.where = STARPU_CPU|STARPU_CUDA,
 	.type = STARPU_SPMD,
 	.type = STARPU_SPMD,
 	.max_parallelism = INT_MAX,
 	.max_parallelism = INT_MAX,
-	.cpu_func = gemv_kernel_cpu,
+	.cpu_funcs = {gemv_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	.cuda_func = gemv_kernel_cuda,
+	.cuda_funcs = {gemv_kernel_cuda, NULL},
 #endif
 #endif
 	.nbuffers = 3,
 	.nbuffers = 3,
 	.model = &gemv_kernel_model
 	.model = &gemv_kernel_model
 };
 };
 
 
-void gemv_kernel(starpu_data_handle v1,
-		starpu_data_handle matrix,
-		starpu_data_handle v2,
+int gemv_kernel(starpu_data_handle_t v1,
+		starpu_data_handle_t matrix,
+		starpu_data_handle_t v2,
 		TYPE p1, TYPE p2,
 		TYPE p1, TYPE p2,
 		unsigned nblocks,
 		unsigned nblocks,
 		int use_reduction)
 		int use_reduction)
 {
 {
 	unsigned b1, b2;
 	unsigned b1, b2;
+	int ret;
 
 
 	for (b2 = 0; b2 < nblocks; b2++)
 	for (b2 = 0; b2 < nblocks; b2++)
 	{
 	{
-		starpu_insert_task(&scal_kernel_cl,
-			STARPU_RW, starpu_data_get_sub_data(v1, 1, b2),
-			STARPU_VALUE, &p1, sizeof(p1),
-			0);
+		ret = starpu_insert_task(&scal_kernel_cl,
+					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b2),
+					 STARPU_VALUE, &p1, sizeof(p1),
+					 0);
+		if (ret == -ENODEV) return ret;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 	}
 	}
 
 
 	for (b2 = 0; b2 < nblocks; b2++)
 	for (b2 = 0; b2 < nblocks; b2++)
@@ -406,15 +454,17 @@ void gemv_kernel(starpu_data_handle v1,
 		for (b1 = 0; b1 < nblocks; b1++)
 		for (b1 = 0; b1 < nblocks; b1++)
 		{
 		{
 			TYPE one = 1.0;
 			TYPE one = 1.0;
-			starpu_insert_task(&gemv_kernel_cl,
-				use_reduction?STARPU_REDUX:STARPU_RW,	starpu_data_get_sub_data(v1, 1, b2),
-				STARPU_R,	starpu_data_get_sub_data(matrix, 2, b2, b1),
-				STARPU_R,	starpu_data_get_sub_data(v2, 1, b1),
-				STARPU_VALUE,	&one,	sizeof(one),
-				STARPU_VALUE,	&p2,	sizeof(p2),
-				0);
+			ret = starpu_insert_task(&gemv_kernel_cl,
+						 use_reduction?STARPU_REDUX:STARPU_RW,	starpu_data_get_sub_data(v1, 1, b2),
+						 STARPU_R,	starpu_data_get_sub_data(matrix, 2, b2, b1),
+						 STARPU_R,	starpu_data_get_sub_data(v2, 1, b1),
+						 STARPU_VALUE,	&one,	sizeof(one),
+						 STARPU_VALUE,	&p2,	sizeof(p2),
+						 0);
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 		}
 		}
 	}
 	}
+	return 0;
 }
 }
 
 
 /*
 /*
@@ -424,7 +474,7 @@ void gemv_kernel(starpu_data_handle v1,
 static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
 static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
 {
 {
 	TYPE p1, p2;
 	TYPE p1, p2;
-	starpu_unpack_cl_args(cl_arg, &p1, &p2);
+	starpu_codelet_unpack_args(cl_arg, &p1, &p2);
 
 
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
@@ -444,7 +494,7 @@ static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
 static void scal_axpy_kernel_cpu(void *descr[], void *cl_arg)
 static void scal_axpy_kernel_cpu(void *descr[], void *cl_arg)
 {
 {
 	TYPE p1, p2;
 	TYPE p1, p2;
-	starpu_unpack_cl_args(cl_arg, &p1, &p2);
+	starpu_codelet_unpack_args(cl_arg, &p1, &p2);
 
 
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
@@ -459,35 +509,42 @@ static void scal_axpy_kernel_cpu(void *descr[], void *cl_arg)
 	AXPY(nx, p2, v2, 1, v1, 1);
 	AXPY(nx, p2, v2, 1, v1, 1);
 }
 }
 
 
-static struct starpu_perfmodel_t scal_axpy_kernel_model = {
+static struct starpu_perfmodel scal_axpy_kernel_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "scal_axpy_kernel"
 	.symbol = "scal_axpy_kernel"
 };
 };
 
 
-static starpu_codelet scal_axpy_kernel_cl = {
+static struct starpu_codelet scal_axpy_kernel_cl =
+{
+	.can_execute = can_execute,
 	.where = STARPU_CPU|STARPU_CUDA,
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = scal_axpy_kernel_cpu,
+	.cpu_funcs = {scal_axpy_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	.cuda_func = scal_axpy_kernel_cuda,
+	.cuda_funcs = {scal_axpy_kernel_cuda, NULL},
 #endif
 #endif
 	.nbuffers = 2,
 	.nbuffers = 2,
 	.model = &scal_axpy_kernel_model
 	.model = &scal_axpy_kernel_model
 };
 };
 
 
-void scal_axpy_kernel(starpu_data_handle v1, TYPE p1,
-			starpu_data_handle v2, TYPE p2,
-			unsigned nblocks)
+int scal_axpy_kernel(starpu_data_handle_t v1, TYPE p1,
+		     starpu_data_handle_t v2, TYPE p2,
+		     unsigned nblocks)
 {
 {
+	int ret;
 	unsigned b;
 	unsigned b;
 	for (b = 0; b < nblocks; b++)
 	for (b = 0; b < nblocks; b++)
 	{
 	{
-		starpu_insert_task(&scal_axpy_kernel_cl,
-			STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
-			STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
-			STARPU_VALUE, &p1, sizeof(p1),
-			STARPU_VALUE, &p2, sizeof(p2),
-			0);
+		ret = starpu_insert_task(&scal_axpy_kernel_cl,
+					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
+					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
+					 STARPU_VALUE, &p1, sizeof(p1),
+					 STARPU_VALUE, &p2, sizeof(p2),
+					 0);
+		if (ret == -ENODEV) return ret;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 	}
 	}
+	return 0;
 }
 }
 
 
 
 
@@ -498,7 +555,7 @@ void scal_axpy_kernel(starpu_data_handle v1, TYPE p1,
 static void axpy_kernel_cuda(void *descr[], void *cl_arg)
 static void axpy_kernel_cuda(void *descr[], void *cl_arg)
 {
 {
 	TYPE p1;
 	TYPE p1;
-	starpu_unpack_cl_args(cl_arg, &p1);
+	starpu_codelet_unpack_args(cl_arg, &p1);
 
 
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
@@ -515,7 +572,7 @@ static void axpy_kernel_cuda(void *descr[], void *cl_arg)
 static void axpy_kernel_cpu(void *descr[], void *cl_arg)
 static void axpy_kernel_cpu(void *descr[], void *cl_arg)
 {
 {
 	TYPE p1;
 	TYPE p1;
-	starpu_unpack_cl_args(cl_arg, &p1);
+	starpu_codelet_unpack_args(cl_arg, &p1);
 
 
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
@@ -527,89 +584,47 @@ static void axpy_kernel_cpu(void *descr[], void *cl_arg)
 	AXPY(nx, p1, v2, 1, v1, 1);
 	AXPY(nx, p1, v2, 1, v1, 1);
 }
 }
 
 
-static struct starpu_perfmodel_t axpy_kernel_model = {
+static struct starpu_perfmodel axpy_kernel_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "axpy_kernel"
 	.symbol = "axpy_kernel"
 };
 };
 
 
-static starpu_codelet axpy_kernel_cl = {
+static struct starpu_codelet axpy_kernel_cl =
+{
+	.can_execute = can_execute,
 	.where = STARPU_CPU|STARPU_CUDA,
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = axpy_kernel_cpu,
+	.cpu_funcs = {axpy_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	.cuda_func = axpy_kernel_cuda,
+	.cuda_funcs = {axpy_kernel_cuda, NULL},
 #endif
 #endif
 	.nbuffers = 2,
 	.nbuffers = 2,
 	.model = &axpy_kernel_model
 	.model = &axpy_kernel_model
 };
 };
 
 
-void axpy_kernel(starpu_data_handle v1,
-		starpu_data_handle v2, TYPE p1,
+int axpy_kernel(starpu_data_handle_t v1,
+		starpu_data_handle_t v2, TYPE p1,
 		unsigned nblocks)
 		unsigned nblocks)
 {
 {
+	int ret;
 	unsigned b;
 	unsigned b;
 	for (b = 0; b < nblocks; b++)
 	for (b = 0; b < nblocks; b++)
 	{
 	{
-		starpu_insert_task(&axpy_kernel_cl,
-			STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
-			STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
-			STARPU_VALUE, &p1, sizeof(p1),
-			0);
+		ret = starpu_insert_task(&axpy_kernel_cl,
+					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
+					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
+					 STARPU_VALUE, &p1, sizeof(p1),
+					 0);
+		if (ret == -ENODEV) return ret;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 	}
 	}
+	return 0;
 }
 }
 
 
-
-/*
- *	COPY kernel : vector_dst <- vector_src
- */
-
-static void copy_handle_cpu(void *descr[], void *cl_arg)
-{
-	TYPE *dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
-	TYPE *src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
-	
-	unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
-	size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
-
-	memcpy(dst, src, nx*elemsize);
-}
-
-#ifdef STARPU_USE_CUDA
-static void copy_handle_cuda(void *descr[], void *cl_arg)
-{
-	TYPE *dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
-	TYPE *src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
-	
-	unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
-	size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
-
-	cudaMemcpyAsync(dst, src, nx*elemsize, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
-}
-#endif
-
-static struct starpu_perfmodel_t copy_handle_model = {
-	.type = STARPU_HISTORY_BASED,
-	.symbol = "copy_handle"
-};
-
-static starpu_codelet copy_handle_cl = {
-	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = copy_handle_cpu,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = copy_handle_cuda,
-#endif
-	.nbuffers = 2,
-	.model = &copy_handle_model
-};
-
-void copy_handle(starpu_data_handle dst, starpu_data_handle src, unsigned nblocks)
+int copy_handle(starpu_data_handle_t dst, starpu_data_handle_t src, unsigned nblocks)
 {
 {
 	unsigned b;
 	unsigned b;
 	for (b = 0; b < nblocks; b++)
 	for (b = 0; b < nblocks; b++)
-	{
-		starpu_insert_task(&copy_handle_cl,
-			STARPU_W, starpu_data_get_sub_data(dst, 1, b),
-			STARPU_R, starpu_data_get_sub_data(src, 1, b),
-			0);
-	}
-} 
+		starpu_data_cpy(starpu_data_get_sub_data(dst, 1, b), starpu_data_get_sub_data(src, 1, b), 1, NULL, NULL);
+	return 0;
+}

+ 1 - 1
examples/incrementer/incrementer.c

@@ -47,7 +47,7 @@ int main(int argc, char **argv)
 		return 77;
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 	niter /= 100;
 	niter /= 100;
 #endif
 #endif
 	if (argc == 2)
 	if (argc == 2)

+ 43 - 8
examples/interface/complex_interface.c

@@ -15,9 +15,6 @@
  */
  */
 
 
 #include <starpu.h>
 #include <starpu.h>
-#include <starpu_cuda.h>
-#include <starpu_opencl.h>
-#include <starpu_hash.h>
 
 
 #include "complex_interface.h"
 #include "complex_interface.h"
 
 
@@ -137,7 +134,7 @@ static starpu_ssize_t complex_allocate_data_on_node(void *data_interface, uint32
 		}
 		}
 #endif
 #endif
 		default:
 		default:
-			STARPU_ASSERT(0);
+			STARPU_ABORT();
 	}
 	}
 
 
 	if (fail)
 	if (fail)
@@ -164,6 +161,43 @@ static uint32_t complex_footprint(starpu_data_handle_t handle)
 	return starpu_crc32_be(starpu_complex_get_nx(handle), 0);
 	return starpu_crc32_be(starpu_complex_get_nx(handle), 0);
 }
 }
 
 
+static void *complex_handle_to_pointer(starpu_data_handle_t handle, uint32_t node)
+{
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+
+	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *)
+		starpu_data_get_interface_on_node(handle, node);
+
+	return (void*) complex_interface->real;
+}
+
+static int complex_pack_data(starpu_data_handle_t handle, uint32_t node, void **ptr)
+{
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+
+	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *)
+		starpu_data_get_interface_on_node(handle, node);
+
+	*ptr = malloc(complex_get_size(handle));
+	memcpy(*ptr, complex_interface->real, complex_interface->nx*sizeof(double));
+	memcpy(*ptr+complex_interface->nx*sizeof(double), complex_interface->imaginary, complex_interface->nx*sizeof(double));
+
+	return 0;
+}
+
+static int complex_unpack_data(starpu_data_handle_t handle, uint32_t node, void *ptr)
+{
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+
+	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *)
+		starpu_data_get_interface_on_node(handle, node);
+
+	memcpy(complex_interface->real, ptr, complex_interface->nx*sizeof(double));
+	memcpy(complex_interface->imaginary, ptr+complex_interface->nx*sizeof(double), complex_interface->nx*sizeof(double));
+
+	return 0;
+}
+
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 static int copy_cuda_async_sync(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, enum cudaMemcpyKind kind, cudaStream_t stream)
 static int copy_cuda_async_sync(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, enum cudaMemcpyKind kind, cudaStream_t stream)
 {
 {
@@ -204,11 +238,10 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *
 #endif
 #endif
 
 
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *_event)
+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
 {
 {
 	struct starpu_complex_interface *src_complex = src_interface;
 	struct starpu_complex_interface *src_complex = src_interface;
 	struct starpu_complex_interface *dst_complex = dst_interface;
 	struct starpu_complex_interface *dst_complex = dst_interface;
-	cl_event *event = (cl_event *)_event;
 	cl_int err;
 	cl_int err;
 	int ret;
 	int ret;
 
 
@@ -244,11 +277,10 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_
         return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
         return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
 }
 }
 
 
-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *_event)
+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
 {
 {
 	struct starpu_complex_interface *src_complex = src_interface;
 	struct starpu_complex_interface *src_complex = src_interface;
 	struct starpu_complex_interface *dst_complex = dst_interface;
 	struct starpu_complex_interface *dst_complex = dst_interface;
-	cl_event *event = (cl_event *)_event;
 	cl_int err;
 	cl_int err;
 	int ret;
 	int ret;
 
 
@@ -310,6 +342,9 @@ static struct starpu_data_interface_ops interface_complex_ops =
 	.footprint = complex_footprint,
 	.footprint = complex_footprint,
 	.interfaceid = -1,
 	.interfaceid = -1,
 	.interface_size = sizeof(struct starpu_complex_interface),
 	.interface_size = sizeof(struct starpu_complex_interface),
+	.handle_to_pointer = complex_handle_to_pointer,
+	.pack_data = complex_pack_data,
+	.unpack_data = complex_unpack_data
 };
 };
 
 
 void starpu_complex_data_register(starpu_data_handle_t *handleptr, uint32_t home_node, double *real, double *imaginary, int nx)
 void starpu_complex_data_register(starpu_data_handle_t *handleptr, uint32_t home_node, double *real, double *imaginary, int nx)

+ 1 - 1
examples/lu/lu_example.c

@@ -300,7 +300,7 @@ int main(int argc, char **argv)
 
 
 	parse_args(argc, argv);
 	parse_args(argc, argv);
 
 
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 	size /= 4;
 	size /= 4;
 	nblocks /= 4;
 	nblocks /= 4;
 #endif
 #endif

+ 2 - 2
examples/mult/xgemm.c

@@ -33,7 +33,7 @@
 static unsigned niter = 10;
 static unsigned niter = 10;
 static unsigned nslicesx = 4;
 static unsigned nslicesx = 4;
 static unsigned nslicesy = 4;
 static unsigned nslicesy = 4;
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 static unsigned xdim = 256;
 static unsigned xdim = 256;
 static unsigned ydim = 256;
 static unsigned ydim = 256;
 static unsigned zdim = 64;
 static unsigned zdim = 64;
@@ -287,7 +287,7 @@ int main(int argc, char **argv)
 
 
 	parse_args(argc, argv);
 	parse_args(argc, argv);
 
 
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 	niter /= 10;
 	niter /= 10;
 #endif
 #endif
 
 

+ 1 - 1
examples/pipeline/pipeline.c

@@ -41,7 +41,7 @@
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 
 
 /* Vector size */
 /* Vector size */
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 #define N 16
 #define N 16
 #else
 #else
 #define N 1048576
 #define N 1048576

+ 253 - 50
examples/reductions/dot_product.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2012 inria
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,24 +17,48 @@
 
 
 #include <starpu.h>
 #include <starpu.h>
 #include <assert.h>
 #include <assert.h>
+#include <math.h>
+
+#include <reductions/dot_product.h>
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #include <cuda.h>
 #include <cublas.h>
 #include <cublas.h>
 #endif
 #endif
 
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 static float *x;
 static float *x;
 static float *y;
 static float *y;
-static starpu_data_handle *x_handles;
-static starpu_data_handle *y_handles;
+static starpu_data_handle_t *x_handles;
+static starpu_data_handle_t *y_handles;
+#ifdef STARPU_USE_OPENCL
+static struct starpu_opencl_program opencl_program;
+#endif
 
 
 static unsigned nblocks = 4096;
 static unsigned nblocks = 4096;
-static unsigned entries_per_bock = 1024;
-
-#define DOT_TYPE double
+static unsigned entries_per_block = 1024;
 
 
 static DOT_TYPE dot = 0.0f;
 static DOT_TYPE dot = 0.0f;
-static starpu_data_handle dot_handle;
+static starpu_data_handle_t dot_handle;
+
+static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
+{
+	enum starpu_archtype type = starpu_worker_get_type(workerid);
+	if (type == STARPU_CPU_WORKER || type == STARPU_OPENCL_WORKER)
+		return 1;
+
+#ifdef STARPU_USE_CUDA
+	/* Cuda device */
+	const struct cudaDeviceProp *props;
+	props = starpu_cuda_get_device_properties(workerid);
+	if (props->major >= 2 || props->minor >= 3)
+		/* At least compute capability 1.3, supports doubles */
+		return 1;
+#endif
+	/* Old card, does not support doubles */
+	return 0;
+}
 
 
 /*
 /*
  *	Codelet to create a neutral element
  *	Codelet to create a neutral element
@@ -49,16 +74,45 @@ void init_cpu_func(void *descr[], void *cl_arg)
 void init_cuda_func(void *descr[], void *cl_arg)
 void init_cuda_func(void *descr[], void *cl_arg)
 {
 {
 	DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
-	cudaMemset(dot, 0, sizeof(DOT_TYPE));
-	cudaThreadSynchronize();
+	cudaMemsetAsync(dot, 0, sizeof(DOT_TYPE), starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+#endif
+
+#ifdef STARPU_USE_OPENCL
+void init_opencl_func(void *buffers[], void *args)
+{
+        cl_int err;
+	cl_command_queue queue;
+
+	cl_mem dot = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[0]);
+	starpu_opencl_get_current_queue(&queue);
+	DOT_TYPE zero = (DOT_TYPE) 0.0;
+
+	err = clEnqueueWriteBuffer(queue,
+			dot,
+			CL_TRUE,
+			0,
+			sizeof(DOT_TYPE),
+			&zero,
+			0,
+			NULL,
+			NULL);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
 }
 }
 #endif
 #endif
 
 
-static struct starpu_codelet_t init_codelet = {
-	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = init_cpu_func,
+static struct starpu_codelet init_codelet =
+{
+	.can_execute = can_execute,
+	.cpu_funcs = {init_cpu_func, NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	.cuda_func = init_cuda_func,
+	.cuda_funcs = {init_cuda_func, NULL},
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {init_opencl_func, NULL},
 #endif
 #endif
 	.nbuffers = 1
 	.nbuffers = 1
 };
 };
@@ -75,9 +129,71 @@ void redux_cpu_func(void *descr[], void *cl_arg)
 	*dota = *dota + *dotb;
 	*dota = *dota + *dotb;
 }
 }
 
 
-static struct starpu_codelet_t redux_codelet = {
-	.where = STARPU_CPU,
-	.cpu_func = redux_cpu_func,
+#ifdef STARPU_USE_CUDA
+extern void redux_cuda_func(void *descr[], void *_args);
+#endif
+
+#ifdef STARPU_USE_OPENCL
+void redux_opencl_func(void *buffers[], void *args)
+{
+	int id, devid;
+        cl_int err;
+	cl_kernel kernel;
+	cl_command_queue queue;
+	cl_event event;
+
+	cl_mem dota = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[0]);
+	cl_mem dotb = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[1]);
+
+	id = starpu_worker_get_id();
+	devid = starpu_worker_get_devid(id);
+
+	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "_redux_opencl", devid);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = clSetKernelArg(kernel, 0, sizeof(dota), &dota);
+	err|= clSetKernelArg(kernel, 1, sizeof(dotb), &dotb);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	{
+		size_t global=1;
+		size_t local;
+                size_t s;
+                cl_device_id device;
+
+                starpu_opencl_get_device(devid, &device);
+
+                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
+                if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+                if (local > global)
+			local=global;
+
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
+		if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
+
+	starpu_opencl_release_kernel(kernel);
+}
+#endif
+
+static struct starpu_codelet redux_codelet =
+{
+	.can_execute = can_execute,
+	.cpu_funcs = {redux_cpu_func, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {redux_cuda_func, NULL},
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {redux_opencl_func, NULL},
+#endif
 	.nbuffers = 2
 	.nbuffers = 2
 };
 };
 
 
@@ -116,57 +232,122 @@ void dot_cuda_func(void *descr[], void *cl_arg)
 
 
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 
 
-	cudaMemcpy(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost);
-
-	cudaThreadSynchronize();
+	cudaMemcpyAsync(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
 
 
 	local_dot = (DOT_TYPE)cublasSdot(n, local_x, 1, local_y, 1);
 	local_dot = (DOT_TYPE)cublasSdot(n, local_x, 1, local_y, 1);
 
 
-	//fprintf(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot);
+	/* FPRINTF(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot); */
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	current_dot += local_dot;
 	current_dot += local_dot;
 
 
-	cudaThreadSynchronize();
+	cudaMemcpyAsync(dot, &current_dot, sizeof(DOT_TYPE), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+#endif
+
+#ifdef STARPU_USE_OPENCL
+void dot_opencl_func(void *buffers[], void *args)
+{
+	int id, devid;
+        cl_int err;
+	cl_kernel kernel;
+	cl_command_queue queue;
+	cl_event event;
+
+	cl_mem x = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
+	cl_mem y = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[1]);
+	cl_mem dot = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[2]);
+	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
+
+	id = starpu_worker_get_id();
+	devid = starpu_worker_get_devid(id);
+
+	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "_dot_opencl", devid);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = clSetKernelArg(kernel, 0, sizeof(x), &x);
+	err|= clSetKernelArg(kernel, 1, sizeof(y), &y);
+	err|= clSetKernelArg(kernel, 2, sizeof(dot), &dot);
+	err|= clSetKernelArg(kernel, 3, sizeof(n), &n);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	{
+		size_t global=1;
+		size_t local;
+                size_t s;
+                cl_device_id device;
+
+                starpu_opencl_get_device(devid, &device);
+
+                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
+                if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+                if (local > global)
+			local=global;
+
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
+		if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+	}
 
 
-	cudaMemcpy(dot, &current_dot, sizeof(DOT_TYPE), cudaMemcpyHostToDevice);
+	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
 
 
-	cudaThreadSynchronize();
+	starpu_opencl_release_kernel(kernel);
 }
 }
 #endif
 #endif
 
 
-static struct starpu_codelet_t dot_codelet = {
-	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = dot_cpu_func,
+static struct starpu_codelet dot_codelet =
+{
+	.can_execute = can_execute,
+	.cpu_funcs = {dot_cpu_func, NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	.cuda_func = dot_cuda_func,
+	.cuda_funcs = {dot_cuda_func, NULL},
 #endif
 #endif
-	.nbuffers = 3
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {dot_opencl_func, NULL},
+#endif
+	.nbuffers = 3,
+	.modes = {STARPU_R, STARPU_R, STARPU_REDUX}
 };
 };
 
 
 /*
 /*
  *	Tasks initialization
  *	Tasks initialization
  */
  */
 
 
-extern void starpu_data_end_reduction_mode(starpu_data_handle handle);
-
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
-	starpu_init(NULL);
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+#ifdef STARPU_USE_OPENCL
+	ret = starpu_opencl_load_opencl_from_file("examples/reductions/dot_product_opencl_kernels.cl",
+						  &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+#endif
 
 
 	starpu_helper_cublas_init();
 	starpu_helper_cublas_init();
 
 
-	unsigned long nelems = nblocks*entries_per_bock;
+	unsigned long nelems = nblocks*entries_per_block;
 	size_t size = nelems*sizeof(float);
 	size_t size = nelems*sizeof(float);
 
 
-	x = malloc(size);
-	y = malloc(size);
+	x = (float *) malloc(size);
+	y = (float *) malloc(size);
 
 
-	x_handles = calloc(nblocks, sizeof(starpu_data_handle));
-	y_handles = calloc(nblocks, sizeof(starpu_data_handle));
+	x_handles = (starpu_data_handle_t *) calloc(nblocks, sizeof(starpu_data_handle_t));
+	y_handles = (starpu_data_handle_t *) calloc(nblocks, sizeof(starpu_data_handle_t));
 
 
 	assert(x && y);
 	assert(x && y);
 
 
         starpu_srand48(0);
         starpu_srand48(0);
-	
+
 	DOT_TYPE reference_dot = 0.0;
 	DOT_TYPE reference_dot = 0.0;
 
 
 	unsigned long i;
 	unsigned long i;
@@ -176,15 +357,15 @@ int main(int argc, char **argv)
 		y[i] = (float)starpu_drand48();
 		y[i] = (float)starpu_drand48();
 
 
 		reference_dot += (DOT_TYPE)x[i]*(DOT_TYPE)y[i];
 		reference_dot += (DOT_TYPE)x[i]*(DOT_TYPE)y[i];
-	} 
-	
+	}
+
 	unsigned block;
 	unsigned block;
 	for (block = 0; block < nblocks; block++)
 	for (block = 0; block < nblocks; block++)
 	{
 	{
 		starpu_vector_data_register(&x_handles[block], 0,
 		starpu_vector_data_register(&x_handles[block], 0,
-			(uintptr_t)&x[entries_per_bock*block], entries_per_bock, sizeof(float));
+			(uintptr_t)&x[entries_per_block*block], entries_per_block, sizeof(float));
 		starpu_vector_data_register(&y_handles[block], 0,
 		starpu_vector_data_register(&y_handles[block], 0,
-			(uintptr_t)&y[entries_per_bock*block], entries_per_bock, sizeof(float));
+			(uintptr_t)&y[entries_per_block*block], entries_per_block, sizeof(float));
 	}
 	}
 
 
 	starpu_variable_data_register(&dot_handle, 0, (uintptr_t)&dot, sizeof(DOT_TYPE));
 	starpu_variable_data_register(&dot_handle, 0, (uintptr_t)&dot, sizeof(DOT_TYPE));
@@ -199,25 +380,47 @@ int main(int argc, char **argv)
 		struct starpu_task *task = starpu_task_create();
 		struct starpu_task *task = starpu_task_create();
 
 
 		task->cl = &dot_codelet;
 		task->cl = &dot_codelet;
+		task->destroy = 1;
 
 
-		task->buffers[0].handle = x_handles[block];
-		task->buffers[0].mode = STARPU_R;
-		task->buffers[1].handle = y_handles[block];
-		task->buffers[1].mode = STARPU_R;
-		task->buffers[2].handle = dot_handle;
-		task->buffers[2].mode = STARPU_REDUX;
+		task->handles[0] = x_handles[block];
+		task->handles[1] = y_handles[block];
+		task->handles[2] = dot_handle;
 
 
-		int ret = starpu_task_submit(task);
+		ret = starpu_task_submit(task);
+		if (ret == -ENODEV) goto enodev;
 		STARPU_ASSERT(!ret);
 		STARPU_ASSERT(!ret);
 	}
 	}
 
 
+	for (block = 0; block < nblocks; block++)
+	{
+		starpu_data_unregister(x_handles[block]);
+		starpu_data_unregister(y_handles[block]);
+	}
 	starpu_data_unregister(dot_handle);
 	starpu_data_unregister(dot_handle);
 
 
-	fprintf(stderr, "Reference : %e vs. %e (Delta %e)\n", reference_dot, dot, reference_dot - dot);
+	FPRINTF(stderr, "Reference : %e vs. %e (Delta %e)\n", reference_dot, dot, reference_dot - dot);
 
 
 	starpu_helper_cublas_shutdown();
 	starpu_helper_cublas_shutdown();
 
 
+#ifdef STARPU_USE_OPENCL
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
+#endif
 	starpu_shutdown();
 	starpu_shutdown();
 
 
-	return 0;
+	free(x);
+	free(y);
+	free(x_handles);
+	free(y_handles);
+
+	if (fabs(reference_dot - dot) < reference_dot * 1e-6)
+		return EXIT_SUCCESS;
+	else
+		return EXIT_FAILURE;
+
+enodev:
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	return 77;
 }
 }

+ 1 - 1
examples/reductions/minmax_reduction.c

@@ -19,7 +19,7 @@
 #include <limits.h>
 #include <limits.h>
 #include <starpu.h>
 #include <starpu.h>
 
 
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 static unsigned nblocks = 512;
 static unsigned nblocks = 512;
 static unsigned entries_per_bock = 64;
 static unsigned entries_per_bock = 64;
 #else
 #else

+ 11 - 7
examples/stencil/Makefile.am

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 #
-# Copyright (C) 2010-2011  Université de Bordeaux 1
+# Copyright (C) 2010-2012  Université de Bordeaux 1
 #
 #
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
 # it under the terms of the GNU Lesser General Public License as published by
@@ -13,13 +13,14 @@
 #
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
 
-AM_CFLAGS = $(HWLOC_CFLAGS)
-LIBS = $(top_builddir)/src/libstarpu.la $(HWLOC_LIBS) @LIBS@
+AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(HWLOC_LIBS) @LIBS@
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) 
 
 
 if USE_MPI
 if USE_MPI
-LIBS += $(top_builddir)/mpi/libstarpumpi.la
-AM_CPPFLAGS += -I$(top_srcdir)/mpi/
+LIBS += $(top_builddir)/mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+AM_CPPFLAGS += -I$(top_srcdir)/mpi/include
 endif
 endif
 
 
 CC = $(CC_OR_MPICC)
 CC = $(CC_OR_MPICC)
@@ -34,7 +35,7 @@ NVCCFLAGS += $(HWLOC_CFLAGS)
 
 
 .cu.o:
 .cu.o:
 	$(MKDIR_P) `dirname $@`
 	$(MKDIR_P) `dirname $@`
-	$(NVCC) $< -c -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS) -I$(top_srcdir)/include/ -I$(top_builddir)/include/
+	$(NVCC) $< -c -o $@ --compiler-options -fno-strict-aliasing -I$(top_srcdir)/include/ -I$(top_builddir)/include/ $(NVCCFLAGS)
 
 
 
 
 endif
 endif
@@ -46,7 +47,7 @@ endif
 check_PROGRAMS =				\
 check_PROGRAMS =				\
 	stencil
 	stencil
 
 
-examplebindir = $(libdir)/starpu/examples/
+examplebindir = $(libdir)/starpu/examples/stencil
 
 
 examplebin_PROGRAMS =				\
 examplebin_PROGRAMS =				\
 	stencil
 	stencil
@@ -112,3 +113,6 @@ CLEANFILES = *.xpm
 
 
 view:
 view:
 	feh --zoom 800 -F 0.xpm 0.5.xpm 1.xpm 2.xpm 3.xpm 4.xpm 6.xpm mpi.xpm
 	feh --zoom 800 -F 0.xpm 0.5.xpm 1.xpm 2.xpm 3.xpm 4.xpm 6.xpm mpi.xpm
+
+showcheck:
+	-cat $(TEST_LOGS) /dev/null

+ 103 - 63
examples/stencil/stencil-kernels.c

@@ -1,6 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
+ *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
  * the Free Software Foundation; either version 2.1 of the License, or (at
  * the Free Software Foundation; either version 2.1 of the License, or (at
@@ -16,17 +18,14 @@
 #include "stencil.h"
 #include "stencil.h"
 #include <sys/time.h>
 #include <sys/time.h>
 
 
-#ifdef STARPU_USE_OPENCL
-#include <CL/cl.h>
-#include <starpu_opencl.h>
-#endif
-
 #ifndef timersub
 #ifndef timersub
 #define	timersub(x, y, res) \
 #define	timersub(x, y, res) \
-	do { \
+	do \
+	{						   \
 		(res)->tv_sec = (x)->tv_sec - (y)->tv_sec; \
 		(res)->tv_sec = (x)->tv_sec - (y)->tv_sec; \
 		(res)->tv_usec = (x)->tv_usec - (y)->tv_usec; \
 		(res)->tv_usec = (x)->tv_usec - (y)->tv_usec; \
-		if ((res)->tv_usec < 0) { \
+		if ((res)->tv_usec < 0) \
+		{			 \
 			(res)->tv_sec--; \
 			(res)->tv_sec--; \
 			(res)->tv_usec += 1000000; \
 			(res)->tv_usec += 1000000; \
 		} \
 		} \
@@ -34,10 +33,12 @@
 #endif
 #endif
 #ifndef timeradd
 #ifndef timeradd
 #define	timeradd(x, y, res) \
 #define	timeradd(x, y, res) \
-	do { \
+	do \
+	{						   \
 		(res)->tv_sec = (x)->tv_sec + (y)->tv_sec; \
 		(res)->tv_sec = (x)->tv_sec + (y)->tv_sec; \
 		(res)->tv_usec = (x)->tv_usec + (y)->tv_usec; \
 		(res)->tv_usec = (x)->tv_usec + (y)->tv_usec; \
-		if ((res)->tv_usec >= 1000000) { \
+		if ((res)->tv_usec >= 1000000) \
+		{			       \
 			(res)->tv_sec++; \
 			(res)->tv_sec++; \
 			(res)->tv_usec -= 1000000; \
 			(res)->tv_usec -= 1000000; \
 		} \
 		} \
@@ -124,6 +125,9 @@ int *who_runs_what;
 int *who_runs_what_index;
 int *who_runs_what_index;
 struct timeval *last_tick;
 struct timeval *last_tick;
 
 
+/* Achieved iterations */
+static int achieved_iter;
+
 /* Record how many updates each worker performed */
 /* Record how many updates each worker performed */
 unsigned update_per_worker[STARPU_NMAXWORKERS];
 unsigned update_per_worker[STARPU_NMAXWORKERS];
 
 
@@ -135,7 +139,8 @@ static void record_who_runs_what(struct block_description *block)
 	gettimeofday(&tv, NULL);
 	gettimeofday(&tv, NULL);
 	timersub(&tv, &start, &tv2);
 	timersub(&tv, &start, &tv2);
 	timersub(&tv2, &last_tick[block->bz], &diff);
 	timersub(&tv2, &last_tick[block->bz], &diff);
-	while (timercmp(&diff, &delta, >=)) {
+	while (timercmp(&diff, &delta, >=))
+	{
 		timeradd(&last_tick[block->bz], &delta, &last_tick[block->bz]);
 		timeradd(&last_tick[block->bz], &delta, &last_tick[block->bz]);
 		timersub(&tv2, &last_tick[block->bz], &diff);
 		timersub(&tv2, &last_tick[block->bz], &diff);
 		if (who_runs_what_index[block->bz] < who_runs_what_len)
 		if (who_runs_what_index[block->bz] < who_runs_what_len)
@@ -146,7 +151,7 @@ static void record_who_runs_what(struct block_description *block)
 		who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = global_workerid(workerid);
 		who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = global_workerid(workerid);
 }
 }
 
 
-static void check_load(starpu_block_interface_t *block, starpu_block_interface_t *boundary)
+static void check_load(struct starpu_block_interface *block, struct starpu_block_interface *boundary)
 {
 {
 	/* Sanity checks */
 	/* Sanity checks */
 	STARPU_ASSERT(block->nx == boundary->nx);
 	STARPU_ASSERT(block->nx == boundary->nx);
@@ -162,10 +167,12 @@ static void check_load(starpu_block_interface_t *block, starpu_block_interface_t
 /*
 /*
  * Load a neighbour's boundary into block, CPU version
  * Load a neighbour's boundary into block, CPU version
  */
  */
-static void load_subblock_from_buffer_cpu(starpu_block_interface_t *block,
-					starpu_block_interface_t *boundary,
+static void load_subblock_from_buffer_cpu(void *_block,
+					void *_boundary,
 					unsigned firstz)
 					unsigned firstz)
 {
 {
+	struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
+	struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
 	check_load(block, boundary);
 	check_load(block, boundary);
 
 
 	/* We do a contiguous memory transfer */
 	/* We do a contiguous memory transfer */
@@ -181,10 +188,12 @@ static void load_subblock_from_buffer_cpu(starpu_block_interface_t *block,
  * Load a neighbour's boundary into block, CUDA version
  * Load a neighbour's boundary into block, CUDA version
  */
  */
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-static void load_subblock_from_buffer_cuda(starpu_block_interface_t *block,
-					starpu_block_interface_t *boundary,
+static void load_subblock_from_buffer_cuda(void *_block,
+					void *_boundary,
 					unsigned firstz)
 					unsigned firstz)
 {
 {
+	struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
+	struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
 	check_load(block, boundary);
 	check_load(block, boundary);
 
 
 	/* We do a contiguous memory transfer */
 	/* We do a contiguous memory transfer */
@@ -241,17 +250,17 @@ fprintf(stderr,"!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid)
 
 
 	for (i=1; i<=K; i++)
 	for (i=1; i<=K; i++)
 	{
 	{
-		starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
-		TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;
+		struct starpu_block_interface *oldb = descr[i%2], *newb = descr[(i+1)%2];
+		TYPE *old = (void*) oldb->ptr, *newer = (void*) newb->ptr;
 
 
 		/* Shadow data */
 		/* Shadow data */
 		cuda_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
 		cuda_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
 
 
 		/* And perform actual computation */
 		/* And perform actual computation */
 #ifdef LIFE
 #ifdef LIFE
-		cuda_life_update_host(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
+		cuda_life_update_host(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
 #else
 #else
-		cudaMemcpyAsync(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
+		cudaMemcpyAsync(newer, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
 #endif /* LIFE */
 #endif /* LIFE */
 	}
 	}
 
 
@@ -259,6 +268,8 @@ fprintf(stderr,"!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid)
 	if ((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess)
 	if ((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess)
 		STARPU_CUDA_REPORT_ERROR(cures);
 		STARPU_CUDA_REPORT_ERROR(cures);
 
 
+	if (block->bz == 0)
+		starpu_top_update_data_integer(starpu_top_achieved_loop, ++achieved_iter);
 }
 }
 #endif /* STARPU_USE_CUDA */
 #endif /* STARPU_USE_CUDA */
 
 
@@ -266,8 +277,8 @@ fprintf(stderr,"!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid)
  * Load a neighbour's boundary into block, OpenCL version
  * Load a neighbour's boundary into block, OpenCL version
  */
  */
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
-static void load_subblock_from_buffer_opencl(starpu_block_interface_t *block,
-					starpu_block_interface_t *boundary,
+static void load_subblock_from_buffer_opencl(struct starpu_block_interface *block,
+					struct starpu_block_interface *boundary,
 					unsigned firstz)
 					unsigned firstz)
 {
 {
 	check_load(block, boundary);
 	check_load(block, boundary);
@@ -278,10 +289,14 @@ static void load_subblock_from_buffer_opencl(starpu_block_interface_t *block,
 	unsigned offset = firstz*block->ldz;
 	unsigned offset = firstz*block->ldz;
 	cl_mem block_data = (cl_mem)block->ptr;
 	cl_mem block_data = (cl_mem)block->ptr;
 	cl_mem boundary_data = (cl_mem)boundary->ptr;
 	cl_mem boundary_data = (cl_mem)boundary->ptr;
+	cl_event event;
 
 
         cl_command_queue cq;
         cl_command_queue cq;
         starpu_opencl_get_current_queue(&cq);
         starpu_opencl_get_current_queue(&cq);
-        clEnqueueCopyBuffer(cq, boundary_data, block_data, 0, offset, boundary_size, 0, NULL, NULL);
+        clEnqueueCopyBuffer(cq, boundary_data, block_data, 0, offset, boundary_size, 0, NULL, &event);
+
+	clWaitForEvents(1, &event);
+	clReleaseEvent(event);
 }
 }
 
 
 /*
 /*
@@ -332,17 +347,20 @@ fprintf(stderr,"!!! DO update_func_opencl z %d OPENCL%d !!!\n", block->bz, worke
 
 
 	for (i=1; i<=K; i++)
 	for (i=1; i<=K; i++)
 	{
 	{
-		starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
-		TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;
+		struct starpu_block_interface *oldb = descr[i%2], *newb = descr[(i+1)%2];
+		TYPE *old = (void*) oldb->ptr, *newer = (void*) newb->ptr;
 
 
 		/* Shadow data */
 		/* Shadow data */
 		opencl_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
 		opencl_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
 
 
 		/* And perform actual computation */
 		/* And perform actual computation */
 #ifdef LIFE
 #ifdef LIFE
-		opencl_life_update_host(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
+		opencl_life_update_host(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
 #else
 #else
-                clEnqueueCopyBuffer(cq, old, new, 0, 0, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), 0, NULL, NULL);
+		cl_event event;
+                clEnqueueCopyBuffer(cq, old, newer, 0, 0, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer), 0, NULL, &event);
+		clWaitForEvents(1, &event);
+		clReleaseEvent(event);
 #endif /* LIFE */
 #endif /* LIFE */
 	}
 	}
 
 
@@ -350,6 +368,8 @@ fprintf(stderr,"!!! DO update_func_opencl z %d OPENCL%d !!!\n", block->bz, worke
 	if ((err = clFinish(cq)))
 	if ((err = clFinish(cq)))
 		STARPU_OPENCL_REPORT_ERROR(err);
 		STARPU_OPENCL_REPORT_ERROR(err);
 
 
+	if (block->bz == 0)
+		starpu_top_update_data_integer(starpu_top_achieved_loop, ++achieved_iter);
 }
 }
 #endif /* STARPU_USE_OPENCL */
 #endif /* STARPU_USE_OPENCL */
 
 
@@ -358,7 +378,7 @@ fprintf(stderr,"!!! DO update_func_opencl z %d OPENCL%d !!!\n", block->bz, worke
  */
  */
 static void update_func_cpu(void *descr[], void *arg)
 static void update_func_cpu(void *descr[], void *arg)
 {
 {
-	struct block_description *block = arg;
+	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	int workerid = starpu_worker_get_id();
 	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
 	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
 	if (block->bz == 0)
 	if (block->bz == 0)
@@ -398,8 +418,8 @@ fprintf(stderr,"!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
 
 
 	for (i=1; i<=K; i++)
 	for (i=1; i<=K; i++)
 	{
 	{
-		starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
-		TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;
+		struct starpu_block_interface *oldb = (struct starpu_block_interface *) descr[i%2], *newb = (struct starpu_block_interface *) descr[(i+1)%2];
+		TYPE *old = (TYPE*) oldb->ptr, *newer = (TYPE*) newb->ptr;
 
 
 		/* Shadow data */
 		/* Shadow data */
 		unsigned ldy = oldb->ldy, ldz = oldb->ldz;
 		unsigned ldy = oldb->ldy, ldz = oldb->ldz;
@@ -417,20 +437,25 @@ fprintf(stderr,"!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
 
 
 		/* And perform actual computation */
 		/* And perform actual computation */
 #ifdef LIFE
 #ifdef LIFE
-		life_update(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
+		life_update(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
 #else
 #else
-		memcpy(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new));
+		memcpy(newer, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer));
 #endif /* LIFE */
 #endif /* LIFE */
 	}
 	}
+
+	if (block->bz == 0)
+		starpu_top_update_data_integer(starpu_top_achieved_loop, ++achieved_iter);
 }
 }
 
 
 /* Performance model and codelet structure */
 /* Performance model and codelet structure */
-static struct starpu_perfmodel_t cl_update_model = {
+static struct starpu_perfmodel cl_update_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "cl_update" 
 	.symbol = "cl_update" 
 };
 };
 
 
-starpu_codelet cl_update = {
+struct starpu_codelet cl_update =
+{
 	.where = 0 |
 	.where = 0 |
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		STARPU_CUDA|
 		STARPU_CUDA|
@@ -439,15 +464,16 @@ starpu_codelet cl_update = {
                 STARPU_OPENCL|
                 STARPU_OPENCL|
 #endif
 #endif
 		STARPU_CPU,
 		STARPU_CPU,
-	.cpu_func = update_func_cpu,
+	.cpu_funcs = {update_func_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	.cuda_func = update_func_cuda,
+	.cuda_funcs = {update_func_cuda, NULL},
 #endif
 #endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
-	.opencl_func = update_func_opencl,
+	.opencl_funcs = {update_func_opencl, NULL},
 #endif
 #endif
 	.model = &cl_update_model,
 	.model = &cl_update_model,
-	.nbuffers = 6
+	.nbuffers = 6,
+	.modes = {STARPU_RW, STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R}
 };
 };
 
 
 /*
 /*
@@ -455,10 +481,12 @@ starpu_codelet cl_update = {
  */
  */
 
 
 /* CPU version */
 /* CPU version */
-static void load_subblock_into_buffer_cpu(starpu_block_interface_t *block,
-					starpu_block_interface_t *boundary,
+static void load_subblock_into_buffer_cpu(void *_block,
+					void *_boundary,
 					unsigned firstz)
 					unsigned firstz)
 {
 {
+	struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
+	struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
 	check_load(block, boundary);
 	check_load(block, boundary);
 
 
 	/* We do a contiguous memory transfer */
 	/* We do a contiguous memory transfer */
@@ -472,10 +500,12 @@ static void load_subblock_into_buffer_cpu(starpu_block_interface_t *block,
 
 
 /* CUDA version */
 /* CUDA version */
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-static void load_subblock_into_buffer_cuda(starpu_block_interface_t *block,
-					starpu_block_interface_t *boundary,
+static void load_subblock_into_buffer_cuda(void *_block,
+					void *_boundary,
 					unsigned firstz)
 					unsigned firstz)
 {
 {
+	struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
+	struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
 	check_load(block, boundary);
 	check_load(block, boundary);
 
 
 	/* We do a contiguous memory transfer */
 	/* We do a contiguous memory transfer */
@@ -490,8 +520,8 @@ static void load_subblock_into_buffer_cuda(starpu_block_interface_t *block,
 
 
 /* OPENCL version */
 /* OPENCL version */
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
-static void load_subblock_into_buffer_opencl(starpu_block_interface_t *block,
-					starpu_block_interface_t *boundary,
+static void load_subblock_into_buffer_opencl(struct starpu_block_interface *block,
+					struct starpu_block_interface *boundary,
 					unsigned firstz)
 					unsigned firstz)
 {
 {
 	check_load(block, boundary);
 	check_load(block, boundary);
@@ -505,8 +535,12 @@ static void load_subblock_into_buffer_opencl(starpu_block_interface_t *block,
 
 
         cl_command_queue cq;
         cl_command_queue cq;
         starpu_opencl_get_current_queue(&cq);
         starpu_opencl_get_current_queue(&cq);
+	cl_event event;
+
+        clEnqueueCopyBuffer(cq, block_data, boundary_data, offset, 0, boundary_size, 0, NULL, &event);
 
 
-        clEnqueueCopyBuffer(cq, block_data, boundary_data, offset, 0, boundary_size, 0, NULL, NULL);
+	clWaitForEvents(1, &event);
+	clReleaseEvent(event);
 }
 }
 #endif /* STARPU_USE_OPENCL */
 #endif /* STARPU_USE_OPENCL */
 
 
@@ -517,7 +551,7 @@ unsigned bottom_per_worker[STARPU_NMAXWORKERS];
 /* top save, CPU version */
 /* top save, CPU version */
 static void dummy_func_top_cpu(void *descr[] __attribute__((unused)), void *arg)
 static void dummy_func_top_cpu(void *descr[] __attribute__((unused)), void *arg)
 {
 {
-	struct block_description *block = arg;
+	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	int workerid = starpu_worker_get_id();
 	top_per_worker[workerid]++;
 	top_per_worker[workerid]++;
 
 
@@ -533,7 +567,7 @@ static void dummy_func_top_cpu(void *descr[] __attribute__((unused)), void *arg)
 /* bottom save, CPU version */
 /* bottom save, CPU version */
 static void dummy_func_bottom_cpu(void *descr[] __attribute__((unused)), void *arg)
 static void dummy_func_bottom_cpu(void *descr[] __attribute__((unused)), void *arg)
 {
 {
-	struct block_description *block = arg;
+	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	int workerid = starpu_worker_get_id();
 	bottom_per_worker[workerid]++;
 	bottom_per_worker[workerid]++;
 
 
@@ -547,7 +581,7 @@ static void dummy_func_bottom_cpu(void *descr[] __attribute__((unused)), void *a
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 static void dummy_func_top_cuda(void *descr[] __attribute__((unused)), void *arg)
 static void dummy_func_top_cuda(void *descr[] __attribute__((unused)), void *arg)
 {
 {
-	struct block_description *block = arg;
+	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	int workerid = starpu_worker_get_id();
 	top_per_worker[workerid]++;
 	top_per_worker[workerid]++;
 
 
@@ -564,7 +598,7 @@ static void dummy_func_top_cuda(void *descr[] __attribute__((unused)), void *arg
 /* bottom save, CUDA version */
 /* bottom save, CUDA version */
 static void dummy_func_bottom_cuda(void *descr[] __attribute__((unused)), void *arg)
 static void dummy_func_bottom_cuda(void *descr[] __attribute__((unused)), void *arg)
 {
 {
-	struct block_description *block = arg;
+	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	int workerid = starpu_worker_get_id();
 	bottom_per_worker[workerid]++;
 	bottom_per_worker[workerid]++;
 
 
@@ -580,7 +614,7 @@ static void dummy_func_bottom_cuda(void *descr[] __attribute__((unused)), void *
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
 static void dummy_func_top_opencl(void *descr[] __attribute__((unused)), void *arg)
 static void dummy_func_top_opencl(void *descr[] __attribute__((unused)), void *arg)
 {
 {
-	struct block_description *block = arg;
+	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	int workerid = starpu_worker_get_id();
 	top_per_worker[workerid]++;
 	top_per_worker[workerid]++;
 
 
@@ -600,7 +634,7 @@ static void dummy_func_top_opencl(void *descr[] __attribute__((unused)), void *a
 /* bottom save, OPENCL version */
 /* bottom save, OPENCL version */
 static void dummy_func_bottom_opencl(void *descr[] __attribute__((unused)), void *arg)
 static void dummy_func_bottom_opencl(void *descr[] __attribute__((unused)), void *arg)
 {
 {
-	struct block_description *block = arg;
+	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	int workerid = starpu_worker_get_id();
 	bottom_per_worker[workerid]++;
 	bottom_per_worker[workerid]++;
 
 
@@ -616,17 +650,20 @@ static void dummy_func_bottom_opencl(void *descr[] __attribute__((unused)), void
 #endif /* STARPU_USE_OPENCL */
 #endif /* STARPU_USE_OPENCL */
 
 
 /* Performance models and codelet for save */
 /* Performance models and codelet for save */
-static struct starpu_perfmodel_t save_cl_bottom_model = {
+static struct starpu_perfmodel save_cl_bottom_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "save_cl_bottom" 
 	.symbol = "save_cl_bottom" 
 };
 };
 
 
-static struct starpu_perfmodel_t save_cl_top_model = {
+static struct starpu_perfmodel save_cl_top_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "save_cl_top" 
 	.symbol = "save_cl_top" 
 };
 };
 
 
-starpu_codelet save_cl_bottom = {
+struct starpu_codelet save_cl_bottom =
+{
 	.where = 0 |
 	.where = 0 |
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		STARPU_CUDA|
 		STARPU_CUDA|
@@ -635,18 +672,20 @@ starpu_codelet save_cl_bottom = {
 		STARPU_OPENCL|
 		STARPU_OPENCL|
 #endif
 #endif
 		STARPU_CPU,
 		STARPU_CPU,
-	.cpu_func = dummy_func_bottom_cpu,
+	.cpu_funcs = {dummy_func_bottom_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	.cuda_func = dummy_func_bottom_cuda,
+	.cuda_funcs = {dummy_func_bottom_cuda, NULL},
 #endif
 #endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
-	.opencl_func = dummy_func_bottom_opencl,
+	.opencl_funcs = {dummy_func_bottom_opencl, NULL},
 #endif
 #endif
 	.model = &save_cl_bottom_model,
 	.model = &save_cl_bottom_model,
-	.nbuffers = 4
+	.nbuffers = 4,
+	.modes = {STARPU_R, STARPU_R, STARPU_W, STARPU_W}
 };
 };
 
 
-starpu_codelet save_cl_top = {
+struct starpu_codelet save_cl_top =
+{
 	.where = 0|
 	.where = 0|
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		STARPU_CUDA|
 		STARPU_CUDA|
@@ -655,13 +694,14 @@ starpu_codelet save_cl_top = {
 		STARPU_OPENCL|
 		STARPU_OPENCL|
 #endif
 #endif
 		STARPU_CPU,
 		STARPU_CPU,
-	.cpu_func = dummy_func_top_cpu,
+	.cpu_funcs = {dummy_func_top_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	.cuda_func = dummy_func_top_cuda,
+	.cuda_funcs = {dummy_func_top_cuda, NULL},
 #endif
 #endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
-	.opencl_func = dummy_func_top_opencl,
+	.opencl_funcs = {dummy_func_top_opencl, NULL},
 #endif
 #endif
 	.model = &save_cl_top_model,
 	.model = &save_cl_top_model,
-	.nbuffers = 4
+	.nbuffers = 4,
+	.modes = {STARPU_R, STARPU_R, STARPU_W, STARPU_W}
 };
 };

+ 47 - 49
examples/stencil/stencil-tasks.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010  Université de Bordeaux 1
  * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -48,17 +49,13 @@ static void create_task_save_local(unsigned iter, unsigned z, int dir, unsigned
 	save_task->cl_arg = descr;
 	save_task->cl_arg = descr;
 
 
 	/* Saving our border... */
 	/* Saving our border... */
-	save_task->buffers[0].handle = descr->layers_handle[0];
-	save_task->buffers[0].mode = STARPU_R;
-	save_task->buffers[1].handle = descr->layers_handle[1];
-	save_task->buffers[1].mode = STARPU_R;
+	save_task->handles[0] = descr->layers_handle[0];
+	save_task->handles[1] = descr->layers_handle[1];
 
 
 	/* ... to the neighbour's copy */
 	/* ... to the neighbour's copy */
 	struct block_description *neighbour = descr->boundary_blocks[(1+dir)/2];
 	struct block_description *neighbour = descr->boundary_blocks[(1+dir)/2];
-	save_task->buffers[2].handle = neighbour->boundaries_handle[(1-dir)/2][0];
-	save_task->buffers[2].mode = STARPU_W;
-	save_task->buffers[3].handle = neighbour->boundaries_handle[(1-dir)/2][1];
-	save_task->buffers[3].mode = STARPU_W;
+	save_task->handles[2] = neighbour->boundaries_handle[(1-dir)/2][0];
+	save_task->handles[3] = neighbour->boundaries_handle[(1-dir)/2][1];
 
 
 	/* Bind */
 	/* Bind */
 	if (iter <= BIND_LAST)
 	if (iter <= BIND_LAST)
@@ -69,14 +66,15 @@ static void create_task_save_local(unsigned iter, unsigned z, int dir, unsigned
 	if (ret)
 	if (ret)
 	{
 	{
 		fprintf(stderr, "Could not submit task save: %d\n", ret);
 		fprintf(stderr, "Could not submit task save: %d\n", ret);
-		STARPU_ASSERT(0);
+		STARPU_ABORT();
 	}
 	}
 }
 }
 
 
 /* R(z) = local & R(z+d) != local */
 /* R(z) = local & R(z+d) != local */
 /* We need to send our save over MPI */
 /* We need to send our save over MPI */
 
 
-static void send_done(void *arg) {
+static void send_done(void *arg)
+{
 	uintptr_t z = (uintptr_t) arg;
 	uintptr_t z = (uintptr_t) arg;
 	DEBUG("DO SEND %d\n", (int)z);
 	DEBUG("DO SEND %d\n", (int)z);
 }
 }
@@ -93,8 +91,8 @@ static void create_task_save_mpi_send(unsigned iter, unsigned z, int dir, unsign
 	STARPU_ASSERT(neighbour->mpi_node != local_rank);
 	STARPU_ASSERT(neighbour->mpi_node != local_rank);
 
 
 	/* Send neighbour's border copy to the neighbour */
 	/* Send neighbour's border copy to the neighbour */
-	starpu_data_handle handle0 = neighbour->boundaries_handle[(1-dir)/2][0];
-	starpu_data_handle handle1 = neighbour->boundaries_handle[(1-dir)/2][1];
+	starpu_data_handle_t handle0 = neighbour->boundaries_handle[(1-dir)/2][0];
+	starpu_data_handle_t handle1 = neighbour->boundaries_handle[(1-dir)/2][1];
 
 
 	starpu_mpi_isend_detached(handle0, dest, MPI_TAG0(z, iter, dir), MPI_COMM_WORLD, send_done, (void*)(uintptr_t)z);
 	starpu_mpi_isend_detached(handle0, dest, MPI_TAG0(z, iter, dir), MPI_COMM_WORLD, send_done, (void*)(uintptr_t)z);
 	starpu_mpi_isend_detached(handle1, dest, MPI_TAG1(z, iter, dir), MPI_COMM_WORLD, send_done, (void*)(uintptr_t)z);
 	starpu_mpi_isend_detached(handle1, dest, MPI_TAG1(z, iter, dir), MPI_COMM_WORLD, send_done, (void*)(uintptr_t)z);
@@ -103,7 +101,8 @@ static void create_task_save_mpi_send(unsigned iter, unsigned z, int dir, unsign
 /* R(z) != local & R(z+d) = local */
 /* R(z) != local & R(z+d) = local */
 /* We need to receive over MPI */
 /* We need to receive over MPI */
 
 
-static void recv_done(void *arg) {
+static void recv_done(void *arg)
+{
 	uintptr_t z = (uintptr_t) arg;
 	uintptr_t z = (uintptr_t) arg;
 	DEBUG("DO RECV %d\n", (int)z);
 	DEBUG("DO RECV %d\n", (int)z);
 }
 }
@@ -119,13 +118,13 @@ static void create_task_save_mpi_recv(unsigned iter, unsigned z, int dir, unsign
 	STARPU_ASSERT(neighbour->mpi_node == local_rank);
 	STARPU_ASSERT(neighbour->mpi_node == local_rank);
 
 
 	/* Receive our neighbour's border in our neighbour copy */
 	/* Receive our neighbour's border in our neighbour copy */
-	starpu_data_handle handle0 = neighbour->boundaries_handle[(1-dir)/2][0];
-	starpu_data_handle handle1 = neighbour->boundaries_handle[(1-dir)/2][1];
+	starpu_data_handle_t handle0 = neighbour->boundaries_handle[(1-dir)/2][0];
+	starpu_data_handle_t handle1 = neighbour->boundaries_handle[(1-dir)/2][1];
 
 
 	starpu_mpi_irecv_detached(handle0, source, MPI_TAG0(z, iter, dir), MPI_COMM_WORLD, recv_done, (void*)(uintptr_t)z);
 	starpu_mpi_irecv_detached(handle0, source, MPI_TAG0(z, iter, dir), MPI_COMM_WORLD, recv_done, (void*)(uintptr_t)z);
 	starpu_mpi_irecv_detached(handle1, source, MPI_TAG1(z, iter, dir), MPI_COMM_WORLD, recv_done, (void*)(uintptr_t)z);
 	starpu_mpi_irecv_detached(handle1, source, MPI_TAG1(z, iter, dir), MPI_COMM_WORLD, recv_done, (void*)(uintptr_t)z);
 }
 }
-#endif // STARPU_USE_MPI
+#endif /* STARPU_USE_MPI */
 
 
 /*
 /*
  * Schedule saving boundaries of blocks to communication buffers
  * Schedule saving boundaries of blocks to communication buffers
@@ -141,26 +140,28 @@ void create_task_save(unsigned iter, unsigned z, int dir, unsigned local_rank)
 		/* Save data from update */
 		/* Save data from update */
 		create_task_save_local(iter, z, dir, local_rank);
 		create_task_save_local(iter, z, dir, local_rank);
 		if (node_z_and_d != local_rank)
 		if (node_z_and_d != local_rank)
-		{ // R(z) = local & R(z+d) != local, We have to send the data
+		{ /* R(z) = local & R(z+d) != local, We have to send the data */
 			create_task_save_mpi_send(iter, z, dir, local_rank);
 			create_task_save_mpi_send(iter, z, dir, local_rank);
 		}
 		}
 
 
 	}
 	}
-	else {	// node_z != local_rank, this MPI node doesn't have the saved data
+	else
+	{	/* node_z != local_rank, this MPI node doesn't have the saved data */
 		if (node_z_and_d == local_rank)
 		if (node_z_and_d == local_rank)
 		{
 		{
 			create_task_save_mpi_recv(iter, z, dir, local_rank);
 			create_task_save_mpi_recv(iter, z, dir, local_rank);
 		}
 		}
-		else {  // R(z) != local & R(z+d) != local We don't have
-			// the saved data and don't need it, we shouldn't
-			// even have been called!
-			STARPU_ASSERT(0);
+		else
+		{ /* R(z) != local & R(z+d) != local We don't have
+			      the saved data and don't need it, we shouldn't
+			      even have been called! */
+			STARPU_ABORT();
 		}
 		}
 	}
 	}
-#else // !STARPU_USE_MPI
+#else /* !STARPU_USE_MPI */
 	STARPU_ASSERT((node_z == local_rank) && (node_z_and_d == local_rank));
 	STARPU_ASSERT((node_z == local_rank) && (node_z_and_d == local_rank));
 	create_task_save_local(iter, z, dir, local_rank);
 	create_task_save_local(iter, z, dir, local_rank);
-#endif // STARPU_USE_MPI
+#endif /* STARPU_USE_MPI */
 }
 }
 
 
 /*
 /*
@@ -176,7 +177,8 @@ void create_task_update(unsigned iter, unsigned z, unsigned local_rank)
 	unsigned niter = get_niter();
 	unsigned niter = get_niter();
 
 
 	/* We are going to synchronize with the last tasks */
 	/* We are going to synchronize with the last tasks */
-	if (iter == niter) {
+	if (iter == niter)
+	{
 		task->detach = 0;
 		task->detach = 0;
 		task->use_tag = 1;
 		task->use_tag = 1;
 		task->tag_id = TAG_FINISH(z);
 		task->tag_id = TAG_FINISH(z);
@@ -186,20 +188,14 @@ void create_task_update(unsigned iter, unsigned z, unsigned local_rank)
 	unsigned new_layer = (old_layer + 1) % 2;
 	unsigned new_layer = (old_layer + 1) % 2;
 
 
 	struct block_description *descr = get_block_description(z);
 	struct block_description *descr = get_block_description(z);
-	task->buffers[0].handle = descr->layers_handle[new_layer];
-	task->buffers[0].mode = STARPU_RW;
-	task->buffers[1].handle = descr->layers_handle[old_layer];
-	task->buffers[1].mode = STARPU_RW;
+	task->handles[0] = descr->layers_handle[new_layer];
+	task->handles[1] = descr->layers_handle[old_layer];
 
 
-	task->buffers[2].handle = descr->boundaries_handle[T][new_layer];
-	task->buffers[2].mode = STARPU_R;
-	task->buffers[3].handle = descr->boundaries_handle[T][old_layer];
-	task->buffers[3].mode = STARPU_R;
+	task->handles[2] = descr->boundaries_handle[T][new_layer];
+	task->handles[3] = descr->boundaries_handle[T][old_layer];
 
 
-	task->buffers[4].handle = descr->boundaries_handle[B][new_layer];
-	task->buffers[4].mode = STARPU_R;
-	task->buffers[5].handle = descr->boundaries_handle[B][old_layer];
-	task->buffers[5].mode = STARPU_R;
+	task->handles[4] = descr->boundaries_handle[B][new_layer];
+	task->handles[5] = descr->boundaries_handle[B][old_layer];
 
 
 	task->cl = &cl_update;
 	task->cl = &cl_update;
 	task->cl_arg = descr;
 	task->cl_arg = descr;
@@ -212,21 +208,24 @@ void create_task_update(unsigned iter, unsigned z, unsigned local_rank)
 	if (ret)
 	if (ret)
 	{
 	{
 		fprintf(stderr, "Could not submit task update block: %d\n", ret);
 		fprintf(stderr, "Could not submit task update block: %d\n", ret);
-		STARPU_ASSERT(0);
+		STARPU_ABORT();
 	}
 	}
 }
 }
 
 
 /* Dummy empty codelet taking one buffer */
 /* Dummy empty codelet taking one buffer */
 static void null_func(void *descr[] __attribute__((unused)), void *arg __attribute__((unused))) { }
 static void null_func(void *descr[] __attribute__((unused)), void *arg __attribute__((unused))) { }
-static starpu_codelet null = {
+static struct starpu_codelet null =
+{
+	.modes = { STARPU_W, STARPU_W },
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
-	.cpu_func = null_func,
-	.cuda_func = null_func,
-	.opencl_func = null_func,
+	.cpu_funcs = {null_func, NULL},
+	.cuda_funcs = {null_func, NULL},
+	.opencl_funcs = {null_func, NULL},
 	.nbuffers = 2
 	.nbuffers = 2
 };
 };
 
 
-void create_start_task(int z, int dir) {
+void create_start_task(int z, int dir)
+{
 	/* Dumb task depending on the init task and simulating writing the
 	/* Dumb task depending on the init task and simulating writing the
 	   neighbour buffers, to avoid communications and computation running
 	   neighbour buffers, to avoid communications and computation running
 	   before we start measuring time */
 	   before we start measuring time */
@@ -236,17 +235,15 @@ void create_start_task(int z, int dir) {
 	wait_init->cl = &null;
 	wait_init->cl = &null;
 	wait_init->use_tag = 1;
 	wait_init->use_tag = 1;
 	wait_init->tag_id = TAG_START(z, dir);
 	wait_init->tag_id = TAG_START(z, dir);
-	wait_init->buffers[0].handle = descr->boundaries_handle[(1+dir)/2][0];
-	wait_init->buffers[0].mode = STARPU_W;
-	wait_init->buffers[1].handle = descr->boundaries_handle[(1+dir)/2][1];
-	wait_init->buffers[1].mode = STARPU_W;
+	wait_init->handles[0] = descr->boundaries_handle[(1 + dir) / 2][0];
+	wait_init->handles[1] = descr->boundaries_handle[(1 + dir) / 2][1];
 	starpu_tag_declare_deps_array(wait_init->tag_id, 1, &tag_init);
 	starpu_tag_declare_deps_array(wait_init->tag_id, 1, &tag_init);
 
 
 	int ret = starpu_task_submit(wait_init);
 	int ret = starpu_task_submit(wait_init);
 	if (ret)
 	if (ret)
 	{
 	{
 		fprintf(stderr, "Could not submit task initial wait: %d\n", ret);
 		fprintf(stderr, "Could not submit task initial wait: %d\n", ret);
-		STARPU_ASSERT(0);
+		STARPU_ABORT();
 	}
 	}
 }
 }
 
 
@@ -261,7 +258,8 @@ void create_tasks(int rank)
 	int niter = get_niter();
 	int niter = get_niter();
 	int nbz = get_nbz();
 	int nbz = get_nbz();
 
 
-	for (bz = 0; bz < nbz; bz++) {
+	for (bz = 0; bz < nbz; bz++)
+	{
 		if ((get_block_mpi_node(bz) == rank) || (get_block_mpi_node(bz+1) == rank))
 		if ((get_block_mpi_node(bz) == rank) || (get_block_mpi_node(bz+1) == rank))
 			create_start_task(bz, +1);
 			create_start_task(bz, +1);
 		if ((get_block_mpi_node(bz) == rank) || (get_block_mpi_node(bz-1) == rank))
 		if ((get_block_mpi_node(bz) == rank) || (get_block_mpi_node(bz-1) == rank))

+ 1 - 1
examples/tag_example/tag_example.c

@@ -43,7 +43,7 @@
 
 
 struct starpu_codelet cl = {};
 struct starpu_codelet cl = {};
 
 
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 #define Ni	32
 #define Ni	32
 #define Nj	32
 #define Nj	32
 #define Nk	32
 #define Nk	32

+ 1 - 1
examples/tag_example/tag_example2.c

@@ -120,7 +120,7 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 		exit(77);
 		exit(77);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 	ni /= 4;
 	ni /= 4;
 	nk /= 16;
 	nk /= 16;
 #endif
 #endif

+ 1 - 1
examples/tag_example/tag_example3.c

@@ -122,7 +122,7 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 		exit(77);
 		exit(77);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 	ni /= 4;
 	ni /= 4;
 	nk /= 16;
 	nk /= 16;
 #endif
 #endif

+ 1 - 1
examples/tag_example/tag_restartable.c

@@ -129,7 +129,7 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 		return 77;
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 	ni /= 4;
 	ni /= 4;
 	nk /= 16;
 	nk /= 16;
 #endif
 #endif

+ 1 - 1
tests/microbenchs/prefetch_data_on_node.c

@@ -23,7 +23,7 @@
 #include <pthread.h>
 #include <pthread.h>
 #include "../helper.h"
 #include "../helper.h"
 
 
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 #define N		100
 #define N		100
 #else
 #else
 #define N		1000
 #define N		1000

+ 1 - 1
tests/microbenchs/sync_tasks_overhead.c

@@ -93,7 +93,7 @@ int main(int argc, char **argv)
 	struct timeval start;
 	struct timeval start;
 	struct timeval end;
 	struct timeval end;
 
 
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 	ntasks = 128;
 	ntasks = 128;
 #endif
 #endif