лет назад: 12 · a9f3cb5acb
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -3,7 +3,7 @@
 
				 # Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
			
 
				 # Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				 # Copyright (C) 2011  Télécom-SudParis
			
 
				-# Copyright (C) 2011-2012  INRIA
			
 
				+# Copyright (C) 2012 INRIA
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
 
				 # it under the terms of the GNU Lesser General Public License as published by
			
@@ -47,7 +47,6 @@ EXTRA_DIST = 					\
 
				 	lu/xlu_implicit_pivot.c			\
			
 
				 	lu/xlu_kernels.c			\
			
 
				 	lu/lu_example.c				\
			
 
				-	sched_ctx_utils/sched_ctx_utils.c		\
			
 
				 	incrementer/incrementer_kernels_opencl_kernel.cl 	\
			
 
				 	basic_examples/variable_kernels_opencl_kernel.cl	\
			
 
				 	matvecmult/matvecmult_kernel.cl				\
			
@@ -100,7 +99,6 @@ noinst_HEADERS = 				\
 
				 	lu/complex_double.h			\
			
 
				 	lu/blas_complex.h			\
			
 
				 	cholesky/cholesky.h			\
			
 
				-	sched_ctx_utils/sched_ctx_utils.h	\
			
 
				 	common/blas_model.h			\
			
 
				 	common/blas.h				\
			
 
				 	mult/simple.h				\
			
@@ -514,7 +512,6 @@ cholesky_cholesky_implicit_SOURCES =		\
 
				 	cholesky/cholesky_implicit.c		\
			
 
				 	cholesky/cholesky_models.c		\
			
 
				 	cholesky/cholesky_kernels.c		\
			
 
				-	sched_ctx_utils/sched_ctx_utils.c	\
			
 
				 	common/blas.c
			
 
				 
			
 
				 cholesky_cholesky_implicit_LDADD =		\
			
--- a/examples/basic_examples/variable.c
+++ b/examples/basic_examples/variable.c
@@ -45,7 +45,7 @@ int main(int argc, char **argv)
 
				 	if (ret == -ENODEV) goto enodev;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-#ifdef STARPU_SLOW_MACHINE
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				 	niter /= 100;
			
 
				 #endif
			
 
				         if (argc == 2) niter = atoi(argv[1]);
			
--- a/examples/cg/cg.c
+++ b/examples/cg/cg.c
@@ -406,7 +406,7 @@ int main(int argc, char **argv)
 
				 {
			
 
				 	int ret;
			
 
				 
			
 
				-#ifdef STARPU_SLOW_MACHINE
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				 	i_max = 16;
			
 
				 #endif
			
 
				 
			
--- a/examples/cg/cg_kernels.c
+++ b/examples/cg/cg_kernels.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -16,6 +16,7 @@
 
				 
			
 
				 #include "cg.h"
			
 
				 #include <math.h>
			
 
				+#include <limits.h>
			
 
				 
			
 
				 #if 0
			
 
				 static void print_vector_from_descr(unsigned nx, TYPE *v)
			
@@ -43,6 +44,23 @@ static void print_matrix_from_descr(unsigned nx, unsigned ny, unsigned ld, TYPE
 
				 }
			
 
				 #endif
			
 
				 
			
 
				+static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
			
 
				+{
			
 
				+	enum starpu_archtype type = starpu_worker_get_type(workerid);
			
 
				+	if (type == STARPU_CPU_WORKER || type == STARPU_OPENCL_WORKER)
			
 
				+		return 1;
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	/* Cuda device */
			
 
				+	const struct cudaDeviceProp *props;
			
 
				+	props = starpu_cuda_get_device_properties(workerid);
			
 
				+	if (props->major >= 2 || props->minor >= 3)
			
 
				+		/* At least compute capability 1.3, supports doubles */
			
 
				+		return 1;
			
 
				+#endif
			
 
				+	/* Old card, does not support doubles */
			
 
				+	return 0;
			
 
				+}
			
 
				 
			
 
				 /*
			
 
				  *	Reduction accumulation methods
			
@@ -67,16 +85,19 @@ static void accumulate_variable_cpu(void *descr[], void *cl_arg)
 
				 	*v_dst = *v_dst + *v_src;
			
 
				 }
			
 
				 
			
 
				-static struct starpu_perfmodel_t accumulate_variable_model = {
			
 
				+static struct starpu_perfmodel accumulate_variable_model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "accumulate_variable"
			
 
				 };
			
 
				 
			
 
				-starpu_codelet accumulate_variable_cl = {
			
 
				+struct starpu_codelet accumulate_variable_cl =
			
 
				+{
			
 
				+	.can_execute = can_execute,
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = accumulate_variable_cpu,
			
 
				+	.cpu_funcs = {accumulate_variable_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = accumulate_variable_cuda,
			
 
				+	.cuda_funcs = {accumulate_variable_cuda, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 2,
			
 
				 	.model = &accumulate_variable_model
			
@@ -103,16 +124,19 @@ static void accumulate_vector_cpu(void *descr[], void *cl_arg)
 
				 	AXPY(n, (TYPE)1.0, v_src, 1, v_dst, 1);
			
 
				 }
			
 
				 
			
 
				-static struct starpu_perfmodel_t accumulate_vector_model = {
			
 
				+static struct starpu_perfmodel accumulate_vector_model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "accumulate_vector"
			
 
				 };
			
 
				 
			
 
				-starpu_codelet accumulate_vector_cl = {
			
 
				+struct starpu_codelet accumulate_vector_cl =
			
 
				+{
			
 
				+	.can_execute = can_execute,
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = accumulate_vector_cpu,
			
 
				+	.cpu_funcs = {accumulate_vector_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = accumulate_vector_cuda,
			
 
				+	.cuda_funcs = {accumulate_vector_cuda, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 2,
			
 
				 	.model = &accumulate_vector_model
			
@@ -141,16 +165,19 @@ static void bzero_variable_cpu(void *descr[], void *cl_arg)
 
				 	*v = (TYPE)0.0;
			
 
				 }
			
 
				 
			
 
				-static struct starpu_perfmodel_t bzero_variable_model = {
			
 
				+static struct starpu_perfmodel bzero_variable_model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "bzero_variable"
			
 
				 };
			
 
				 
			
 
				-starpu_codelet bzero_variable_cl = {
			
 
				+struct starpu_codelet bzero_variable_cl =
			
 
				+{
			
 
				+	.can_execute = can_execute,
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = bzero_variable_cpu,
			
 
				+	.cpu_funcs = {bzero_variable_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = bzero_variable_cuda,
			
 
				+	.cuda_funcs = {bzero_variable_cuda, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 1,
			
 
				 	.model = &bzero_variable_model
			
@@ -176,16 +203,19 @@ static void bzero_vector_cpu(void *descr[], void *cl_arg)
 
				 	memset(v, 0, n*sizeof(TYPE));
			
 
				 }
			
 
				 
			
 
				-static struct starpu_perfmodel_t bzero_vector_model = {
			
 
				+static struct starpu_perfmodel bzero_vector_model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "bzero_vector"
			
 
				 };
			
 
				 
			
 
				-starpu_codelet bzero_vector_cl = {
			
 
				+struct starpu_codelet bzero_vector_cl =
			
 
				+{
			
 
				+	.can_execute = can_execute,
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = bzero_vector_cpu,
			
 
				+	.cpu_funcs = {bzero_vector_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = bzero_vector_cuda,
			
 
				+	.cuda_funcs = {bzero_vector_cuda, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 1,
			
 
				 	.model = &bzero_vector_model
			
@@ -229,39 +259,48 @@ static void dot_kernel_cpu(void *descr[], void *cl_arg)
 
				 	*dot = *dot + local_dot;
			
 
				 }
			
 
				 
			
 
				-static struct starpu_perfmodel_t dot_kernel_model = {
			
 
				+static struct starpu_perfmodel dot_kernel_model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "dot_kernel"
			
 
				 };
			
 
				 
			
 
				-static starpu_codelet dot_kernel_cl = {
			
 
				+static struct starpu_codelet dot_kernel_cl =
			
 
				+{
			
 
				+	.can_execute = can_execute,
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = dot_kernel_cpu,
			
 
				+	.cpu_funcs = {dot_kernel_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = dot_kernel_cuda,
			
 
				+	.cuda_funcs = {dot_kernel_cuda, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 3,
			
 
				 	.model = &dot_kernel_model
			
 
				 };
			
 
				 
			
 
				-void dot_kernel(starpu_data_handle v1,
			
 
				-		starpu_data_handle v2,
			
 
				-		starpu_data_handle s,
			
 
				-		unsigned nblocks,
			
 
				-		int use_reduction)
			
 
				+int dot_kernel(starpu_data_handle_t v1,
			
 
				+	       starpu_data_handle_t v2,
			
 
				+	       starpu_data_handle_t s,
			
 
				+	       unsigned nblocks,
			
 
				+	       int use_reduction)
			
 
				 {
			
 
				+	int ret;
			
 
				+
			
 
				 	/* Blank the accumulation variable */
			
 
				-	starpu_insert_task(&bzero_variable_cl, STARPU_W, s, 0);
			
 
				+	ret = starpu_insert_task(&bzero_variable_cl, STARPU_W, s, 0);
			
 
				+	if (ret == -ENODEV) return ret;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				 
			
 
				 	unsigned b;
			
 
				 	for (b = 0; b < nblocks; b++)
			
 
				 	{
			
 
				-		starpu_insert_task(&dot_kernel_cl,
			
 
				-			use_reduction?STARPU_REDUX:STARPU_RW, s,
			
 
				-			STARPU_R, starpu_data_get_sub_data(v1, 1, b),
			
 
				-			STARPU_R, starpu_data_get_sub_data(v2, 1, b),
			
 
				-			0);
			
 
				+		ret = starpu_insert_task(&dot_kernel_cl,
			
 
				+					 use_reduction?STARPU_REDUX:STARPU_RW, s,
			
 
				+					 STARPU_R, starpu_data_get_sub_data(v1, 1, b),
			
 
				+					 STARPU_R, starpu_data_get_sub_data(v2, 1, b),
			
 
				+					 0);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				 	}
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -272,7 +311,7 @@ void dot_kernel(starpu_data_handle v1,
 
				 static void scal_kernel_cuda(void *descr[], void *cl_arg)
			
 
				 {
			
 
				 	TYPE p1;
			
 
				-	starpu_unpack_cl_args(cl_arg, &p1);
			
 
				+	starpu_codelet_unpack_args(cl_arg, &p1);
			
 
				 
			
 
				 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
@@ -287,7 +326,7 @@ static void scal_kernel_cuda(void *descr[], void *cl_arg)
 
				 static void scal_kernel_cpu(void *descr[], void *cl_arg)
			
 
				 {
			
 
				 	TYPE alpha;
			
 
				-	starpu_unpack_cl_args(cl_arg, &alpha);
			
 
				+	starpu_codelet_unpack_args(cl_arg, &alpha);
			
 
				 
			
 
				 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
@@ -296,16 +335,19 @@ static void scal_kernel_cpu(void *descr[], void *cl_arg)
 
				 	SCAL(n, alpha, v1, 1);
			
 
				 }
			
 
				 
			
 
				-static struct starpu_perfmodel_t scal_kernel_model = {
			
 
				+static struct starpu_perfmodel scal_kernel_model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "scal_kernel"
			
 
				 };
			
 
				 
			
 
				-static starpu_codelet scal_kernel_cl = {
			
 
				+static struct starpu_codelet scal_kernel_cl =
			
 
				+{
			
 
				+	.can_execute = can_execute,
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = scal_kernel_cpu,
			
 
				+	.cpu_funcs = {scal_kernel_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = scal_kernel_cuda,
			
 
				+	.cuda_funcs = {scal_kernel_cuda, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 1,
			
 
				 	.model = &scal_kernel_model
			
@@ -327,7 +369,7 @@ static void gemv_kernel_cuda(void *descr[], void *cl_arg)
 
				 	unsigned ny = STARPU_MATRIX_GET_NY(descr[1]);
			
 
				  
			
 
				 	TYPE alpha, beta;
			
 
				-	starpu_unpack_cl_args(cl_arg, &beta, &alpha);
			
 
				+	starpu_codelet_unpack_args(cl_arg, &beta, &alpha);
			
 
				 
			
 
				 	/* Compute v1 = alpha M v2 + beta v1 */
			
 
				 	cublasgemv('N', nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
			
@@ -346,7 +388,7 @@ static void gemv_kernel_cpu(void *descr[], void *cl_arg)
 
				 	unsigned ny = STARPU_MATRIX_GET_NY(descr[1]);
			
 
				 
			
 
				 	TYPE alpha, beta;
			
 
				-	starpu_unpack_cl_args(cl_arg, &beta, &alpha);
			
 
				+	starpu_codelet_unpack_args(cl_arg, &beta, &alpha);
			
 
				 
			
 
				 	int worker_size = starpu_combined_worker_get_size();
			
 
				 
			
@@ -367,38 +409,44 @@ static void gemv_kernel_cpu(void *descr[], void *cl_arg)
 
				 	GEMV("N", nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
			
 
				 }
			
 
				 
			
 
				-static struct starpu_perfmodel_t gemv_kernel_model = {
			
 
				+static struct starpu_perfmodel gemv_kernel_model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "gemv_kernel"
			
 
				 };
			
 
				 
			
 
				-static starpu_codelet gemv_kernel_cl = {
			
 
				+static struct starpu_codelet gemv_kernel_cl =
			
 
				+{
			
 
				+	.can_execute = can_execute,
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.type = STARPU_SPMD,
			
 
				 	.max_parallelism = INT_MAX,
			
 
				-	.cpu_func = gemv_kernel_cpu,
			
 
				+	.cpu_funcs = {gemv_kernel_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = gemv_kernel_cuda,
			
 
				+	.cuda_funcs = {gemv_kernel_cuda, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 3,
			
 
				 	.model = &gemv_kernel_model
			
 
				 };
			
 
				 
			
 
				-void gemv_kernel(starpu_data_handle v1,
			
 
				-		starpu_data_handle matrix,
			
 
				-		starpu_data_handle v2,
			
 
				+int gemv_kernel(starpu_data_handle_t v1,
			
 
				+		starpu_data_handle_t matrix,
			
 
				+		starpu_data_handle_t v2,
			
 
				 		TYPE p1, TYPE p2,
			
 
				 		unsigned nblocks,
			
 
				 		int use_reduction)
			
 
				 {
			
 
				 	unsigned b1, b2;
			
 
				+	int ret;
			
 
				 
			
 
				 	for (b2 = 0; b2 < nblocks; b2++)
			
 
				 	{
			
 
				-		starpu_insert_task(&scal_kernel_cl,
			
 
				-			STARPU_RW, starpu_data_get_sub_data(v1, 1, b2),
			
 
				-			STARPU_VALUE, &p1, sizeof(p1),
			
 
				-			0);
			
 
				+		ret = starpu_insert_task(&scal_kernel_cl,
			
 
				+					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b2),
			
 
				+					 STARPU_VALUE, &p1, sizeof(p1),
			
 
				+					 0);
			
 
				+		if (ret == -ENODEV) return ret;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				 	}
			
 
				 
			
 
				 	for (b2 = 0; b2 < nblocks; b2++)
			
@@ -406,15 +454,17 @@ void gemv_kernel(starpu_data_handle v1,
 
				 		for (b1 = 0; b1 < nblocks; b1++)
			
 
				 		{
			
 
				 			TYPE one = 1.0;
			
 
				-			starpu_insert_task(&gemv_kernel_cl,
			
 
				-				use_reduction?STARPU_REDUX:STARPU_RW,	starpu_data_get_sub_data(v1, 1, b2),
			
 
				-				STARPU_R,	starpu_data_get_sub_data(matrix, 2, b2, b1),
			
 
				-				STARPU_R,	starpu_data_get_sub_data(v2, 1, b1),
			
 
				-				STARPU_VALUE,	&one,	sizeof(one),
			
 
				-				STARPU_VALUE,	&p2,	sizeof(p2),
			
 
				-				0);
			
 
				+			ret = starpu_insert_task(&gemv_kernel_cl,
			
 
				+						 use_reduction?STARPU_REDUX:STARPU_RW,	starpu_data_get_sub_data(v1, 1, b2),
			
 
				+						 STARPU_R,	starpu_data_get_sub_data(matrix, 2, b2, b1),
			
 
				+						 STARPU_R,	starpu_data_get_sub_data(v2, 1, b1),
			
 
				+						 STARPU_VALUE,	&one,	sizeof(one),
			
 
				+						 STARPU_VALUE,	&p2,	sizeof(p2),
			
 
				+						 0);
			
 
				+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				 		}
			
 
				 	}
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -424,7 +474,7 @@ void gemv_kernel(starpu_data_handle v1,
 
				 static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
			
 
				 {
			
 
				 	TYPE p1, p2;
			
 
				-	starpu_unpack_cl_args(cl_arg, &p1, &p2);
			
 
				+	starpu_codelet_unpack_args(cl_arg, &p1, &p2);
			
 
				 
			
 
				 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
			
@@ -444,7 +494,7 @@ static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
 
				 static void scal_axpy_kernel_cpu(void *descr[], void *cl_arg)
			
 
				 {
			
 
				 	TYPE p1, p2;
			
 
				-	starpu_unpack_cl_args(cl_arg, &p1, &p2);
			
 
				+	starpu_codelet_unpack_args(cl_arg, &p1, &p2);
			
 
				 
			
 
				 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
			
@@ -459,35 +509,42 @@ static void scal_axpy_kernel_cpu(void *descr[], void *cl_arg)
 
				 	AXPY(nx, p2, v2, 1, v1, 1);
			
 
				 }
			
 
				 
			
 
				-static struct starpu_perfmodel_t scal_axpy_kernel_model = {
			
 
				+static struct starpu_perfmodel scal_axpy_kernel_model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "scal_axpy_kernel"
			
 
				 };
			
 
				 
			
 
				-static starpu_codelet scal_axpy_kernel_cl = {
			
 
				+static struct starpu_codelet scal_axpy_kernel_cl =
			
 
				+{
			
 
				+	.can_execute = can_execute,
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = scal_axpy_kernel_cpu,
			
 
				+	.cpu_funcs = {scal_axpy_kernel_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = scal_axpy_kernel_cuda,
			
 
				+	.cuda_funcs = {scal_axpy_kernel_cuda, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 2,
			
 
				 	.model = &scal_axpy_kernel_model
			
 
				 };
			
 
				 
			
 
				-void scal_axpy_kernel(starpu_data_handle v1, TYPE p1,
			
 
				-			starpu_data_handle v2, TYPE p2,
			
 
				-			unsigned nblocks)
			
 
				+int scal_axpy_kernel(starpu_data_handle_t v1, TYPE p1,
			
 
				+		     starpu_data_handle_t v2, TYPE p2,
			
 
				+		     unsigned nblocks)
			
 
				 {
			
 
				+	int ret;
			
 
				 	unsigned b;
			
 
				 	for (b = 0; b < nblocks; b++)
			
 
				 	{
			
 
				-		starpu_insert_task(&scal_axpy_kernel_cl,
			
 
				-			STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
			
 
				-			STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
			
 
				-			STARPU_VALUE, &p1, sizeof(p1),
			
 
				-			STARPU_VALUE, &p2, sizeof(p2),
			
 
				-			0);
			
 
				+		ret = starpu_insert_task(&scal_axpy_kernel_cl,
			
 
				+					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
			
 
				+					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
			
 
				+					 STARPU_VALUE, &p1, sizeof(p1),
			
 
				+					 STARPU_VALUE, &p2, sizeof(p2),
			
 
				+					 0);
			
 
				+		if (ret == -ENODEV) return ret;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				 	}
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 
			
@@ -498,7 +555,7 @@ void scal_axpy_kernel(starpu_data_handle v1, TYPE p1,
 
				 static void axpy_kernel_cuda(void *descr[], void *cl_arg)
			
 
				 {
			
 
				 	TYPE p1;
			
 
				-	starpu_unpack_cl_args(cl_arg, &p1);
			
 
				+	starpu_codelet_unpack_args(cl_arg, &p1);
			
 
				 
			
 
				 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
			
@@ -515,7 +572,7 @@ static void axpy_kernel_cuda(void *descr[], void *cl_arg)
 
				 static void axpy_kernel_cpu(void *descr[], void *cl_arg)
			
 
				 {
			
 
				 	TYPE p1;
			
 
				-	starpu_unpack_cl_args(cl_arg, &p1);
			
 
				+	starpu_codelet_unpack_args(cl_arg, &p1);
			
 
				 
			
 
				 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
			
@@ -527,89 +584,47 @@ static void axpy_kernel_cpu(void *descr[], void *cl_arg)
 
				 	AXPY(nx, p1, v2, 1, v1, 1);
			
 
				 }
			
 
				 
			
 
				-static struct starpu_perfmodel_t axpy_kernel_model = {
			
 
				+static struct starpu_perfmodel axpy_kernel_model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "axpy_kernel"
			
 
				 };
			
 
				 
			
 
				-static starpu_codelet axpy_kernel_cl = {
			
 
				+static struct starpu_codelet axpy_kernel_cl =
			
 
				+{
			
 
				+	.can_execute = can_execute,
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = axpy_kernel_cpu,
			
 
				+	.cpu_funcs = {axpy_kernel_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = axpy_kernel_cuda,
			
 
				+	.cuda_funcs = {axpy_kernel_cuda, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 2,
			
 
				 	.model = &axpy_kernel_model
			
 
				 };
			
 
				 
			
 
				-void axpy_kernel(starpu_data_handle v1,
			
 
				-		starpu_data_handle v2, TYPE p1,
			
 
				+int axpy_kernel(starpu_data_handle_t v1,
			
 
				+		starpu_data_handle_t v2, TYPE p1,
			
 
				 		unsigned nblocks)
			
 
				 {
			
 
				+	int ret;
			
 
				 	unsigned b;
			
 
				 	for (b = 0; b < nblocks; b++)
			
 
				 	{
			
 
				-		starpu_insert_task(&axpy_kernel_cl,
			
 
				-			STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
			
 
				-			STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
			
 
				-			STARPU_VALUE, &p1, sizeof(p1),
			
 
				-			0);
			
 
				+		ret = starpu_insert_task(&axpy_kernel_cl,
			
 
				+					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
			
 
				+					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
			
 
				+					 STARPU_VALUE, &p1, sizeof(p1),
			
 
				+					 0);
			
 
				+		if (ret == -ENODEV) return ret;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				 	}
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				-
			
 
				-/*
			
 
				- *	COPY kernel : vector_dst <- vector_src
			
 
				- */
			
 
				-
			
 
				-static void copy_handle_cpu(void *descr[], void *cl_arg)
			
 
				-{
			
 
				-	TYPE *dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				-	TYPE *src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				-	
			
 
				-	unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				-	size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
			
 
				-
			
 
				-	memcpy(dst, src, nx*elemsize);
			
 
				-}
			
 
				-
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-static void copy_handle_cuda(void *descr[], void *cl_arg)
			
 
				-{
			
 
				-	TYPE *dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				-	TYPE *src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				-	
			
 
				-	unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				-	size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
			
 
				-
			
 
				-	cudaMemcpyAsync(dst, src, nx*elemsize, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
			
 
				-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-static struct starpu_perfmodel_t copy_handle_model = {
			
 
				-	.type = STARPU_HISTORY_BASED,
			
 
				-	.symbol = "copy_handle"
			
 
				-};
			
 
				-
			
 
				-static starpu_codelet copy_handle_cl = {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = copy_handle_cpu,
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = copy_handle_cuda,
			
 
				-#endif
			
 
				-	.nbuffers = 2,
			
 
				-	.model = &copy_handle_model
			
 
				-};
			
 
				-
			
 
				-void copy_handle(starpu_data_handle dst, starpu_data_handle src, unsigned nblocks)
			
 
				+int copy_handle(starpu_data_handle_t dst, starpu_data_handle_t src, unsigned nblocks)
			
 
				 {
			
 
				 	unsigned b;
			
 
				 	for (b = 0; b < nblocks; b++)
			
 
				-	{
			
 
				-		starpu_insert_task(&copy_handle_cl,
			
 
				-			STARPU_W, starpu_data_get_sub_data(dst, 1, b),
			
 
				-			STARPU_R, starpu_data_get_sub_data(src, 1, b),
			
 
				-			0);
			
 
				-	}
			
 
				-} 
			
 
				+		starpu_data_cpy(starpu_data_get_sub_data(dst, 1, b), starpu_data_get_sub_data(src, 1, b), 1, NULL, NULL);
			
 
				+	return 0;
			
 
				+}
			
--- a/examples/incrementer/incrementer.c
+++ b/examples/incrementer/incrementer.c
@@ -47,7 +47,7 @@ int main(int argc, char **argv)
 
				 		return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-#ifdef STARPU_SLOW_MACHINE
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				 	niter /= 100;
			
 
				 #endif
			
 
				 	if (argc == 2)
			
--- a/examples/interface/complex_interface.c
+++ b/examples/interface/complex_interface.c
@@ -15,9 +15,6 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				-#include <starpu_cuda.h>
			
 
				-#include <starpu_opencl.h>
			
 
				-#include <starpu_hash.h>
			
 
				 
			
 
				 #include "complex_interface.h"
			
 
				 
			
@@ -137,7 +134,7 @@ static starpu_ssize_t complex_allocate_data_on_node(void *data_interface, uint32
 
				 		}
			
 
				 #endif
			
 
				 		default:
			
 
				-			STARPU_ASSERT(0);
			
 
				+			STARPU_ABORT();
			
 
				 	}
			
 
				 
			
 
				 	if (fail)
			
@@ -164,6 +161,43 @@ static uint32_t complex_footprint(starpu_data_handle_t handle)
 
				 	return starpu_crc32_be(starpu_complex_get_nx(handle), 0);
			
 
				 }
			
 
				 
			
 
				+static void *complex_handle_to_pointer(starpu_data_handle_t handle, uint32_t node)
			
 
				+{
			
 
				+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				+
			
 
				+	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *)
			
 
				+		starpu_data_get_interface_on_node(handle, node);
			
 
				+
			
 
				+	return (void*) complex_interface->real;
			
 
				+}
			
 
				+
			
 
				+static int complex_pack_data(starpu_data_handle_t handle, uint32_t node, void **ptr)
			
 
				+{
			
 
				+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				+
			
 
				+	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *)
			
 
				+		starpu_data_get_interface_on_node(handle, node);
			
 
				+
			
 
				+	*ptr = malloc(complex_get_size(handle));
			
 
				+	memcpy(*ptr, complex_interface->real, complex_interface->nx*sizeof(double));
			
 
				+	memcpy(*ptr+complex_interface->nx*sizeof(double), complex_interface->imaginary, complex_interface->nx*sizeof(double));
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int complex_unpack_data(starpu_data_handle_t handle, uint32_t node, void *ptr)
			
 
				+{
			
 
				+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				+
			
 
				+	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *)
			
 
				+		starpu_data_get_interface_on_node(handle, node);
			
 
				+
			
 
				+	memcpy(complex_interface->real, ptr, complex_interface->nx*sizeof(double));
			
 
				+	memcpy(complex_interface->imaginary, ptr+complex_interface->nx*sizeof(double), complex_interface->nx*sizeof(double));
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static int copy_cuda_async_sync(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, enum cudaMemcpyKind kind, cudaStream_t stream)
			
 
				 {
			
@@ -204,11 +238,10 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *_event)
			
 
				+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				 {
			
 
				 	struct starpu_complex_interface *src_complex = src_interface;
			
 
				 	struct starpu_complex_interface *dst_complex = dst_interface;
			
 
				-	cl_event *event = (cl_event *)_event;
			
 
				 	cl_int err;
			
 
				 	int ret;
			
 
				 
			
@@ -244,11 +277,10 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_
 
				         return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				 }
			
 
				 
			
 
				-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *_event)
			
 
				+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				 {
			
 
				 	struct starpu_complex_interface *src_complex = src_interface;
			
 
				 	struct starpu_complex_interface *dst_complex = dst_interface;
			
 
				-	cl_event *event = (cl_event *)_event;
			
 
				 	cl_int err;
			
 
				 	int ret;
			
 
				 
			
@@ -310,6 +342,9 @@ static struct starpu_data_interface_ops interface_complex_ops =
 
				 	.footprint = complex_footprint,
			
 
				 	.interfaceid = -1,
			
 
				 	.interface_size = sizeof(struct starpu_complex_interface),
			
 
				+	.handle_to_pointer = complex_handle_to_pointer,
			
 
				+	.pack_data = complex_pack_data,
			
 
				+	.unpack_data = complex_unpack_data
			
 
				 };
			
 
				 
			
 
				 void starpu_complex_data_register(starpu_data_handle_t *handleptr, uint32_t home_node, double *real, double *imaginary, int nx)
			
--- a/examples/lu/lu_example.c
+++ b/examples/lu/lu_example.c
@@ -300,7 +300,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 	parse_args(argc, argv);
			
 
				 
			
 
				-#ifdef STARPU_SLOW_MACHINE
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				 	size /= 4;
			
 
				 	nblocks /= 4;
			
 
				 #endif
			
--- a/examples/mult/xgemm.c
+++ b/examples/mult/xgemm.c
@@ -33,7 +33,7 @@
 
				 static unsigned niter = 10;
			
 
				 static unsigned nslicesx = 4;
			
 
				 static unsigned nslicesy = 4;
			
 
				-#ifdef STARPU_SLOW_MACHINE
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				 static unsigned xdim = 256;
			
 
				 static unsigned ydim = 256;
			
 
				 static unsigned zdim = 64;
			
@@ -287,7 +287,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 	parse_args(argc, argv);
			
 
				 
			
 
				-#ifdef STARPU_SLOW_MACHINE
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				 	niter /= 10;
			
 
				 #endif
			
 
				 
			
--- a/examples/pipeline/pipeline.c
+++ b/examples/pipeline/pipeline.c
@@ -41,7 +41,7 @@
 
				 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				 
			
 
				 /* Vector size */
			
 
				-#ifdef STARPU_SLOW_MACHINE
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				 #define N 16
			
 
				 #else
			
 
				 #define N 1048576
			
--- a/examples/reductions/dot_product.c
+++ b/examples/reductions/dot_product.c
@@ -1,6 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2012 inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -16,24 +17,48 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 #include <assert.h>
			
 
				+#include <math.h>
			
 
				+
			
 
				+#include <reductions/dot_product.h>
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <cuda.h>
			
 
				 #include <cublas.h>
			
 
				 #endif
			
 
				 
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				+
			
 
				 static float *x;
			
 
				 static float *y;
			
 
				-static starpu_data_handle *x_handles;
			
 
				-static starpu_data_handle *y_handles;
			
 
				+static starpu_data_handle_t *x_handles;
			
 
				+static starpu_data_handle_t *y_handles;
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static struct starpu_opencl_program opencl_program;
			
 
				+#endif
			
 
				 
			
 
				 static unsigned nblocks = 4096;
			
 
				-static unsigned entries_per_bock = 1024;
			
 
				-
			
 
				-#define DOT_TYPE double
			
 
				+static unsigned entries_per_block = 1024;
			
 
				 
			
 
				 static DOT_TYPE dot = 0.0f;
			
 
				-static starpu_data_handle dot_handle;
			
 
				+static starpu_data_handle_t dot_handle;
			
 
				+
			
 
				+static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
			
 
				+{
			
 
				+	enum starpu_archtype type = starpu_worker_get_type(workerid);
			
 
				+	if (type == STARPU_CPU_WORKER || type == STARPU_OPENCL_WORKER)
			
 
				+		return 1;
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	/* Cuda device */
			
 
				+	const struct cudaDeviceProp *props;
			
 
				+	props = starpu_cuda_get_device_properties(workerid);
			
 
				+	if (props->major >= 2 || props->minor >= 3)
			
 
				+		/* At least compute capability 1.3, supports doubles */
			
 
				+		return 1;
			
 
				+#endif
			
 
				+	/* Old card, does not support doubles */
			
 
				+	return 0;
			
 
				+}
			
 
				 
			
 
				 /*
			
 
				  *	Codelet to create a neutral element
			
@@ -49,16 +74,45 @@ void init_cpu_func(void *descr[], void *cl_arg)
 
				 void init_cuda_func(void *descr[], void *cl_arg)
			
 
				 {
			
 
				 	DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				-	cudaMemset(dot, 0, sizeof(DOT_TYPE));
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaMemsetAsync(dot, 0, sizeof(DOT_TYPE), starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+void init_opencl_func(void *buffers[], void *args)
			
 
				+{
			
 
				+        cl_int err;
			
 
				+	cl_command_queue queue;
			
 
				+
			
 
				+	cl_mem dot = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[0]);
			
 
				+	starpu_opencl_get_current_queue(&queue);
			
 
				+	DOT_TYPE zero = (DOT_TYPE) 0.0;
			
 
				+
			
 
				+	err = clEnqueueWriteBuffer(queue,
			
 
				+			dot,
			
 
				+			CL_TRUE,
			
 
				+			0,
			
 
				+			sizeof(DOT_TYPE),
			
 
				+			&zero,
			
 
				+			0,
			
 
				+			NULL,
			
 
				+			NULL);
			
 
				+	if (err != CL_SUCCESS)
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-static struct starpu_codelet_t init_codelet = {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = init_cpu_func,
			
 
				+static struct starpu_codelet init_codelet =
			
 
				+{
			
 
				+	.can_execute = can_execute,
			
 
				+	.cpu_funcs = {init_cpu_func, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = init_cuda_func,
			
 
				+	.cuda_funcs = {init_cuda_func, NULL},
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.opencl_funcs = {init_opencl_func, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 1
			
 
				 };
			
@@ -75,9 +129,71 @@ void redux_cpu_func(void *descr[], void *cl_arg)
 
				 	*dota = *dota + *dotb;
			
 
				 }
			
 
				 
			
 
				-static struct starpu_codelet_t redux_codelet = {
			
 
				-	.where = STARPU_CPU,
			
 
				-	.cpu_func = redux_cpu_func,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+extern void redux_cuda_func(void *descr[], void *_args);
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+void redux_opencl_func(void *buffers[], void *args)
			
 
				+{
			
 
				+	int id, devid;
			
 
				+        cl_int err;
			
 
				+	cl_kernel kernel;
			
 
				+	cl_command_queue queue;
			
 
				+	cl_event event;
			
 
				+
			
 
				+	cl_mem dota = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[0]);
			
 
				+	cl_mem dotb = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[1]);
			
 
				+
			
 
				+	id = starpu_worker_get_id();
			
 
				+	devid = starpu_worker_get_devid(id);
			
 
				+
			
 
				+	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "_redux_opencl", devid);
			
 
				+	if (err != CL_SUCCESS)
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	err = clSetKernelArg(kernel, 0, sizeof(dota), &dota);
			
 
				+	err|= clSetKernelArg(kernel, 1, sizeof(dotb), &dotb);
			
 
				+	if (err != CL_SUCCESS)
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	{
			
 
				+		size_t global=1;
			
 
				+		size_t local;
			
 
				+                size_t s;
			
 
				+                cl_device_id device;
			
 
				+
			
 
				+                starpu_opencl_get_device(devid, &device);
			
 
				+
			
 
				+                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
			
 
				+                if (err != CL_SUCCESS)
			
 
				+			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+                if (local > global)
			
 
				+			local=global;
			
 
				+
			
 
				+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
			
 
				+		if (err != CL_SUCCESS)
			
 
				+			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	}
			
 
				+
			
 
				+	clFinish(queue);
			
 
				+	starpu_opencl_collect_stats(event);
			
 
				+	clReleaseEvent(event);
			
 
				+
			
 
				+	starpu_opencl_release_kernel(kernel);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+static struct starpu_codelet redux_codelet =
			
 
				+{
			
 
				+	.can_execute = can_execute,
			
 
				+	.cpu_funcs = {redux_cpu_func, NULL},
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {redux_cuda_func, NULL},
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.opencl_funcs = {redux_opencl_func, NULL},
			
 
				+#endif
			
 
				 	.nbuffers = 2
			
 
				 };
			
 
				 
			
@@ -116,57 +232,122 @@ void dot_cuda_func(void *descr[], void *cl_arg)
 
				 
			
 
				 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				-	cudaMemcpy(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost);
			
 
				-
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaMemcpyAsync(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				 
			
 
				 	local_dot = (DOT_TYPE)cublasSdot(n, local_x, 1, local_y, 1);
			
 
				 
			
 
				-	//fprintf(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot);
			
 
				+	/* FPRINTF(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot); */
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 	current_dot += local_dot;
			
 
				 
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaMemcpyAsync(dot, &current_dot, sizeof(DOT_TYPE), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+void dot_opencl_func(void *buffers[], void *args)
			
 
				+{
			
 
				+	int id, devid;
			
 
				+        cl_int err;
			
 
				+	cl_kernel kernel;
			
 
				+	cl_command_queue queue;
			
 
				+	cl_event event;
			
 
				+
			
 
				+	cl_mem x = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
			
 
				+	cl_mem y = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[1]);
			
 
				+	cl_mem dot = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[2]);
			
 
				+	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				+
			
 
				+	id = starpu_worker_get_id();
			
 
				+	devid = starpu_worker_get_devid(id);
			
 
				+
			
 
				+	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "_dot_opencl", devid);
			
 
				+	if (err != CL_SUCCESS)
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	err = clSetKernelArg(kernel, 0, sizeof(x), &x);
			
 
				+	err|= clSetKernelArg(kernel, 1, sizeof(y), &y);
			
 
				+	err|= clSetKernelArg(kernel, 2, sizeof(dot), &dot);
			
 
				+	err|= clSetKernelArg(kernel, 3, sizeof(n), &n);
			
 
				+	if (err != CL_SUCCESS)
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	{
			
 
				+		size_t global=1;
			
 
				+		size_t local;
			
 
				+                size_t s;
			
 
				+                cl_device_id device;
			
 
				+
			
 
				+                starpu_opencl_get_device(devid, &device);
			
 
				+
			
 
				+                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
			
 
				+                if (err != CL_SUCCESS)
			
 
				+			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+                if (local > global)
			
 
				+			local=global;
			
 
				+
			
 
				+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
			
 
				+		if (err != CL_SUCCESS)
			
 
				+			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	}
			
 
				 
			
 
				-	cudaMemcpy(dot, &current_dot, sizeof(DOT_TYPE), cudaMemcpyHostToDevice);
			
 
				+	clFinish(queue);
			
 
				+	starpu_opencl_collect_stats(event);
			
 
				+	clReleaseEvent(event);
			
 
				 
			
 
				-	cudaThreadSynchronize();
			
 
				+	starpu_opencl_release_kernel(kernel);
			
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-static struct starpu_codelet_t dot_codelet = {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = dot_cpu_func,
			
 
				+static struct starpu_codelet dot_codelet =
			
 
				+{
			
 
				+	.can_execute = can_execute,
			
 
				+	.cpu_funcs = {dot_cpu_func, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = dot_cuda_func,
			
 
				+	.cuda_funcs = {dot_cuda_func, NULL},
			
 
				 #endif
			
 
				-	.nbuffers = 3
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.opencl_funcs = {dot_opencl_func, NULL},
			
 
				+#endif
			
 
				+	.nbuffers = 3,
			
 
				+	.modes = {STARPU_R, STARPU_R, STARPU_REDUX}
			
 
				 };
			
 
				 
			
 
				 /*
			
 
				  *	Tasks initialization
			
 
				  */
			
 
				 
			
 
				-extern void starpu_data_end_reduction_mode(starpu_data_handle handle);
			
 
				-
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				-	starpu_init(NULL);
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV)
			
 
				+		return 77;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	ret = starpu_opencl_load_opencl_from_file("examples/reductions/dot_product_opencl_kernels.cl",
			
 
				+						  &opencl_program, NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
			
 
				+#endif
			
 
				 
			
 
				 	starpu_helper_cublas_init();
			
 
				 
			
 
				-	unsigned long nelems = nblocks*entries_per_bock;
			
 
				+	unsigned long nelems = nblocks*entries_per_block;
			
 
				 	size_t size = nelems*sizeof(float);
			
 
				 
			
 
				-	x = malloc(size);
			
 
				-	y = malloc(size);
			
 
				+	x = (float *) malloc(size);
			
 
				+	y = (float *) malloc(size);
			
 
				 
			
 
				-	x_handles = calloc(nblocks, sizeof(starpu_data_handle));
			
 
				-	y_handles = calloc(nblocks, sizeof(starpu_data_handle));
			
 
				+	x_handles = (starpu_data_handle_t *) calloc(nblocks, sizeof(starpu_data_handle_t));
			
 
				+	y_handles = (starpu_data_handle_t *) calloc(nblocks, sizeof(starpu_data_handle_t));
			
 
				 
			
 
				 	assert(x && y);
			
 
				 
			
 
				         starpu_srand48(0);
			
 
				-	
			
 
				+
			
 
				 	DOT_TYPE reference_dot = 0.0;
			
 
				 
			
 
				 	unsigned long i;
			
@@ -176,15 +357,15 @@ int main(int argc, char **argv)
 
				 		y[i] = (float)starpu_drand48();
			
 
				 
			
 
				 		reference_dot += (DOT_TYPE)x[i]*(DOT_TYPE)y[i];
			
 
				-	} 
			
 
				-	
			
 
				+	}
			
 
				+
			
 
				 	unsigned block;
			
 
				 	for (block = 0; block < nblocks; block++)
			
 
				 	{
			
 
				 		starpu_vector_data_register(&x_handles[block], 0,
			
 
				-			(uintptr_t)&x[entries_per_bock*block], entries_per_bock, sizeof(float));
			
 
				+			(uintptr_t)&x[entries_per_block*block], entries_per_block, sizeof(float));
			
 
				 		starpu_vector_data_register(&y_handles[block], 0,
			
 
				-			(uintptr_t)&y[entries_per_bock*block], entries_per_bock, sizeof(float));
			
 
				+			(uintptr_t)&y[entries_per_block*block], entries_per_block, sizeof(float));
			
 
				 	}
			
 
				 
			
 
				 	starpu_variable_data_register(&dot_handle, 0, (uintptr_t)&dot, sizeof(DOT_TYPE));
			
@@ -199,25 +380,47 @@ int main(int argc, char **argv)
 
				 		struct starpu_task *task = starpu_task_create();
			
 
				 
			
 
				 		task->cl = &dot_codelet;
			
 
				+		task->destroy = 1;
			
 
				 
			
 
				-		task->buffers[0].handle = x_handles[block];
			
 
				-		task->buffers[0].mode = STARPU_R;
			
 
				-		task->buffers[1].handle = y_handles[block];
			
 
				-		task->buffers[1].mode = STARPU_R;
			
 
				-		task->buffers[2].handle = dot_handle;
			
 
				-		task->buffers[2].mode = STARPU_REDUX;
			
 
				+		task->handles[0] = x_handles[block];
			
 
				+		task->handles[1] = y_handles[block];
			
 
				+		task->handles[2] = dot_handle;
			
 
				 
			
 
				-		int ret = starpu_task_submit(task);
			
 
				+		ret = starpu_task_submit(task);
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				 		STARPU_ASSERT(!ret);
			
 
				 	}
			
 
				 
			
 
				+	for (block = 0; block < nblocks; block++)
			
 
				+	{
			
 
				+		starpu_data_unregister(x_handles[block]);
			
 
				+		starpu_data_unregister(y_handles[block]);
			
 
				+	}
			
 
				 	starpu_data_unregister(dot_handle);
			
 
				 
			
 
				-	fprintf(stderr, "Reference : %e vs. %e (Delta %e)\n", reference_dot, dot, reference_dot - dot);
			
 
				+	FPRINTF(stderr, "Reference : %e vs. %e (Delta %e)\n", reference_dot, dot, reference_dot - dot);
			
 
				 
			
 
				 	starpu_helper_cublas_shutdown();
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+        ret = starpu_opencl_unload_opencl(&opencl_program);
			
 
				+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
			
 
				+#endif
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	return 0;
			
 
				+	free(x);
			
 
				+	free(y);
			
 
				+	free(x_handles);
			
 
				+	free(y_handles);
			
 
				+
			
 
				+	if (fabs(reference_dot - dot) < reference_dot * 1e-6)
			
 
				+		return EXIT_SUCCESS;
			
 
				+	else
			
 
				+		return EXIT_FAILURE;
			
 
				+
			
 
				+enodev:
			
 
				+	fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				+	/* yes, we do not perform the computation but we did detect that no one
			
 
				+ 	 * could perform the kernel, so this is not an error from StarPU */
			
 
				+	return 77;
			
 
				 }
			
--- a/examples/reductions/minmax_reduction.c
+++ b/examples/reductions/minmax_reduction.c
@@ -19,7 +19,7 @@
 
				 #include <limits.h>
			
 
				 #include <starpu.h>
			
 
				 
			
 
				-#ifdef STARPU_SLOW_MACHINE
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				 static unsigned nblocks = 512;
			
 
				 static unsigned entries_per_bock = 64;
			
 
				 #else
			
--- a/examples/stencil/Makefile.am
+++ b/examples/stencil/Makefile.am
@@ -1,6 +1,6 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				-# Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				+# Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
 
				 # it under the terms of the GNU Lesser General Public License as published by
			
@@ -13,13 +13,14 @@
 
				 #
			
 
				 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				 
			
 
				-AM_CFLAGS = $(HWLOC_CFLAGS)
			
 
				-LIBS = $(top_builddir)/src/libstarpu.la $(HWLOC_LIBS) @LIBS@
			
 
				+AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
			
 
				+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(HWLOC_LIBS) @LIBS@
			
 
				 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
			
 
				+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) 
			
 
				 
			
 
				 if USE_MPI
			
 
				-LIBS += $(top_builddir)/mpi/libstarpumpi.la
			
 
				-AM_CPPFLAGS += -I$(top_srcdir)/mpi/
			
 
				+LIBS += $(top_builddir)/mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+AM_CPPFLAGS += -I$(top_srcdir)/mpi/include
			
 
				 endif
			
 
				 
			
 
				 CC = $(CC_OR_MPICC)
			
@@ -34,7 +35,7 @@ NVCCFLAGS += $(HWLOC_CFLAGS)
 
				 
			
 
				 .cu.o:
			
 
				 	$(MKDIR_P) `dirname $@`
			
 
				-	$(NVCC) $< -c -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS) -I$(top_srcdir)/include/ -I$(top_builddir)/include/
			
 
				+	$(NVCC) $< -c -o $@ --compiler-options -fno-strict-aliasing -I$(top_srcdir)/include/ -I$(top_builddir)/include/ $(NVCCFLAGS)
			
 
				 
			
 
				 
			
 
				 endif
			
@@ -46,7 +47,7 @@ endif
 
				 check_PROGRAMS =				\
			
 
				 	stencil
			
 
				 
			
 
				-examplebindir = $(libdir)/starpu/examples/
			
 
				+examplebindir = $(libdir)/starpu/examples/stencil
			
 
				 
			
 
				 examplebin_PROGRAMS =				\
			
 
				 	stencil
			
@@ -112,3 +113,6 @@ CLEANFILES = *.xpm
 
				 
			
 
				 view:
			
 
				 	feh --zoom 800 -F 0.xpm 0.5.xpm 1.xpm 2.xpm 3.xpm 4.xpm 6.xpm mpi.xpm
			
 
				+
			
 
				+showcheck:
			
 
				+	-cat $(TEST_LOGS) /dev/null
			
--- a/examples/stencil/stencil-kernels.c
+++ b/examples/stencil/stencil-kernels.c
@@ -1,6 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
 
				  * the Free Software Foundation; either version 2.1 of the License, or (at
			
@@ -16,17 +18,14 @@
 
				 #include "stencil.h"
			
 
				 #include <sys/time.h>
			
 
				 
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-#include <CL/cl.h>
			
 
				-#include <starpu_opencl.h>
			
 
				-#endif
			
 
				-
			
 
				 #ifndef timersub
			
 
				 #define	timersub(x, y, res) \
			
 
				-	do { \
			
 
				+	do \
			
 
				+	{						   \
			
 
				 		(res)->tv_sec = (x)->tv_sec - (y)->tv_sec; \
			
 
				 		(res)->tv_usec = (x)->tv_usec - (y)->tv_usec; \
			
 
				-		if ((res)->tv_usec < 0) { \
			
 
				+		if ((res)->tv_usec < 0) \
			
 
				+		{			 \
			
 
				 			(res)->tv_sec--; \
			
 
				 			(res)->tv_usec += 1000000; \
			
 
				 		} \
			
@@ -34,10 +33,12 @@
 
				 #endif
			
 
				 #ifndef timeradd
			
 
				 #define	timeradd(x, y, res) \
			
 
				-	do { \
			
 
				+	do \
			
 
				+	{						   \
			
 
				 		(res)->tv_sec = (x)->tv_sec + (y)->tv_sec; \
			
 
				 		(res)->tv_usec = (x)->tv_usec + (y)->tv_usec; \
			
 
				-		if ((res)->tv_usec >= 1000000) { \
			
 
				+		if ((res)->tv_usec >= 1000000) \
			
 
				+		{			       \
			
 
				 			(res)->tv_sec++; \
			
 
				 			(res)->tv_usec -= 1000000; \
			
 
				 		} \
			
@@ -124,6 +125,9 @@ int *who_runs_what;
 
				 int *who_runs_what_index;
			
 
				 struct timeval *last_tick;
			
 
				 
			
 
				+/* Achieved iterations */
			
 
				+static int achieved_iter;
			
 
				+
			
 
				 /* Record how many updates each worker performed */
			
 
				 unsigned update_per_worker[STARPU_NMAXWORKERS];
			
 
				 
			
@@ -135,7 +139,8 @@ static void record_who_runs_what(struct block_description *block)
 
				 	gettimeofday(&tv, NULL);
			
 
				 	timersub(&tv, &start, &tv2);
			
 
				 	timersub(&tv2, &last_tick[block->bz], &diff);
			
 
				-	while (timercmp(&diff, &delta, >=)) {
			
 
				+	while (timercmp(&diff, &delta, >=))
			
 
				+	{
			
 
				 		timeradd(&last_tick[block->bz], &delta, &last_tick[block->bz]);
			
 
				 		timersub(&tv2, &last_tick[block->bz], &diff);
			
 
				 		if (who_runs_what_index[block->bz] < who_runs_what_len)
			
@@ -146,7 +151,7 @@ static void record_who_runs_what(struct block_description *block)
 
				 		who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = global_workerid(workerid);
			
 
				 }
			
 
				 
			
 
				-static void check_load(starpu_block_interface_t *block, starpu_block_interface_t *boundary)
			
 
				+static void check_load(struct starpu_block_interface *block, struct starpu_block_interface *boundary)
			
 
				 {
			
 
				 	/* Sanity checks */
			
 
				 	STARPU_ASSERT(block->nx == boundary->nx);
			
@@ -162,10 +167,12 @@ static void check_load(starpu_block_interface_t *block, starpu_block_interface_t
 
				 /*
			
 
				  * Load a neighbour's boundary into block, CPU version
			
 
				  */
			
 
				-static void load_subblock_from_buffer_cpu(starpu_block_interface_t *block,
			
 
				-					starpu_block_interface_t *boundary,
			
 
				+static void load_subblock_from_buffer_cpu(void *_block,
			
 
				+					void *_boundary,
			
 
				 					unsigned firstz)
			
 
				 {
			
 
				+	struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
			
 
				+	struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
			
 
				 	check_load(block, boundary);
			
 
				 
			
 
				 	/* We do a contiguous memory transfer */
			
@@ -181,10 +188,12 @@ static void load_subblock_from_buffer_cpu(starpu_block_interface_t *block,
 
				  * Load a neighbour's boundary into block, CUDA version
			
 
				  */
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static void load_subblock_from_buffer_cuda(starpu_block_interface_t *block,
			
 
				-					starpu_block_interface_t *boundary,
			
 
				+static void load_subblock_from_buffer_cuda(void *_block,
			
 
				+					void *_boundary,
			
 
				 					unsigned firstz)
			
 
				 {
			
 
				+	struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
			
 
				+	struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
			
 
				 	check_load(block, boundary);
			
 
				 
			
 
				 	/* We do a contiguous memory transfer */
			
@@ -241,17 +250,17 @@ fprintf(stderr,"!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid)
 
				 
			
 
				 	for (i=1; i<=K; i++)
			
 
				 	{
			
 
				-		starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
			
 
				-		TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;
			
 
				+		struct starpu_block_interface *oldb = descr[i%2], *newb = descr[(i+1)%2];
			
 
				+		TYPE *old = (void*) oldb->ptr, *newer = (void*) newb->ptr;
			
 
				 
			
 
				 		/* Shadow data */
			
 
				 		cuda_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
			
 
				 
			
 
				 		/* And perform actual computation */
			
 
				 #ifdef LIFE
			
 
				-		cuda_life_update_host(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
			
 
				+		cuda_life_update_host(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
			
 
				 #else
			
 
				-		cudaMemcpyAsync(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
			
 
				+		cudaMemcpyAsync(newer, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
			
 
				 #endif /* LIFE */
			
 
				 	}
			
 
				 
			
@@ -259,6 +268,8 @@ fprintf(stderr,"!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid)
 
				 	if ((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess)
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				+	if (block->bz == 0)
			
 
				+		starpu_top_update_data_integer(starpu_top_achieved_loop, ++achieved_iter);
			
 
				 }
			
 
				 #endif /* STARPU_USE_CUDA */
			
 
				 
			
@@ -266,8 +277,8 @@ fprintf(stderr,"!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid)
 
				  * Load a neighbour's boundary into block, OpenCL version
			
 
				  */
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-static void load_subblock_from_buffer_opencl(starpu_block_interface_t *block,
			
 
				-					starpu_block_interface_t *boundary,
			
 
				+static void load_subblock_from_buffer_opencl(struct starpu_block_interface *block,
			
 
				+					struct starpu_block_interface *boundary,
			
 
				 					unsigned firstz)
			
 
				 {
			
 
				 	check_load(block, boundary);
			
@@ -278,10 +289,14 @@ static void load_subblock_from_buffer_opencl(starpu_block_interface_t *block,
 
				 	unsigned offset = firstz*block->ldz;
			
 
				 	cl_mem block_data = (cl_mem)block->ptr;
			
 
				 	cl_mem boundary_data = (cl_mem)boundary->ptr;
			
 
				+	cl_event event;
			
 
				 
			
 
				         cl_command_queue cq;
			
 
				         starpu_opencl_get_current_queue(&cq);
			
 
				-        clEnqueueCopyBuffer(cq, boundary_data, block_data, 0, offset, boundary_size, 0, NULL, NULL);
			
 
				+        clEnqueueCopyBuffer(cq, boundary_data, block_data, 0, offset, boundary_size, 0, NULL, &event);
			
 
				+
			
 
				+	clWaitForEvents(1, &event);
			
 
				+	clReleaseEvent(event);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -332,17 +347,20 @@ fprintf(stderr,"!!! DO update_func_opencl z %d OPENCL%d !!!\n", block->bz, worke
 
				 
			
 
				 	for (i=1; i<=K; i++)
			
 
				 	{
			
 
				-		starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
			
 
				-		TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;
			
 
				+		struct starpu_block_interface *oldb = descr[i%2], *newb = descr[(i+1)%2];
			
 
				+		TYPE *old = (void*) oldb->ptr, *newer = (void*) newb->ptr;
			
 
				 
			
 
				 		/* Shadow data */
			
 
				 		opencl_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
			
 
				 
			
 
				 		/* And perform actual computation */
			
 
				 #ifdef LIFE
			
 
				-		opencl_life_update_host(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
			
 
				+		opencl_life_update_host(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
			
 
				 #else
			
 
				-                clEnqueueCopyBuffer(cq, old, new, 0, 0, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), 0, NULL, NULL);
			
 
				+		cl_event event;
			
 
				+                clEnqueueCopyBuffer(cq, old, newer, 0, 0, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer), 0, NULL, &event);
			
 
				+		clWaitForEvents(1, &event);
			
 
				+		clReleaseEvent(event);
			
 
				 #endif /* LIFE */
			
 
				 	}
			
 
				 
			
@@ -350,6 +368,8 @@ fprintf(stderr,"!!! DO update_func_opencl z %d OPENCL%d !!!\n", block->bz, worke
 
				 	if ((err = clFinish(cq)))
			
 
				 		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				+	if (block->bz == 0)
			
 
				+		starpu_top_update_data_integer(starpu_top_achieved_loop, ++achieved_iter);
			
 
				 }
			
 
				 #endif /* STARPU_USE_OPENCL */
			
 
				 
			
@@ -358,7 +378,7 @@ fprintf(stderr,"!!! DO update_func_opencl z %d OPENCL%d !!!\n", block->bz, worke
 
				  */
			
 
				 static void update_func_cpu(void *descr[], void *arg)
			
 
				 {
			
 
				-	struct block_description *block = arg;
			
 
				+	struct block_description *block = (struct block_description *) arg;
			
 
				 	int workerid = starpu_worker_get_id();
			
 
				 	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
			
 
				 	if (block->bz == 0)
			
@@ -398,8 +418,8 @@ fprintf(stderr,"!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
 
				 
			
 
				 	for (i=1; i<=K; i++)
			
 
				 	{
			
 
				-		starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
			
 
				-		TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;
			
 
				+		struct starpu_block_interface *oldb = (struct starpu_block_interface *) descr[i%2], *newb = (struct starpu_block_interface *) descr[(i+1)%2];
			
 
				+		TYPE *old = (TYPE*) oldb->ptr, *newer = (TYPE*) newb->ptr;
			
 
				 
			
 
				 		/* Shadow data */
			
 
				 		unsigned ldy = oldb->ldy, ldz = oldb->ldz;
			
@@ -417,20 +437,25 @@ fprintf(stderr,"!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
 
				 
			
 
				 		/* And perform actual computation */
			
 
				 #ifdef LIFE
			
 
				-		life_update(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
			
 
				+		life_update(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
			
 
				 #else
			
 
				-		memcpy(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new));
			
 
				+		memcpy(newer, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer));
			
 
				 #endif /* LIFE */
			
 
				 	}
			
 
				+
			
 
				+	if (block->bz == 0)
			
 
				+		starpu_top_update_data_integer(starpu_top_achieved_loop, ++achieved_iter);
			
 
				 }
			
 
				 
			
 
				 /* Performance model and codelet structure */
			
 
				-static struct starpu_perfmodel_t cl_update_model = {
			
 
				+static struct starpu_perfmodel cl_update_model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "cl_update" 
			
 
				 };
			
 
				 
			
 
				-starpu_codelet cl_update = {
			
 
				+struct starpu_codelet cl_update =
			
 
				+{
			
 
				 	.where = 0 |
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		STARPU_CUDA|
			
@@ -439,15 +464,16 @@ starpu_codelet cl_update = {
 
				                 STARPU_OPENCL|
			
 
				 #endif
			
 
				 		STARPU_CPU,
			
 
				-	.cpu_func = update_func_cpu,
			
 
				+	.cpu_funcs = {update_func_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = update_func_cuda,
			
 
				+	.cuda_funcs = {update_func_cuda, NULL},
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-	.opencl_func = update_func_opencl,
			
 
				+	.opencl_funcs = {update_func_opencl, NULL},
			
 
				 #endif
			
 
				 	.model = &cl_update_model,
			
 
				-	.nbuffers = 6
			
 
				+	.nbuffers = 6,
			
 
				+	.modes = {STARPU_RW, STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R}
			
 
				 };
			
 
				 
			
 
				 /*
			
@@ -455,10 +481,12 @@ starpu_codelet cl_update = {
 
				  */
			
 
				 
			
 
				 /* CPU version */
			
 
				-static void load_subblock_into_buffer_cpu(starpu_block_interface_t *block,
			
 
				-					starpu_block_interface_t *boundary,
			
 
				+static void load_subblock_into_buffer_cpu(void *_block,
			
 
				+					void *_boundary,
			
 
				 					unsigned firstz)
			
 
				 {
			
 
				+	struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
			
 
				+	struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
			
 
				 	check_load(block, boundary);
			
 
				 
			
 
				 	/* We do a contiguous memory transfer */
			
@@ -472,10 +500,12 @@ static void load_subblock_into_buffer_cpu(starpu_block_interface_t *block,
 
				 
			
 
				 /* CUDA version */
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static void load_subblock_into_buffer_cuda(starpu_block_interface_t *block,
			
 
				-					starpu_block_interface_t *boundary,
			
 
				+static void load_subblock_into_buffer_cuda(void *_block,
			
 
				+					void *_boundary,
			
 
				 					unsigned firstz)
			
 
				 {
			
 
				+	struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
			
 
				+	struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
			
 
				 	check_load(block, boundary);
			
 
				 
			
 
				 	/* We do a contiguous memory transfer */
			
@@ -490,8 +520,8 @@ static void load_subblock_into_buffer_cuda(starpu_block_interface_t *block,
 
				 
			
 
				 /* OPENCL version */
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-static void load_subblock_into_buffer_opencl(starpu_block_interface_t *block,
			
 
				-					starpu_block_interface_t *boundary,
			
 
				+static void load_subblock_into_buffer_opencl(struct starpu_block_interface *block,
			
 
				+					struct starpu_block_interface *boundary,
			
 
				 					unsigned firstz)
			
 
				 {
			
 
				 	check_load(block, boundary);
			
@@ -505,8 +535,12 @@ static void load_subblock_into_buffer_opencl(starpu_block_interface_t *block,
 
				 
			
 
				         cl_command_queue cq;
			
 
				         starpu_opencl_get_current_queue(&cq);
			
 
				+	cl_event event;
			
 
				+
			
 
				+        clEnqueueCopyBuffer(cq, block_data, boundary_data, offset, 0, boundary_size, 0, NULL, &event);
			
 
				 
			
 
				-        clEnqueueCopyBuffer(cq, block_data, boundary_data, offset, 0, boundary_size, 0, NULL, NULL);
			
 
				+	clWaitForEvents(1, &event);
			
 
				+	clReleaseEvent(event);
			
 
				 }
			
 
				 #endif /* STARPU_USE_OPENCL */
			
 
				 
			
@@ -517,7 +551,7 @@ unsigned bottom_per_worker[STARPU_NMAXWORKERS];
 
				 /* top save, CPU version */
			
 
				 static void dummy_func_top_cpu(void *descr[] __attribute__((unused)), void *arg)
			
 
				 {
			
 
				-	struct block_description *block = arg;
			
 
				+	struct block_description *block = (struct block_description *) arg;
			
 
				 	int workerid = starpu_worker_get_id();
			
 
				 	top_per_worker[workerid]++;
			
 
				 
			
@@ -533,7 +567,7 @@ static void dummy_func_top_cpu(void *descr[] __attribute__((unused)), void *arg)
 
				 /* bottom save, CPU version */
			
 
				 static void dummy_func_bottom_cpu(void *descr[] __attribute__((unused)), void *arg)
			
 
				 {
			
 
				-	struct block_description *block = arg;
			
 
				+	struct block_description *block = (struct block_description *) arg;
			
 
				 	int workerid = starpu_worker_get_id();
			
 
				 	bottom_per_worker[workerid]++;
			
 
				 
			
@@ -547,7 +581,7 @@ static void dummy_func_bottom_cpu(void *descr[] __attribute__((unused)), void *a
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static void dummy_func_top_cuda(void *descr[] __attribute__((unused)), void *arg)
			
 
				 {
			
 
				-	struct block_description *block = arg;
			
 
				+	struct block_description *block = (struct block_description *) arg;
			
 
				 	int workerid = starpu_worker_get_id();
			
 
				 	top_per_worker[workerid]++;
			
 
				 
			
@@ -564,7 +598,7 @@ static void dummy_func_top_cuda(void *descr[] __attribute__((unused)), void *arg
 
				 /* bottom save, CUDA version */
			
 
				 static void dummy_func_bottom_cuda(void *descr[] __attribute__((unused)), void *arg)
			
 
				 {
			
 
				-	struct block_description *block = arg;
			
 
				+	struct block_description *block = (struct block_description *) arg;
			
 
				 	int workerid = starpu_worker_get_id();
			
 
				 	bottom_per_worker[workerid]++;
			
 
				 
			
@@ -580,7 +614,7 @@ static void dummy_func_bottom_cuda(void *descr[] __attribute__((unused)), void *
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 static void dummy_func_top_opencl(void *descr[] __attribute__((unused)), void *arg)
			
 
				 {
			
 
				-	struct block_description *block = arg;
			
 
				+	struct block_description *block = (struct block_description *) arg;
			
 
				 	int workerid = starpu_worker_get_id();
			
 
				 	top_per_worker[workerid]++;
			
 
				 
			
@@ -600,7 +634,7 @@ static void dummy_func_top_opencl(void *descr[] __attribute__((unused)), void *a
 
				 /* bottom save, OPENCL version */
			
 
				 static void dummy_func_bottom_opencl(void *descr[] __attribute__((unused)), void *arg)
			
 
				 {
			
 
				-	struct block_description *block = arg;
			
 
				+	struct block_description *block = (struct block_description *) arg;
			
 
				 	int workerid = starpu_worker_get_id();
			
 
				 	bottom_per_worker[workerid]++;
			
 
				 
			
@@ -616,17 +650,20 @@ static void dummy_func_bottom_opencl(void *descr[] __attribute__((unused)), void
 
				 #endif /* STARPU_USE_OPENCL */
			
 
				 
			
 
				 /* Performance models and codelet for save */
			
 
				-static struct starpu_perfmodel_t save_cl_bottom_model = {
			
 
				+static struct starpu_perfmodel save_cl_bottom_model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "save_cl_bottom" 
			
 
				 };
			
 
				 
			
 
				-static struct starpu_perfmodel_t save_cl_top_model = {
			
 
				+static struct starpu_perfmodel save_cl_top_model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "save_cl_top" 
			
 
				 };
			
 
				 
			
 
				-starpu_codelet save_cl_bottom = {
			
 
				+struct starpu_codelet save_cl_bottom =
			
 
				+{
			
 
				 	.where = 0 |
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		STARPU_CUDA|
			
@@ -635,18 +672,20 @@ starpu_codelet save_cl_bottom = {
 
				 		STARPU_OPENCL|
			
 
				 #endif
			
 
				 		STARPU_CPU,
			
 
				-	.cpu_func = dummy_func_bottom_cpu,
			
 
				+	.cpu_funcs = {dummy_func_bottom_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = dummy_func_bottom_cuda,
			
 
				+	.cuda_funcs = {dummy_func_bottom_cuda, NULL},
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-	.opencl_func = dummy_func_bottom_opencl,
			
 
				+	.opencl_funcs = {dummy_func_bottom_opencl, NULL},
			
 
				 #endif
			
 
				 	.model = &save_cl_bottom_model,
			
 
				-	.nbuffers = 4
			
 
				+	.nbuffers = 4,
			
 
				+	.modes = {STARPU_R, STARPU_R, STARPU_W, STARPU_W}
			
 
				 };
			
 
				 
			
 
				-starpu_codelet save_cl_top = {
			
 
				+struct starpu_codelet save_cl_top =
			
 
				+{
			
 
				 	.where = 0|
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		STARPU_CUDA|
			
@@ -655,13 +694,14 @@ starpu_codelet save_cl_top = {
 
				 		STARPU_OPENCL|
			
 
				 #endif
			
 
				 		STARPU_CPU,
			
 
				-	.cpu_func = dummy_func_top_cpu,
			
 
				+	.cpu_funcs = {dummy_func_top_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = dummy_func_top_cuda,
			
 
				+	.cuda_funcs = {dummy_func_top_cuda, NULL},
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-	.opencl_func = dummy_func_top_opencl,
			
 
				+	.opencl_funcs = {dummy_func_top_opencl, NULL},
			
 
				 #endif
			
 
				 	.model = &save_cl_top_model,
			
 
				-	.nbuffers = 4
			
 
				+	.nbuffers = 4,
			
 
				+	.modes = {STARPU_R, STARPU_R, STARPU_W, STARPU_W}
			
 
				 };
			
--- a/examples/stencil/stencil-tasks.c
+++ b/examples/stencil/stencil-tasks.c
@@ -1,6 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -48,17 +49,13 @@ static void create_task_save_local(unsigned iter, unsigned z, int dir, unsigned
 
				 	save_task->cl_arg = descr;
			
 
				 
			
 
				 	/* Saving our border... */
			
 
				-	save_task->buffers[0].handle = descr->layers_handle[0];
			
 
				-	save_task->buffers[0].mode = STARPU_R;
			
 
				-	save_task->buffers[1].handle = descr->layers_handle[1];
			
 
				-	save_task->buffers[1].mode = STARPU_R;
			
 
				+	save_task->handles[0] = descr->layers_handle[0];
			
 
				+	save_task->handles[1] = descr->layers_handle[1];
			
 
				 
			
 
				 	/* ... to the neighbour's copy */
			
 
				 	struct block_description *neighbour = descr->boundary_blocks[(1+dir)/2];
			
 
				-	save_task->buffers[2].handle = neighbour->boundaries_handle[(1-dir)/2][0];
			
 
				-	save_task->buffers[2].mode = STARPU_W;
			
 
				-	save_task->buffers[3].handle = neighbour->boundaries_handle[(1-dir)/2][1];
			
 
				-	save_task->buffers[3].mode = STARPU_W;
			
 
				+	save_task->handles[2] = neighbour->boundaries_handle[(1-dir)/2][0];
			
 
				+	save_task->handles[3] = neighbour->boundaries_handle[(1-dir)/2][1];
			
 
				 
			
 
				 	/* Bind */
			
 
				 	if (iter <= BIND_LAST)
			
@@ -69,14 +66,15 @@ static void create_task_save_local(unsigned iter, unsigned z, int dir, unsigned
 
				 	if (ret)
			
 
				 	{
			
 
				 		fprintf(stderr, "Could not submit task save: %d\n", ret);
			
 
				-		STARPU_ASSERT(0);
			
 
				+		STARPU_ABORT();
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				 /* R(z) = local & R(z+d) != local */
			
 
				 /* We need to send our save over MPI */
			
 
				 
			
 
				-static void send_done(void *arg) {
			
 
				+static void send_done(void *arg)
			
 
				+{
			
 
				 	uintptr_t z = (uintptr_t) arg;
			
 
				 	DEBUG("DO SEND %d\n", (int)z);
			
 
				 }
			
@@ -93,8 +91,8 @@ static void create_task_save_mpi_send(unsigned iter, unsigned z, int dir, unsign
 
				 	STARPU_ASSERT(neighbour->mpi_node != local_rank);
			
 
				 
			
 
				 	/* Send neighbour's border copy to the neighbour */
			
 
				-	starpu_data_handle handle0 = neighbour->boundaries_handle[(1-dir)/2][0];
			
 
				-	starpu_data_handle handle1 = neighbour->boundaries_handle[(1-dir)/2][1];
			
 
				+	starpu_data_handle_t handle0 = neighbour->boundaries_handle[(1-dir)/2][0];
			
 
				+	starpu_data_handle_t handle1 = neighbour->boundaries_handle[(1-dir)/2][1];
			
 
				 
			
 
				 	starpu_mpi_isend_detached(handle0, dest, MPI_TAG0(z, iter, dir), MPI_COMM_WORLD, send_done, (void*)(uintptr_t)z);
			
 
				 	starpu_mpi_isend_detached(handle1, dest, MPI_TAG1(z, iter, dir), MPI_COMM_WORLD, send_done, (void*)(uintptr_t)z);
			
@@ -103,7 +101,8 @@ static void create_task_save_mpi_send(unsigned iter, unsigned z, int dir, unsign
 
				 /* R(z) != local & R(z+d) = local */
			
 
				 /* We need to receive over MPI */
			
 
				 
			
 
				-static void recv_done(void *arg) {
			
 
				+static void recv_done(void *arg)
			
 
				+{
			
 
				 	uintptr_t z = (uintptr_t) arg;
			
 
				 	DEBUG("DO RECV %d\n", (int)z);
			
 
				 }
			
@@ -119,13 +118,13 @@ static void create_task_save_mpi_recv(unsigned iter, unsigned z, int dir, unsign
 
				 	STARPU_ASSERT(neighbour->mpi_node == local_rank);
			
 
				 
			
 
				 	/* Receive our neighbour's border in our neighbour copy */
			
 
				-	starpu_data_handle handle0 = neighbour->boundaries_handle[(1-dir)/2][0];
			
 
				-	starpu_data_handle handle1 = neighbour->boundaries_handle[(1-dir)/2][1];
			
 
				+	starpu_data_handle_t handle0 = neighbour->boundaries_handle[(1-dir)/2][0];
			
 
				+	starpu_data_handle_t handle1 = neighbour->boundaries_handle[(1-dir)/2][1];
			
 
				 
			
 
				 	starpu_mpi_irecv_detached(handle0, source, MPI_TAG0(z, iter, dir), MPI_COMM_WORLD, recv_done, (void*)(uintptr_t)z);
			
 
				 	starpu_mpi_irecv_detached(handle1, source, MPI_TAG1(z, iter, dir), MPI_COMM_WORLD, recv_done, (void*)(uintptr_t)z);
			
 
				 }
			
 
				-#endif // STARPU_USE_MPI
			
 
				+#endif /* STARPU_USE_MPI */
			
 
				 
			
 
				 /*
			
 
				  * Schedule saving boundaries of blocks to communication buffers
			
@@ -141,26 +140,28 @@ void create_task_save(unsigned iter, unsigned z, int dir, unsigned local_rank)
 
				 		/* Save data from update */
			
 
				 		create_task_save_local(iter, z, dir, local_rank);
			
 
				 		if (node_z_and_d != local_rank)
			
 
				-		{ // R(z) = local & R(z+d) != local, We have to send the data
			
 
				+		{ /* R(z) = local & R(z+d) != local, We have to send the data */
			
 
				 			create_task_save_mpi_send(iter, z, dir, local_rank);
			
 
				 		}
			
 
				 
			
 
				 	}
			
 
				-	else {	// node_z != local_rank, this MPI node doesn't have the saved data
			
 
				+	else
			
 
				+	{	/* node_z != local_rank, this MPI node doesn't have the saved data */
			
 
				 		if (node_z_and_d == local_rank)
			
 
				 		{
			
 
				 			create_task_save_mpi_recv(iter, z, dir, local_rank);
			
 
				 		}
			
 
				-		else {  // R(z) != local & R(z+d) != local We don't have
			
 
				-			// the saved data and don't need it, we shouldn't
			
 
				-			// even have been called!
			
 
				-			STARPU_ASSERT(0);
			
 
				+		else
			
 
				+		{ /* R(z) != local & R(z+d) != local We don't have
			
 
				+			      the saved data and don't need it, we shouldn't
			
 
				+			      even have been called! */
			
 
				+			STARPU_ABORT();
			
 
				 		}
			
 
				 	}
			
 
				-#else // !STARPU_USE_MPI
			
 
				+#else /* !STARPU_USE_MPI */
			
 
				 	STARPU_ASSERT((node_z == local_rank) && (node_z_and_d == local_rank));
			
 
				 	create_task_save_local(iter, z, dir, local_rank);
			
 
				-#endif // STARPU_USE_MPI
			
 
				+#endif /* STARPU_USE_MPI */
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -176,7 +177,8 @@ void create_task_update(unsigned iter, unsigned z, unsigned local_rank)
 
				 	unsigned niter = get_niter();
			
 
				 
			
 
				 	/* We are going to synchronize with the last tasks */
			
 
				-	if (iter == niter) {
			
 
				+	if (iter == niter)
			
 
				+	{
			
 
				 		task->detach = 0;
			
 
				 		task->use_tag = 1;
			
 
				 		task->tag_id = TAG_FINISH(z);
			
@@ -186,20 +188,14 @@ void create_task_update(unsigned iter, unsigned z, unsigned local_rank)
 
				 	unsigned new_layer = (old_layer + 1) % 2;
			
 
				 
			
 
				 	struct block_description *descr = get_block_description(z);
			
 
				-	task->buffers[0].handle = descr->layers_handle[new_layer];
			
 
				-	task->buffers[0].mode = STARPU_RW;
			
 
				-	task->buffers[1].handle = descr->layers_handle[old_layer];
			
 
				-	task->buffers[1].mode = STARPU_RW;
			
 
				+	task->handles[0] = descr->layers_handle[new_layer];
			
 
				+	task->handles[1] = descr->layers_handle[old_layer];
			
 
				 
			
 
				-	task->buffers[2].handle = descr->boundaries_handle[T][new_layer];
			
 
				-	task->buffers[2].mode = STARPU_R;
			
 
				-	task->buffers[3].handle = descr->boundaries_handle[T][old_layer];
			
 
				-	task->buffers[3].mode = STARPU_R;
			
 
				+	task->handles[2] = descr->boundaries_handle[T][new_layer];
			
 
				+	task->handles[3] = descr->boundaries_handle[T][old_layer];
			
 
				 
			
 
				-	task->buffers[4].handle = descr->boundaries_handle[B][new_layer];
			
 
				-	task->buffers[4].mode = STARPU_R;
			
 
				-	task->buffers[5].handle = descr->boundaries_handle[B][old_layer];
			
 
				-	task->buffers[5].mode = STARPU_R;
			
 
				+	task->handles[4] = descr->boundaries_handle[B][new_layer];
			
 
				+	task->handles[5] = descr->boundaries_handle[B][old_layer];
			
 
				 
			
 
				 	task->cl = &cl_update;
			
 
				 	task->cl_arg = descr;
			
@@ -212,21 +208,24 @@ void create_task_update(unsigned iter, unsigned z, unsigned local_rank)
 
				 	if (ret)
			
 
				 	{
			
 
				 		fprintf(stderr, "Could not submit task update block: %d\n", ret);
			
 
				-		STARPU_ASSERT(0);
			
 
				+		STARPU_ABORT();
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				 /* Dummy empty codelet taking one buffer */
			
 
				 static void null_func(void *descr[] __attribute__((unused)), void *arg __attribute__((unused))) { }
			
 
				-static starpu_codelet null = {
			
 
				+static struct starpu_codelet null =
			
 
				+{
			
 
				+	.modes = { STARPU_W, STARPU_W },
			
 
				 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				-	.cpu_func = null_func,
			
 
				-	.cuda_func = null_func,
			
 
				-	.opencl_func = null_func,
			
 
				+	.cpu_funcs = {null_func, NULL},
			
 
				+	.cuda_funcs = {null_func, NULL},
			
 
				+	.opencl_funcs = {null_func, NULL},
			
 
				 	.nbuffers = 2
			
 
				 };
			
 
				 
			
 
				-void create_start_task(int z, int dir) {
			
 
				+void create_start_task(int z, int dir)
			
 
				+{
			
 
				 	/* Dumb task depending on the init task and simulating writing the
			
 
				 	   neighbour buffers, to avoid communications and computation running
			
 
				 	   before we start measuring time */
			
@@ -236,17 +235,15 @@ void create_start_task(int z, int dir) {
 
				 	wait_init->cl = &null;
			
 
				 	wait_init->use_tag = 1;
			
 
				 	wait_init->tag_id = TAG_START(z, dir);
			
 
				-	wait_init->buffers[0].handle = descr->boundaries_handle[(1+dir)/2][0];
			
 
				-	wait_init->buffers[0].mode = STARPU_W;
			
 
				-	wait_init->buffers[1].handle = descr->boundaries_handle[(1+dir)/2][1];
			
 
				-	wait_init->buffers[1].mode = STARPU_W;
			
 
				+	wait_init->handles[0] = descr->boundaries_handle[(1 + dir) / 2][0];
			
 
				+	wait_init->handles[1] = descr->boundaries_handle[(1 + dir) / 2][1];
			
 
				 	starpu_tag_declare_deps_array(wait_init->tag_id, 1, &tag_init);
			
 
				 
			
 
				 	int ret = starpu_task_submit(wait_init);
			
 
				 	if (ret)
			
 
				 	{
			
 
				 		fprintf(stderr, "Could not submit task initial wait: %d\n", ret);
			
 
				-		STARPU_ASSERT(0);
			
 
				+		STARPU_ABORT();
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -261,7 +258,8 @@ void create_tasks(int rank)
 
				 	int niter = get_niter();
			
 
				 	int nbz = get_nbz();
			
 
				 
			
 
				-	for (bz = 0; bz < nbz; bz++) {
			
 
				+	for (bz = 0; bz < nbz; bz++)
			
 
				+	{
			
 
				 		if ((get_block_mpi_node(bz) == rank) || (get_block_mpi_node(bz+1) == rank))
			
 
				 			create_start_task(bz, +1);
			
 
				 		if ((get_block_mpi_node(bz) == rank) || (get_block_mpi_node(bz-1) == rank))
			
--- a/examples/tag_example/tag_example.c
+++ b/examples/tag_example/tag_example.c
@@ -43,7 +43,7 @@
 
				 
			
 
				 struct starpu_codelet cl = {};
			
 
				 
			
 
				-#ifdef STARPU_SLOW_MACHINE
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				 #define Ni	32
			
 
				 #define Nj	32
			
 
				 #define Nk	32
			
--- a/examples/tag_example/tag_example2.c
+++ b/examples/tag_example/tag_example2.c
@@ -120,7 +120,7 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 
				 		exit(77);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-#ifdef STARPU_SLOW_MACHINE
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				 	ni /= 4;
			
 
				 	nk /= 16;
			
 
				 #endif
			
--- a/examples/tag_example/tag_example3.c
+++ b/examples/tag_example/tag_example3.c
@@ -122,7 +122,7 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 
				 		exit(77);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-#ifdef STARPU_SLOW_MACHINE
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				 	ni /= 4;
			
 
				 	nk /= 16;
			
 
				 #endif
			
--- a/examples/tag_example/tag_restartable.c
+++ b/examples/tag_example/tag_restartable.c
@@ -129,7 +129,7 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 
				 		return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-#ifdef STARPU_SLOW_MACHINE
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				 	ni /= 4;
			
 
				 	nk /= 16;
			
 
				 #endif
			
--- a/tests/microbenchs/prefetch_data_on_node.c
+++ b/tests/microbenchs/prefetch_data_on_node.c
@@ -23,7 +23,7 @@
 
				 #include <pthread.h>
			
 
				 #include "../helper.h"
			
 
				 
			
 
				-#ifdef STARPU_SLOW_MACHINE
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				 #define N		100
			
 
				 #else
			
 
				 #define N		1000
			
--- a/tests/microbenchs/sync_tasks_overhead.c
+++ b/tests/microbenchs/sync_tasks_overhead.c
@@ -93,7 +93,7 @@ int main(int argc, char **argv)
 
				 	struct timeval start;
			
 
				 	struct timeval end;
			
 
				 
			
 
				-#ifdef STARPU_SLOW_MACHINE
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				 	ntasks = 128;
			
 
				 #endif