12 years ago · a9f3cb5acb
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -3,7 +3,7 @@
 
																 # Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
															
 
																 # Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																 # Copyright (C) 2011  Télécom-SudParis
															
 
																-# Copyright (C) 2011-2012  INRIA
															
 
																+# Copyright (C) 2012 INRIA
															
 
																 #
															
 
																 # StarPU is free software; you can redistribute it and/or modify
															
 
																 # it under the terms of the GNU Lesser General Public License as published by
															
@@ -47,7 +47,6 @@ EXTRA_DIST = 					\
 
																 	lu/xlu_implicit_pivot.c			\
															
 
																 	lu/xlu_kernels.c			\
															
 
																 	lu/lu_example.c				\
															
 
																-	sched_ctx_utils/sched_ctx_utils.c		\
															
 
																 	incrementer/incrementer_kernels_opencl_kernel.cl 	\
															
 
																 	basic_examples/variable_kernels_opencl_kernel.cl	\
															
 
																 	matvecmult/matvecmult_kernel.cl				\
															
@@ -100,7 +99,6 @@ noinst_HEADERS = 				\
 
																 	lu/complex_double.h			\
															
 
																 	lu/blas_complex.h			\
															
 
																 	cholesky/cholesky.h			\
															
 
																-	sched_ctx_utils/sched_ctx_utils.h	\
															
 
																 	common/blas_model.h			\
															
 
																 	common/blas.h				\
															
 
																 	mult/simple.h				\
															
@@ -514,7 +512,6 @@ cholesky_cholesky_implicit_SOURCES =		\
 
																 	cholesky/cholesky_implicit.c		\
															
 
																 	cholesky/cholesky_models.c		\
															
 
																 	cholesky/cholesky_kernels.c		\
															
 
																-	sched_ctx_utils/sched_ctx_utils.c	\
															
 
																 	common/blas.c
															
 
																 cholesky_cholesky_implicit_LDADD =		\
															
--- a/examples/basic_examples/variable.c
+++ b/examples/basic_examples/variable.c
@@ -45,7 +45,7 @@ int main(int argc, char **argv)
 
																 	if (ret == -ENODEV) goto enodev;
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-#ifdef STARPU_SLOW_MACHINE
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																 	niter /= 100;
															
 
																 #endif
															
 
																         if (argc == 2) niter = atoi(argv[1]);
															
--- a/examples/cg/cg.c
+++ b/examples/cg/cg.c
@@ -406,7 +406,7 @@ int main(int argc, char **argv)
 
																 {
															
 
																 	int ret;
															
 
																-#ifdef STARPU_SLOW_MACHINE
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																 	i_max = 16;
															
 
																 #endif
															
--- a/examples/cg/cg_kernels.c
+++ b/examples/cg/cg_kernels.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -16,6 +16,7 @@
 
																 #include "cg.h"
															
 
																 #include <math.h>
															
 
																+#include <limits.h>
															
 
																 #if 0
															
 
																 static void print_vector_from_descr(unsigned nx, TYPE *v)
															
@@ -43,6 +44,23 @@ static void print_matrix_from_descr(unsigned nx, unsigned ny, unsigned ld, TYPE
 
																 }
															
 
																 #endif
															
 
																+static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
															
 
																+{
															
 
																+	enum starpu_archtype type = starpu_worker_get_type(workerid);
															
 
																+	if (type == STARPU_CPU_WORKER || type == STARPU_OPENCL_WORKER)
															
 
																+		return 1;
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	/* Cuda device */
															
 
																+	const struct cudaDeviceProp *props;
															
 
																+	props = starpu_cuda_get_device_properties(workerid);
															
 
																+	if (props->major >= 2 || props->minor >= 3)
															
 
																+		/* At least compute capability 1.3, supports doubles */
															
 
																+		return 1;
															
 
																+#endif
															
 
																+	/* Old card, does not support doubles */
															
 
																+	return 0;
															
 
																+}
															
 
																 /*
															
 
																  *	Reduction accumulation methods
															
@@ -67,16 +85,19 @@ static void accumulate_variable_cpu(void *descr[], void *cl_arg)
 
																 	*v_dst = *v_dst + *v_src;
															
 
																 }
															
 
																-static struct starpu_perfmodel_t accumulate_variable_model = {
															
 
																+static struct starpu_perfmodel accumulate_variable_model =
															
 
																+{
															
 
																 	.type = STARPU_HISTORY_BASED,
															
 
																 	.symbol = "accumulate_variable"
															
 
																 };
															
 
																-starpu_codelet accumulate_variable_cl = {
															
 
																+struct starpu_codelet accumulate_variable_cl =
															
 
																+{
															
 
																+	.can_execute = can_execute,
															
 
																 	.where = STARPU_CPU|STARPU_CUDA,
															
 
																-	.cpu_func = accumulate_variable_cpu,
															
 
																+	.cpu_funcs = {accumulate_variable_cpu, NULL},
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-	.cuda_func = accumulate_variable_cuda,
															
 
																+	.cuda_funcs = {accumulate_variable_cuda, NULL},
															
 
																 #endif
															
 
																 	.nbuffers = 2,
															
 
																 	.model = &accumulate_variable_model
															
@@ -103,16 +124,19 @@ static void accumulate_vector_cpu(void *descr[], void *cl_arg)
 
																 	AXPY(n, (TYPE)1.0, v_src, 1, v_dst, 1);
															
 
																 }
															
 
																-static struct starpu_perfmodel_t accumulate_vector_model = {
															
 
																+static struct starpu_perfmodel accumulate_vector_model =
															
 
																+{
															
 
																 	.type = STARPU_HISTORY_BASED,
															
 
																 	.symbol = "accumulate_vector"
															
 
																 };
															
 
																-starpu_codelet accumulate_vector_cl = {
															
 
																+struct starpu_codelet accumulate_vector_cl =
															
 
																+{
															
 
																+	.can_execute = can_execute,
															
 
																 	.where = STARPU_CPU|STARPU_CUDA,
															
 
																-	.cpu_func = accumulate_vector_cpu,
															
 
																+	.cpu_funcs = {accumulate_vector_cpu, NULL},
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-	.cuda_func = accumulate_vector_cuda,
															
 
																+	.cuda_funcs = {accumulate_vector_cuda, NULL},
															
 
																 #endif
															
 
																 	.nbuffers = 2,
															
 
																 	.model = &accumulate_vector_model
															
@@ -141,16 +165,19 @@ static void bzero_variable_cpu(void *descr[], void *cl_arg)
 
																 	*v = (TYPE)0.0;
															
 
																 }
															
 
																-static struct starpu_perfmodel_t bzero_variable_model = {
															
 
																+static struct starpu_perfmodel bzero_variable_model =
															
 
																+{
															
 
																 	.type = STARPU_HISTORY_BASED,
															
 
																 	.symbol = "bzero_variable"
															
 
																 };
															
 
																-starpu_codelet bzero_variable_cl = {
															
 
																+struct starpu_codelet bzero_variable_cl =
															
 
																+{
															
 
																+	.can_execute = can_execute,
															
 
																 	.where = STARPU_CPU|STARPU_CUDA,
															
 
																-	.cpu_func = bzero_variable_cpu,
															
 
																+	.cpu_funcs = {bzero_variable_cpu, NULL},
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-	.cuda_func = bzero_variable_cuda,
															
 
																+	.cuda_funcs = {bzero_variable_cuda, NULL},
															
 
																 #endif
															
 
																 	.nbuffers = 1,
															
 
																 	.model = &bzero_variable_model
															
@@ -176,16 +203,19 @@ static void bzero_vector_cpu(void *descr[], void *cl_arg)
 
																 	memset(v, 0, n*sizeof(TYPE));
															
 
																 }
															
 
																-static struct starpu_perfmodel_t bzero_vector_model = {
															
 
																+static struct starpu_perfmodel bzero_vector_model =
															
 
																+{
															
 
																 	.type = STARPU_HISTORY_BASED,
															
 
																 	.symbol = "bzero_vector"
															
 
																 };
															
 
																-starpu_codelet bzero_vector_cl = {
															
 
																+struct starpu_codelet bzero_vector_cl =
															
 
																+{
															
 
																+	.can_execute = can_execute,
															
 
																 	.where = STARPU_CPU|STARPU_CUDA,
															
 
																-	.cpu_func = bzero_vector_cpu,
															
 
																+	.cpu_funcs = {bzero_vector_cpu, NULL},
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-	.cuda_func = bzero_vector_cuda,
															
 
																+	.cuda_funcs = {bzero_vector_cuda, NULL},
															
 
																 #endif
															
 
																 	.nbuffers = 1,
															
 
																 	.model = &bzero_vector_model
															
@@ -229,39 +259,48 @@ static void dot_kernel_cpu(void *descr[], void *cl_arg)
 
																 	*dot = *dot + local_dot;
															
 
																 }
															
 
																-static struct starpu_perfmodel_t dot_kernel_model = {
															
 
																+static struct starpu_perfmodel dot_kernel_model =
															
 
																+{
															
 
																 	.type = STARPU_HISTORY_BASED,
															
 
																 	.symbol = "dot_kernel"
															
 
																 };
															
 
																-static starpu_codelet dot_kernel_cl = {
															
 
																+static struct starpu_codelet dot_kernel_cl =
															
 
																+{
															
 
																+	.can_execute = can_execute,
															
 
																 	.where = STARPU_CPU|STARPU_CUDA,
															
 
																-	.cpu_func = dot_kernel_cpu,
															
 
																+	.cpu_funcs = {dot_kernel_cpu, NULL},
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-	.cuda_func = dot_kernel_cuda,
															
 
																+	.cuda_funcs = {dot_kernel_cuda, NULL},
															
 
																 #endif
															
 
																 	.nbuffers = 3,
															
 
																 	.model = &dot_kernel_model
															
 
																 };
															
 
																-void dot_kernel(starpu_data_handle v1,
															
 
																-		starpu_data_handle v2,
															
 
																-		starpu_data_handle s,
															
 
																-		unsigned nblocks,
															
 
																-		int use_reduction)
															
 
																+int dot_kernel(starpu_data_handle_t v1,
															
 
																+	       starpu_data_handle_t v2,
															
 
																+	       starpu_data_handle_t s,
															
 
																+	       unsigned nblocks,
															
 
																+	       int use_reduction)
															
 
																 {
															
 
																+	int ret;
															
 
																+
															
 
																 	/* Blank the accumulation variable */
															
 
																-	starpu_insert_task(&bzero_variable_cl, STARPU_W, s, 0);
															
 
																+	ret = starpu_insert_task(&bzero_variable_cl, STARPU_W, s, 0);
															
 
																+	if (ret == -ENODEV) return ret;
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
															
 
																 	unsigned b;
															
 
																 	for (b = 0; b < nblocks; b++)
															
 
																 	{
															
 
																-		starpu_insert_task(&dot_kernel_cl,
															
 
																-			use_reduction?STARPU_REDUX:STARPU_RW, s,
															
 
																-			STARPU_R, starpu_data_get_sub_data(v1, 1, b),
															
 
																-			STARPU_R, starpu_data_get_sub_data(v2, 1, b),
															
 
																-			0);
															
 
																+		ret = starpu_insert_task(&dot_kernel_cl,
															
 
																+					 use_reduction?STARPU_REDUX:STARPU_RW, s,
															
 
																+					 STARPU_R, starpu_data_get_sub_data(v1, 1, b),
															
 
																+					 STARPU_R, starpu_data_get_sub_data(v2, 1, b),
															
 
																+					 0);
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
															
 
																 	}
															
 
																+	return 0;
															
 
																 }
															
 
																 /*
															
@@ -272,7 +311,7 @@ void dot_kernel(starpu_data_handle v1,
 
																 static void scal_kernel_cuda(void *descr[], void *cl_arg)
															
 
																 {
															
 
																 	TYPE p1;
															
 
																-	starpu_unpack_cl_args(cl_arg, &p1);
															
 
																+	starpu_codelet_unpack_args(cl_arg, &p1);
															
 
																 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
															
@@ -287,7 +326,7 @@ static void scal_kernel_cuda(void *descr[], void *cl_arg)
 
																 static void scal_kernel_cpu(void *descr[], void *cl_arg)
															
 
																 {
															
 
																 	TYPE alpha;
															
 
																-	starpu_unpack_cl_args(cl_arg, &alpha);
															
 
																+	starpu_codelet_unpack_args(cl_arg, &alpha);
															
 
																 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
															
@@ -296,16 +335,19 @@ static void scal_kernel_cpu(void *descr[], void *cl_arg)
 
																 	SCAL(n, alpha, v1, 1);
															
 
																 }
															
 
																-static struct starpu_perfmodel_t scal_kernel_model = {
															
 
																+static struct starpu_perfmodel scal_kernel_model =
															
 
																+{
															
 
																 	.type = STARPU_HISTORY_BASED,
															
 
																 	.symbol = "scal_kernel"
															
 
																 };
															
 
																-static starpu_codelet scal_kernel_cl = {
															
 
																+static struct starpu_codelet scal_kernel_cl =
															
 
																+{
															
 
																+	.can_execute = can_execute,
															
 
																 	.where = STARPU_CPU|STARPU_CUDA,
															
 
																-	.cpu_func = scal_kernel_cpu,
															
 
																+	.cpu_funcs = {scal_kernel_cpu, NULL},
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-	.cuda_func = scal_kernel_cuda,
															
 
																+	.cuda_funcs = {scal_kernel_cuda, NULL},
															
 
																 #endif
															
 
																 	.nbuffers = 1,
															
 
																 	.model = &scal_kernel_model
															
@@ -327,7 +369,7 @@ static void gemv_kernel_cuda(void *descr[], void *cl_arg)
 
																 	unsigned ny = STARPU_MATRIX_GET_NY(descr[1]);
															
 
																 	TYPE alpha, beta;
															
 
																-	starpu_unpack_cl_args(cl_arg, &beta, &alpha);
															
 
																+	starpu_codelet_unpack_args(cl_arg, &beta, &alpha);
															
 
																 	/* Compute v1 = alpha M v2 + beta v1 */
															
 
																 	cublasgemv('N', nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
															
@@ -346,7 +388,7 @@ static void gemv_kernel_cpu(void *descr[], void *cl_arg)
 
																 	unsigned ny = STARPU_MATRIX_GET_NY(descr[1]);
															
 
																 	TYPE alpha, beta;
															
 
																-	starpu_unpack_cl_args(cl_arg, &beta, &alpha);
															
 
																+	starpu_codelet_unpack_args(cl_arg, &beta, &alpha);
															
 
																 	int worker_size = starpu_combined_worker_get_size();
															
@@ -367,38 +409,44 @@ static void gemv_kernel_cpu(void *descr[], void *cl_arg)
 
																 	GEMV("N", nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
															
 
																 }
															
 
																-static struct starpu_perfmodel_t gemv_kernel_model = {
															
 
																+static struct starpu_perfmodel gemv_kernel_model =
															
 
																+{
															
 
																 	.type = STARPU_HISTORY_BASED,
															
 
																 	.symbol = "gemv_kernel"
															
 
																 };
															
 
																-static starpu_codelet gemv_kernel_cl = {
															
 
																+static struct starpu_codelet gemv_kernel_cl =
															
 
																+{
															
 
																+	.can_execute = can_execute,
															
 
																 	.where = STARPU_CPU|STARPU_CUDA,
															
 
																 	.type = STARPU_SPMD,
															
 
																 	.max_parallelism = INT_MAX,
															
 
																-	.cpu_func = gemv_kernel_cpu,
															
 
																+	.cpu_funcs = {gemv_kernel_cpu, NULL},
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-	.cuda_func = gemv_kernel_cuda,
															
 
																+	.cuda_funcs = {gemv_kernel_cuda, NULL},
															
 
																 #endif
															
 
																 	.nbuffers = 3,
															
 
																 	.model = &gemv_kernel_model
															
 
																 };
															
 
																-void gemv_kernel(starpu_data_handle v1,
															
 
																-		starpu_data_handle matrix,
															
 
																-		starpu_data_handle v2,
															
 
																+int gemv_kernel(starpu_data_handle_t v1,
															
 
																+		starpu_data_handle_t matrix,
															
 
																+		starpu_data_handle_t v2,
															
 
																 		TYPE p1, TYPE p2,
															
 
																 		unsigned nblocks,
															
 
																 		int use_reduction)
															
 
																 {
															
 
																 	unsigned b1, b2;
															
 
																+	int ret;
															
 
																 	for (b2 = 0; b2 < nblocks; b2++)
															
 
																 	{
															
 
																-		starpu_insert_task(&scal_kernel_cl,
															
 
																-			STARPU_RW, starpu_data_get_sub_data(v1, 1, b2),
															
 
																-			STARPU_VALUE, &p1, sizeof(p1),
															
 
																-			0);
															
 
																+		ret = starpu_insert_task(&scal_kernel_cl,
															
 
																+					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b2),
															
 
																+					 STARPU_VALUE, &p1, sizeof(p1),
															
 
																+					 0);
															
 
																+		if (ret == -ENODEV) return ret;
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
															
 
																 	}
															
 
																 	for (b2 = 0; b2 < nblocks; b2++)
															
@@ -406,15 +454,17 @@ void gemv_kernel(starpu_data_handle v1,
 
																 		for (b1 = 0; b1 < nblocks; b1++)
															
 
																 		{
															
 
																 			TYPE one = 1.0;
															
 
																-			starpu_insert_task(&gemv_kernel_cl,
															
 
																-				use_reduction?STARPU_REDUX:STARPU_RW,	starpu_data_get_sub_data(v1, 1, b2),
															
 
																-				STARPU_R,	starpu_data_get_sub_data(matrix, 2, b2, b1),
															
 
																-				STARPU_R,	starpu_data_get_sub_data(v2, 1, b1),
															
 
																-				STARPU_VALUE,	&one,	sizeof(one),
															
 
																-				STARPU_VALUE,	&p2,	sizeof(p2),
															
 
																-				0);
															
 
																+			ret = starpu_insert_task(&gemv_kernel_cl,
															
 
																+						 use_reduction?STARPU_REDUX:STARPU_RW,	starpu_data_get_sub_data(v1, 1, b2),
															
 
																+						 STARPU_R,	starpu_data_get_sub_data(matrix, 2, b2, b1),
															
 
																+						 STARPU_R,	starpu_data_get_sub_data(v2, 1, b1),
															
 
																+						 STARPU_VALUE,	&one,	sizeof(one),
															
 
																+						 STARPU_VALUE,	&p2,	sizeof(p2),
															
 
																+						 0);
															
 
																+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
															
 
																 		}
															
 
																 	}
															
 
																+	return 0;
															
 
																 }
															
 
																 /*
															
@@ -424,7 +474,7 @@ void gemv_kernel(starpu_data_handle v1,
 
																 static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
															
 
																 {
															
 
																 	TYPE p1, p2;
															
 
																-	starpu_unpack_cl_args(cl_arg, &p1, &p2);
															
 
																+	starpu_codelet_unpack_args(cl_arg, &p1, &p2);
															
 
																 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
															
@@ -444,7 +494,7 @@ static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
 
																 static void scal_axpy_kernel_cpu(void *descr[], void *cl_arg)
															
 
																 {
															
 
																 	TYPE p1, p2;
															
 
																-	starpu_unpack_cl_args(cl_arg, &p1, &p2);
															
 
																+	starpu_codelet_unpack_args(cl_arg, &p1, &p2);
															
 
																 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
															
@@ -459,35 +509,42 @@ static void scal_axpy_kernel_cpu(void *descr[], void *cl_arg)
 
																 	AXPY(nx, p2, v2, 1, v1, 1);
															
 
																 }
															
 
																-static struct starpu_perfmodel_t scal_axpy_kernel_model = {
															
 
																+static struct starpu_perfmodel scal_axpy_kernel_model =
															
 
																+{
															
 
																 	.type = STARPU_HISTORY_BASED,
															
 
																 	.symbol = "scal_axpy_kernel"
															
 
																 };
															
 
																-static starpu_codelet scal_axpy_kernel_cl = {
															
 
																+static struct starpu_codelet scal_axpy_kernel_cl =
															
 
																+{
															
 
																+	.can_execute = can_execute,
															
 
																 	.where = STARPU_CPU|STARPU_CUDA,
															
 
																-	.cpu_func = scal_axpy_kernel_cpu,
															
 
																+	.cpu_funcs = {scal_axpy_kernel_cpu, NULL},
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-	.cuda_func = scal_axpy_kernel_cuda,
															
 
																+	.cuda_funcs = {scal_axpy_kernel_cuda, NULL},
															
 
																 #endif
															
 
																 	.nbuffers = 2,
															
 
																 	.model = &scal_axpy_kernel_model
															
 
																 };
															
 
																-void scal_axpy_kernel(starpu_data_handle v1, TYPE p1,
															
 
																-			starpu_data_handle v2, TYPE p2,
															
 
																-			unsigned nblocks)
															
 
																+int scal_axpy_kernel(starpu_data_handle_t v1, TYPE p1,
															
 
																+		     starpu_data_handle_t v2, TYPE p2,
															
 
																+		     unsigned nblocks)
															
 
																 {
															
 
																+	int ret;
															
 
																 	unsigned b;
															
 
																 	for (b = 0; b < nblocks; b++)
															
 
																 	{
															
 
																-		starpu_insert_task(&scal_axpy_kernel_cl,
															
 
																-			STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
															
 
																-			STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
															
 
																-			STARPU_VALUE, &p1, sizeof(p1),
															
 
																-			STARPU_VALUE, &p2, sizeof(p2),
															
 
																-			0);
															
 
																+		ret = starpu_insert_task(&scal_axpy_kernel_cl,
															
 
																+					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
															
 
																+					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
															
 
																+					 STARPU_VALUE, &p1, sizeof(p1),
															
 
																+					 STARPU_VALUE, &p2, sizeof(p2),
															
 
																+					 0);
															
 
																+		if (ret == -ENODEV) return ret;
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
															
 
																 	}
															
 
																+	return 0;
															
 
																 }
															
@@ -498,7 +555,7 @@ void scal_axpy_kernel(starpu_data_handle v1, TYPE p1,
 
																 static void axpy_kernel_cuda(void *descr[], void *cl_arg)
															
 
																 {
															
 
																 	TYPE p1;
															
 
																-	starpu_unpack_cl_args(cl_arg, &p1);
															
 
																+	starpu_codelet_unpack_args(cl_arg, &p1);
															
 
																 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
															
@@ -515,7 +572,7 @@ static void axpy_kernel_cuda(void *descr[], void *cl_arg)
 
																 static void axpy_kernel_cpu(void *descr[], void *cl_arg)
															
 
																 {
															
 
																 	TYPE p1;
															
 
																-	starpu_unpack_cl_args(cl_arg, &p1);
															
 
																+	starpu_codelet_unpack_args(cl_arg, &p1);
															
 
																 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
															
@@ -527,89 +584,47 @@ static void axpy_kernel_cpu(void *descr[], void *cl_arg)
 
																 	AXPY(nx, p1, v2, 1, v1, 1);
															
 
																 }
															
 
																-static struct starpu_perfmodel_t axpy_kernel_model = {
															
 
																+static struct starpu_perfmodel axpy_kernel_model =
															
 
																+{
															
 
																 	.type = STARPU_HISTORY_BASED,
															
 
																 	.symbol = "axpy_kernel"
															
 
																 };
															
 
																-static starpu_codelet axpy_kernel_cl = {
															
 
																+static struct starpu_codelet axpy_kernel_cl =
															
 
																+{
															
 
																+	.can_execute = can_execute,
															
 
																 	.where = STARPU_CPU|STARPU_CUDA,
															
 
																-	.cpu_func = axpy_kernel_cpu,
															
 
																+	.cpu_funcs = {axpy_kernel_cpu, NULL},
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-	.cuda_func = axpy_kernel_cuda,
															
 
																+	.cuda_funcs = {axpy_kernel_cuda, NULL},
															
 
																 #endif
															
 
																 	.nbuffers = 2,
															
 
																 	.model = &axpy_kernel_model
															
 
																 };
															
 
																-void axpy_kernel(starpu_data_handle v1,
															
 
																-		starpu_data_handle v2, TYPE p1,
															
 
																+int axpy_kernel(starpu_data_handle_t v1,
															
 
																+		starpu_data_handle_t v2, TYPE p1,
															
 
																 		unsigned nblocks)
															
 
																 {
															
 
																+	int ret;
															
 
																 	unsigned b;
															
 
																 	for (b = 0; b < nblocks; b++)
															
 
																 	{
															
 
																-		starpu_insert_task(&axpy_kernel_cl,
															
 
																-			STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
															
 
																-			STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
															
 
																-			STARPU_VALUE, &p1, sizeof(p1),
															
 
																-			0);
															
 
																+		ret = starpu_insert_task(&axpy_kernel_cl,
															
 
																+					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
															
 
																+					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
															
 
																+					 STARPU_VALUE, &p1, sizeof(p1),
															
 
																+					 0);
															
 
																+		if (ret == -ENODEV) return ret;
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
															
 
																 	}
															
 
																+	return 0;
															
 
																 }
															
 
																-
															
 
																-/*
															
 
																- *	COPY kernel : vector_dst <- vector_src
															
 
																- */
															
 
																-
															
 
																-static void copy_handle_cpu(void *descr[], void *cl_arg)
															
 
																-{
															
 
																-	TYPE *dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																-	TYPE *src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
															
 
																-	
															
 
																-	unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
															
 
																-	size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
															
 
																-
															
 
																-	memcpy(dst, src, nx*elemsize);
															
 
																-}
															
 
																-
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-static void copy_handle_cuda(void *descr[], void *cl_arg)
															
 
																-{
															
 
																-	TYPE *dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																-	TYPE *src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
															
 
																-	
															
 
																-	unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
															
 
																-	size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
															
 
																-
															
 
																-	cudaMemcpyAsync(dst, src, nx*elemsize, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
															
 
																-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																-}
															
 
																-#endif
															
 
																-
															
 
																-static struct starpu_perfmodel_t copy_handle_model = {
															
 
																-	.type = STARPU_HISTORY_BASED,
															
 
																-	.symbol = "copy_handle"
															
 
																-};
															
 
																-
															
 
																-static starpu_codelet copy_handle_cl = {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA,
															
 
																-	.cpu_func = copy_handle_cpu,
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-	.cuda_func = copy_handle_cuda,
															
 
																-#endif
															
 
																-	.nbuffers = 2,
															
 
																-	.model = &copy_handle_model
															
 
																-};
															
 
																-
															
 
																-void copy_handle(starpu_data_handle dst, starpu_data_handle src, unsigned nblocks)
															
 
																+int copy_handle(starpu_data_handle_t dst, starpu_data_handle_t src, unsigned nblocks)
															
 
																 {
															
 
																 	unsigned b;
															
 
																 	for (b = 0; b < nblocks; b++)
															
 
																-	{
															
 
																-		starpu_insert_task(&copy_handle_cl,
															
 
																-			STARPU_W, starpu_data_get_sub_data(dst, 1, b),
															
 
																-			STARPU_R, starpu_data_get_sub_data(src, 1, b),
															
 
																-			0);
															
 
																-	}
															
 
																-} 
															
 
																+		starpu_data_cpy(starpu_data_get_sub_data(dst, 1, b), starpu_data_get_sub_data(src, 1, b), 1, NULL, NULL);
															
 
																+	return 0;
															
 
																+}
															
--- a/examples/incrementer/incrementer.c
+++ b/examples/incrementer/incrementer.c
@@ -47,7 +47,7 @@ int main(int argc, char **argv)
 
																 		return 77;
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-#ifdef STARPU_SLOW_MACHINE
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																 	niter /= 100;
															
 
																 #endif
															
 
																 	if (argc == 2)
															
--- a/examples/interface/complex_interface.c
+++ b/examples/interface/complex_interface.c
@@ -15,9 +15,6 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																-#include <starpu_cuda.h>
															
 
																-#include <starpu_opencl.h>
															
 
																-#include <starpu_hash.h>
															
 
																 #include "complex_interface.h"
															
@@ -137,7 +134,7 @@ static starpu_ssize_t complex_allocate_data_on_node(void *data_interface, uint32
 
																 		}
															
 
																 #endif
															
 
																 		default:
															
 
																-			STARPU_ASSERT(0);
															
 
																+			STARPU_ABORT();
															
 
																 	}
															
 
																 	if (fail)
															
@@ -164,6 +161,43 @@ static uint32_t complex_footprint(starpu_data_handle_t handle)
 
																 	return starpu_crc32_be(starpu_complex_get_nx(handle), 0);
															
 
																 }
															
 
																+static void *complex_handle_to_pointer(starpu_data_handle_t handle, uint32_t node)
															
 
																+{
															
 
																+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
															
 
																+
															
 
																+	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *)
															
 
																+		starpu_data_get_interface_on_node(handle, node);
															
 
																+
															
 
																+	return (void*) complex_interface->real;
															
 
																+}
															
 
																+
															
 
																+static int complex_pack_data(starpu_data_handle_t handle, uint32_t node, void **ptr)
															
 
																+{
															
 
																+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
															
 
																+
															
 
																+	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *)
															
 
																+		starpu_data_get_interface_on_node(handle, node);
															
 
																+
															
 
																+	*ptr = malloc(complex_get_size(handle));
															
 
																+	memcpy(*ptr, complex_interface->real, complex_interface->nx*sizeof(double));
															
 
																+	memcpy(*ptr+complex_interface->nx*sizeof(double), complex_interface->imaginary, complex_interface->nx*sizeof(double));
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+static int complex_unpack_data(starpu_data_handle_t handle, uint32_t node, void *ptr)
															
 
																+{
															
 
																+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
															
 
																+
															
 
																+	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *)
															
 
																+		starpu_data_get_interface_on_node(handle, node);
															
 
																+
															
 
																+	memcpy(complex_interface->real, ptr, complex_interface->nx*sizeof(double));
															
 
																+	memcpy(complex_interface->imaginary, ptr+complex_interface->nx*sizeof(double), complex_interface->nx*sizeof(double));
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 static int copy_cuda_async_sync(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, enum cudaMemcpyKind kind, cudaStream_t stream)
															
 
																 {
															
@@ -204,11 +238,10 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *
 
																 #endif
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *_event)
															
 
																+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
															
 
																 {
															
 
																 	struct starpu_complex_interface *src_complex = src_interface;
															
 
																 	struct starpu_complex_interface *dst_complex = dst_interface;
															
 
																-	cl_event *event = (cl_event *)_event;
															
 
																 	cl_int err;
															
 
																 	int ret;
															
@@ -244,11 +277,10 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_
 
																         return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
															
 
																 }
															
 
																-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *_event)
															
 
																+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
															
 
																 {
															
 
																 	struct starpu_complex_interface *src_complex = src_interface;
															
 
																 	struct starpu_complex_interface *dst_complex = dst_interface;
															
 
																-	cl_event *event = (cl_event *)_event;
															
 
																 	cl_int err;
															
 
																 	int ret;
															
@@ -310,6 +342,9 @@ static struct starpu_data_interface_ops interface_complex_ops =
 
																 	.footprint = complex_footprint,
															
 
																 	.interfaceid = -1,
															
 
																 	.interface_size = sizeof(struct starpu_complex_interface),
															
 
																+	.handle_to_pointer = complex_handle_to_pointer,
															
 
																+	.pack_data = complex_pack_data,
															
 
																+	.unpack_data = complex_unpack_data
															
 
																 };
															
 
																 void starpu_complex_data_register(starpu_data_handle_t *handleptr, uint32_t home_node, double *real, double *imaginary, int nx)
															
--- a/examples/lu/lu_example.c
+++ b/examples/lu/lu_example.c
@@ -300,7 +300,7 @@ int main(int argc, char **argv)
 
																 	parse_args(argc, argv);
															
 
																-#ifdef STARPU_SLOW_MACHINE
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																 	size /= 4;
															
 
																 	nblocks /= 4;
															
 
																 #endif
															
--- a/examples/mult/xgemm.c
+++ b/examples/mult/xgemm.c
@@ -33,7 +33,7 @@
 
																 static unsigned niter = 10;
															
 
																 static unsigned nslicesx = 4;
															
 
																 static unsigned nslicesy = 4;
															
 
																-#ifdef STARPU_SLOW_MACHINE
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																 static unsigned xdim = 256;
															
 
																 static unsigned ydim = 256;
															
 
																 static unsigned zdim = 64;
															
@@ -287,7 +287,7 @@ int main(int argc, char **argv)
 
																 	parse_args(argc, argv);
															
 
																-#ifdef STARPU_SLOW_MACHINE
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																 	niter /= 10;
															
 
																 #endif
															
--- a/examples/pipeline/pipeline.c
+++ b/examples/pipeline/pipeline.c
@@ -41,7 +41,7 @@
 
																 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																 /* Vector size */
															
 
																-#ifdef STARPU_SLOW_MACHINE
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																 #define N 16
															
 
																 #else
															
 
																 #define N 1048576
															
--- a/examples/reductions/dot_product.c
+++ b/examples/reductions/dot_product.c
@@ -1,6 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010-2012  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2012 inria
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -16,24 +17,48 @@
 
																 #include <starpu.h>
															
 
																 #include <assert.h>
															
 
																+#include <math.h>
															
 
																+
															
 
																+#include <reductions/dot_product.h>
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 #include <cuda.h>
															
 
																 #include <cublas.h>
															
 
																 #endif
															
 
																+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																+
															
 
																 static float *x;
															
 
																 static float *y;
															
 
																-static starpu_data_handle *x_handles;
															
 
																-static starpu_data_handle *y_handles;
															
 
																+static starpu_data_handle_t *x_handles;
															
 
																+static starpu_data_handle_t *y_handles;
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+static struct starpu_opencl_program opencl_program;
															
 
																+#endif
															
 
																 static unsigned nblocks = 4096;
															
 
																-static unsigned entries_per_bock = 1024;
															
 
																-
															
 
																-#define DOT_TYPE double
															
 
																+static unsigned entries_per_block = 1024;
															
 
																 static DOT_TYPE dot = 0.0f;
															
 
																-static starpu_data_handle dot_handle;
															
 
																+static starpu_data_handle_t dot_handle;
															
 
																+
															
 
																+static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
															
 
																+{
															
 
																+	enum starpu_archtype type = starpu_worker_get_type(workerid);
															
 
																+	if (type == STARPU_CPU_WORKER || type == STARPU_OPENCL_WORKER)
															
 
																+		return 1;
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	/* Cuda device */
															
 
																+	const struct cudaDeviceProp *props;
															
 
																+	props = starpu_cuda_get_device_properties(workerid);
															
 
																+	if (props->major >= 2 || props->minor >= 3)
															
 
																+		/* At least compute capability 1.3, supports doubles */
															
 
																+		return 1;
															
 
																+#endif
															
 
																+	/* Old card, does not support doubles */
															
 
																+	return 0;
															
 
																+}
															
 
																 /*
															
 
																  *	Codelet to create a neutral element
															
@@ -49,16 +74,45 @@ void init_cpu_func(void *descr[], void *cl_arg)
 
																 void init_cuda_func(void *descr[], void *cl_arg)
															
 
																 {
															
 
																 	DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																-	cudaMemset(dot, 0, sizeof(DOT_TYPE));
															
 
																-	cudaThreadSynchronize();
															
 
																+	cudaMemsetAsync(dot, 0, sizeof(DOT_TYPE), starpu_cuda_get_local_stream());
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+}
															
 
																+#endif
															
 
																+
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+void init_opencl_func(void *buffers[], void *args)
															
 
																+{
															
 
																+        cl_int err;
															
 
																+	cl_command_queue queue;
															
 
																+
															
 
																+	cl_mem dot = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[0]);
															
 
																+	starpu_opencl_get_current_queue(&queue);
															
 
																+	DOT_TYPE zero = (DOT_TYPE) 0.0;
															
 
																+
															
 
																+	err = clEnqueueWriteBuffer(queue,
															
 
																+			dot,
															
 
																+			CL_TRUE,
															
 
																+			0,
															
 
																+			sizeof(DOT_TYPE),
															
 
																+			&zero,
															
 
																+			0,
															
 
																+			NULL,
															
 
																+			NULL);
															
 
																+	if (err != CL_SUCCESS)
															
 
																+		STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																 }
															
 
																 #endif
															
 
																-static struct starpu_codelet_t init_codelet = {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA,
															
 
																-	.cpu_func = init_cpu_func,
															
 
																+static struct starpu_codelet init_codelet =
															
 
																+{
															
 
																+	.can_execute = can_execute,
															
 
																+	.cpu_funcs = {init_cpu_func, NULL},
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-	.cuda_func = init_cuda_func,
															
 
																+	.cuda_funcs = {init_cuda_func, NULL},
															
 
																+#endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+	.opencl_funcs = {init_opencl_func, NULL},
															
 
																 #endif
															
 
																 	.nbuffers = 1
															
 
																 };
															
@@ -75,9 +129,71 @@ void redux_cpu_func(void *descr[], void *cl_arg)
 
																 	*dota = *dota + *dotb;
															
 
																 }
															
 
																-static struct starpu_codelet_t redux_codelet = {
															
 
																-	.where = STARPU_CPU,
															
 
																-	.cpu_func = redux_cpu_func,
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+extern void redux_cuda_func(void *descr[], void *_args);
															
 
																+#endif
															
 
																+
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+void redux_opencl_func(void *buffers[], void *args)
															
 
																+{
															
 
																+	int id, devid;
															
 
																+        cl_int err;
															
 
																+	cl_kernel kernel;
															
 
																+	cl_command_queue queue;
															
 
																+	cl_event event;
															
 
																+
															
 
																+	cl_mem dota = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[0]);
															
 
																+	cl_mem dotb = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[1]);
															
 
																+
															
 
																+	id = starpu_worker_get_id();
															
 
																+	devid = starpu_worker_get_devid(id);
															
 
																+
															
 
																+	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "_redux_opencl", devid);
															
 
																+	if (err != CL_SUCCESS)
															
 
																+		STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	err = clSetKernelArg(kernel, 0, sizeof(dota), &dota);
															
 
																+	err|= clSetKernelArg(kernel, 1, sizeof(dotb), &dotb);
															
 
																+	if (err != CL_SUCCESS)
															
 
																+		STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	{
															
 
																+		size_t global=1;
															
 
																+		size_t local;
															
 
																+                size_t s;
															
 
																+                cl_device_id device;
															
 
																+
															
 
																+                starpu_opencl_get_device(devid, &device);
															
 
																+
															
 
																+                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
															
 
																+                if (err != CL_SUCCESS)
															
 
																+			STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+                if (local > global)
															
 
																+			local=global;
															
 
																+
															
 
																+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
															
 
																+		if (err != CL_SUCCESS)
															
 
																+			STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+	}
															
 
																+
															
 
																+	clFinish(queue);
															
 
																+	starpu_opencl_collect_stats(event);
															
 
																+	clReleaseEvent(event);
															
 
																+
															
 
																+	starpu_opencl_release_kernel(kernel);
															
 
																+}
															
 
																+#endif
															
 
																+
															
 
																+static struct starpu_codelet redux_codelet =
															
 
																+{
															
 
																+	.can_execute = can_execute,
															
 
																+	.cpu_funcs = {redux_cpu_func, NULL},
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	.cuda_funcs = {redux_cuda_func, NULL},
															
 
																+#endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+	.opencl_funcs = {redux_opencl_func, NULL},
															
 
																+#endif
															
 
																 	.nbuffers = 2
															
 
																 };
															
@@ -116,57 +232,122 @@ void dot_cuda_func(void *descr[], void *cl_arg)
 
																 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
															
 
																-	cudaMemcpy(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost);
															
 
																-
															
 
																-	cudaThreadSynchronize();
															
 
																+	cudaMemcpyAsync(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
															
 
																 	local_dot = (DOT_TYPE)cublasSdot(n, local_x, 1, local_y, 1);
															
 
																-	//fprintf(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot);
															
 
																+	/* FPRINTF(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot); */
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 	current_dot += local_dot;
															
 
																-	cudaThreadSynchronize();
															
 
																+	cudaMemcpyAsync(dot, &current_dot, sizeof(DOT_TYPE), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+}
															
 
																+#endif
															
 
																+
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+void dot_opencl_func(void *buffers[], void *args)
															
 
																+{
															
 
																+	int id, devid;
															
 
																+        cl_int err;
															
 
																+	cl_kernel kernel;
															
 
																+	cl_command_queue queue;
															
 
																+	cl_event event;
															
 
																+
															
 
																+	cl_mem x = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
															
 
																+	cl_mem y = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[1]);
															
 
																+	cl_mem dot = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[2]);
															
 
																+	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
															
 
																+
															
 
																+	id = starpu_worker_get_id();
															
 
																+	devid = starpu_worker_get_devid(id);
															
 
																+
															
 
																+	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "_dot_opencl", devid);
															
 
																+	if (err != CL_SUCCESS)
															
 
																+		STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	err = clSetKernelArg(kernel, 0, sizeof(x), &x);
															
 
																+	err|= clSetKernelArg(kernel, 1, sizeof(y), &y);
															
 
																+	err|= clSetKernelArg(kernel, 2, sizeof(dot), &dot);
															
 
																+	err|= clSetKernelArg(kernel, 3, sizeof(n), &n);
															
 
																+	if (err != CL_SUCCESS)
															
 
																+		STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	{
															
 
																+		size_t global=1;
															
 
																+		size_t local;
															
 
																+                size_t s;
															
 
																+                cl_device_id device;
															
 
																+
															
 
																+                starpu_opencl_get_device(devid, &device);
															
 
																+
															
 
																+                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
															
 
																+                if (err != CL_SUCCESS)
															
 
																+			STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+                if (local > global)
															
 
																+			local=global;
															
 
																+
															
 
																+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
															
 
																+		if (err != CL_SUCCESS)
															
 
																+			STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+	}
															
 
																-	cudaMemcpy(dot, &current_dot, sizeof(DOT_TYPE), cudaMemcpyHostToDevice);
															
 
																+	clFinish(queue);
															
 
																+	starpu_opencl_collect_stats(event);
															
 
																+	clReleaseEvent(event);
															
 
																-	cudaThreadSynchronize();
															
 
																+	starpu_opencl_release_kernel(kernel);
															
 
																 }
															
 
																 #endif
															
 
																-static struct starpu_codelet_t dot_codelet = {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA,
															
 
																-	.cpu_func = dot_cpu_func,
															
 
																+static struct starpu_codelet dot_codelet =
															
 
																+{
															
 
																+	.can_execute = can_execute,
															
 
																+	.cpu_funcs = {dot_cpu_func, NULL},
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-	.cuda_func = dot_cuda_func,
															
 
																+	.cuda_funcs = {dot_cuda_func, NULL},
															
 
																 #endif
															
 
																-	.nbuffers = 3
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+	.opencl_funcs = {dot_opencl_func, NULL},
															
 
																+#endif
															
 
																+	.nbuffers = 3,
															
 
																+	.modes = {STARPU_R, STARPU_R, STARPU_REDUX}
															
 
																 };
															
 
																 /*
															
 
																  *	Tasks initialization
															
 
																  */
															
 
																-extern void starpu_data_end_reduction_mode(starpu_data_handle handle);
															
 
																-
															
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																-	starpu_init(NULL);
															
 
																+	int ret;
															
 
																+
															
 
																+	ret = starpu_init(NULL);
															
 
																+	if (ret == -ENODEV)
															
 
																+		return 77;
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+	ret = starpu_opencl_load_opencl_from_file("examples/reductions/dot_product_opencl_kernels.cl",
															
 
																+						  &opencl_program, NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
															
 
																+#endif
															
 
																 	starpu_helper_cublas_init();
															
 
																-	unsigned long nelems = nblocks*entries_per_bock;
															
 
																+	unsigned long nelems = nblocks*entries_per_block;
															
 
																 	size_t size = nelems*sizeof(float);
															
 
																-	x = malloc(size);
															
 
																-	y = malloc(size);
															
 
																+	x = (float *) malloc(size);
															
 
																+	y = (float *) malloc(size);
															
 
																-	x_handles = calloc(nblocks, sizeof(starpu_data_handle));
															
 
																-	y_handles = calloc(nblocks, sizeof(starpu_data_handle));
															
 
																+	x_handles = (starpu_data_handle_t *) calloc(nblocks, sizeof(starpu_data_handle_t));
															
 
																+	y_handles = (starpu_data_handle_t *) calloc(nblocks, sizeof(starpu_data_handle_t));
															
 
																 	assert(x && y);
															
 
																         starpu_srand48(0);
															
 
																-	
															
 
																+
															
 
																 	DOT_TYPE reference_dot = 0.0;
															
 
																 	unsigned long i;
															
@@ -176,15 +357,15 @@ int main(int argc, char **argv)
 
																 		y[i] = (float)starpu_drand48();
															
 
																 		reference_dot += (DOT_TYPE)x[i]*(DOT_TYPE)y[i];
															
 
																-	} 
															
 
																-	
															
 
																+	}
															
 
																+
															
 
																 	unsigned block;
															
 
																 	for (block = 0; block < nblocks; block++)
															
 
																 	{
															
 
																 		starpu_vector_data_register(&x_handles[block], 0,
															
 
																-			(uintptr_t)&x[entries_per_bock*block], entries_per_bock, sizeof(float));
															
 
																+			(uintptr_t)&x[entries_per_block*block], entries_per_block, sizeof(float));
															
 
																 		starpu_vector_data_register(&y_handles[block], 0,
															
 
																-			(uintptr_t)&y[entries_per_bock*block], entries_per_bock, sizeof(float));
															
 
																+			(uintptr_t)&y[entries_per_block*block], entries_per_block, sizeof(float));
															
 
																 	}
															
 
																 	starpu_variable_data_register(&dot_handle, 0, (uintptr_t)&dot, sizeof(DOT_TYPE));
															
@@ -199,25 +380,47 @@ int main(int argc, char **argv)
 
																 		struct starpu_task *task = starpu_task_create();
															
 
																 		task->cl = &dot_codelet;
															
 
																+		task->destroy = 1;
															
 
																-		task->buffers[0].handle = x_handles[block];
															
 
																-		task->buffers[0].mode = STARPU_R;
															
 
																-		task->buffers[1].handle = y_handles[block];
															
 
																-		task->buffers[1].mode = STARPU_R;
															
 
																-		task->buffers[2].handle = dot_handle;
															
 
																-		task->buffers[2].mode = STARPU_REDUX;
															
 
																+		task->handles[0] = x_handles[block];
															
 
																+		task->handles[1] = y_handles[block];
															
 
																+		task->handles[2] = dot_handle;
															
 
																-		int ret = starpu_task_submit(task);
															
 
																+		ret = starpu_task_submit(task);
															
 
																+		if (ret == -ENODEV) goto enodev;
															
 
																 		STARPU_ASSERT(!ret);
															
 
																 	}
															
 
																+	for (block = 0; block < nblocks; block++)
															
 
																+	{
															
 
																+		starpu_data_unregister(x_handles[block]);
															
 
																+		starpu_data_unregister(y_handles[block]);
															
 
																+	}
															
 
																 	starpu_data_unregister(dot_handle);
															
 
																-	fprintf(stderr, "Reference : %e vs. %e (Delta %e)\n", reference_dot, dot, reference_dot - dot);
															
 
																+	FPRINTF(stderr, "Reference : %e vs. %e (Delta %e)\n", reference_dot, dot, reference_dot - dot);
															
 
																 	starpu_helper_cublas_shutdown();
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+        ret = starpu_opencl_unload_opencl(&opencl_program);
															
 
																+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
															
 
																+#endif
															
 
																 	starpu_shutdown();
															
 
																-	return 0;
															
 
																+	free(x);
															
 
																+	free(y);
															
 
																+	free(x_handles);
															
 
																+	free(y_handles);
															
 
																+
															
 
																+	if (fabs(reference_dot - dot) < reference_dot * 1e-6)
															
 
																+		return EXIT_SUCCESS;
															
 
																+	else
															
 
																+		return EXIT_FAILURE;
															
 
																+
															
 
																+enodev:
															
 
																+	fprintf(stderr, "WARNING: No one can execute this task\n");
															
 
																+	/* yes, we do not perform the computation but we did detect that no one
															
 
																+ 	 * could perform the kernel, so this is not an error from StarPU */
															
 
																+	return 77;
															
 
																 }
															
--- a/examples/reductions/minmax_reduction.c
+++ b/examples/reductions/minmax_reduction.c
@@ -19,7 +19,7 @@
 
																 #include <limits.h>
															
 
																 #include <starpu.h>
															
 
																-#ifdef STARPU_SLOW_MACHINE
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																 static unsigned nblocks = 512;
															
 
																 static unsigned entries_per_bock = 64;
															
 
																 #else
															
--- a/examples/stencil/Makefile.am
+++ b/examples/stencil/Makefile.am
@@ -1,6 +1,6 @@
 
																 # StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																 #
															
 
																-# Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																+# Copyright (C) 2010-2012  Université de Bordeaux 1
															
 
																 #
															
 
																 # StarPU is free software; you can redistribute it and/or modify
															
 
																 # it under the terms of the GNU Lesser General Public License as published by
															
@@ -13,13 +13,14 @@
 
																 #
															
 
																 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																-AM_CFLAGS = $(HWLOC_CFLAGS)
															
 
																-LIBS = $(top_builddir)/src/libstarpu.la $(HWLOC_LIBS) @LIBS@
															
 
																+AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
															
 
																+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(HWLOC_LIBS) @LIBS@
															
 
																 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
															
 
																+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) 
															
 
																 if USE_MPI
															
 
																-LIBS += $(top_builddir)/mpi/libstarpumpi.la
															
 
																-AM_CPPFLAGS += -I$(top_srcdir)/mpi/
															
 
																+LIBS += $(top_builddir)/mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																+AM_CPPFLAGS += -I$(top_srcdir)/mpi/include
															
 
																 endif
															
 
																 CC = $(CC_OR_MPICC)
															
@@ -34,7 +35,7 @@ NVCCFLAGS += $(HWLOC_CFLAGS)
 
																 .cu.o:
															
 
																 	$(MKDIR_P) `dirname $@`
															
 
																-	$(NVCC) $< -c -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS) -I$(top_srcdir)/include/ -I$(top_builddir)/include/
															
 
																+	$(NVCC) $< -c -o $@ --compiler-options -fno-strict-aliasing -I$(top_srcdir)/include/ -I$(top_builddir)/include/ $(NVCCFLAGS)
															
 
																 endif
															
@@ -46,7 +47,7 @@ endif
 
																 check_PROGRAMS =				\
															
 
																 	stencil
															
 
																-examplebindir = $(libdir)/starpu/examples/
															
 
																+examplebindir = $(libdir)/starpu/examples/stencil
															
 
																 examplebin_PROGRAMS =				\
															
 
																 	stencil
															
@@ -112,3 +113,6 @@ CLEANFILES = *.xpm
 
																 view:
															
 
																 	feh --zoom 800 -F 0.xpm 0.5.xpm 1.xpm 2.xpm 3.xpm 4.xpm 6.xpm mpi.xpm
															
 
																+
															
 
																+showcheck:
															
 
																+	-cat $(TEST_LOGS) /dev/null
															
--- a/examples/stencil/stencil-kernels.c
+++ b/examples/stencil/stencil-kernels.c
@@ -1,6 +1,8 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010-2012  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
 
																  * the Free Software Foundation; either version 2.1 of the License, or (at
															
@@ -16,17 +18,14 @@
 
																 #include "stencil.h"
															
 
																 #include <sys/time.h>
															
 
																-#ifdef STARPU_USE_OPENCL
															
 
																-#include <CL/cl.h>
															
 
																-#include <starpu_opencl.h>
															
 
																-#endif
															
 
																-
															
 
																 #ifndef timersub
															
 
																 #define	timersub(x, y, res) \
															
 
																-	do { \
															
 
																+	do \
															
 
																+	{						   \
															
 
																 		(res)->tv_sec = (x)->tv_sec - (y)->tv_sec; \
															
 
																 		(res)->tv_usec = (x)->tv_usec - (y)->tv_usec; \
															
 
																-		if ((res)->tv_usec < 0) { \
															
 
																+		if ((res)->tv_usec < 0) \
															
 
																+		{			 \
															
 
																 			(res)->tv_sec--; \
															
 
																 			(res)->tv_usec += 1000000; \
															
 
																 		} \
															
@@ -34,10 +33,12 @@
 
																 #endif
															
 
																 #ifndef timeradd
															
 
																 #define	timeradd(x, y, res) \
															
 
																-	do { \
															
 
																+	do \
															
 
																+	{						   \
															
 
																 		(res)->tv_sec = (x)->tv_sec + (y)->tv_sec; \
															
 
																 		(res)->tv_usec = (x)->tv_usec + (y)->tv_usec; \
															
 
																-		if ((res)->tv_usec >= 1000000) { \
															
 
																+		if ((res)->tv_usec >= 1000000) \
															
 
																+		{			       \
															
 
																 			(res)->tv_sec++; \
															
 
																 			(res)->tv_usec -= 1000000; \
															
 
																 		} \
															
@@ -124,6 +125,9 @@ int *who_runs_what;
 
																 int *who_runs_what_index;
															
 
																 struct timeval *last_tick;
															
 
																+/* Achieved iterations */
															
 
																+static int achieved_iter;
															
 
																+
															
 
																 /* Record how many updates each worker performed */
															
 
																 unsigned update_per_worker[STARPU_NMAXWORKERS];
															
@@ -135,7 +139,8 @@ static void record_who_runs_what(struct block_description *block)
 
																 	gettimeofday(&tv, NULL);
															
 
																 	timersub(&tv, &start, &tv2);
															
 
																 	timersub(&tv2, &last_tick[block->bz], &diff);
															
 
																-	while (timercmp(&diff, &delta, >=)) {
															
 
																+	while (timercmp(&diff, &delta, >=))
															
 
																+	{
															
 
																 		timeradd(&last_tick[block->bz], &delta, &last_tick[block->bz]);
															
 
																 		timersub(&tv2, &last_tick[block->bz], &diff);
															
 
																 		if (who_runs_what_index[block->bz] < who_runs_what_len)
															
@@ -146,7 +151,7 @@ static void record_who_runs_what(struct block_description *block)
 
																 		who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = global_workerid(workerid);
															
 
																 }
															
 
																-static void check_load(starpu_block_interface_t *block, starpu_block_interface_t *boundary)
															
 
																+static void check_load(struct starpu_block_interface *block, struct starpu_block_interface *boundary)
															
 
																 {
															
 
																 	/* Sanity checks */
															
 
																 	STARPU_ASSERT(block->nx == boundary->nx);
															
@@ -162,10 +167,12 @@ static void check_load(starpu_block_interface_t *block, starpu_block_interface_t
 
																 /*
															
 
																  * Load a neighbour's boundary into block, CPU version
															
 
																  */
															
 
																-static void load_subblock_from_buffer_cpu(starpu_block_interface_t *block,
															
 
																-					starpu_block_interface_t *boundary,
															
 
																+static void load_subblock_from_buffer_cpu(void *_block,
															
 
																+					void *_boundary,
															
 
																 					unsigned firstz)
															
 
																 {
															
 
																+	struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
															
 
																+	struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
															
 
																 	check_load(block, boundary);
															
 
																 	/* We do a contiguous memory transfer */
															
@@ -181,10 +188,12 @@ static void load_subblock_from_buffer_cpu(starpu_block_interface_t *block,
 
																  * Load a neighbour's boundary into block, CUDA version
															
 
																  */
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-static void load_subblock_from_buffer_cuda(starpu_block_interface_t *block,
															
 
																-					starpu_block_interface_t *boundary,
															
 
																+static void load_subblock_from_buffer_cuda(void *_block,
															
 
																+					void *_boundary,
															
 
																 					unsigned firstz)
															
 
																 {
															
 
																+	struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
															
 
																+	struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
															
 
																 	check_load(block, boundary);
															
 
																 	/* We do a contiguous memory transfer */
															
@@ -241,17 +250,17 @@ fprintf(stderr,"!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid)
 
																 	for (i=1; i<=K; i++)
															
 
																 	{
															
 
																-		starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
															
 
																-		TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;
															
 
																+		struct starpu_block_interface *oldb = descr[i%2], *newb = descr[(i+1)%2];
															
 
																+		TYPE *old = (void*) oldb->ptr, *newer = (void*) newb->ptr;
															
 
																 		/* Shadow data */
															
 
																 		cuda_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
															
 
																 		/* And perform actual computation */
															
 
																 #ifdef LIFE
															
 
																-		cuda_life_update_host(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
															
 
																+		cuda_life_update_host(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
															
 
																 #else
															
 
																-		cudaMemcpyAsync(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
															
 
																+		cudaMemcpyAsync(newer, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
															
 
																 #endif /* LIFE */
															
 
																 	}
															
@@ -259,6 +268,8 @@ fprintf(stderr,"!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid)
 
																 	if ((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess)
															
 
																 		STARPU_CUDA_REPORT_ERROR(cures);
															
 
																+	if (block->bz == 0)
															
 
																+		starpu_top_update_data_integer(starpu_top_achieved_loop, ++achieved_iter);
															
 
																 }
															
 
																 #endif /* STARPU_USE_CUDA */
															
@@ -266,8 +277,8 @@ fprintf(stderr,"!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid)
 
																  * Load a neighbour's boundary into block, OpenCL version
															
 
																  */
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																-static void load_subblock_from_buffer_opencl(starpu_block_interface_t *block,
															
 
																-					starpu_block_interface_t *boundary,
															
 
																+static void load_subblock_from_buffer_opencl(struct starpu_block_interface *block,
															
 
																+					struct starpu_block_interface *boundary,
															
 
																 					unsigned firstz)
															
 
																 {
															
 
																 	check_load(block, boundary);
															
@@ -278,10 +289,14 @@ static void load_subblock_from_buffer_opencl(starpu_block_interface_t *block,
 
																 	unsigned offset = firstz*block->ldz;
															
 
																 	cl_mem block_data = (cl_mem)block->ptr;
															
 
																 	cl_mem boundary_data = (cl_mem)boundary->ptr;
															
 
																+	cl_event event;
															
 
																         cl_command_queue cq;
															
 
																         starpu_opencl_get_current_queue(&cq);
															
 
																-        clEnqueueCopyBuffer(cq, boundary_data, block_data, 0, offset, boundary_size, 0, NULL, NULL);
															
 
																+        clEnqueueCopyBuffer(cq, boundary_data, block_data, 0, offset, boundary_size, 0, NULL, &event);
															
 
																+
															
 
																+	clWaitForEvents(1, &event);
															
 
																+	clReleaseEvent(event);
															
 
																 }
															
 
																 /*
															
@@ -332,17 +347,20 @@ fprintf(stderr,"!!! DO update_func_opencl z %d OPENCL%d !!!\n", block->bz, worke
 
																 	for (i=1; i<=K; i++)
															
 
																 	{
															
 
																-		starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
															
 
																-		TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;
															
 
																+		struct starpu_block_interface *oldb = descr[i%2], *newb = descr[(i+1)%2];
															
 
																+		TYPE *old = (void*) oldb->ptr, *newer = (void*) newb->ptr;
															
 
																 		/* Shadow data */
															
 
																 		opencl_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
															
 
																 		/* And perform actual computation */
															
 
																 #ifdef LIFE
															
 
																-		opencl_life_update_host(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
															
 
																+		opencl_life_update_host(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
															
 
																 #else
															
 
																-                clEnqueueCopyBuffer(cq, old, new, 0, 0, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), 0, NULL, NULL);
															
 
																+		cl_event event;
															
 
																+                clEnqueueCopyBuffer(cq, old, newer, 0, 0, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer), 0, NULL, &event);
															
 
																+		clWaitForEvents(1, &event);
															
 
																+		clReleaseEvent(event);
															
 
																 #endif /* LIFE */
															
 
																 	}
															
@@ -350,6 +368,8 @@ fprintf(stderr,"!!! DO update_func_opencl z %d OPENCL%d !!!\n", block->bz, worke
 
																 	if ((err = clFinish(cq)))
															
 
																 		STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+	if (block->bz == 0)
															
 
																+		starpu_top_update_data_integer(starpu_top_achieved_loop, ++achieved_iter);
															
 
																 }
															
 
																 #endif /* STARPU_USE_OPENCL */
															
@@ -358,7 +378,7 @@ fprintf(stderr,"!!! DO update_func_opencl z %d OPENCL%d !!!\n", block->bz, worke
 
																  */
															
 
																 static void update_func_cpu(void *descr[], void *arg)
															
 
																 {
															
 
																-	struct block_description *block = arg;
															
 
																+	struct block_description *block = (struct block_description *) arg;
															
 
																 	int workerid = starpu_worker_get_id();
															
 
																 	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
															
 
																 	if (block->bz == 0)
															
@@ -398,8 +418,8 @@ fprintf(stderr,"!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
 
																 	for (i=1; i<=K; i++)
															
 
																 	{
															
 
																-		starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
															
 
																-		TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;
															
 
																+		struct starpu_block_interface *oldb = (struct starpu_block_interface *) descr[i%2], *newb = (struct starpu_block_interface *) descr[(i+1)%2];
															
 
																+		TYPE *old = (TYPE*) oldb->ptr, *newer = (TYPE*) newb->ptr;
															
 
																 		/* Shadow data */
															
 
																 		unsigned ldy = oldb->ldy, ldz = oldb->ldz;
															
@@ -417,20 +437,25 @@ fprintf(stderr,"!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
 
																 		/* And perform actual computation */
															
 
																 #ifdef LIFE
															
 
																-		life_update(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
															
 
																+		life_update(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
															
 
																 #else
															
 
																-		memcpy(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new));
															
 
																+		memcpy(newer, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer));
															
 
																 #endif /* LIFE */
															
 
																 	}
															
 
																+
															
 
																+	if (block->bz == 0)
															
 
																+		starpu_top_update_data_integer(starpu_top_achieved_loop, ++achieved_iter);
															
 
																 }
															
 
																 /* Performance model and codelet structure */
															
 
																-static struct starpu_perfmodel_t cl_update_model = {
															
 
																+static struct starpu_perfmodel cl_update_model =
															
 
																+{
															
 
																 	.type = STARPU_HISTORY_BASED,
															
 
																 	.symbol = "cl_update" 
															
 
																 };
															
 
																-starpu_codelet cl_update = {
															
 
																+struct starpu_codelet cl_update =
															
 
																+{
															
 
																 	.where = 0 |
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 		STARPU_CUDA|
															
@@ -439,15 +464,16 @@ starpu_codelet cl_update = {
 
																                 STARPU_OPENCL|
															
 
																 #endif
															
 
																 		STARPU_CPU,
															
 
																-	.cpu_func = update_func_cpu,
															
 
																+	.cpu_funcs = {update_func_cpu, NULL},
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-	.cuda_func = update_func_cuda,
															
 
																+	.cuda_funcs = {update_func_cuda, NULL},
															
 
																 #endif
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																-	.opencl_func = update_func_opencl,
															
 
																+	.opencl_funcs = {update_func_opencl, NULL},
															
 
																 #endif
															
 
																 	.model = &cl_update_model,
															
 
																-	.nbuffers = 6
															
 
																+	.nbuffers = 6,
															
 
																+	.modes = {STARPU_RW, STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R}
															
 
																 };
															
 
																 /*
															
@@ -455,10 +481,12 @@ starpu_codelet cl_update = {
 
																  */
															
 
																 /* CPU version */
															
 
																-static void load_subblock_into_buffer_cpu(starpu_block_interface_t *block,
															
 
																-					starpu_block_interface_t *boundary,
															
 
																+static void load_subblock_into_buffer_cpu(void *_block,
															
 
																+					void *_boundary,
															
 
																 					unsigned firstz)
															
 
																 {
															
 
																+	struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
															
 
																+	struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
															
 
																 	check_load(block, boundary);
															
 
																 	/* We do a contiguous memory transfer */
															
@@ -472,10 +500,12 @@ static void load_subblock_into_buffer_cpu(starpu_block_interface_t *block,
 
																 /* CUDA version */
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-static void load_subblock_into_buffer_cuda(starpu_block_interface_t *block,
															
 
																-					starpu_block_interface_t *boundary,
															
 
																+static void load_subblock_into_buffer_cuda(void *_block,
															
 
																+					void *_boundary,
															
 
																 					unsigned firstz)
															
 
																 {
															
 
																+	struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
															
 
																+	struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
															
 
																 	check_load(block, boundary);
															
 
																 	/* We do a contiguous memory transfer */
															
@@ -490,8 +520,8 @@ static void load_subblock_into_buffer_cuda(starpu_block_interface_t *block,
 
																 /* OPENCL version */
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																-static void load_subblock_into_buffer_opencl(starpu_block_interface_t *block,
															
 
																-					starpu_block_interface_t *boundary,
															
 
																+static void load_subblock_into_buffer_opencl(struct starpu_block_interface *block,
															
 
																+					struct starpu_block_interface *boundary,
															
 
																 					unsigned firstz)
															
 
																 {
															
 
																 	check_load(block, boundary);
															
@@ -505,8 +535,12 @@ static void load_subblock_into_buffer_opencl(starpu_block_interface_t *block,
 
																         cl_command_queue cq;
															
 
																         starpu_opencl_get_current_queue(&cq);
															
 
																+	cl_event event;
															
 
																+
															
 
																+        clEnqueueCopyBuffer(cq, block_data, boundary_data, offset, 0, boundary_size, 0, NULL, &event);
															
 
																-        clEnqueueCopyBuffer(cq, block_data, boundary_data, offset, 0, boundary_size, 0, NULL, NULL);
															
 
																+	clWaitForEvents(1, &event);
															
 
																+	clReleaseEvent(event);
															
 
																 }
															
 
																 #endif /* STARPU_USE_OPENCL */
															
@@ -517,7 +551,7 @@ unsigned bottom_per_worker[STARPU_NMAXWORKERS];
 
																 /* top save, CPU version */
															
 
																 static void dummy_func_top_cpu(void *descr[] __attribute__((unused)), void *arg)
															
 
																 {
															
 
																-	struct block_description *block = arg;
															
 
																+	struct block_description *block = (struct block_description *) arg;
															
 
																 	int workerid = starpu_worker_get_id();
															
 
																 	top_per_worker[workerid]++;
															
@@ -533,7 +567,7 @@ static void dummy_func_top_cpu(void *descr[] __attribute__((unused)), void *arg)
 
																 /* bottom save, CPU version */
															
 
																 static void dummy_func_bottom_cpu(void *descr[] __attribute__((unused)), void *arg)
															
 
																 {
															
 
																-	struct block_description *block = arg;
															
 
																+	struct block_description *block = (struct block_description *) arg;
															
 
																 	int workerid = starpu_worker_get_id();
															
 
																 	bottom_per_worker[workerid]++;
															
@@ -547,7 +581,7 @@ static void dummy_func_bottom_cpu(void *descr[] __attribute__((unused)), void *a
 
																 #ifdef STARPU_USE_CUDA
															
 
																 static void dummy_func_top_cuda(void *descr[] __attribute__((unused)), void *arg)
															
 
																 {
															
 
																-	struct block_description *block = arg;
															
 
																+	struct block_description *block = (struct block_description *) arg;
															
 
																 	int workerid = starpu_worker_get_id();
															
 
																 	top_per_worker[workerid]++;
															
@@ -564,7 +598,7 @@ static void dummy_func_top_cuda(void *descr[] __attribute__((unused)), void *arg
 
																 /* bottom save, CUDA version */
															
 
																 static void dummy_func_bottom_cuda(void *descr[] __attribute__((unused)), void *arg)
															
 
																 {
															
 
																-	struct block_description *block = arg;
															
 
																+	struct block_description *block = (struct block_description *) arg;
															
 
																 	int workerid = starpu_worker_get_id();
															
 
																 	bottom_per_worker[workerid]++;
															
@@ -580,7 +614,7 @@ static void dummy_func_bottom_cuda(void *descr[] __attribute__((unused)), void *
 
																 #ifdef STARPU_USE_OPENCL
															
 
																 static void dummy_func_top_opencl(void *descr[] __attribute__((unused)), void *arg)
															
 
																 {
															
 
																-	struct block_description *block = arg;
															
 
																+	struct block_description *block = (struct block_description *) arg;
															
 
																 	int workerid = starpu_worker_get_id();
															
 
																 	top_per_worker[workerid]++;
															
@@ -600,7 +634,7 @@ static void dummy_func_top_opencl(void *descr[] __attribute__((unused)), void *a
 
																 /* bottom save, OPENCL version */
															
 
																 static void dummy_func_bottom_opencl(void *descr[] __attribute__((unused)), void *arg)
															
 
																 {
															
 
																-	struct block_description *block = arg;
															
 
																+	struct block_description *block = (struct block_description *) arg;
															
 
																 	int workerid = starpu_worker_get_id();
															
 
																 	bottom_per_worker[workerid]++;
															
@@ -616,17 +650,20 @@ static void dummy_func_bottom_opencl(void *descr[] __attribute__((unused)), void
 
																 #endif /* STARPU_USE_OPENCL */
															
 
																 /* Performance models and codelet for save */
															
 
																-static struct starpu_perfmodel_t save_cl_bottom_model = {
															
 
																+static struct starpu_perfmodel save_cl_bottom_model =
															
 
																+{
															
 
																 	.type = STARPU_HISTORY_BASED,
															
 
																 	.symbol = "save_cl_bottom" 
															
 
																 };
															
 
																-static struct starpu_perfmodel_t save_cl_top_model = {
															
 
																+static struct starpu_perfmodel save_cl_top_model =
															
 
																+{
															
 
																 	.type = STARPU_HISTORY_BASED,
															
 
																 	.symbol = "save_cl_top" 
															
 
																 };
															
 
																-starpu_codelet save_cl_bottom = {
															
 
																+struct starpu_codelet save_cl_bottom =
															
 
																+{
															
 
																 	.where = 0 |
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 		STARPU_CUDA|
															
@@ -635,18 +672,20 @@ starpu_codelet save_cl_bottom = {
 
																 		STARPU_OPENCL|
															
 
																 #endif
															
 
																 		STARPU_CPU,
															
 
																-	.cpu_func = dummy_func_bottom_cpu,
															
 
																+	.cpu_funcs = {dummy_func_bottom_cpu, NULL},
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-	.cuda_func = dummy_func_bottom_cuda,
															
 
																+	.cuda_funcs = {dummy_func_bottom_cuda, NULL},
															
 
																 #endif
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																-	.opencl_func = dummy_func_bottom_opencl,
															
 
																+	.opencl_funcs = {dummy_func_bottom_opencl, NULL},
															
 
																 #endif
															
 
																 	.model = &save_cl_bottom_model,
															
 
																-	.nbuffers = 4
															
 
																+	.nbuffers = 4,
															
 
																+	.modes = {STARPU_R, STARPU_R, STARPU_W, STARPU_W}
															
 
																 };
															
 
																-starpu_codelet save_cl_top = {
															
 
																+struct starpu_codelet save_cl_top =
															
 
																+{
															
 
																 	.where = 0|
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 		STARPU_CUDA|
															
@@ -655,13 +694,14 @@ starpu_codelet save_cl_top = {
 
																 		STARPU_OPENCL|
															
 
																 #endif
															
 
																 		STARPU_CPU,
															
 
																-	.cpu_func = dummy_func_top_cpu,
															
 
																+	.cpu_funcs = {dummy_func_top_cpu, NULL},
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-	.cuda_func = dummy_func_top_cuda,
															
 
																+	.cuda_funcs = {dummy_func_top_cuda, NULL},
															
 
																 #endif
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																-	.opencl_func = dummy_func_top_opencl,
															
 
																+	.opencl_funcs = {dummy_func_top_opencl, NULL},
															
 
																 #endif
															
 
																 	.model = &save_cl_top_model,
															
 
																-	.nbuffers = 4
															
 
																+	.nbuffers = 4,
															
 
																+	.modes = {STARPU_R, STARPU_R, STARPU_W, STARPU_W}
															
 
																 };
															
--- a/examples/stencil/stencil-tasks.c
+++ b/examples/stencil/stencil-tasks.c
@@ -1,6 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -48,17 +49,13 @@ static void create_task_save_local(unsigned iter, unsigned z, int dir, unsigned
 
																 	save_task->cl_arg = descr;
															
 
																 	/* Saving our border... */
															
 
																-	save_task->buffers[0].handle = descr->layers_handle[0];
															
 
																-	save_task->buffers[0].mode = STARPU_R;
															
 
																-	save_task->buffers[1].handle = descr->layers_handle[1];
															
 
																-	save_task->buffers[1].mode = STARPU_R;
															
 
																+	save_task->handles[0] = descr->layers_handle[0];
															
 
																+	save_task->handles[1] = descr->layers_handle[1];
															
 
																 	/* ... to the neighbour's copy */
															
 
																 	struct block_description *neighbour = descr->boundary_blocks[(1+dir)/2];
															
 
																-	save_task->buffers[2].handle = neighbour->boundaries_handle[(1-dir)/2][0];
															
 
																-	save_task->buffers[2].mode = STARPU_W;
															
 
																-	save_task->buffers[3].handle = neighbour->boundaries_handle[(1-dir)/2][1];
															
 
																-	save_task->buffers[3].mode = STARPU_W;
															
 
																+	save_task->handles[2] = neighbour->boundaries_handle[(1-dir)/2][0];
															
 
																+	save_task->handles[3] = neighbour->boundaries_handle[(1-dir)/2][1];
															
 
																 	/* Bind */
															
 
																 	if (iter <= BIND_LAST)
															
@@ -69,14 +66,15 @@ static void create_task_save_local(unsigned iter, unsigned z, int dir, unsigned
 
																 	if (ret)
															
 
																 	{
															
 
																 		fprintf(stderr, "Could not submit task save: %d\n", ret);
															
 
																-		STARPU_ASSERT(0);
															
 
																+		STARPU_ABORT();
															
 
																 	}
															
 
																 }
															
 
																 /* R(z) = local & R(z+d) != local */
															
 
																 /* We need to send our save over MPI */
															
 
																-static void send_done(void *arg) {
															
 
																+static void send_done(void *arg)
															
 
																+{
															
 
																 	uintptr_t z = (uintptr_t) arg;
															
 
																 	DEBUG("DO SEND %d\n", (int)z);
															
 
																 }
															
@@ -93,8 +91,8 @@ static void create_task_save_mpi_send(unsigned iter, unsigned z, int dir, unsign
 
																 	STARPU_ASSERT(neighbour->mpi_node != local_rank);
															
 
																 	/* Send neighbour's border copy to the neighbour */
															
 
																-	starpu_data_handle handle0 = neighbour->boundaries_handle[(1-dir)/2][0];
															
 
																-	starpu_data_handle handle1 = neighbour->boundaries_handle[(1-dir)/2][1];
															
 
																+	starpu_data_handle_t handle0 = neighbour->boundaries_handle[(1-dir)/2][0];
															
 
																+	starpu_data_handle_t handle1 = neighbour->boundaries_handle[(1-dir)/2][1];
															
 
																 	starpu_mpi_isend_detached(handle0, dest, MPI_TAG0(z, iter, dir), MPI_COMM_WORLD, send_done, (void*)(uintptr_t)z);
															
 
																 	starpu_mpi_isend_detached(handle1, dest, MPI_TAG1(z, iter, dir), MPI_COMM_WORLD, send_done, (void*)(uintptr_t)z);
															
@@ -103,7 +101,8 @@ static void create_task_save_mpi_send(unsigned iter, unsigned z, int dir, unsign
 
																 /* R(z) != local & R(z+d) = local */
															
 
																 /* We need to receive over MPI */
															
 
																-static void recv_done(void *arg) {
															
 
																+static void recv_done(void *arg)
															
 
																+{
															
 
																 	uintptr_t z = (uintptr_t) arg;
															
 
																 	DEBUG("DO RECV %d\n", (int)z);
															
 
																 }
															
@@ -119,13 +118,13 @@ static void create_task_save_mpi_recv(unsigned iter, unsigned z, int dir, unsign
 
																 	STARPU_ASSERT(neighbour->mpi_node == local_rank);
															
 
																 	/* Receive our neighbour's border in our neighbour copy */
															
 
																-	starpu_data_handle handle0 = neighbour->boundaries_handle[(1-dir)/2][0];
															
 
																-	starpu_data_handle handle1 = neighbour->boundaries_handle[(1-dir)/2][1];
															
 
																+	starpu_data_handle_t handle0 = neighbour->boundaries_handle[(1-dir)/2][0];
															
 
																+	starpu_data_handle_t handle1 = neighbour->boundaries_handle[(1-dir)/2][1];
															
 
																 	starpu_mpi_irecv_detached(handle0, source, MPI_TAG0(z, iter, dir), MPI_COMM_WORLD, recv_done, (void*)(uintptr_t)z);
															
 
																 	starpu_mpi_irecv_detached(handle1, source, MPI_TAG1(z, iter, dir), MPI_COMM_WORLD, recv_done, (void*)(uintptr_t)z);
															
 
																 }
															
 
																-#endif // STARPU_USE_MPI
															
 
																+#endif /* STARPU_USE_MPI */
															
 
																 /*
															
 
																  * Schedule saving boundaries of blocks to communication buffers
															
@@ -141,26 +140,28 @@ void create_task_save(unsigned iter, unsigned z, int dir, unsigned local_rank)
 
																 		/* Save data from update */
															
 
																 		create_task_save_local(iter, z, dir, local_rank);
															
 
																 		if (node_z_and_d != local_rank)
															
 
																-		{ // R(z) = local & R(z+d) != local, We have to send the data
															
 
																+		{ /* R(z) = local & R(z+d) != local, We have to send the data */
															
 
																 			create_task_save_mpi_send(iter, z, dir, local_rank);
															
 
																 		}
															
 
																 	}
															
 
																-	else {	// node_z != local_rank, this MPI node doesn't have the saved data
															
 
																+	else
															
 
																+	{	/* node_z != local_rank, this MPI node doesn't have the saved data */
															
 
																 		if (node_z_and_d == local_rank)
															
 
																 		{
															
 
																 			create_task_save_mpi_recv(iter, z, dir, local_rank);
															
 
																 		}
															
 
																-		else {  // R(z) != local & R(z+d) != local We don't have
															
 
																-			// the saved data and don't need it, we shouldn't
															
 
																-			// even have been called!
															
 
																-			STARPU_ASSERT(0);
															
 
																+		else
															
 
																+		{ /* R(z) != local & R(z+d) != local We don't have
															
 
																+			      the saved data and don't need it, we shouldn't
															
 
																+			      even have been called! */
															
 
																+			STARPU_ABORT();
															
 
																 		}
															
 
																 	}
															
 
																-#else // !STARPU_USE_MPI
															
 
																+#else /* !STARPU_USE_MPI */
															
 
																 	STARPU_ASSERT((node_z == local_rank) && (node_z_and_d == local_rank));
															
 
																 	create_task_save_local(iter, z, dir, local_rank);
															
 
																-#endif // STARPU_USE_MPI
															
 
																+#endif /* STARPU_USE_MPI */
															
 
																 }
															
 
																 /*
															
@@ -176,7 +177,8 @@ void create_task_update(unsigned iter, unsigned z, unsigned local_rank)
 
																 	unsigned niter = get_niter();
															
 
																 	/* We are going to synchronize with the last tasks */
															
 
																-	if (iter == niter) {
															
 
																+	if (iter == niter)
															
 
																+	{
															
 
																 		task->detach = 0;
															
 
																 		task->use_tag = 1;
															
 
																 		task->tag_id = TAG_FINISH(z);
															
@@ -186,20 +188,14 @@ void create_task_update(unsigned iter, unsigned z, unsigned local_rank)
 
																 	unsigned new_layer = (old_layer + 1) % 2;
															
 
																 	struct block_description *descr = get_block_description(z);
															
 
																-	task->buffers[0].handle = descr->layers_handle[new_layer];
															
 
																-	task->buffers[0].mode = STARPU_RW;
															
 
																-	task->buffers[1].handle = descr->layers_handle[old_layer];
															
 
																-	task->buffers[1].mode = STARPU_RW;
															
 
																+	task->handles[0] = descr->layers_handle[new_layer];
															
 
																+	task->handles[1] = descr->layers_handle[old_layer];
															
 
																-	task->buffers[2].handle = descr->boundaries_handle[T][new_layer];
															
 
																-	task->buffers[2].mode = STARPU_R;
															
 
																-	task->buffers[3].handle = descr->boundaries_handle[T][old_layer];
															
 
																-	task->buffers[3].mode = STARPU_R;
															
 
																+	task->handles[2] = descr->boundaries_handle[T][new_layer];
															
 
																+	task->handles[3] = descr->boundaries_handle[T][old_layer];
															
 
																-	task->buffers[4].handle = descr->boundaries_handle[B][new_layer];
															
 
																-	task->buffers[4].mode = STARPU_R;
															
 
																-	task->buffers[5].handle = descr->boundaries_handle[B][old_layer];
															
 
																-	task->buffers[5].mode = STARPU_R;
															
 
																+	task->handles[4] = descr->boundaries_handle[B][new_layer];
															
 
																+	task->handles[5] = descr->boundaries_handle[B][old_layer];
															
 
																 	task->cl = &cl_update;
															
 
																 	task->cl_arg = descr;
															
@@ -212,21 +208,24 @@ void create_task_update(unsigned iter, unsigned z, unsigned local_rank)
 
																 	if (ret)
															
 
																 	{
															
 
																 		fprintf(stderr, "Could not submit task update block: %d\n", ret);
															
 
																-		STARPU_ASSERT(0);
															
 
																+		STARPU_ABORT();
															
 
																 	}
															
 
																 }
															
 
																 /* Dummy empty codelet taking one buffer */
															
 
																 static void null_func(void *descr[] __attribute__((unused)), void *arg __attribute__((unused))) { }
															
 
																-static starpu_codelet null = {
															
 
																+static struct starpu_codelet null =
															
 
																+{
															
 
																+	.modes = { STARPU_W, STARPU_W },
															
 
																 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
															
 
																-	.cpu_func = null_func,
															
 
																-	.cuda_func = null_func,
															
 
																-	.opencl_func = null_func,
															
 
																+	.cpu_funcs = {null_func, NULL},
															
 
																+	.cuda_funcs = {null_func, NULL},
															
 
																+	.opencl_funcs = {null_func, NULL},
															
 
																 	.nbuffers = 2
															
 
																 };
															
 
																-void create_start_task(int z, int dir) {
															
 
																+void create_start_task(int z, int dir)
															
 
																+{
															
 
																 	/* Dumb task depending on the init task and simulating writing the
															
 
																 	   neighbour buffers, to avoid communications and computation running
															
 
																 	   before we start measuring time */
															
@@ -236,17 +235,15 @@ void create_start_task(int z, int dir) {
 
																 	wait_init->cl = &null;
															
 
																 	wait_init->use_tag = 1;
															
 
																 	wait_init->tag_id = TAG_START(z, dir);
															
 
																-	wait_init->buffers[0].handle = descr->boundaries_handle[(1+dir)/2][0];
															
 
																-	wait_init->buffers[0].mode = STARPU_W;
															
 
																-	wait_init->buffers[1].handle = descr->boundaries_handle[(1+dir)/2][1];
															
 
																-	wait_init->buffers[1].mode = STARPU_W;
															
 
																+	wait_init->handles[0] = descr->boundaries_handle[(1 + dir) / 2][0];
															
 
																+	wait_init->handles[1] = descr->boundaries_handle[(1 + dir) / 2][1];
															
 
																 	starpu_tag_declare_deps_array(wait_init->tag_id, 1, &tag_init);
															
 
																 	int ret = starpu_task_submit(wait_init);
															
 
																 	if (ret)
															
 
																 	{
															
 
																 		fprintf(stderr, "Could not submit task initial wait: %d\n", ret);
															
 
																-		STARPU_ASSERT(0);
															
 
																+		STARPU_ABORT();
															
 
																 	}
															
 
																 }
															
@@ -261,7 +258,8 @@ void create_tasks(int rank)
 
																 	int niter = get_niter();
															
 
																 	int nbz = get_nbz();
															
 
																-	for (bz = 0; bz < nbz; bz++) {
															
 
																+	for (bz = 0; bz < nbz; bz++)
															
 
																+	{
															
 
																 		if ((get_block_mpi_node(bz) == rank) || (get_block_mpi_node(bz+1) == rank))
															
 
																 			create_start_task(bz, +1);
															
 
																 		if ((get_block_mpi_node(bz) == rank) || (get_block_mpi_node(bz-1) == rank))
															
--- a/examples/tag_example/tag_example.c
+++ b/examples/tag_example/tag_example.c
@@ -43,7 +43,7 @@
 
																 struct starpu_codelet cl = {};
															
 
																-#ifdef STARPU_SLOW_MACHINE
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																 #define Ni	32
															
 
																 #define Nj	32
															
 
																 #define Nk	32
															
--- a/examples/tag_example/tag_example2.c
+++ b/examples/tag_example/tag_example2.c
@@ -120,7 +120,7 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 
																 		exit(77);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-#ifdef STARPU_SLOW_MACHINE
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																 	ni /= 4;
															
 
																 	nk /= 16;
															
 
																 #endif
															
--- a/examples/tag_example/tag_example3.c
+++ b/examples/tag_example/tag_example3.c
@@ -122,7 +122,7 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 
																 		exit(77);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-#ifdef STARPU_SLOW_MACHINE
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																 	ni /= 4;
															
 
																 	nk /= 16;
															
 
																 #endif
															
--- a/examples/tag_example/tag_restartable.c
+++ b/examples/tag_example/tag_restartable.c
@@ -129,7 +129,7 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 
																 		return 77;
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-#ifdef STARPU_SLOW_MACHINE
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																 	ni /= 4;
															
 
																 	nk /= 16;
															
 
																 #endif
															
--- a/tests/microbenchs/prefetch_data_on_node.c
+++ b/tests/microbenchs/prefetch_data_on_node.c
@@ -23,7 +23,7 @@
 
																 #include <pthread.h>
															
 
																 #include "../helper.h"
															
 
																-#ifdef STARPU_SLOW_MACHINE
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																 #define N		100
															
 
																 #else
															
 
																 #define N		1000
															
--- a/tests/microbenchs/sync_tasks_overhead.c
+++ b/tests/microbenchs/sync_tasks_overhead.c
@@ -93,7 +93,7 @@ int main(int argc, char **argv)
 
																 	struct timeval start;
															
 
																 	struct timeval end;
															
 
																-#ifdef STARPU_SLOW_MACHINE
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																 	ntasks = 128;
															
 
																 #endif