Andra Hugo лет назад: 12
Родитель
Сommit
a9f3cb5acb

+ 1 - 4
examples/Makefile.am

@@ -3,7 +3,7 @@
 # Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
 # Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
 # Copyright (C) 2011  Télécom-SudParis
-# Copyright (C) 2011-2012  INRIA
+# Copyright (C) 2012 INRIA
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -47,7 +47,6 @@ EXTRA_DIST = 					\
 	lu/xlu_implicit_pivot.c			\
 	lu/xlu_kernels.c			\
 	lu/lu_example.c				\
-	sched_ctx_utils/sched_ctx_utils.c		\
 	incrementer/incrementer_kernels_opencl_kernel.cl 	\
 	basic_examples/variable_kernels_opencl_kernel.cl	\
 	matvecmult/matvecmult_kernel.cl				\
@@ -100,7 +99,6 @@ noinst_HEADERS = 				\
 	lu/complex_double.h			\
 	lu/blas_complex.h			\
 	cholesky/cholesky.h			\
-	sched_ctx_utils/sched_ctx_utils.h	\
 	common/blas_model.h			\
 	common/blas.h				\
 	mult/simple.h				\
@@ -514,7 +512,6 @@ cholesky_cholesky_implicit_SOURCES =		\
 	cholesky/cholesky_implicit.c		\
 	cholesky/cholesky_models.c		\
 	cholesky/cholesky_kernels.c		\
-	sched_ctx_utils/sched_ctx_utils.c	\
 	common/blas.c
 
 cholesky_cholesky_implicit_LDADD =		\

+ 1 - 1
examples/basic_examples/variable.c

@@ -45,7 +45,7 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) goto enodev;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 	niter /= 100;
 #endif
         if (argc == 2) niter = atoi(argv[1]);

+ 1 - 1
examples/cg/cg.c

@@ -406,7 +406,7 @@ int main(int argc, char **argv)
 {
 	int ret;
 
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 	i_max = 16;
 #endif
 

+ 154 - 139
examples/cg/cg_kernels.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,6 +16,7 @@
 
 #include "cg.h"
 #include <math.h>
+#include <limits.h>
 
 #if 0
 static void print_vector_from_descr(unsigned nx, TYPE *v)
@@ -43,6 +44,23 @@ static void print_matrix_from_descr(unsigned nx, unsigned ny, unsigned ld, TYPE
 }
 #endif
 
+static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
+{
+	enum starpu_archtype type = starpu_worker_get_type(workerid);
+	if (type == STARPU_CPU_WORKER || type == STARPU_OPENCL_WORKER)
+		return 1;
+
+#ifdef STARPU_USE_CUDA
+	/* Cuda device */
+	const struct cudaDeviceProp *props;
+	props = starpu_cuda_get_device_properties(workerid);
+	if (props->major >= 2 || props->minor >= 3)
+		/* At least compute capability 1.3, supports doubles */
+		return 1;
+#endif
+	/* Old card, does not support doubles */
+	return 0;
+}
 
 /*
  *	Reduction accumulation methods
@@ -67,16 +85,19 @@ static void accumulate_variable_cpu(void *descr[], void *cl_arg)
 	*v_dst = *v_dst + *v_src;
 }
 
-static struct starpu_perfmodel_t accumulate_variable_model = {
+static struct starpu_perfmodel accumulate_variable_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "accumulate_variable"
 };
 
-starpu_codelet accumulate_variable_cl = {
+struct starpu_codelet accumulate_variable_cl =
+{
+	.can_execute = can_execute,
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = accumulate_variable_cpu,
+	.cpu_funcs = {accumulate_variable_cpu, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = accumulate_variable_cuda,
+	.cuda_funcs = {accumulate_variable_cuda, NULL},
 #endif
 	.nbuffers = 2,
 	.model = &accumulate_variable_model
@@ -103,16 +124,19 @@ static void accumulate_vector_cpu(void *descr[], void *cl_arg)
 	AXPY(n, (TYPE)1.0, v_src, 1, v_dst, 1);
 }
 
-static struct starpu_perfmodel_t accumulate_vector_model = {
+static struct starpu_perfmodel accumulate_vector_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "accumulate_vector"
 };
 
-starpu_codelet accumulate_vector_cl = {
+struct starpu_codelet accumulate_vector_cl =
+{
+	.can_execute = can_execute,
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = accumulate_vector_cpu,
+	.cpu_funcs = {accumulate_vector_cpu, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = accumulate_vector_cuda,
+	.cuda_funcs = {accumulate_vector_cuda, NULL},
 #endif
 	.nbuffers = 2,
 	.model = &accumulate_vector_model
@@ -141,16 +165,19 @@ static void bzero_variable_cpu(void *descr[], void *cl_arg)
 	*v = (TYPE)0.0;
 }
 
-static struct starpu_perfmodel_t bzero_variable_model = {
+static struct starpu_perfmodel bzero_variable_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "bzero_variable"
 };
 
-starpu_codelet bzero_variable_cl = {
+struct starpu_codelet bzero_variable_cl =
+{
+	.can_execute = can_execute,
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = bzero_variable_cpu,
+	.cpu_funcs = {bzero_variable_cpu, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = bzero_variable_cuda,
+	.cuda_funcs = {bzero_variable_cuda, NULL},
 #endif
 	.nbuffers = 1,
 	.model = &bzero_variable_model
@@ -176,16 +203,19 @@ static void bzero_vector_cpu(void *descr[], void *cl_arg)
 	memset(v, 0, n*sizeof(TYPE));
 }
 
-static struct starpu_perfmodel_t bzero_vector_model = {
+static struct starpu_perfmodel bzero_vector_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "bzero_vector"
 };
 
-starpu_codelet bzero_vector_cl = {
+struct starpu_codelet bzero_vector_cl =
+{
+	.can_execute = can_execute,
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = bzero_vector_cpu,
+	.cpu_funcs = {bzero_vector_cpu, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = bzero_vector_cuda,
+	.cuda_funcs = {bzero_vector_cuda, NULL},
 #endif
 	.nbuffers = 1,
 	.model = &bzero_vector_model
@@ -229,39 +259,48 @@ static void dot_kernel_cpu(void *descr[], void *cl_arg)
 	*dot = *dot + local_dot;
 }
 
-static struct starpu_perfmodel_t dot_kernel_model = {
+static struct starpu_perfmodel dot_kernel_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "dot_kernel"
 };
 
-static starpu_codelet dot_kernel_cl = {
+static struct starpu_codelet dot_kernel_cl =
+{
+	.can_execute = can_execute,
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = dot_kernel_cpu,
+	.cpu_funcs = {dot_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = dot_kernel_cuda,
+	.cuda_funcs = {dot_kernel_cuda, NULL},
 #endif
 	.nbuffers = 3,
 	.model = &dot_kernel_model
 };
 
-void dot_kernel(starpu_data_handle v1,
-		starpu_data_handle v2,
-		starpu_data_handle s,
-		unsigned nblocks,
-		int use_reduction)
+int dot_kernel(starpu_data_handle_t v1,
+	       starpu_data_handle_t v2,
+	       starpu_data_handle_t s,
+	       unsigned nblocks,
+	       int use_reduction)
 {
+	int ret;
+
 	/* Blank the accumulation variable */
-	starpu_insert_task(&bzero_variable_cl, STARPU_W, s, 0);
+	ret = starpu_insert_task(&bzero_variable_cl, STARPU_W, s, 0);
+	if (ret == -ENODEV) return ret;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 
 	unsigned b;
 	for (b = 0; b < nblocks; b++)
 	{
-		starpu_insert_task(&dot_kernel_cl,
-			use_reduction?STARPU_REDUX:STARPU_RW, s,
-			STARPU_R, starpu_data_get_sub_data(v1, 1, b),
-			STARPU_R, starpu_data_get_sub_data(v2, 1, b),
-			0);
+		ret = starpu_insert_task(&dot_kernel_cl,
+					 use_reduction?STARPU_REDUX:STARPU_RW, s,
+					 STARPU_R, starpu_data_get_sub_data(v1, 1, b),
+					 STARPU_R, starpu_data_get_sub_data(v2, 1, b),
+					 0);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 	}
+	return 0;
 }
 
 /*
@@ -272,7 +311,7 @@ void dot_kernel(starpu_data_handle v1,
 static void scal_kernel_cuda(void *descr[], void *cl_arg)
 {
 	TYPE p1;
-	starpu_unpack_cl_args(cl_arg, &p1);
+	starpu_codelet_unpack_args(cl_arg, &p1);
 
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
@@ -287,7 +326,7 @@ static void scal_kernel_cuda(void *descr[], void *cl_arg)
 static void scal_kernel_cpu(void *descr[], void *cl_arg)
 {
 	TYPE alpha;
-	starpu_unpack_cl_args(cl_arg, &alpha);
+	starpu_codelet_unpack_args(cl_arg, &alpha);
 
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
@@ -296,16 +335,19 @@ static void scal_kernel_cpu(void *descr[], void *cl_arg)
 	SCAL(n, alpha, v1, 1);
 }
 
-static struct starpu_perfmodel_t scal_kernel_model = {
+static struct starpu_perfmodel scal_kernel_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "scal_kernel"
 };
 
-static starpu_codelet scal_kernel_cl = {
+static struct starpu_codelet scal_kernel_cl =
+{
+	.can_execute = can_execute,
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = scal_kernel_cpu,
+	.cpu_funcs = {scal_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = scal_kernel_cuda,
+	.cuda_funcs = {scal_kernel_cuda, NULL},
 #endif
 	.nbuffers = 1,
 	.model = &scal_kernel_model
@@ -327,7 +369,7 @@ static void gemv_kernel_cuda(void *descr[], void *cl_arg)
 	unsigned ny = STARPU_MATRIX_GET_NY(descr[1]);
  
 	TYPE alpha, beta;
-	starpu_unpack_cl_args(cl_arg, &beta, &alpha);
+	starpu_codelet_unpack_args(cl_arg, &beta, &alpha);
 
 	/* Compute v1 = alpha M v2 + beta v1 */
 	cublasgemv('N', nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
@@ -346,7 +388,7 @@ static void gemv_kernel_cpu(void *descr[], void *cl_arg)
 	unsigned ny = STARPU_MATRIX_GET_NY(descr[1]);
 
 	TYPE alpha, beta;
-	starpu_unpack_cl_args(cl_arg, &beta, &alpha);
+	starpu_codelet_unpack_args(cl_arg, &beta, &alpha);
 
 	int worker_size = starpu_combined_worker_get_size();
 
@@ -367,38 +409,44 @@ static void gemv_kernel_cpu(void *descr[], void *cl_arg)
 	GEMV("N", nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
 }
 
-static struct starpu_perfmodel_t gemv_kernel_model = {
+static struct starpu_perfmodel gemv_kernel_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "gemv_kernel"
 };
 
-static starpu_codelet gemv_kernel_cl = {
+static struct starpu_codelet gemv_kernel_cl =
+{
+	.can_execute = can_execute,
 	.where = STARPU_CPU|STARPU_CUDA,
 	.type = STARPU_SPMD,
 	.max_parallelism = INT_MAX,
-	.cpu_func = gemv_kernel_cpu,
+	.cpu_funcs = {gemv_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = gemv_kernel_cuda,
+	.cuda_funcs = {gemv_kernel_cuda, NULL},
 #endif
 	.nbuffers = 3,
 	.model = &gemv_kernel_model
 };
 
-void gemv_kernel(starpu_data_handle v1,
-		starpu_data_handle matrix,
-		starpu_data_handle v2,
+int gemv_kernel(starpu_data_handle_t v1,
+		starpu_data_handle_t matrix,
+		starpu_data_handle_t v2,
 		TYPE p1, TYPE p2,
 		unsigned nblocks,
 		int use_reduction)
 {
 	unsigned b1, b2;
+	int ret;
 
 	for (b2 = 0; b2 < nblocks; b2++)
 	{
-		starpu_insert_task(&scal_kernel_cl,
-			STARPU_RW, starpu_data_get_sub_data(v1, 1, b2),
-			STARPU_VALUE, &p1, sizeof(p1),
-			0);
+		ret = starpu_insert_task(&scal_kernel_cl,
+					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b2),
+					 STARPU_VALUE, &p1, sizeof(p1),
+					 0);
+		if (ret == -ENODEV) return ret;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 	}
 
 	for (b2 = 0; b2 < nblocks; b2++)
@@ -406,15 +454,17 @@ void gemv_kernel(starpu_data_handle v1,
 		for (b1 = 0; b1 < nblocks; b1++)
 		{
 			TYPE one = 1.0;
-			starpu_insert_task(&gemv_kernel_cl,
-				use_reduction?STARPU_REDUX:STARPU_RW,	starpu_data_get_sub_data(v1, 1, b2),
-				STARPU_R,	starpu_data_get_sub_data(matrix, 2, b2, b1),
-				STARPU_R,	starpu_data_get_sub_data(v2, 1, b1),
-				STARPU_VALUE,	&one,	sizeof(one),
-				STARPU_VALUE,	&p2,	sizeof(p2),
-				0);
+			ret = starpu_insert_task(&gemv_kernel_cl,
+						 use_reduction?STARPU_REDUX:STARPU_RW,	starpu_data_get_sub_data(v1, 1, b2),
+						 STARPU_R,	starpu_data_get_sub_data(matrix, 2, b2, b1),
+						 STARPU_R,	starpu_data_get_sub_data(v2, 1, b1),
+						 STARPU_VALUE,	&one,	sizeof(one),
+						 STARPU_VALUE,	&p2,	sizeof(p2),
+						 0);
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 		}
 	}
+	return 0;
 }
 
 /*
@@ -424,7 +474,7 @@ void gemv_kernel(starpu_data_handle v1,
 static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
 {
 	TYPE p1, p2;
-	starpu_unpack_cl_args(cl_arg, &p1, &p2);
+	starpu_codelet_unpack_args(cl_arg, &p1, &p2);
 
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
@@ -444,7 +494,7 @@ static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
 static void scal_axpy_kernel_cpu(void *descr[], void *cl_arg)
 {
 	TYPE p1, p2;
-	starpu_unpack_cl_args(cl_arg, &p1, &p2);
+	starpu_codelet_unpack_args(cl_arg, &p1, &p2);
 
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
@@ -459,35 +509,42 @@ static void scal_axpy_kernel_cpu(void *descr[], void *cl_arg)
 	AXPY(nx, p2, v2, 1, v1, 1);
 }
 
-static struct starpu_perfmodel_t scal_axpy_kernel_model = {
+static struct starpu_perfmodel scal_axpy_kernel_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "scal_axpy_kernel"
 };
 
-static starpu_codelet scal_axpy_kernel_cl = {
+static struct starpu_codelet scal_axpy_kernel_cl =
+{
+	.can_execute = can_execute,
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = scal_axpy_kernel_cpu,
+	.cpu_funcs = {scal_axpy_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = scal_axpy_kernel_cuda,
+	.cuda_funcs = {scal_axpy_kernel_cuda, NULL},
 #endif
 	.nbuffers = 2,
 	.model = &scal_axpy_kernel_model
 };
 
-void scal_axpy_kernel(starpu_data_handle v1, TYPE p1,
-			starpu_data_handle v2, TYPE p2,
-			unsigned nblocks)
+int scal_axpy_kernel(starpu_data_handle_t v1, TYPE p1,
+		     starpu_data_handle_t v2, TYPE p2,
+		     unsigned nblocks)
 {
+	int ret;
 	unsigned b;
 	for (b = 0; b < nblocks; b++)
 	{
-		starpu_insert_task(&scal_axpy_kernel_cl,
-			STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
-			STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
-			STARPU_VALUE, &p1, sizeof(p1),
-			STARPU_VALUE, &p2, sizeof(p2),
-			0);
+		ret = starpu_insert_task(&scal_axpy_kernel_cl,
+					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
+					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
+					 STARPU_VALUE, &p1, sizeof(p1),
+					 STARPU_VALUE, &p2, sizeof(p2),
+					 0);
+		if (ret == -ENODEV) return ret;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 	}
+	return 0;
 }
 
 
@@ -498,7 +555,7 @@ void scal_axpy_kernel(starpu_data_handle v1, TYPE p1,
 static void axpy_kernel_cuda(void *descr[], void *cl_arg)
 {
 	TYPE p1;
-	starpu_unpack_cl_args(cl_arg, &p1);
+	starpu_codelet_unpack_args(cl_arg, &p1);
 
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
@@ -515,7 +572,7 @@ static void axpy_kernel_cuda(void *descr[], void *cl_arg)
 static void axpy_kernel_cpu(void *descr[], void *cl_arg)
 {
 	TYPE p1;
-	starpu_unpack_cl_args(cl_arg, &p1);
+	starpu_codelet_unpack_args(cl_arg, &p1);
 
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
@@ -527,89 +584,47 @@ static void axpy_kernel_cpu(void *descr[], void *cl_arg)
 	AXPY(nx, p1, v2, 1, v1, 1);
 }
 
-static struct starpu_perfmodel_t axpy_kernel_model = {
+static struct starpu_perfmodel axpy_kernel_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "axpy_kernel"
 };
 
-static starpu_codelet axpy_kernel_cl = {
+static struct starpu_codelet axpy_kernel_cl =
+{
+	.can_execute = can_execute,
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = axpy_kernel_cpu,
+	.cpu_funcs = {axpy_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = axpy_kernel_cuda,
+	.cuda_funcs = {axpy_kernel_cuda, NULL},
 #endif
 	.nbuffers = 2,
 	.model = &axpy_kernel_model
 };
 
-void axpy_kernel(starpu_data_handle v1,
-		starpu_data_handle v2, TYPE p1,
+int axpy_kernel(starpu_data_handle_t v1,
+		starpu_data_handle_t v2, TYPE p1,
 		unsigned nblocks)
 {
+	int ret;
 	unsigned b;
 	for (b = 0; b < nblocks; b++)
 	{
-		starpu_insert_task(&axpy_kernel_cl,
-			STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
-			STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
-			STARPU_VALUE, &p1, sizeof(p1),
-			0);
+		ret = starpu_insert_task(&axpy_kernel_cl,
+					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
+					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
+					 STARPU_VALUE, &p1, sizeof(p1),
+					 0);
+		if (ret == -ENODEV) return ret;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 	}
+	return 0;
 }
 
-
-/*
- *	COPY kernel : vector_dst <- vector_src
- */
-
-static void copy_handle_cpu(void *descr[], void *cl_arg)
-{
-	TYPE *dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
-	TYPE *src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
-	
-	unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
-	size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
-
-	memcpy(dst, src, nx*elemsize);
-}
-
-#ifdef STARPU_USE_CUDA
-static void copy_handle_cuda(void *descr[], void *cl_arg)
-{
-	TYPE *dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
-	TYPE *src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
-	
-	unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
-	size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
-
-	cudaMemcpyAsync(dst, src, nx*elemsize, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
-}
-#endif
-
-static struct starpu_perfmodel_t copy_handle_model = {
-	.type = STARPU_HISTORY_BASED,
-	.symbol = "copy_handle"
-};
-
-static starpu_codelet copy_handle_cl = {
-	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = copy_handle_cpu,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = copy_handle_cuda,
-#endif
-	.nbuffers = 2,
-	.model = &copy_handle_model
-};
-
-void copy_handle(starpu_data_handle dst, starpu_data_handle src, unsigned nblocks)
+int copy_handle(starpu_data_handle_t dst, starpu_data_handle_t src, unsigned nblocks)
 {
 	unsigned b;
 	for (b = 0; b < nblocks; b++)
-	{
-		starpu_insert_task(&copy_handle_cl,
-			STARPU_W, starpu_data_get_sub_data(dst, 1, b),
-			STARPU_R, starpu_data_get_sub_data(src, 1, b),
-			0);
-	}
-} 
+		starpu_data_cpy(starpu_data_get_sub_data(dst, 1, b), starpu_data_get_sub_data(src, 1, b), 1, NULL, NULL);
+	return 0;
+}

+ 1 - 1
examples/incrementer/incrementer.c

@@ -47,7 +47,7 @@ int main(int argc, char **argv)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 	niter /= 100;
 #endif
 	if (argc == 2)

+ 43 - 8
examples/interface/complex_interface.c

@@ -15,9 +15,6 @@
  */
 
 #include <starpu.h>
-#include <starpu_cuda.h>
-#include <starpu_opencl.h>
-#include <starpu_hash.h>
 
 #include "complex_interface.h"
 
@@ -137,7 +134,7 @@ static starpu_ssize_t complex_allocate_data_on_node(void *data_interface, uint32
 		}
 #endif
 		default:
-			STARPU_ASSERT(0);
+			STARPU_ABORT();
 	}
 
 	if (fail)
@@ -164,6 +161,43 @@ static uint32_t complex_footprint(starpu_data_handle_t handle)
 	return starpu_crc32_be(starpu_complex_get_nx(handle), 0);
 }
 
+static void *complex_handle_to_pointer(starpu_data_handle_t handle, uint32_t node)
+{
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+
+	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *)
+		starpu_data_get_interface_on_node(handle, node);
+
+	return (void*) complex_interface->real;
+}
+
+static int complex_pack_data(starpu_data_handle_t handle, uint32_t node, void **ptr)
+{
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+
+	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *)
+		starpu_data_get_interface_on_node(handle, node);
+
+	*ptr = malloc(complex_get_size(handle));
+	memcpy(*ptr, complex_interface->real, complex_interface->nx*sizeof(double));
+	memcpy(*ptr+complex_interface->nx*sizeof(double), complex_interface->imaginary, complex_interface->nx*sizeof(double));
+
+	return 0;
+}
+
+static int complex_unpack_data(starpu_data_handle_t handle, uint32_t node, void *ptr)
+{
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+
+	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *)
+		starpu_data_get_interface_on_node(handle, node);
+
+	memcpy(complex_interface->real, ptr, complex_interface->nx*sizeof(double));
+	memcpy(complex_interface->imaginary, ptr+complex_interface->nx*sizeof(double), complex_interface->nx*sizeof(double));
+
+	return 0;
+}
+
 #ifdef STARPU_USE_CUDA
 static int copy_cuda_async_sync(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, enum cudaMemcpyKind kind, cudaStream_t stream)
 {
@@ -204,11 +238,10 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *
 #endif
 
 #ifdef STARPU_USE_OPENCL
-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *_event)
+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
 {
 	struct starpu_complex_interface *src_complex = src_interface;
 	struct starpu_complex_interface *dst_complex = dst_interface;
-	cl_event *event = (cl_event *)_event;
 	cl_int err;
 	int ret;
 
@@ -244,11 +277,10 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_
         return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
 }
 
-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *_event)
+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
 {
 	struct starpu_complex_interface *src_complex = src_interface;
 	struct starpu_complex_interface *dst_complex = dst_interface;
-	cl_event *event = (cl_event *)_event;
 	cl_int err;
 	int ret;
 
@@ -310,6 +342,9 @@ static struct starpu_data_interface_ops interface_complex_ops =
 	.footprint = complex_footprint,
 	.interfaceid = -1,
 	.interface_size = sizeof(struct starpu_complex_interface),
+	.handle_to_pointer = complex_handle_to_pointer,
+	.pack_data = complex_pack_data,
+	.unpack_data = complex_unpack_data
 };
 
 void starpu_complex_data_register(starpu_data_handle_t *handleptr, uint32_t home_node, double *real, double *imaginary, int nx)

+ 1 - 1
examples/lu/lu_example.c

@@ -300,7 +300,7 @@ int main(int argc, char **argv)
 
 	parse_args(argc, argv);
 
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 	size /= 4;
 	nblocks /= 4;
 #endif

+ 2 - 2
examples/mult/xgemm.c

@@ -33,7 +33,7 @@
 static unsigned niter = 10;
 static unsigned nslicesx = 4;
 static unsigned nslicesy = 4;
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 static unsigned xdim = 256;
 static unsigned ydim = 256;
 static unsigned zdim = 64;
@@ -287,7 +287,7 @@ int main(int argc, char **argv)
 
 	parse_args(argc, argv);
 
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 	niter /= 10;
 #endif
 

+ 1 - 1
examples/pipeline/pipeline.c

@@ -41,7 +41,7 @@
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 
 /* Vector size */
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 #define N 16
 #else
 #define N 1048576

+ 253 - 50
examples/reductions/dot_product.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2012 inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,24 +17,48 @@
 
 #include <starpu.h>
 #include <assert.h>
+#include <math.h>
+
+#include <reductions/dot_product.h>
 
 #ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #include <cublas.h>
 #endif
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 static float *x;
 static float *y;
-static starpu_data_handle *x_handles;
-static starpu_data_handle *y_handles;
+static starpu_data_handle_t *x_handles;
+static starpu_data_handle_t *y_handles;
+#ifdef STARPU_USE_OPENCL
+static struct starpu_opencl_program opencl_program;
+#endif
 
 static unsigned nblocks = 4096;
-static unsigned entries_per_bock = 1024;
-
-#define DOT_TYPE double
+static unsigned entries_per_block = 1024;
 
 static DOT_TYPE dot = 0.0f;
-static starpu_data_handle dot_handle;
+static starpu_data_handle_t dot_handle;
+
+static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
+{
+	enum starpu_archtype type = starpu_worker_get_type(workerid);
+	if (type == STARPU_CPU_WORKER || type == STARPU_OPENCL_WORKER)
+		return 1;
+
+#ifdef STARPU_USE_CUDA
+	/* Cuda device */
+	const struct cudaDeviceProp *props;
+	props = starpu_cuda_get_device_properties(workerid);
+	if (props->major >= 2 || props->minor >= 3)
+		/* At least compute capability 1.3, supports doubles */
+		return 1;
+#endif
+	/* Old card, does not support doubles */
+	return 0;
+}
 
 /*
  *	Codelet to create a neutral element
@@ -49,16 +74,45 @@ void init_cpu_func(void *descr[], void *cl_arg)
 void init_cuda_func(void *descr[], void *cl_arg)
 {
 	DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
-	cudaMemset(dot, 0, sizeof(DOT_TYPE));
-	cudaThreadSynchronize();
+	cudaMemsetAsync(dot, 0, sizeof(DOT_TYPE), starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+#endif
+
+#ifdef STARPU_USE_OPENCL
+void init_opencl_func(void *buffers[], void *args)
+{
+        cl_int err;
+	cl_command_queue queue;
+
+	cl_mem dot = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[0]);
+	starpu_opencl_get_current_queue(&queue);
+	DOT_TYPE zero = (DOT_TYPE) 0.0;
+
+	err = clEnqueueWriteBuffer(queue,
+			dot,
+			CL_TRUE,
+			0,
+			sizeof(DOT_TYPE),
+			&zero,
+			0,
+			NULL,
+			NULL);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
 }
 #endif
 
-static struct starpu_codelet_t init_codelet = {
-	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = init_cpu_func,
+static struct starpu_codelet init_codelet =
+{
+	.can_execute = can_execute,
+	.cpu_funcs = {init_cpu_func, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = init_cuda_func,
+	.cuda_funcs = {init_cuda_func, NULL},
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {init_opencl_func, NULL},
 #endif
 	.nbuffers = 1
 };
@@ -75,9 +129,71 @@ void redux_cpu_func(void *descr[], void *cl_arg)
 	*dota = *dota + *dotb;
 }
 
-static struct starpu_codelet_t redux_codelet = {
-	.where = STARPU_CPU,
-	.cpu_func = redux_cpu_func,
+#ifdef STARPU_USE_CUDA
+extern void redux_cuda_func(void *descr[], void *_args);
+#endif
+
+#ifdef STARPU_USE_OPENCL
+void redux_opencl_func(void *buffers[], void *args)
+{
+	int id, devid;
+        cl_int err;
+	cl_kernel kernel;
+	cl_command_queue queue;
+	cl_event event;
+
+	cl_mem dota = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[0]);
+	cl_mem dotb = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[1]);
+
+	id = starpu_worker_get_id();
+	devid = starpu_worker_get_devid(id);
+
+	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "_redux_opencl", devid);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = clSetKernelArg(kernel, 0, sizeof(dota), &dota);
+	err|= clSetKernelArg(kernel, 1, sizeof(dotb), &dotb);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	{
+		size_t global=1;
+		size_t local;
+                size_t s;
+                cl_device_id device;
+
+                starpu_opencl_get_device(devid, &device);
+
+                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
+                if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+                if (local > global)
+			local=global;
+
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
+		if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
+
+	starpu_opencl_release_kernel(kernel);
+}
+#endif
+
+static struct starpu_codelet redux_codelet =
+{
+	.can_execute = can_execute,
+	.cpu_funcs = {redux_cpu_func, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {redux_cuda_func, NULL},
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {redux_opencl_func, NULL},
+#endif
 	.nbuffers = 2
 };
 
@@ -116,57 +232,122 @@ void dot_cuda_func(void *descr[], void *cl_arg)
 
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 
-	cudaMemcpy(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost);
-
-	cudaThreadSynchronize();
+	cudaMemcpyAsync(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
 
 	local_dot = (DOT_TYPE)cublasSdot(n, local_x, 1, local_y, 1);
 
-	//fprintf(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot);
+	/* FPRINTF(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot); */
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	current_dot += local_dot;
 
-	cudaThreadSynchronize();
+	cudaMemcpyAsync(dot, &current_dot, sizeof(DOT_TYPE), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+#endif
+
+#ifdef STARPU_USE_OPENCL
+void dot_opencl_func(void *buffers[], void *args)
+{
+	int id, devid;
+        cl_int err;
+	cl_kernel kernel;
+	cl_command_queue queue;
+	cl_event event;
+
+	cl_mem x = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
+	cl_mem y = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[1]);
+	cl_mem dot = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[2]);
+	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
+
+	id = starpu_worker_get_id();
+	devid = starpu_worker_get_devid(id);
+
+	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "_dot_opencl", devid);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = clSetKernelArg(kernel, 0, sizeof(x), &x);
+	err|= clSetKernelArg(kernel, 1, sizeof(y), &y);
+	err|= clSetKernelArg(kernel, 2, sizeof(dot), &dot);
+	err|= clSetKernelArg(kernel, 3, sizeof(n), &n);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	{
+		size_t global=1;
+		size_t local;
+                size_t s;
+                cl_device_id device;
+
+                starpu_opencl_get_device(devid, &device);
+
+                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
+                if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+                if (local > global)
+			local=global;
+
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
+		if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+	}
 
-	cudaMemcpy(dot, &current_dot, sizeof(DOT_TYPE), cudaMemcpyHostToDevice);
+	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
 
-	cudaThreadSynchronize();
+	starpu_opencl_release_kernel(kernel);
 }
 #endif
 
-static struct starpu_codelet_t dot_codelet = {
-	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = dot_cpu_func,
+static struct starpu_codelet dot_codelet =
+{
+	.can_execute = can_execute,
+	.cpu_funcs = {dot_cpu_func, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = dot_cuda_func,
+	.cuda_funcs = {dot_cuda_func, NULL},
 #endif
-	.nbuffers = 3
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {dot_opencl_func, NULL},
+#endif
+	.nbuffers = 3,
+	.modes = {STARPU_R, STARPU_R, STARPU_REDUX}
 };
 
 /*
  *	Tasks initialization
  */
 
-extern void starpu_data_end_reduction_mode(starpu_data_handle handle);
-
 int main(int argc, char **argv)
 {
-	starpu_init(NULL);
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+#ifdef STARPU_USE_OPENCL
+	ret = starpu_opencl_load_opencl_from_file("examples/reductions/dot_product_opencl_kernels.cl",
+						  &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+#endif
 
 	starpu_helper_cublas_init();
 
-	unsigned long nelems = nblocks*entries_per_bock;
+	unsigned long nelems = nblocks*entries_per_block;
 	size_t size = nelems*sizeof(float);
 
-	x = malloc(size);
-	y = malloc(size);
+	x = (float *) malloc(size);
+	y = (float *) malloc(size);
 
-	x_handles = calloc(nblocks, sizeof(starpu_data_handle));
-	y_handles = calloc(nblocks, sizeof(starpu_data_handle));
+	x_handles = (starpu_data_handle_t *) calloc(nblocks, sizeof(starpu_data_handle_t));
+	y_handles = (starpu_data_handle_t *) calloc(nblocks, sizeof(starpu_data_handle_t));
 
 	assert(x && y);
 
         starpu_srand48(0);
-	
+
 	DOT_TYPE reference_dot = 0.0;
 
 	unsigned long i;
@@ -176,15 +357,15 @@ int main(int argc, char **argv)
 		y[i] = (float)starpu_drand48();
 
 		reference_dot += (DOT_TYPE)x[i]*(DOT_TYPE)y[i];
-	} 
-	
+	}
+
 	unsigned block;
 	for (block = 0; block < nblocks; block++)
 	{
 		starpu_vector_data_register(&x_handles[block], 0,
-			(uintptr_t)&x[entries_per_bock*block], entries_per_bock, sizeof(float));
+			(uintptr_t)&x[entries_per_block*block], entries_per_block, sizeof(float));
 		starpu_vector_data_register(&y_handles[block], 0,
-			(uintptr_t)&y[entries_per_bock*block], entries_per_bock, sizeof(float));
+			(uintptr_t)&y[entries_per_block*block], entries_per_block, sizeof(float));
 	}
 
 	starpu_variable_data_register(&dot_handle, 0, (uintptr_t)&dot, sizeof(DOT_TYPE));
@@ -199,25 +380,47 @@ int main(int argc, char **argv)
 		struct starpu_task *task = starpu_task_create();
 
 		task->cl = &dot_codelet;
+		task->destroy = 1;
 
-		task->buffers[0].handle = x_handles[block];
-		task->buffers[0].mode = STARPU_R;
-		task->buffers[1].handle = y_handles[block];
-		task->buffers[1].mode = STARPU_R;
-		task->buffers[2].handle = dot_handle;
-		task->buffers[2].mode = STARPU_REDUX;
+		task->handles[0] = x_handles[block];
+		task->handles[1] = y_handles[block];
+		task->handles[2] = dot_handle;
 
-		int ret = starpu_task_submit(task);
+		ret = starpu_task_submit(task);
+		if (ret == -ENODEV) goto enodev;
 		STARPU_ASSERT(!ret);
 	}
 
+	for (block = 0; block < nblocks; block++)
+	{
+		starpu_data_unregister(x_handles[block]);
+		starpu_data_unregister(y_handles[block]);
+	}
 	starpu_data_unregister(dot_handle);
 
-	fprintf(stderr, "Reference : %e vs. %e (Delta %e)\n", reference_dot, dot, reference_dot - dot);
+	FPRINTF(stderr, "Reference : %e vs. %e (Delta %e)\n", reference_dot, dot, reference_dot - dot);
 
 	starpu_helper_cublas_shutdown();
 
+#ifdef STARPU_USE_OPENCL
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
+#endif
 	starpu_shutdown();
 
-	return 0;
+	free(x);
+	free(y);
+	free(x_handles);
+	free(y_handles);
+
+	if (fabs(reference_dot - dot) < reference_dot * 1e-6)
+		return EXIT_SUCCESS;
+	else
+		return EXIT_FAILURE;
+
+enodev:
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	return 77;
 }

+ 1 - 1
examples/reductions/minmax_reduction.c

@@ -19,7 +19,7 @@
 #include <limits.h>
 #include <starpu.h>
 
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 static unsigned nblocks = 512;
 static unsigned entries_per_bock = 64;
 #else

+ 11 - 7
examples/stencil/Makefile.am

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2010-2011  Université de Bordeaux 1
+# Copyright (C) 2010-2012  Université de Bordeaux 1
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -13,13 +13,14 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
-AM_CFLAGS = $(HWLOC_CFLAGS)
-LIBS = $(top_builddir)/src/libstarpu.la $(HWLOC_LIBS) @LIBS@
+AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(HWLOC_LIBS) @LIBS@
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) 
 
 if USE_MPI
-LIBS += $(top_builddir)/mpi/libstarpumpi.la
-AM_CPPFLAGS += -I$(top_srcdir)/mpi/
+LIBS += $(top_builddir)/mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+AM_CPPFLAGS += -I$(top_srcdir)/mpi/include
 endif
 
 CC = $(CC_OR_MPICC)
@@ -34,7 +35,7 @@ NVCCFLAGS += $(HWLOC_CFLAGS)
 
 .cu.o:
 	$(MKDIR_P) `dirname $@`
-	$(NVCC) $< -c -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS) -I$(top_srcdir)/include/ -I$(top_builddir)/include/
+	$(NVCC) $< -c -o $@ --compiler-options -fno-strict-aliasing -I$(top_srcdir)/include/ -I$(top_builddir)/include/ $(NVCCFLAGS)
 
 
 endif
@@ -46,7 +47,7 @@ endif
 check_PROGRAMS =				\
 	stencil
 
-examplebindir = $(libdir)/starpu/examples/
+examplebindir = $(libdir)/starpu/examples/stencil
 
 examplebin_PROGRAMS =				\
 	stencil
@@ -112,3 +113,6 @@ CLEANFILES = *.xpm
 
 view:
 	feh --zoom 800 -F 0.xpm 0.5.xpm 1.xpm 2.xpm 3.xpm 4.xpm 6.xpm mpi.xpm
+
+showcheck:
+	-cat $(TEST_LOGS) /dev/null

+ 103 - 63
examples/stencil/stencil-kernels.c

@@ -1,6 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
+ *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * the Free Software Foundation; either version 2.1 of the License, or (at
@@ -16,17 +18,14 @@
 #include "stencil.h"
 #include <sys/time.h>
 
-#ifdef STARPU_USE_OPENCL
-#include <CL/cl.h>
-#include <starpu_opencl.h>
-#endif
-
 #ifndef timersub
 #define	timersub(x, y, res) \
-	do { \
+	do \
+	{						   \
 		(res)->tv_sec = (x)->tv_sec - (y)->tv_sec; \
 		(res)->tv_usec = (x)->tv_usec - (y)->tv_usec; \
-		if ((res)->tv_usec < 0) { \
+		if ((res)->tv_usec < 0) \
+		{			 \
 			(res)->tv_sec--; \
 			(res)->tv_usec += 1000000; \
 		} \
@@ -34,10 +33,12 @@
 #endif
 #ifndef timeradd
 #define	timeradd(x, y, res) \
-	do { \
+	do \
+	{						   \
 		(res)->tv_sec = (x)->tv_sec + (y)->tv_sec; \
 		(res)->tv_usec = (x)->tv_usec + (y)->tv_usec; \
-		if ((res)->tv_usec >= 1000000) { \
+		if ((res)->tv_usec >= 1000000) \
+		{			       \
 			(res)->tv_sec++; \
 			(res)->tv_usec -= 1000000; \
 		} \
@@ -124,6 +125,9 @@ int *who_runs_what;
 int *who_runs_what_index;
 struct timeval *last_tick;
 
+/* Achieved iterations */
+static int achieved_iter;
+
 /* Record how many updates each worker performed */
 unsigned update_per_worker[STARPU_NMAXWORKERS];
 
@@ -135,7 +139,8 @@ static void record_who_runs_what(struct block_description *block)
 	gettimeofday(&tv, NULL);
 	timersub(&tv, &start, &tv2);
 	timersub(&tv2, &last_tick[block->bz], &diff);
-	while (timercmp(&diff, &delta, >=)) {
+	while (timercmp(&diff, &delta, >=))
+	{
 		timeradd(&last_tick[block->bz], &delta, &last_tick[block->bz]);
 		timersub(&tv2, &last_tick[block->bz], &diff);
 		if (who_runs_what_index[block->bz] < who_runs_what_len)
@@ -146,7 +151,7 @@ static void record_who_runs_what(struct block_description *block)
 		who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = global_workerid(workerid);
 }
 
-static void check_load(starpu_block_interface_t *block, starpu_block_interface_t *boundary)
+static void check_load(struct starpu_block_interface *block, struct starpu_block_interface *boundary)
 {
 	/* Sanity checks */
 	STARPU_ASSERT(block->nx == boundary->nx);
@@ -162,10 +167,12 @@ static void check_load(starpu_block_interface_t *block, starpu_block_interface_t
 /*
  * Load a neighbour's boundary into block, CPU version
  */
-static void load_subblock_from_buffer_cpu(starpu_block_interface_t *block,
-					starpu_block_interface_t *boundary,
+static void load_subblock_from_buffer_cpu(void *_block,
+					void *_boundary,
 					unsigned firstz)
 {
+	struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
+	struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
 	check_load(block, boundary);
 
 	/* We do a contiguous memory transfer */
@@ -181,10 +188,12 @@ static void load_subblock_from_buffer_cpu(starpu_block_interface_t *block,
  * Load a neighbour's boundary into block, CUDA version
  */
 #ifdef STARPU_USE_CUDA
-static void load_subblock_from_buffer_cuda(starpu_block_interface_t *block,
-					starpu_block_interface_t *boundary,
+static void load_subblock_from_buffer_cuda(void *_block,
+					void *_boundary,
 					unsigned firstz)
 {
+	struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
+	struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
 	check_load(block, boundary);
 
 	/* We do a contiguous memory transfer */
@@ -241,17 +250,17 @@ fprintf(stderr,"!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid)
 
 	for (i=1; i<=K; i++)
 	{
-		starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
-		TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;
+		struct starpu_block_interface *oldb = descr[i%2], *newb = descr[(i+1)%2];
+		TYPE *old = (void*) oldb->ptr, *newer = (void*) newb->ptr;
 
 		/* Shadow data */
 		cuda_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
 
 		/* And perform actual computation */
 #ifdef LIFE
-		cuda_life_update_host(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
+		cuda_life_update_host(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
 #else
-		cudaMemcpyAsync(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
+		cudaMemcpyAsync(newer, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
 #endif /* LIFE */
 	}
 
@@ -259,6 +268,8 @@ fprintf(stderr,"!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid)
 	if ((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess)
 		STARPU_CUDA_REPORT_ERROR(cures);
 
+	if (block->bz == 0)
+		starpu_top_update_data_integer(starpu_top_achieved_loop, ++achieved_iter);
 }
 #endif /* STARPU_USE_CUDA */
 
@@ -266,8 +277,8 @@ fprintf(stderr,"!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid)
  * Load a neighbour's boundary into block, OpenCL version
  */
 #ifdef STARPU_USE_OPENCL
-static void load_subblock_from_buffer_opencl(starpu_block_interface_t *block,
-					starpu_block_interface_t *boundary,
+static void load_subblock_from_buffer_opencl(struct starpu_block_interface *block,
+					struct starpu_block_interface *boundary,
 					unsigned firstz)
 {
 	check_load(block, boundary);
@@ -278,10 +289,14 @@ static void load_subblock_from_buffer_opencl(starpu_block_interface_t *block,
 	unsigned offset = firstz*block->ldz;
 	cl_mem block_data = (cl_mem)block->ptr;
 	cl_mem boundary_data = (cl_mem)boundary->ptr;
+	cl_event event;
 
         cl_command_queue cq;
         starpu_opencl_get_current_queue(&cq);
-        clEnqueueCopyBuffer(cq, boundary_data, block_data, 0, offset, boundary_size, 0, NULL, NULL);
+        clEnqueueCopyBuffer(cq, boundary_data, block_data, 0, offset, boundary_size, 0, NULL, &event);
+
+	clWaitForEvents(1, &event);
+	clReleaseEvent(event);
 }
 
 /*
@@ -332,17 +347,20 @@ fprintf(stderr,"!!! DO update_func_opencl z %d OPENCL%d !!!\n", block->bz, worke
 
 	for (i=1; i<=K; i++)
 	{
-		starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
-		TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;
+		struct starpu_block_interface *oldb = descr[i%2], *newb = descr[(i+1)%2];
+		TYPE *old = (void*) oldb->ptr, *newer = (void*) newb->ptr;
 
 		/* Shadow data */
 		opencl_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
 
 		/* And perform actual computation */
 #ifdef LIFE
-		opencl_life_update_host(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
+		opencl_life_update_host(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
 #else
-                clEnqueueCopyBuffer(cq, old, new, 0, 0, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), 0, NULL, NULL);
+		cl_event event;
+                clEnqueueCopyBuffer(cq, old, newer, 0, 0, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer), 0, NULL, &event);
+		clWaitForEvents(1, &event);
+		clReleaseEvent(event);
 #endif /* LIFE */
 	}
 
@@ -350,6 +368,8 @@ fprintf(stderr,"!!! DO update_func_opencl z %d OPENCL%d !!!\n", block->bz, worke
 	if ((err = clFinish(cq)))
 		STARPU_OPENCL_REPORT_ERROR(err);
 
+	if (block->bz == 0)
+		starpu_top_update_data_integer(starpu_top_achieved_loop, ++achieved_iter);
 }
 #endif /* STARPU_USE_OPENCL */
 
@@ -358,7 +378,7 @@ fprintf(stderr,"!!! DO update_func_opencl z %d OPENCL%d !!!\n", block->bz, worke
  */
 static void update_func_cpu(void *descr[], void *arg)
 {
-	struct block_description *block = arg;
+	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
 	if (block->bz == 0)
@@ -398,8 +418,8 @@ fprintf(stderr,"!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
 
 	for (i=1; i<=K; i++)
 	{
-		starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
-		TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;
+		struct starpu_block_interface *oldb = (struct starpu_block_interface *) descr[i%2], *newb = (struct starpu_block_interface *) descr[(i+1)%2];
+		TYPE *old = (TYPE*) oldb->ptr, *newer = (TYPE*) newb->ptr;
 
 		/* Shadow data */
 		unsigned ldy = oldb->ldy, ldz = oldb->ldz;
@@ -417,20 +437,25 @@ fprintf(stderr,"!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
 
 		/* And perform actual computation */
 #ifdef LIFE
-		life_update(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
+		life_update(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
 #else
-		memcpy(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new));
+		memcpy(newer, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer));
 #endif /* LIFE */
 	}
+
+	if (block->bz == 0)
+		starpu_top_update_data_integer(starpu_top_achieved_loop, ++achieved_iter);
 }
 
 /* Performance model and codelet structure */
-static struct starpu_perfmodel_t cl_update_model = {
+static struct starpu_perfmodel cl_update_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "cl_update" 
 };
 
-starpu_codelet cl_update = {
+struct starpu_codelet cl_update =
+{
 	.where = 0 |
 #ifdef STARPU_USE_CUDA
 		STARPU_CUDA|
@@ -439,15 +464,16 @@ starpu_codelet cl_update = {
                 STARPU_OPENCL|
 #endif
 		STARPU_CPU,
-	.cpu_func = update_func_cpu,
+	.cpu_funcs = {update_func_cpu, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = update_func_cuda,
+	.cuda_funcs = {update_func_cuda, NULL},
 #endif
 #ifdef STARPU_USE_OPENCL
-	.opencl_func = update_func_opencl,
+	.opencl_funcs = {update_func_opencl, NULL},
 #endif
 	.model = &cl_update_model,
-	.nbuffers = 6
+	.nbuffers = 6,
+	.modes = {STARPU_RW, STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R}
 };
 
 /*
@@ -455,10 +481,12 @@ starpu_codelet cl_update = {
  */
 
 /* CPU version */
-static void load_subblock_into_buffer_cpu(starpu_block_interface_t *block,
-					starpu_block_interface_t *boundary,
+static void load_subblock_into_buffer_cpu(void *_block,
+					void *_boundary,
 					unsigned firstz)
 {
+	struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
+	struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
 	check_load(block, boundary);
 
 	/* We do a contiguous memory transfer */
@@ -472,10 +500,12 @@ static void load_subblock_into_buffer_cpu(starpu_block_interface_t *block,
 
 /* CUDA version */
 #ifdef STARPU_USE_CUDA
-static void load_subblock_into_buffer_cuda(starpu_block_interface_t *block,
-					starpu_block_interface_t *boundary,
+static void load_subblock_into_buffer_cuda(void *_block,
+					void *_boundary,
 					unsigned firstz)
 {
+	struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
+	struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
 	check_load(block, boundary);
 
 	/* We do a contiguous memory transfer */
@@ -490,8 +520,8 @@ static void load_subblock_into_buffer_cuda(starpu_block_interface_t *block,
 
 /* OPENCL version */
 #ifdef STARPU_USE_OPENCL
-static void load_subblock_into_buffer_opencl(starpu_block_interface_t *block,
-					starpu_block_interface_t *boundary,
+static void load_subblock_into_buffer_opencl(struct starpu_block_interface *block,
+					struct starpu_block_interface *boundary,
 					unsigned firstz)
 {
 	check_load(block, boundary);
@@ -505,8 +535,12 @@ static void load_subblock_into_buffer_opencl(starpu_block_interface_t *block,
 
         cl_command_queue cq;
         starpu_opencl_get_current_queue(&cq);
+	cl_event event;
+
+        clEnqueueCopyBuffer(cq, block_data, boundary_data, offset, 0, boundary_size, 0, NULL, &event);
 
-        clEnqueueCopyBuffer(cq, block_data, boundary_data, offset, 0, boundary_size, 0, NULL, NULL);
+	clWaitForEvents(1, &event);
+	clReleaseEvent(event);
 }
 #endif /* STARPU_USE_OPENCL */
 
@@ -517,7 +551,7 @@ unsigned bottom_per_worker[STARPU_NMAXWORKERS];
 /* top save, CPU version */
 static void dummy_func_top_cpu(void *descr[] __attribute__((unused)), void *arg)
 {
-	struct block_description *block = arg;
+	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	top_per_worker[workerid]++;
 
@@ -533,7 +567,7 @@ static void dummy_func_top_cpu(void *descr[] __attribute__((unused)), void *arg)
 /* bottom save, CPU version */
 static void dummy_func_bottom_cpu(void *descr[] __attribute__((unused)), void *arg)
 {
-	struct block_description *block = arg;
+	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	bottom_per_worker[workerid]++;
 
@@ -547,7 +581,7 @@ static void dummy_func_bottom_cpu(void *descr[] __attribute__((unused)), void *a
 #ifdef STARPU_USE_CUDA
 static void dummy_func_top_cuda(void *descr[] __attribute__((unused)), void *arg)
 {
-	struct block_description *block = arg;
+	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	top_per_worker[workerid]++;
 
@@ -564,7 +598,7 @@ static void dummy_func_top_cuda(void *descr[] __attribute__((unused)), void *arg
 /* bottom save, CUDA version */
 static void dummy_func_bottom_cuda(void *descr[] __attribute__((unused)), void *arg)
 {
-	struct block_description *block = arg;
+	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	bottom_per_worker[workerid]++;
 
@@ -580,7 +614,7 @@ static void dummy_func_bottom_cuda(void *descr[] __attribute__((unused)), void *
 #ifdef STARPU_USE_OPENCL
 static void dummy_func_top_opencl(void *descr[] __attribute__((unused)), void *arg)
 {
-	struct block_description *block = arg;
+	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	top_per_worker[workerid]++;
 
@@ -600,7 +634,7 @@ static void dummy_func_top_opencl(void *descr[] __attribute__((unused)), void *a
 /* bottom save, OPENCL version */
 static void dummy_func_bottom_opencl(void *descr[] __attribute__((unused)), void *arg)
 {
-	struct block_description *block = arg;
+	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	bottom_per_worker[workerid]++;
 
@@ -616,17 +650,20 @@ static void dummy_func_bottom_opencl(void *descr[] __attribute__((unused)), void
 #endif /* STARPU_USE_OPENCL */
 
 /* Performance models and codelet for save */
-static struct starpu_perfmodel_t save_cl_bottom_model = {
+static struct starpu_perfmodel save_cl_bottom_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "save_cl_bottom" 
 };
 
-static struct starpu_perfmodel_t save_cl_top_model = {
+static struct starpu_perfmodel save_cl_top_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "save_cl_top" 
 };
 
-starpu_codelet save_cl_bottom = {
+struct starpu_codelet save_cl_bottom =
+{
 	.where = 0 |
 #ifdef STARPU_USE_CUDA
 		STARPU_CUDA|
@@ -635,18 +672,20 @@ starpu_codelet save_cl_bottom = {
 		STARPU_OPENCL|
 #endif
 		STARPU_CPU,
-	.cpu_func = dummy_func_bottom_cpu,
+	.cpu_funcs = {dummy_func_bottom_cpu, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = dummy_func_bottom_cuda,
+	.cuda_funcs = {dummy_func_bottom_cuda, NULL},
 #endif
 #ifdef STARPU_USE_OPENCL
-	.opencl_func = dummy_func_bottom_opencl,
+	.opencl_funcs = {dummy_func_bottom_opencl, NULL},
 #endif
 	.model = &save_cl_bottom_model,
-	.nbuffers = 4
+	.nbuffers = 4,
+	.modes = {STARPU_R, STARPU_R, STARPU_W, STARPU_W}
 };
 
-starpu_codelet save_cl_top = {
+struct starpu_codelet save_cl_top =
+{
 	.where = 0|
 #ifdef STARPU_USE_CUDA
 		STARPU_CUDA|
@@ -655,13 +694,14 @@ starpu_codelet save_cl_top = {
 		STARPU_OPENCL|
 #endif
 		STARPU_CPU,
-	.cpu_func = dummy_func_top_cpu,
+	.cpu_funcs = {dummy_func_top_cpu, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = dummy_func_top_cuda,
+	.cuda_funcs = {dummy_func_top_cuda, NULL},
 #endif
 #ifdef STARPU_USE_OPENCL
-	.opencl_func = dummy_func_top_opencl,
+	.opencl_funcs = {dummy_func_top_opencl, NULL},
 #endif
 	.model = &save_cl_top_model,
-	.nbuffers = 4
+	.nbuffers = 4,
+	.modes = {STARPU_R, STARPU_R, STARPU_W, STARPU_W}
 };

+ 47 - 49
examples/stencil/stencil-tasks.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -48,17 +49,13 @@ static void create_task_save_local(unsigned iter, unsigned z, int dir, unsigned
 	save_task->cl_arg = descr;
 
 	/* Saving our border... */
-	save_task->buffers[0].handle = descr->layers_handle[0];
-	save_task->buffers[0].mode = STARPU_R;
-	save_task->buffers[1].handle = descr->layers_handle[1];
-	save_task->buffers[1].mode = STARPU_R;
+	save_task->handles[0] = descr->layers_handle[0];
+	save_task->handles[1] = descr->layers_handle[1];
 
 	/* ... to the neighbour's copy */
 	struct block_description *neighbour = descr->boundary_blocks[(1+dir)/2];
-	save_task->buffers[2].handle = neighbour->boundaries_handle[(1-dir)/2][0];
-	save_task->buffers[2].mode = STARPU_W;
-	save_task->buffers[3].handle = neighbour->boundaries_handle[(1-dir)/2][1];
-	save_task->buffers[3].mode = STARPU_W;
+	save_task->handles[2] = neighbour->boundaries_handle[(1-dir)/2][0];
+	save_task->handles[3] = neighbour->boundaries_handle[(1-dir)/2][1];
 
 	/* Bind */
 	if (iter <= BIND_LAST)
@@ -69,14 +66,15 @@ static void create_task_save_local(unsigned iter, unsigned z, int dir, unsigned
 	if (ret)
 	{
 		fprintf(stderr, "Could not submit task save: %d\n", ret);
-		STARPU_ASSERT(0);
+		STARPU_ABORT();
 	}
 }
 
 /* R(z) = local & R(z+d) != local */
 /* We need to send our save over MPI */
 
-static void send_done(void *arg) {
+static void send_done(void *arg)
+{
 	uintptr_t z = (uintptr_t) arg;
 	DEBUG("DO SEND %d\n", (int)z);
 }
@@ -93,8 +91,8 @@ static void create_task_save_mpi_send(unsigned iter, unsigned z, int dir, unsign
 	STARPU_ASSERT(neighbour->mpi_node != local_rank);
 
 	/* Send neighbour's border copy to the neighbour */
-	starpu_data_handle handle0 = neighbour->boundaries_handle[(1-dir)/2][0];
-	starpu_data_handle handle1 = neighbour->boundaries_handle[(1-dir)/2][1];
+	starpu_data_handle_t handle0 = neighbour->boundaries_handle[(1-dir)/2][0];
+	starpu_data_handle_t handle1 = neighbour->boundaries_handle[(1-dir)/2][1];
 
 	starpu_mpi_isend_detached(handle0, dest, MPI_TAG0(z, iter, dir), MPI_COMM_WORLD, send_done, (void*)(uintptr_t)z);
 	starpu_mpi_isend_detached(handle1, dest, MPI_TAG1(z, iter, dir), MPI_COMM_WORLD, send_done, (void*)(uintptr_t)z);
@@ -103,7 +101,8 @@ static void create_task_save_mpi_send(unsigned iter, unsigned z, int dir, unsign
 /* R(z) != local & R(z+d) = local */
 /* We need to receive over MPI */
 
-static void recv_done(void *arg) {
+static void recv_done(void *arg)
+{
 	uintptr_t z = (uintptr_t) arg;
 	DEBUG("DO RECV %d\n", (int)z);
 }
@@ -119,13 +118,13 @@ static void create_task_save_mpi_recv(unsigned iter, unsigned z, int dir, unsign
 	STARPU_ASSERT(neighbour->mpi_node == local_rank);
 
 	/* Receive our neighbour's border in our neighbour copy */
-	starpu_data_handle handle0 = neighbour->boundaries_handle[(1-dir)/2][0];
-	starpu_data_handle handle1 = neighbour->boundaries_handle[(1-dir)/2][1];
+	starpu_data_handle_t handle0 = neighbour->boundaries_handle[(1-dir)/2][0];
+	starpu_data_handle_t handle1 = neighbour->boundaries_handle[(1-dir)/2][1];
 
 	starpu_mpi_irecv_detached(handle0, source, MPI_TAG0(z, iter, dir), MPI_COMM_WORLD, recv_done, (void*)(uintptr_t)z);
 	starpu_mpi_irecv_detached(handle1, source, MPI_TAG1(z, iter, dir), MPI_COMM_WORLD, recv_done, (void*)(uintptr_t)z);
 }
-#endif // STARPU_USE_MPI
+#endif /* STARPU_USE_MPI */
 
 /*
  * Schedule saving boundaries of blocks to communication buffers
@@ -141,26 +140,28 @@ void create_task_save(unsigned iter, unsigned z, int dir, unsigned local_rank)
 		/* Save data from update */
 		create_task_save_local(iter, z, dir, local_rank);
 		if (node_z_and_d != local_rank)
-		{ // R(z) = local & R(z+d) != local, We have to send the data
+		{ /* R(z) = local & R(z+d) != local, We have to send the data */
 			create_task_save_mpi_send(iter, z, dir, local_rank);
 		}
 
 	}
-	else {	// node_z != local_rank, this MPI node doesn't have the saved data
+	else
+	{	/* node_z != local_rank, this MPI node doesn't have the saved data */
 		if (node_z_and_d == local_rank)
 		{
 			create_task_save_mpi_recv(iter, z, dir, local_rank);
 		}
-		else {  // R(z) != local & R(z+d) != local We don't have
-			// the saved data and don't need it, we shouldn't
-			// even have been called!
-			STARPU_ASSERT(0);
+		else
+		{ /* R(z) != local & R(z+d) != local We don't have
+			      the saved data and don't need it, we shouldn't
+			      even have been called! */
+			STARPU_ABORT();
 		}
 	}
-#else // !STARPU_USE_MPI
+#else /* !STARPU_USE_MPI */
 	STARPU_ASSERT((node_z == local_rank) && (node_z_and_d == local_rank));
 	create_task_save_local(iter, z, dir, local_rank);
-#endif // STARPU_USE_MPI
+#endif /* STARPU_USE_MPI */
 }
 
 /*
@@ -176,7 +177,8 @@ void create_task_update(unsigned iter, unsigned z, unsigned local_rank)
 	unsigned niter = get_niter();
 
 	/* We are going to synchronize with the last tasks */
-	if (iter == niter) {
+	if (iter == niter)
+	{
 		task->detach = 0;
 		task->use_tag = 1;
 		task->tag_id = TAG_FINISH(z);
@@ -186,20 +188,14 @@ void create_task_update(unsigned iter, unsigned z, unsigned local_rank)
 	unsigned new_layer = (old_layer + 1) % 2;
 
 	struct block_description *descr = get_block_description(z);
-	task->buffers[0].handle = descr->layers_handle[new_layer];
-	task->buffers[0].mode = STARPU_RW;
-	task->buffers[1].handle = descr->layers_handle[old_layer];
-	task->buffers[1].mode = STARPU_RW;
+	task->handles[0] = descr->layers_handle[new_layer];
+	task->handles[1] = descr->layers_handle[old_layer];
 
-	task->buffers[2].handle = descr->boundaries_handle[T][new_layer];
-	task->buffers[2].mode = STARPU_R;
-	task->buffers[3].handle = descr->boundaries_handle[T][old_layer];
-	task->buffers[3].mode = STARPU_R;
+	task->handles[2] = descr->boundaries_handle[T][new_layer];
+	task->handles[3] = descr->boundaries_handle[T][old_layer];
 
-	task->buffers[4].handle = descr->boundaries_handle[B][new_layer];
-	task->buffers[4].mode = STARPU_R;
-	task->buffers[5].handle = descr->boundaries_handle[B][old_layer];
-	task->buffers[5].mode = STARPU_R;
+	task->handles[4] = descr->boundaries_handle[B][new_layer];
+	task->handles[5] = descr->boundaries_handle[B][old_layer];
 
 	task->cl = &cl_update;
 	task->cl_arg = descr;
@@ -212,21 +208,24 @@ void create_task_update(unsigned iter, unsigned z, unsigned local_rank)
 	if (ret)
 	{
 		fprintf(stderr, "Could not submit task update block: %d\n", ret);
-		STARPU_ASSERT(0);
+		STARPU_ABORT();
 	}
 }
 
 /* Dummy empty codelet taking one buffer */
 static void null_func(void *descr[] __attribute__((unused)), void *arg __attribute__((unused))) { }
-static starpu_codelet null = {
+static struct starpu_codelet null =
+{
+	.modes = { STARPU_W, STARPU_W },
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
-	.cpu_func = null_func,
-	.cuda_func = null_func,
-	.opencl_func = null_func,
+	.cpu_funcs = {null_func, NULL},
+	.cuda_funcs = {null_func, NULL},
+	.opencl_funcs = {null_func, NULL},
 	.nbuffers = 2
 };
 
-void create_start_task(int z, int dir) {
+void create_start_task(int z, int dir)
+{
 	/* Dumb task depending on the init task and simulating writing the
 	   neighbour buffers, to avoid communications and computation running
 	   before we start measuring time */
@@ -236,17 +235,15 @@ void create_start_task(int z, int dir) {
 	wait_init->cl = &null;
 	wait_init->use_tag = 1;
 	wait_init->tag_id = TAG_START(z, dir);
-	wait_init->buffers[0].handle = descr->boundaries_handle[(1+dir)/2][0];
-	wait_init->buffers[0].mode = STARPU_W;
-	wait_init->buffers[1].handle = descr->boundaries_handle[(1+dir)/2][1];
-	wait_init->buffers[1].mode = STARPU_W;
+	wait_init->handles[0] = descr->boundaries_handle[(1 + dir) / 2][0];
+	wait_init->handles[1] = descr->boundaries_handle[(1 + dir) / 2][1];
 	starpu_tag_declare_deps_array(wait_init->tag_id, 1, &tag_init);
 
 	int ret = starpu_task_submit(wait_init);
 	if (ret)
 	{
 		fprintf(stderr, "Could not submit task initial wait: %d\n", ret);
-		STARPU_ASSERT(0);
+		STARPU_ABORT();
 	}
 }
 
@@ -261,7 +258,8 @@ void create_tasks(int rank)
 	int niter = get_niter();
 	int nbz = get_nbz();
 
-	for (bz = 0; bz < nbz; bz++) {
+	for (bz = 0; bz < nbz; bz++)
+	{
 		if ((get_block_mpi_node(bz) == rank) || (get_block_mpi_node(bz+1) == rank))
 			create_start_task(bz, +1);
 		if ((get_block_mpi_node(bz) == rank) || (get_block_mpi_node(bz-1) == rank))

+ 1 - 1
examples/tag_example/tag_example.c

@@ -43,7 +43,7 @@
 
 struct starpu_codelet cl = {};
 
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 #define Ni	32
 #define Nj	32
 #define Nk	32

+ 1 - 1
examples/tag_example/tag_example2.c

@@ -120,7 +120,7 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 		exit(77);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 	ni /= 4;
 	nk /= 16;
 #endif

+ 1 - 1
examples/tag_example/tag_example3.c

@@ -122,7 +122,7 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 		exit(77);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 	ni /= 4;
 	nk /= 16;
 #endif

+ 1 - 1
examples/tag_example/tag_restartable.c

@@ -129,7 +129,7 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 	ni /= 4;
 	nk /= 16;
 #endif

+ 1 - 1
tests/microbenchs/prefetch_data_on_node.c

@@ -23,7 +23,7 @@
 #include <pthread.h>
 #include "../helper.h"
 
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 #define N		100
 #else
 #define N		1000

+ 1 - 1
tests/microbenchs/sync_tasks_overhead.c

@@ -93,7 +93,7 @@ int main(int argc, char **argv)
 	struct timeval start;
 	struct timeval end;
 
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 	ntasks = 128;
 #endif