13 年前 · 0ca1520d7f
--- a/sched_ctx_hypervisor/examples/cholesky/cholesky.h
+++ b/sched_ctx_hypervisor/examples/cholesky/cholesky.h
@@ -30,6 +30,7 @@
 
				 
			
 
				 #include <common/blas.h>
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_bound.h>
			
 
				 
			
 
				 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				 #define NMAXBLOCKS	32
			
@@ -61,6 +62,7 @@ static unsigned nbigblocks = 8;
 
				 static unsigned pinned = 0;
			
 
				 static unsigned noprio = 0;
			
 
				 static unsigned check = 0;
			
 
				+static unsigned bound = 0;
			
 
				 static unsigned with_ctxs = 0;
			
 
				 static unsigned with_noctxs = 0;
			
 
				 static unsigned chole1 = 0;
			
@@ -76,64 +78,81 @@ void chol_cublas_codelet_update_u21(void *descr[], void *_args);
 
				 void chol_cublas_codelet_update_u22(void *descr[], void *_args);
			
 
				 #endif
			
 
				 
			
 
				-extern struct starpu_perfmodel_t chol_model_11;
			
 
				-extern struct starpu_perfmodel_t chol_model_21;
			
 
				-extern struct starpu_perfmodel_t chol_model_22;
			
 
				+extern struct starpu_perfmodel chol_model_11;
			
 
				+extern struct starpu_perfmodel chol_model_21;
			
 
				+extern struct starpu_perfmodel chol_model_22;
			
 
				 
			
 
				 static void __attribute__((unused)) parse_args(int argc, char **argv)
			
 
				 {
			
 
				 	int i;
			
 
				-	for (i = 1; i < argc; i++) {
			
 
				-		if (strcmp(argv[i], "-with_ctxs") == 0) {
			
 
				+	for (i = 1; i < argc; i++)
			
 
				+	{
			
 
				+		if (strcmp(argv[i], "-with_ctxs") == 0) 
			
 
				+		{
			
 
				 			with_ctxs = 1;
			
 
				 			break;
			
 
				 		}
			
 
				-		if (strcmp(argv[i], "-with_noctxs") == 0) {
			
 
				+		if (strcmp(argv[i], "-with_noctxs") == 0) 
			
 
				+		{
			
 
				 			with_noctxs = 1;
			
 
				 			break;
			
 
				 		}
			
 
				 		
			
 
				-		if (strcmp(argv[i], "-chole1") == 0) {
			
 
				+		if (strcmp(argv[i], "-chole1") == 0) 
			
 
				+		{
			
 
				 			chole1 = 1;
			
 
				 			break;
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-chole2") == 0) {
			
 
				+		if (strcmp(argv[i], "-chole2") == 0) 
			
 
				+		{
			
 
				 			chole2 = 1;
			
 
				 			break;
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-size") == 0) {
			
 
				-			char *argptr;
			
 
				+		if (strcmp(argv[i], "-size") == 0)
			
 
				+		{
			
 
				+		        char *argptr;
			
 
				 			size = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				-		
			
 
				-		if (strcmp(argv[i], "-nblocks") == 0) {
			
 
				-			char *argptr;
			
 
				+
			
 
				+		if (strcmp(argv[i], "-nblocks") == 0)
			
 
				+		{
			
 
				+		        char *argptr;
			
 
				 			nblocks = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				-		
			
 
				-		if (strcmp(argv[i], "-nbigblocks") == 0) {
			
 
				-			char *argptr;
			
 
				+
			
 
				+		if (strcmp(argv[i], "-nbigblocks") == 0)
			
 
				+		{
			
 
				+		        char *argptr;
			
 
				 			nbigblocks = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				-		
			
 
				-		if (strcmp(argv[i], "-pin") == 0) {
			
 
				+
			
 
				+		if (strcmp(argv[i], "-pin") == 0)
			
 
				+		{
			
 
				 			pinned = 1;
			
 
				 		}
			
 
				-		
			
 
				-		if (strcmp(argv[i], "-no-prio") == 0) {
			
 
				+
			
 
				+		if (strcmp(argv[i], "-no-prio") == 0)
			
 
				+		{
			
 
				 			noprio = 1;
			
 
				 		}
			
 
				-		
			
 
				-		if (strcmp(argv[i], "-check") == 0) {
			
 
				+
			
 
				+		if (strcmp(argv[i], "-bound") == 0)
			
 
				+		{
			
 
				+			bound = 1;
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-check") == 0)
			
 
				+		{
			
 
				 			check = 1;
			
 
				 		}
			
 
				-		
			
 
				-		if (strcmp(argv[i], "-h") == 0) {
			
 
				+
			
 
				+		if (strcmp(argv[i], "-h") == 0)
			
 
				+		{
			
 
				 			printf("usage : %s [-pin] [-size size] [-nblocks nblocks] [-check]\n", argv[0]);
			
 
				-		}	
			
 
				-	}	
			
 
				+		}
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 #endif /* __DW_CHOLESKY_H__ */
			
--- a/sched_ctx_hypervisor/examples/cholesky/cholesky_grain_tag.c
+++ b/sched_ctx_hypervisor/examples/cholesky/cholesky_grain_tag.c
@@ -1,8 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -36,91 +36,100 @@ static struct starpu_task *create_task(starpu_tag_t id)
 
				  *	Create the codelets
			
 
				  */
			
 
				 
			
 
				-static starpu_codelet cl11 =
			
 
				+static struct starpu_codelet cl11 =
			
 
				 {
			
 
				+	.modes = { STARPU_RW },
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u11,
			
 
				+	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u11,
			
 
				+	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 1,
			
 
				 	.model = &chol_model_11
			
 
				 };
			
 
				 
			
 
				-static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k, unsigned reclevel)
			
 
				+static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned k, unsigned reclevel)
			
 
				 {
			
 
				 /*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
			
 
				 
			
 
				 	struct starpu_task *task = create_task(TAG11_AUX(k, reclevel));
			
 
				-	
			
 
				+
			
 
				 	task->cl = &cl11;
			
 
				 
			
 
				 	/* which sub-data is manipulated ? */
			
 
				-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k);
			
 
				-	task->buffers[0].mode = STARPU_RW;
			
 
				+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
			
 
				 
			
 
				 	/* this is an important task */
			
 
				 	task->priority = STARPU_MAX_PRIO;
			
 
				 
			
 
				 	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				+	if (k > 0)
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG11_AUX(k, reclevel), 1, TAG22_AUX(k-1, k, k, reclevel));
			
 
				 	}
			
 
				 
			
 
				 	return task;
			
 
				 }
			
 
				 
			
 
				-static starpu_codelet cl21 =
			
 
				+static struct starpu_codelet cl21 =
			
 
				 {
			
 
				+	.modes = { STARPU_R, STARPU_RW },
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u21,
			
 
				+	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u21,
			
 
				+	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 2,
			
 
				 	.model = &chol_model_21
			
 
				 };
			
 
				 
			
 
				-static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j, unsigned reclevel)
			
 
				+static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j, unsigned reclevel)
			
 
				 {
			
 
				+	int ret;
			
 
				+
			
 
				 	struct starpu_task *task = create_task(TAG21_AUX(k, j, reclevel));
			
 
				 
			
 
				-	task->cl = &cl21;	
			
 
				+	task->cl = &cl21;
			
 
				 
			
 
				 	/* which sub-data is manipulated ? */
			
 
				-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k); 
			
 
				-	task->buffers[0].mode = STARPU_R;
			
 
				-	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
			
 
				-	task->buffers[1].mode = STARPU_RW;
			
 
				+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
			
 
				+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
			
 
				 
			
 
				-	if (j == k+1) {
			
 
				+	if (j == k+1)
			
 
				+	{
			
 
				 		task->priority = STARPU_MAX_PRIO;
			
 
				 	}
			
 
				 
			
 
				 	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				+	if (k > 0)
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 2, TAG11_AUX(k, reclevel), TAG22_AUX(k-1, k, j, reclevel));
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 1, TAG11_AUX(k, reclevel));
			
 
				 	}
			
 
				 
			
 
				-	starpu_task_submit(task);
			
 
				+	ret = starpu_task_submit(task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 }
			
 
				 
			
 
				-static starpu_codelet cl22 =
			
 
				+static struct starpu_codelet cl22 =
			
 
				 {
			
 
				+	.modes = { STARPU_R, STARPU_R, STARPU_RW },
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u22,
			
 
				+	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u22,
			
 
				+	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 3,
			
 
				 	.model = &chol_model_22
			
 
				 };
			
 
				 
			
 
				-static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j, unsigned reclevel)
			
 
				+static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, unsigned j, unsigned reclevel)
			
 
				 {
			
 
				+	int ret;
			
 
				+
			
 
				 /*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22_AUX(k,i,j)); */
			
 
				 
			
 
				 	struct starpu_task *task = create_task(TAG22_AUX(k, i, j, reclevel));
			
@@ -128,44 +137,47 @@ static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, uns
 
				 	task->cl = &cl22;
			
 
				 
			
 
				 	/* which sub-data is manipulated ? */
			
 
				-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, i); 
			
 
				-	task->buffers[0].mode = STARPU_R;
			
 
				-	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
			
 
				-	task->buffers[1].mode = STARPU_R;
			
 
				-	task->buffers[2].handle = starpu_data_get_sub_data(dataA, 2, i, j); 
			
 
				-	task->buffers[2].mode = STARPU_RW;
			
 
				-
			
 
				-	if ( (i == k + 1) && (j == k +1) ) {
			
 
				+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, i);
			
 
				+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
			
 
				+	task->handles[2] = starpu_data_get_sub_data(dataA, 2, i, j);
			
 
				+
			
 
				+	if ( (i == k + 1) && (j == k +1) )
			
 
				+	{
			
 
				 		task->priority = STARPU_MAX_PRIO;
			
 
				 	}
			
 
				 
			
 
				 	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				+	if (k > 0)
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 3, TAG22_AUX(k-1, i, j, reclevel), TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 2, TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
			
 
				 	}
			
 
				 
			
 
				-	starpu_task_submit(task);
			
 
				+	ret = starpu_task_submit(task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 }
			
 
				 
			
 
				 
			
 
				 
			
 
				 /*
			
 
				- *	code to bootstrap the factorization 
			
 
				+ *	code to bootstrap the factorization
			
 
				  *	and construct the DAG
			
 
				  */
			
 
				 
			
 
				 static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks, unsigned reclevel)
			
 
				 {
			
 
				+	int ret;
			
 
				+
			
 
				 	/* create a new codelet */
			
 
				 	struct starpu_task *entry_task = NULL;
			
 
				 
			
 
				 	/* create all the DAG nodes */
			
 
				 	unsigned i,j,k;
			
 
				 
			
 
				-	starpu_data_handle dataA;
			
 
				+	starpu_data_handle_t dataA;
			
 
				 
			
 
				 	/* monitor and partition the A matrix into blocks :
			
 
				 	 * one block is now determined by 2 unsigned (i,j) */
			
@@ -173,12 +185,14 @@ static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
				 
			
 
				 	starpu_data_set_sequential_consistency_flag(dataA, 0);
			
 
				 
			
 
				-	struct starpu_data_filter f = {
			
 
				+	struct starpu_data_filter f =
			
 
				+	{
			
 
				 		.filter_func = starpu_vertical_block_filter_func,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
 
				 
			
 
				-	struct starpu_data_filter f2 = {
			
 
				+	struct starpu_data_filter f2 =
			
 
				+	{
			
 
				 		.filter_func = starpu_block_filter_func,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
@@ -189,13 +203,16 @@ static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
				 	{
			
 
				 		struct starpu_task *task = create_task_11(dataA, k, reclevel);
			
 
				 		/* we defer the launch of the first task */
			
 
				-		if (k == 0) {
			
 
				+		if (k == 0)
			
 
				+		{
			
 
				 			entry_task = task;
			
 
				 		}
			
 
				-		else {
			
 
				-			starpu_task_submit(task);
			
 
				+		else
			
 
				+		{
			
 
				+			ret = starpu_task_submit(task);
			
 
				+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 		}
			
 
				-		
			
 
				+
			
 
				 		for (j = k+1; j<nblocks; j++)
			
 
				 		{
			
 
				 			create_task_21(dataA, k, j, reclevel);
			
@@ -209,7 +226,7 @@ static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
				 	}
			
 
				 
			
 
				 	/* schedule the codelet */
			
 
				-	int ret = starpu_task_submit(entry_task);
			
 
				+	ret = starpu_task_submit(entry_task);
			
 
				 	if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				 	{
			
 
				 		FPRINTF(stderr, "No worker may execute this task\n");
			
@@ -221,9 +238,11 @@ static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
				 		/* stall the application until the end of computations */
			
 
				 		starpu_tag_wait(TAG11_AUX(nblocks-1, reclevel));
			
 
				 		starpu_data_unpartition(dataA, 0);
			
 
				+		starpu_data_unregister(dataA);
			
 
				 		return;
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		STARPU_ASSERT(reclevel == 0);
			
 
				 		unsigned ndeps_tags = (nblocks - nbigblocks)*(nblocks - nbigblocks);
			
 
				 
			
@@ -253,20 +272,26 @@ static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
				 
			
 
				 static void initialize_system(float **A, unsigned dim, unsigned pinned)
			
 
				 {
			
 
				-	starpu_init(NULL);
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV)
			
 
				+		exit(77);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	starpu_helper_cublas_init();
			
 
				 
			
 
				 	if (pinned)
			
 
				 	{
			
 
				 		starpu_malloc((void **)A, dim*dim*sizeof(float));
			
 
				-	} 
			
 
				-	else {
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				 		*A = malloc(dim*dim*sizeof(float));
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				-void cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks)
			
 
				+void cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks, unsigned pinned)
			
 
				 {
			
 
				 	struct timeval start;
			
 
				 	struct timeval end;
			
@@ -284,8 +309,20 @@ void cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, u
 
				 	double flop = (1.0f*size*size*size)/3.0f;
			
 
				 	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				 
			
 
				-	starpu_helper_cublas_shutdown();
			
 
				+}
			
 
				 
			
 
				+static void shutdown_system(float **matA, unsigned pinned)
			
 
				+{
			
 
				+	if (pinned)
			
 
				+	{
			
 
				+	     starpu_free(*matA);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+	     free(*matA);
			
 
				+	}
			
 
				+
			
 
				+	starpu_helper_cublas_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 }
			
 
				 
			
@@ -299,8 +336,6 @@ int main(int argc, char **argv)
 
				 	parse_args(argc, argv);
			
 
				 
			
 
				 	float *mat;
			
 
				-
			
 
				-	mat = malloc(size*size*sizeof(float));
			
 
				 	initialize_system(&mat, size, pinned);
			
 
				 
			
 
				 	unsigned i,j;
			
@@ -321,10 +356,12 @@ int main(int argc, char **argv)
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				-			if (i <= j) {
			
 
				+			if (i <= j)
			
 
				+			{
			
 
				 				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
			
 
				 			}
			
 
				-			else {
			
 
				+			else
			
 
				+			{
			
 
				 				FPRINTF(stdout, ".\t");
			
 
				 			}
			
 
				 		}
			
@@ -332,8 +369,7 @@ int main(int argc, char **argv)
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				-
			
 
				-	cholesky_grain(mat, size, size, nblocks, nbigblocks);
			
 
				+	cholesky_grain(mat, size, size, nblocks, nbigblocks, pinned);
			
 
				 
			
 
				 #ifdef CHECK_OUTPUT
			
 
				 	FPRINTF(stdout, "Results :\n");
			
@@ -342,10 +378,12 @@ int main(int argc, char **argv)
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				-			if (i <= j) {
			
 
				+			if (i <= j)
			
 
				+			{
			
 
				 				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
			
 
				 			}
			
 
				-			else {
			
 
				+			else
			
 
				+			{
			
 
				 				FPRINTF(stdout, ".\t");
			
 
				 				mat[j+i*size] = 0.0f; /* debug */
			
 
				 			}
			
@@ -357,7 +395,7 @@ int main(int argc, char **argv)
 
				 	float *test_mat = malloc(size*size*sizeof(float));
			
 
				 	STARPU_ASSERT(test_mat);
			
 
				 
			
 
				-	SSYRK("L", "N", size, size, 1.0f, 
			
 
				+	SSYRK("L", "N", size, size, 1.0f,
			
 
				 				mat, size, 0.0f, test_mat, size);
			
 
				 
			
 
				 	FPRINTF(stderr, "comparing results ...\n");
			
@@ -365,16 +403,20 @@ int main(int argc, char **argv)
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				-			if (i <= j) {
			
 
				+			if (i <= j)
			
 
				+			{
			
 
				                                 FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
			
 
				 			}
			
 
				-			else {
			
 
				+			else
			
 
				+			{
			
 
				 				FPRINTF(stdout, ".\t");
			
 
				 			}
			
 
				 		}
			
 
				 		FPRINTF(stdout, "\n");
			
 
				 	}
			
 
				+	free(test_mat);
			
 
				 #endif
			
 
				 
			
 
				+	shutdown_system(&mat, pinned);
			
 
				 	return 0;
			
 
				 }
			
--- a/sched_ctx_hypervisor/examples/cholesky/cholesky_implicit.c
+++ b/sched_ctx_hypervisor/examples/cholesky/cholesky_implicit.c
@@ -1,8 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -17,45 +17,48 @@
 
				  */
			
 
				 
			
 
				 #include "cholesky.h"
			
 
				-#include "../sched_ctx_utils/sched_ctx_utils.h"
			
 
				+
			
 
				 /*
			
 
				  *	Create the codelets
			
 
				  */
			
 
				 
			
 
				-static starpu_codelet cl11 =
			
 
				+static struct starpu_codelet cl11 =
			
 
				 {
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.type = STARPU_SEQ,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u11,
			
 
				+	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u11,
			
 
				+	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 1,
			
 
				+	.modes = {STARPU_RW},
			
 
				 	.model = &chol_model_11
			
 
				 };
			
 
				 
			
 
				-static starpu_codelet cl21 =
			
 
				+static struct starpu_codelet cl21 =
			
 
				 {
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.type = STARPU_SEQ,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u21,
			
 
				+	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u21,
			
 
				+	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 2,
			
 
				+	.modes = {STARPU_R, STARPU_RW},
			
 
				 	.model = &chol_model_21
			
 
				 };
			
 
				 
			
 
				-static starpu_codelet cl22 =
			
 
				+static struct starpu_codelet cl22 =
			
 
				 {
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.type = STARPU_SEQ,
			
 
				 	.max_parallelism = INT_MAX,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u22,
			
 
				+	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u22,
			
 
				+	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 3,
			
 
				+	.modes = {STARPU_R, STARPU_R, STARPU_RW},
			
 
				 	.model = &chol_model_22
			
 
				 };
			
 
				 
			
@@ -69,9 +72,9 @@ static void callback_turn_spmd_on(void *arg __attribute__ ((unused)))
 
				 	cl22.type = STARPU_SPMD;
			
 
				 }
			
 
				 
			
 
				-int hypervisor_tag = 1;
			
 
				-static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
			
 
				+static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
			
 
				 {
			
 
				+	int ret;
			
 
				 	struct timeval start;
			
 
				 	struct timeval end;
			
 
				 
			
@@ -81,19 +84,22 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 
				 
			
 
				 	gettimeofday(&start, NULL);
			
 
				 
			
 
				+	if (bound)
			
 
				+		starpu_bound_start(0, 0);
			
 
				 	/* create all the DAG nodes */
			
 
				 	for (k = 0; k < nblocks; k++)
			
 
				 	{
			
 
				-                starpu_data_handle sdatakk = starpu_data_get_sub_data(dataA, 2, k, k);
			
 
				+                starpu_data_handle_t sdatakk = starpu_data_get_sub_data(dataA, 2, k, k);
			
 
				 		if(k == 0 && with_ctxs)
			
 
				 		{
			
 
				-			starpu_insert_task(&cl11,
			
 
				+			 ret = starpu_insert_task(&cl11,
			
 
				 					   STARPU_PRIORITY, prio_level,
			
 
				 					   STARPU_RW, sdatakk,
			
 
				 					   STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
			
 
				 					   STARPU_HYPERVISOR_TAG, hypervisor_tag,
			
 
				 					   0);
			
 
				 			set_hypervisor_conf(START_BENCH, hypervisor_tag++);
			
 
				+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				 		}
			
 
				 		else
			
 
				 			starpu_insert_task(&cl11,
			
@@ -104,49 +110,53 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 
				 
			
 
				 		for (j = k+1; j<nblocks; j++)
			
 
				 		{
			
 
				-                        starpu_data_handle sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
			
 
				+                        starpu_data_handle_t sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
			
 
				 
			
 
				-                        starpu_insert_task(&cl21,
			
 
				-                                           STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
			
 
				-                                           STARPU_R, sdatakk,
			
 
				-                                           STARPU_RW, sdatakj,
			
 
				-                                           0);
			
 
				+                        ret = starpu_insert_task(&cl21,
			
 
				+						 STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
			
 
				+						 STARPU_R, sdatakk,
			
 
				+						 STARPU_RW, sdatakj,
			
 
				+						 0);
			
 
				+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				 
			
 
				 			for (i = k+1; i<nblocks; i++)
			
 
				 			{
			
 
				 				if (i <= j)
			
 
				                                 {
			
 
				-					starpu_data_handle sdataki = starpu_data_get_sub_data(dataA, 2, k, i);
			
 
				-					starpu_data_handle sdataij = starpu_data_get_sub_data(dataA, 2, i, j);
			
 
				+					starpu_data_handle_t sdataki = starpu_data_get_sub_data(dataA, 2, k, i);
			
 
				+					starpu_data_handle_t sdataij = starpu_data_get_sub_data(dataA, 2, i, j);
			
 
				 
			
 
				 					if(k == (nblocks-2) && j == (nblocks-1) &&
			
 
				 					   i == (k + 1) && with_ctxs)
			
 
				 					{
			
 
				-						starpu_insert_task(&cl22,
			
 
				+						ret = starpu_insert_task(&cl22,
			
 
				 								   STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
			
 
				 								   STARPU_R, sdataki,
			
 
				 								   STARPU_R, sdatakj,
			
 
				 								   STARPU_RW, sdataij,
			
 
				 								   STARPU_HYPERVISOR_TAG, hypervisor_tag,
			
 
				 								   0);
			
 
				-						
			
 
				+						STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				 						set_hypervisor_conf(END_BENCH, hypervisor_tag++);
			
 
				 					}
			
 
				 					
			
 
				 					else
			
 
				-						starpu_insert_task(&cl22,
			
 
				+						ret = starpu_insert_task(&cl22,
			
 
				 								   STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
			
 
				 								   STARPU_R, sdataki,
			
 
				 								   STARPU_R, sdatakj,
			
 
				 								   STARPU_RW, sdataij,
			
 
				 								   0);
			
 
				+						STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				 					
			
 
				-                                }
			
 
				+                   }
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				 	starpu_task_wait_for_all();
			
 
				+	if (bound)
			
 
				+		starpu_bound_stop();
			
 
				 
			
 
				 	starpu_data_unpartition(dataA, 0);
			
 
				 
			
@@ -166,23 +176,31 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 
				 		FPRINTF(stdout, "%2.2f\n", timing/1000);
			
 
				 	
			
 
				 		FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				+		if (bound)
			
 
				+		{
			
 
				+			double res;
			
 
				+			starpu_bound_compute(&res, NULL, 0);
			
 
				+			FPRINTF(stderr, "Theoretical GFlops: %2.2f\n", (flop/res/1000000.0f));
			
 
				+		}
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				 static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
			
 
				 {
			
 
				-	starpu_data_handle dataA;
			
 
				+	starpu_data_handle_t dataA;
			
 
				 
			
 
				 	/* monitor and partition the A matrix into blocks :
			
 
				 	 * one block is now determined by 2 unsigned (i,j) */
			
 
				 	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
			
 
				 
			
 
				-	struct starpu_data_filter f = {
			
 
				+	struct starpu_data_filter f =
			
 
				+	{
			
 
				 		.filter_func = starpu_vertical_block_filter_func,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
 
				 
			
 
				-	struct starpu_data_filter f2 = {
			
 
				+	struct starpu_data_filter f2 =
			
 
				+	{
			
 
				 		.filter_func = starpu_block_filter_func,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
@@ -190,11 +208,25 @@ static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 
				 	starpu_data_map_filters(dataA, 2, &f, &f2);
			
 
				 
			
 
				 	_cholesky(dataA, nblocks);
			
 
				+
			
 
				+	starpu_data_unregister(dataA);
			
 
				 }
			
 
				 
			
 
				-static void execute_cholesky(float *mat, unsigned size, unsigned nblocks)
			
 
				+static void execute_cholesky(unsigned size, unsigned nblocks)
			
 
				 {
			
 
				+	float *mat;
			
 
				+	starpu_malloc((void **)&mat, (size_t)size*size*sizeof(float));
			
 
				+
			
 
				 	unsigned i,j;
			
 
				+	for (i = 0; i < size; i++)
			
 
				+	{
			
 
				+		for (j = 0; j < size; j++)
			
 
				+		{
			
 
				+			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
			
 
				+			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 /* #define PRINT_OUTPUT */
			
 
				 #ifdef PRINT_OUTPUT
			
 
				 	FPRINTF(stdout, "Input :\n");
			
@@ -203,16 +235,19 @@ static void execute_cholesky(float *mat, unsigned size, unsigned nblocks)
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				-			if (i <= j) {
			
 
				+			if (i <= j)
			
 
				+			{
			
 
				 				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
			
 
				 			}
			
 
				-			else {
			
 
				+			else
			
 
				+			{
			
 
				 				FPRINTF(stdout, ".\t");
			
 
				 			}
			
 
				 		}
			
 
				 		FPRINTF(stdout, "\n");
			
 
				 	}
			
 
				 #endif
			
 
				+
			
 
				 	cholesky(mat, size, size, nblocks);
			
 
				 
			
 
				 #ifdef PRINT_OUTPUT
			
@@ -221,10 +256,12 @@ static void execute_cholesky(float *mat, unsigned size, unsigned nblocks)
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				-			if (i <= j) {
			
 
				+			if (i <= j)
			
 
				+			{
			
 
				 				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
			
 
				 			}
			
 
				-			else {
			
 
				+			else
			
 
				+			{
			
 
				 				FPRINTF(stdout, ".\t");
			
 
				 				mat[j+i*size] = 0.0f; /* debug */
			
 
				 			}
			
@@ -240,50 +277,56 @@ static void execute_cholesky(float *mat, unsigned size, unsigned nblocks)
 
				 		{
			
 
				 			for (i = 0; i < size; i++)
			
 
				 			{
			
 
				-				if (i > j) {
			
 
				+				if (i > j)
			
 
				+				{
			
 
				 					mat[j+i*size] = 0.0f; /* debug */
			
 
				 				}
			
 
				 			}
			
 
				 		}
			
 
				 		float *test_mat = malloc(size*size*sizeof(float));
			
 
				 		STARPU_ASSERT(test_mat);
			
 
				-	
			
 
				+
			
 
				 		SSYRK("L", "N", size, size, 1.0f,
			
 
				 					mat, size, 0.0f, test_mat, size);
			
 
				-	
			
 
				+
			
 
				 		FPRINTF(stderr, "comparing results ...\n");
			
 
				 #ifdef PRINT_OUTPUT
			
 
				 		for (j = 0; j < size; j++)
			
 
				 		{
			
 
				 			for (i = 0; i < size; i++)
			
 
				 			{
			
 
				-				if (i <= j) {
			
 
				+				if (i <= j)
			
 
				+				{
			
 
				 					FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
			
 
				 				}
			
 
				-				else {
			
 
				+				else
			
 
				+				{
			
 
				 					FPRINTF(stdout, ".\t");
			
 
				 				}
			
 
				 			}
			
 
				 			FPRINTF(stdout, "\n");
			
 
				 		}
			
 
				 #endif
			
 
				-	
			
 
				+
			
 
				 		for (j = 0; j < size; j++)
			
 
				 		{
			
 
				 			for (i = 0; i < size; i++)
			
 
				 			{
			
 
				-				if (i <= j) {
			
 
				+				if (i <= j)
			
 
				+				{
			
 
				 	                                float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
			
 
				 	                                float err = abs(test_mat[j +i*size] - orig);
			
 
				-	                                if (err > 0.00001) {
			
 
				+	                                if (err > 0.00001)
			
 
				+					{
			
 
				 	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.2f != %2.2f (err %2.2f)\n", i, j, test_mat[j +i*size], orig, err);
			
 
				 	                                        assert(0);
			
 
				 	                                }
			
 
				 	                        }
			
 
				 			}
			
 
				 	        }
			
 
				+		free(test_mat);
			
 
				 	}
			
 
				-
			
 
				+	starpu_free(mat);
			
 
				 }
			
 
				 
			
 
				 int main(int argc, char **argv)
			
@@ -314,23 +357,7 @@ int main(int argc, char **argv)
 
				 	else if(chole2)
			
 
				 		start_2ndbench(execute_cholesky);
			
 
				 	else
			
 
				-	{
			
 
				-		float *mat;
			
 
				-		starpu_malloc((void **)&mat, (size_t)size*size*sizeof(float));
			
 
				-		
			
 
				-		unsigned i,j;
			
 
				-		for (i = 0; i < size; i++)
			
 
				-		{
			
 
				-			for (j = 0; j < size; j++)
			
 
				-			{
			
 
				-				mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
			
 
				-				/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
			
 
				-			}
			
 
				-		}
			
 
				-
			
 
				-		execute_cholesky(mat, size, nblocks);
			
 
				-	}
			
 
				-
			
 
				+		execute_cholesky(size, nblocks);
			
 
				 
			
 
				 	starpu_helper_cublas_shutdown();
			
 
				 	starpu_shutdown();
			
--- a/sched_ctx_hypervisor/examples/cholesky/cholesky_kernels.c
+++ b/sched_ctx_hypervisor/examples/cholesky/cholesky_kernels.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -17,7 +17,7 @@
 
				 
			
 
				 #include <starpu_config.h>
			
 
				 #include "cholesky.h"
			
 
				-//#include "../common/blas.h"
			
 
				+#include "../common/blas.h"
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <starpu_cuda.h>
			
 
				 #ifdef STARPU_HAVE_MAGMA
			
@@ -55,7 +55,8 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __at
 
				 			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21, 
			
 
				 				right, ld12, 1.0f, center, ld22);
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			/* Parallel CPU kernel */
			
 
				 			int rank = starpu_combined_worker_get_rank();
			
 
				 
			
@@ -113,7 +114,8 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, __attrib
 
				 	unsigned nx21 = STARPU_MATRIX_GET_NY(descr[1]);
			
 
				 	unsigned ny21 = STARPU_MATRIX_GET_NX(descr[1]);
			
 
				 
			
 
				-	switch (s) {
			
 
				+	switch (s)
			
 
				+	{
			
 
				 		case 0:
			
 
				 			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
			
 
				 			break;
			
@@ -157,7 +159,8 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 
				 
			
 
				 	unsigned z;
			
 
				 
			
 
				-	switch (s) {
			
 
				+	switch (s)
			
 
				+	{
			
 
				 		case 0:
			
 
				 
			
 
				 			/*
			
@@ -188,7 +191,8 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 
				 			int ret;
			
 
				 			int info;
			
 
				 			ret = magma_spotrf_gpu('L', nx, sub11, ld, &info);
			
 
				-			if (ret != MAGMA_SUCCESS) {
			
 
				+			if (ret != MAGMA_SUCCESS)
			
 
				+			{
			
 
				 				fprintf(stderr, "Error in Magma: %d\n", ret);
			
 
				 				STARPU_ABORT();
			
 
				 			}
			
--- a/sched_ctx_hypervisor/examples/cholesky/cholesky_models.c
+++ b/sched_ctx_hypervisor/examples/cholesky/cholesky_models.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -17,8 +17,8 @@
 
				  */
			
 
				 
			
 
				 /*
			
 
				- * As a convention, in that file, descr[0] is represented by A,
			
 
				- * 				  descr[1] is B ...
			
 
				+ * As a convention, in that file, buffers[0] is represented by A,
			
 
				+ * 				  buffers[1] is B ...
			
 
				  */
			
 
				 
			
 
				 /*
			
@@ -26,6 +26,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include "cholesky.h"
			
 
				 
			
 
				 /* #define USE_PERTURBATION	1 */
			
 
				 
			
@@ -35,11 +36,11 @@
 
				 #define PERTURBATE(a)	(a)
			
 
				 #endif
			
 
				 
			
 
				-static double cpu_chol_task_11_cost(starpu_buffer_descr *descr)
			
 
				+static double cpu_chol_task_11_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
 
				-	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				+	n = starpu_matrix_get_nx(task->handles[0]);
			
 
				 
			
 
				 	double cost = (((double)(n)*n*n)/1000.0f*0.894/0.79176);
			
 
				 
			
@@ -50,11 +51,11 @@ static double cpu_chol_task_11_cost(starpu_buffer_descr *descr)
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
 
				-static double cuda_chol_task_11_cost(starpu_buffer_descr *descr)
			
 
				+static double cuda_chol_task_11_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
 
				-	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				+	n = starpu_matrix_get_nx(task->handles[0]);
			
 
				 
			
 
				 	double cost = (((double)(n)*n*n)/50.0f/10.75/5.088633/0.9883);
			
 
				 
			
@@ -65,11 +66,11 @@ static double cuda_chol_task_11_cost(starpu_buffer_descr *descr)
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
 
				-static double cpu_chol_task_21_cost(starpu_buffer_descr *descr)
			
 
				+static double cpu_chol_task_21_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
 
				-	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				+	n = starpu_matrix_get_nx(task->handles[0]);
			
 
				 
			
 
				 	double cost = (((double)(n)*n*n)/7706.674/0.95/0.9965);
			
 
				 
			
@@ -80,11 +81,11 @@ static double cpu_chol_task_21_cost(starpu_buffer_descr *descr)
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
 
				-static double cuda_chol_task_21_cost(starpu_buffer_descr *descr)
			
 
				+static double cuda_chol_task_21_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
 
				-	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				+	n = starpu_matrix_get_nx(task->handles[0]);
			
 
				 
			
 
				 	double cost = (((double)(n)*n*n)/50.0f/10.75/87.29520);
			
 
				 
			
@@ -95,11 +96,11 @@ static double cuda_chol_task_21_cost(starpu_buffer_descr *descr)
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
 
				-static double cpu_chol_task_22_cost(starpu_buffer_descr *descr)
			
 
				+static double cpu_chol_task_22_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
 
				-	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				+	n = starpu_matrix_get_nx(task->handles[0]);
			
 
				 
			
 
				 	double cost = (((double)(n)*n*n)/50.0f/10.75/8.0760);
			
 
				 
			
@@ -110,11 +111,11 @@ static double cpu_chol_task_22_cost(starpu_buffer_descr *descr)
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
 
				-static double cuda_chol_task_22_cost(starpu_buffer_descr *descr)
			
 
				+static double cuda_chol_task_22_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
 
				-	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				+	n = starpu_matrix_get_nx(task->handles[0]);
			
 
				 
			
 
				 	double cost = (((double)(n)*n*n)/50.0f/10.75/76.30666);
			
 
				 
			
@@ -125,28 +126,34 @@ static double cuda_chol_task_22_cost(starpu_buffer_descr *descr)
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
 
				-struct starpu_perfmodel_t chol_model_11 = {
			
 
				-	.per_arch = {
			
 
				-		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_11_cost },
			
 
				-		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_11_cost }
			
 
				+struct starpu_perfmodel chol_model_11 =
			
 
				+{
			
 
				+	.per_arch =
			
 
				+	{
			
 
				+		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_11_cost },
			
 
				+		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_11_cost }
			
 
				 	},
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "chol_model_11"
			
 
				 };
			
 
				 
			
 
				-struct starpu_perfmodel_t chol_model_21 = {
			
 
				-	.per_arch = {
			
 
				-		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_21_cost },
			
 
				-		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_21_cost }
			
 
				+struct starpu_perfmodel chol_model_21 =
			
 
				+{
			
 
				+	.per_arch =
			
 
				+	{
			
 
				+		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_21_cost },
			
 
				+		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_21_cost }
			
 
				 	},
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "chol_model_21"
			
 
				 };
			
 
				 
			
 
				-struct starpu_perfmodel_t chol_model_22 = {
			
 
				-	.per_arch = {
			
 
				-		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_22_cost },
			
 
				-		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_22_cost }
			
 
				+struct starpu_perfmodel chol_model_22 =
			
 
				+{
			
 
				+	.per_arch =
			
 
				+	{
			
 
				+		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_22_cost },
			
 
				+		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_22_cost }
			
 
				 	},
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "chol_model_22"
			
--- a/sched_ctx_hypervisor/examples/cholesky/cholesky_tag.c
+++ b/sched_ctx_hypervisor/examples/cholesky/cholesky_tag.c
@@ -1,8 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -36,96 +36,101 @@ static struct starpu_task *create_task(starpu_tag_t id)
 
				  *	Create the codelets
			
 
				  */
			
 
				 
			
 
				-static starpu_codelet cl11 =
			
 
				+static struct starpu_codelet cl11 =
			
 
				 {
			
 
				+	.modes = { STARPU_RW },
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u11,
			
 
				+	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u11,
			
 
				+	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 1,
			
 
				 	.model = &chol_model_11
			
 
				 };
			
 
				 
			
 
				-static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k)
			
 
				+static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned k)
			
 
				 {
			
 
				 /*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
			
 
				 
			
 
				 	struct starpu_task *task = create_task(TAG11(k));
			
 
				-	
			
 
				+
			
 
				 	task->cl = &cl11;
			
 
				 
			
 
				 	/* which sub-data is manipulated ? */
			
 
				-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k);
			
 
				-	task->buffers[0].mode = STARPU_RW;
			
 
				+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
			
 
				 
			
 
				 	/* this is an important task */
			
 
				 	if (!noprio)
			
 
				 		task->priority = STARPU_MAX_PRIO;
			
 
				 
			
 
				 	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				+	if (k > 0)
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
			
 
				 	}
			
 
				 
			
 
				 	return task;
			
 
				 }
			
 
				 
			
 
				-static starpu_codelet cl21 =
			
 
				+static struct starpu_codelet cl21 =
			
 
				 {
			
 
				+	.modes = { STARPU_R, STARPU_RW },
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u21,
			
 
				+	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u21,
			
 
				+	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 2,
			
 
				 	.model = &chol_model_21
			
 
				 };
			
 
				 
			
 
				-static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j)
			
 
				+static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j)
			
 
				 {
			
 
				 	struct starpu_task *task = create_task(TAG21(k, j));
			
 
				 
			
 
				-	task->cl = &cl21;	
			
 
				+	task->cl = &cl21;
			
 
				 
			
 
				 	/* which sub-data is manipulated ? */
			
 
				-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k); 
			
 
				-	task->buffers[0].mode = STARPU_R;
			
 
				-	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
			
 
				-	task->buffers[1].mode = STARPU_RW;
			
 
				+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
			
 
				+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
			
 
				 
			
 
				-	if (!noprio && (j == k+1)) {
			
 
				+	if (!noprio && (j == k+1))
			
 
				+	{
			
 
				 		task->priority = STARPU_MAX_PRIO;
			
 
				 	}
			
 
				 
			
 
				 	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				+	if (k > 0)
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG21(k, j), 2, TAG11(k), TAG22(k-1, k, j));
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
			
 
				 	}
			
 
				 
			
 
				 	int ret = starpu_task_submit(task);
			
 
				-        if (STARPU_UNLIKELY(ret == -ENODEV)) {
			
 
				+        if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				+	{
			
 
				                 FPRINTF(stderr, "No worker may execute this task\n");
			
 
				                 exit(0);
			
 
				         }
			
 
				 
			
 
				 }
			
 
				 
			
 
				-static starpu_codelet cl22 =
			
 
				+static struct starpu_codelet cl22 =
			
 
				 {
			
 
				+	.modes = { STARPU_R, STARPU_R, STARPU_RW },
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u22,
			
 
				+	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u22,
			
 
				+	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 3,
			
 
				 	.model = &chol_model_22
			
 
				 };
			
 
				 
			
 
				-static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j)
			
 
				+static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, unsigned j)
			
 
				 {
			
 
				 /*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
			
 
				 
			
@@ -134,27 +139,28 @@ static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, uns
 
				 	task->cl = &cl22;
			
 
				 
			
 
				 	/* which sub-data is manipulated ? */
			
 
				-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, i); 
			
 
				-	task->buffers[0].mode = STARPU_R;
			
 
				-	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
			
 
				-	task->buffers[1].mode = STARPU_R;
			
 
				-	task->buffers[2].handle = starpu_data_get_sub_data(dataA, 2, i, j); 
			
 
				-	task->buffers[2].mode = STARPU_RW;
			
 
				-
			
 
				-	if (!noprio && (i == k + 1) && (j == k +1) ) {
			
 
				+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, i);
			
 
				+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
			
 
				+	task->handles[2] = starpu_data_get_sub_data(dataA, 2, i, j);
			
 
				+
			
 
				+	if (!noprio && (i == k + 1) && (j == k +1) )
			
 
				+	{
			
 
				 		task->priority = STARPU_MAX_PRIO;
			
 
				 	}
			
 
				 
			
 
				 	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				+	if (k > 0)
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j));
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
			
 
				 	}
			
 
				 
			
 
				 	int ret = starpu_task_submit(task);
			
 
				-        if (STARPU_UNLIKELY(ret == -ENODEV)) {
			
 
				+        if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				+	{
			
 
				                 FPRINTF(stderr, "No worker may execute this task\n");
			
 
				                 exit(0);
			
 
				         }
			
@@ -163,11 +169,11 @@ static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, uns
 
				 
			
 
				 
			
 
				 /*
			
 
				- *	code to bootstrap the factorization 
			
 
				+ *	code to bootstrap the factorization
			
 
				  *	and construct the DAG
			
 
				  */
			
 
				 
			
 
				-static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
			
 
				+static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
			
 
				 {
			
 
				 	struct timeval start;
			
 
				 	struct timeval end;
			
@@ -183,18 +189,21 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 
				 	{
			
 
				 		struct starpu_task *task = create_task_11(dataA, k);
			
 
				 		/* we defer the launch of the first task */
			
 
				-		if (k == 0) {
			
 
				+		if (k == 0)
			
 
				+		{
			
 
				 			entry_task = task;
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			int ret = starpu_task_submit(task);
			
 
				-                        if (STARPU_UNLIKELY(ret == -ENODEV)) {
			
 
				+                        if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				+			{
			
 
				                                 FPRINTF(stderr, "No worker may execute this task\n");
			
 
				                                 exit(0);
			
 
				                         }
			
 
				 
			
 
				 		}
			
 
				-		
			
 
				+
			
 
				 		for (j = k+1; j<nblocks; j++)
			
 
				 		{
			
 
				 			create_task_21(dataA, k, j);
			
@@ -209,7 +218,8 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 
				 
			
 
				 	/* schedule the codelet */
			
 
				 	int ret = starpu_task_submit(entry_task);
			
 
				-        if (STARPU_UNLIKELY(ret == -ENODEV)) {
			
 
				+        if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				+	{
			
 
				                 FPRINTF(stderr, "No worker may execute this task\n");
			
 
				                 exit(0);
			
 
				         }
			
@@ -233,24 +243,31 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 
				 	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				 }
			
 
				 
			
 
				-static void initialize_system(float **A, unsigned dim, unsigned pinned)
			
 
				+static int initialize_system(float **A, unsigned dim, unsigned pinned)
			
 
				 {
			
 
				-	starpu_init(NULL);
			
 
				-	
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV)
			
 
				+		return 77;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				 	starpu_helper_cublas_init();
			
 
				 
			
 
				 	if (pinned)
			
 
				 	{
			
 
				 		starpu_malloc((void **)A, (size_t)dim*dim*sizeof(float));
			
 
				-	} 
			
 
				-	else {
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				 		*A = malloc(dim*dim*sizeof(float));
			
 
				 	}
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
			
 
				 {
			
 
				-	starpu_data_handle dataA;
			
 
				+	starpu_data_handle_t dataA;
			
 
				 
			
 
				 	/* monitor and partition the A matrix into blocks :
			
 
				 	 * one block is now determined by 2 unsigned (i,j) */
			
@@ -258,12 +275,14 @@ static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 
				 
			
 
				 	starpu_data_set_sequential_consistency_flag(dataA, 0);
			
 
				 
			
 
				-	struct starpu_data_filter f = {
			
 
				+	struct starpu_data_filter f =
			
 
				+	{
			
 
				 		.filter_func = starpu_vertical_block_filter_func,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
 
				 
			
 
				-	struct starpu_data_filter f2 = {
			
 
				+	struct starpu_data_filter f2 =
			
 
				+	{
			
 
				 		.filter_func = starpu_block_filter_func,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
@@ -272,8 +291,21 @@ static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 
				 
			
 
				 	_cholesky(dataA, nblocks);
			
 
				 
			
 
				-	starpu_helper_cublas_shutdown();
			
 
				+	starpu_data_unregister(dataA);
			
 
				+}
			
 
				+
			
 
				+static void shutdown_system(float **matA, unsigned pinned)
			
 
				+{
			
 
				+	if (pinned)
			
 
				+	{
			
 
				+		starpu_free(*matA);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		free(*matA);
			
 
				+	}
			
 
				 
			
 
				+	starpu_helper_cublas_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 }
			
 
				 
			
@@ -287,9 +319,8 @@ int main(int argc, char **argv)
 
				 	parse_args(argc, argv);
			
 
				 
			
 
				 	float *mat;
			
 
				-
			
 
				-	mat = malloc(size*size*sizeof(float));
			
 
				-	initialize_system(&mat, size, pinned);
			
 
				+	int ret = initialize_system(&mat, size, pinned);
			
 
				+	if (ret) return ret;
			
 
				 
			
 
				 	unsigned i,j;
			
 
				 	for (i = 0; i < size; i++)
			
@@ -309,10 +340,12 @@ int main(int argc, char **argv)
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				-			if (i <= j) {
			
 
				+			if (i <= j)
			
 
				+			{
			
 
				 				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
			
 
				 			}
			
 
				-			else {
			
 
				+			else
			
 
				+			{
			
 
				 				FPRINTF(stdout, ".\t");
			
 
				 			}
			
 
				 		}
			
@@ -330,10 +363,12 @@ int main(int argc, char **argv)
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				-			if (i <= j) {
			
 
				+			if (i <= j)
			
 
				+			{
			
 
				 				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
			
 
				 			}
			
 
				-			else {
			
 
				+			else
			
 
				+			{
			
 
				 				FPRINTF(stdout, ".\t");
			
 
				 				mat[j+i*size] = 0.0f; /* debug */
			
 
				 			}
			
@@ -345,7 +380,7 @@ int main(int argc, char **argv)
 
				 	float *test_mat = malloc(size*size*sizeof(float));
			
 
				 	STARPU_ASSERT(test_mat);
			
 
				 
			
 
				-	SSYRK("L", "N", size, size, 1.0f, 
			
 
				+	SSYRK("L", "N", size, size, 1.0f,
			
 
				 				mat, size, 0.0f, test_mat, size);
			
 
				 
			
 
				 	FPRINTF(stderr, "comparing results ...\n");
			
@@ -353,16 +388,20 @@ int main(int argc, char **argv)
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				-			if (i <= j) {
			
 
				+			if (i <= j)
			
 
				+			{
			
 
				 				FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
			
 
				 			}
			
 
				-			else {
			
 
				+			else
			
 
				+			{
			
 
				 				FPRINTF(stdout, ".\t");
			
 
				 			}
			
 
				 		}
			
 
				 		FPRINTF(stdout, "\n");
			
 
				 	}
			
 
				+	free(test_mat);
			
 
				 #endif
			
 
				 
			
 
				+	shutdown_system(&mat, pinned);
			
 
				 	return 0;
			
 
				 }
			
--- a/sched_ctx_hypervisor/examples/cholesky/cholesky_tile_tag.c
+++ b/sched_ctx_hypervisor/examples/cholesky/cholesky_tile_tag.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -19,7 +19,7 @@
 
				 
			
 
				 /* A [ y ] [ x ] */
			
 
				 float *A[NMAXBLOCKS][NMAXBLOCKS];
			
 
				-starpu_data_handle A_state[NMAXBLOCKS][NMAXBLOCKS];
			
 
				+starpu_data_handle_t A_state[NMAXBLOCKS][NMAXBLOCKS];
			
 
				 
			
 
				 /*
			
 
				  *	Some useful functions
			
@@ -39,12 +39,13 @@ static struct starpu_task *create_task(starpu_tag_t id)
 
				  *	Create the codelets
			
 
				  */
			
 
				 
			
 
				-static starpu_codelet cl11 =
			
 
				+static struct starpu_codelet cl11 =
			
 
				 {
			
 
				+	.modes = { STARPU_RW },
			
 
				 	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u11,
			
 
				+	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u11,
			
 
				+	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				 #ifdef SPU_FUNC_POTRF
			
@@ -66,26 +67,27 @@ static struct starpu_task * create_task_11(unsigned k, unsigned nblocks)
 
				 	task->cl = &cl11;
			
 
				 
			
 
				 	/* which sub-data is manipulated ? */
			
 
				-	task->buffers[0].handle = A_state[k][k];
			
 
				-	task->buffers[0].mode = STARPU_RW;
			
 
				+	task->handles[0] = A_state[k][k];
			
 
				 
			
 
				 	/* this is an important task */
			
 
				 	task->priority = STARPU_MAX_PRIO;
			
 
				 
			
 
				 	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				+	if (k > 0)
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
			
 
				 	}
			
 
				 
			
 
				 	return task;
			
 
				 }
			
 
				 
			
 
				-static starpu_codelet cl21 =
			
 
				+static struct starpu_codelet cl21 =
			
 
				 {
			
 
				+	.modes = { STARPU_R, STARPU_RW },
			
 
				 	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u21,
			
 
				+	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u21,
			
 
				+	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				 #ifdef SPU_FUNC_STRSM
			
@@ -100,37 +102,42 @@ static starpu_codelet cl21 =
 
				 
			
 
				 static void create_task_21(unsigned k, unsigned j)
			
 
				 {
			
 
				+	int ret;
			
 
				+
			
 
				 	struct starpu_task *task = create_task(TAG21(k, j));
			
 
				 
			
 
				 	task->cl = &cl21;	
			
 
				 
			
 
				 	/* which sub-data is manipulated ? */
			
 
				-	task->buffers[0].handle = A_state[k][k]; 
			
 
				-	task->buffers[0].mode = STARPU_R;
			
 
				-	task->buffers[1].handle = A_state[j][k]; 
			
 
				-	task->buffers[1].mode = STARPU_RW;
			
 
				+	task->handles[0] = A_state[k][k];
			
 
				+	task->handles[1] = A_state[j][k];
			
 
				 
			
 
				-	if (j == k+1) {
			
 
				+	if (j == k+1)
			
 
				+	{
			
 
				 		task->priority = STARPU_MAX_PRIO;
			
 
				 	}
			
 
				 
			
 
				 	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				+	if (k > 0)
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG21(k, j), 2, TAG11(k), TAG22(k-1, k, j));
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
			
 
				 	}
			
 
				 
			
 
				-	starpu_task_submit(task);
			
 
				+	ret = starpu_task_submit(task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 }
			
 
				 
			
 
				-static starpu_codelet cl22 =
			
 
				+static struct starpu_codelet cl22 =
			
 
				 {
			
 
				+	.modes = { STARPU_R, STARPU_R, STARPU_RW },
			
 
				 	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u22,
			
 
				+	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u22,
			
 
				+	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				 #ifdef SPU_FUNC_SGEMM
			
@@ -145,6 +152,8 @@ static starpu_codelet cl22 =
 
				 
			
 
				 static void create_task_22(unsigned k, unsigned i, unsigned j)
			
 
				 {
			
 
				+	int ret;
			
 
				+
			
 
				 /*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
			
 
				 
			
 
				 	struct starpu_task *task = create_task(TAG22(k, i, j));
			
@@ -152,26 +161,27 @@ static void create_task_22(unsigned k, unsigned i, unsigned j)
 
				 	task->cl = &cl22;
			
 
				 
			
 
				 	/* which sub-data is manipulated ? */
			
 
				-	task->buffers[0].handle = A_state[i][k]; 
			
 
				-	task->buffers[0].mode = STARPU_R;
			
 
				-	task->buffers[1].handle = A_state[j][k]; 
			
 
				-	task->buffers[1].mode = STARPU_R;
			
 
				-	task->buffers[2].handle = A_state[j][i]; 
			
 
				-	task->buffers[2].mode = STARPU_RW;
			
 
				-
			
 
				-	if ( (i == k + 1) && (j == k +1) ) {
			
 
				+	task->handles[0] = A_state[i][k];
			
 
				+	task->handles[1] = A_state[j][k];
			
 
				+	task->handles[2] = A_state[j][i];
			
 
				+
			
 
				+	if ( (i == k + 1) && (j == k +1) )
			
 
				+	{
			
 
				 		task->priority = STARPU_MAX_PRIO;
			
 
				 	}
			
 
				 
			
 
				 	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				+	if (k > 0)
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j));
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
			
 
				 	}
			
 
				 
			
 
				-	starpu_task_submit(task);
			
 
				+	ret = starpu_task_submit(task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 }
			
 
				 
			
 
				 
			
@@ -183,6 +193,8 @@ static void create_task_22(unsigned k, unsigned i, unsigned j)
 
				 
			
 
				 static void cholesky_no_stride(void)
			
 
				 {
			
 
				+	int ret;
			
 
				+
			
 
				 	struct timeval start;
			
 
				 	struct timeval end;
			
 
				 
			
@@ -195,11 +207,14 @@ static void cholesky_no_stride(void)
 
				 	{
			
 
				 		struct starpu_task *task = create_task_11(k, nblocks);
			
 
				 		/* we defer the launch of the first task */
			
 
				-		if (k == 0) {
			
 
				+		if (k == 0)
			
 
				+		{
			
 
				 			entry_task = task;
			
 
				 		}
			
 
				-		else {
			
 
				-			starpu_task_submit(task);
			
 
				+		else
			
 
				+		{
			
 
				+			ret = starpu_task_submit(task);
			
 
				+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 		}
			
 
				 		
			
 
				 		for (j = k+1; j<nblocks; j++)
			
@@ -216,7 +231,8 @@ static void cholesky_no_stride(void)
 
				 
			
 
				 	/* schedule the codelet */
			
 
				 	gettimeofday(&start, NULL);
			
 
				-	starpu_task_submit(entry_task);
			
 
				+	ret = starpu_task_submit(entry_task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 
			
 
				 	/* stall the application until the end of computations */
			
 
				 	starpu_tag_wait(TAG11(nblocks-1));
			
@@ -235,13 +251,17 @@ int main(int argc, char **argv)
 
				 {
			
 
				 	unsigned x, y;
			
 
				 	unsigned i, j;
			
 
				+	int ret;
			
 
				 
			
 
				 	parse_args(argc, argv);
			
 
				 	assert(nblocks <= NMAXBLOCKS);
			
 
				 
			
 
				 	FPRINTF(stderr, "BLOCK SIZE = %d\n", size / nblocks);
			
 
				 
			
 
				-	starpu_init(NULL);
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV)
			
 
				+		return 77;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	/* Disable sequential consistency */
			
 
				 	starpu_data_set_default_sequential_consistency_flag(0);
			
@@ -251,17 +271,8 @@ int main(int argc, char **argv)
 
				 	for (y = 0; y < nblocks; y++)
			
 
				 	for (x = 0; x < nblocks; x++)
			
 
				 	{
			
 
				-		if (x <= y) {
			
 
				-			A[y][x] = malloc(BLOCKSIZE*BLOCKSIZE*sizeof(float));
			
 
				-			assert(A[y][x]);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-
			
 
				-	for (y = 0; y < nblocks; y++)
			
 
				-	for (x = 0; x < nblocks; x++)
			
 
				-	{
			
 
				-		if (x <= y) {
			
 
				+		if (x <= y)
			
 
				+		{
			
 
				 #ifdef STARPU_HAVE_POSIX_MEMALIGN
			
 
				 			posix_memalign((void **)&A[y][x], 128, BLOCKSIZE*BLOCKSIZE*sizeof(float));
			
 
				 #else
			
@@ -277,7 +288,8 @@ int main(int argc, char **argv)
 
				 	 * */
			
 
				 	for (y = 0; y < nblocks; y++)
			
 
				 	for (x = 0; x < nblocks; x++)
			
 
				-	if (x <= y) {
			
 
				+	if (x <= y)
			
 
				+	{
			
 
				 		for (i = 0; i < BLOCKSIZE; i++)
			
 
				 		for (j = 0; j < BLOCKSIZE; j++)
			
 
				 		{
			
@@ -290,12 +302,11 @@ int main(int argc, char **argv)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-
			
 
				-
			
 
				 	for (y = 0; y < nblocks; y++)
			
 
				 	for (x = 0; x < nblocks; x++)
			
 
				 	{
			
 
				-		if (x <= y) {
			
 
				+		if (x <= y)
			
 
				+		{
			
 
				 			starpu_matrix_data_register(&A_state[y][x], 0, (uintptr_t)A[y][x], 
			
 
				 				BLOCKSIZE, BLOCKSIZE, BLOCKSIZE, sizeof(float));
			
 
				 		}
			
@@ -303,6 +314,16 @@ int main(int argc, char **argv)
 
				 
			
 
				 	cholesky_no_stride();
			
 
				 
			
 
				+	for (y = 0; y < nblocks; y++)
			
 
				+	for (x = 0; x < nblocks; x++)
			
 
				+	{
			
 
				+		if (x <= y)
			
 
				+		{
			
 
				+			starpu_data_unregister(A_state[y][x]);
			
 
				+			free(A[y][x]);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 	starpu_helper_cublas_shutdown();
			
 
				 
			
 
				 	starpu_shutdown();