Andra Hugo 13 年 前
コミット
0ca1520d7f

+ 45 - 26
sched_ctx_hypervisor/examples/cholesky/cholesky.h

@@ -30,6 +30,7 @@
 
 #include <common/blas.h>
 #include <starpu.h>
+#include <starpu_bound.h>
 
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 #define NMAXBLOCKS	32
@@ -61,6 +62,7 @@ static unsigned nbigblocks = 8;
 static unsigned pinned = 0;
 static unsigned noprio = 0;
 static unsigned check = 0;
+static unsigned bound = 0;
 static unsigned with_ctxs = 0;
 static unsigned with_noctxs = 0;
 static unsigned chole1 = 0;
@@ -76,64 +78,81 @@ void chol_cublas_codelet_update_u21(void *descr[], void *_args);
 void chol_cublas_codelet_update_u22(void *descr[], void *_args);
 #endif
 
-extern struct starpu_perfmodel_t chol_model_11;
-extern struct starpu_perfmodel_t chol_model_21;
-extern struct starpu_perfmodel_t chol_model_22;
+extern struct starpu_perfmodel chol_model_11;
+extern struct starpu_perfmodel chol_model_21;
+extern struct starpu_perfmodel chol_model_22;
 
 static void __attribute__((unused)) parse_args(int argc, char **argv)
 {
 	int i;
-	for (i = 1; i < argc; i++) {
-		if (strcmp(argv[i], "-with_ctxs") == 0) {
+	for (i = 1; i < argc; i++)
+	{
+		if (strcmp(argv[i], "-with_ctxs") == 0) 
+		{
 			with_ctxs = 1;
 			break;
 		}
-		if (strcmp(argv[i], "-with_noctxs") == 0) {
+		if (strcmp(argv[i], "-with_noctxs") == 0) 
+		{
 			with_noctxs = 1;
 			break;
 		}
 		
-		if (strcmp(argv[i], "-chole1") == 0) {
+		if (strcmp(argv[i], "-chole1") == 0) 
+		{
 			chole1 = 1;
 			break;
 		}
 
-		if (strcmp(argv[i], "-chole2") == 0) {
+		if (strcmp(argv[i], "-chole2") == 0) 
+		{
 			chole2 = 1;
 			break;
 		}
 
-		if (strcmp(argv[i], "-size") == 0) {
-			char *argptr;
+		if (strcmp(argv[i], "-size") == 0)
+		{
+		        char *argptr;
 			size = strtol(argv[++i], &argptr, 10);
 		}
-		
-		if (strcmp(argv[i], "-nblocks") == 0) {
-			char *argptr;
+
+		if (strcmp(argv[i], "-nblocks") == 0)
+		{
+		        char *argptr;
 			nblocks = strtol(argv[++i], &argptr, 10);
 		}
-		
-		if (strcmp(argv[i], "-nbigblocks") == 0) {
-			char *argptr;
+
+		if (strcmp(argv[i], "-nbigblocks") == 0)
+		{
+		        char *argptr;
 			nbigblocks = strtol(argv[++i], &argptr, 10);
 		}
-		
-		if (strcmp(argv[i], "-pin") == 0) {
+
+		if (strcmp(argv[i], "-pin") == 0)
+		{
 			pinned = 1;
 		}
-		
-		if (strcmp(argv[i], "-no-prio") == 0) {
+
+		if (strcmp(argv[i], "-no-prio") == 0)
+		{
 			noprio = 1;
 		}
-		
-		if (strcmp(argv[i], "-check") == 0) {
+
+		if (strcmp(argv[i], "-bound") == 0)
+		{
+			bound = 1;
+		}
+
+		if (strcmp(argv[i], "-check") == 0)
+		{
 			check = 1;
 		}
-		
-		if (strcmp(argv[i], "-h") == 0) {
+
+		if (strcmp(argv[i], "-h") == 0)
+		{
 			printf("usage : %s [-pin] [-size size] [-nblocks nblocks] [-check]\n", argv[0]);
-		}	
-	}	
+		}
+	}
 }
 
 #endif /* __DW_CHOLESKY_H__ */

+ 106 - 64
sched_ctx_hypervisor/examples/cholesky/cholesky_grain_tag.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -36,91 +36,100 @@ static struct starpu_task *create_task(starpu_tag_t id)
  *	Create the codelets
  */
 
-static starpu_codelet cl11 =
+static struct starpu_codelet cl11 =
 {
+	.modes = { STARPU_RW },
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = chol_cpu_codelet_update_u11,
+	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u11,
+	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
 #endif
 	.nbuffers = 1,
 	.model = &chol_model_11
 };
 
-static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k, unsigned reclevel)
+static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned k, unsigned reclevel)
 {
 /*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
 
 	struct starpu_task *task = create_task(TAG11_AUX(k, reclevel));
-	
+
 	task->cl = &cl11;
 
 	/* which sub-data is manipulated ? */
-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k);
-	task->buffers[0].mode = STARPU_RW;
+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
 
 	/* this is an important task */
 	task->priority = STARPU_MAX_PRIO;
 
 	/* enforce dependencies ... */
-	if (k > 0) {
+	if (k > 0)
+	{
 		starpu_tag_declare_deps(TAG11_AUX(k, reclevel), 1, TAG22_AUX(k-1, k, k, reclevel));
 	}
 
 	return task;
 }
 
-static starpu_codelet cl21 =
+static struct starpu_codelet cl21 =
 {
+	.modes = { STARPU_R, STARPU_RW },
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = chol_cpu_codelet_update_u21,
+	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u21,
+	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
 #endif
 	.nbuffers = 2,
 	.model = &chol_model_21
 };
 
-static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j, unsigned reclevel)
+static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j, unsigned reclevel)
 {
+	int ret;
+
 	struct starpu_task *task = create_task(TAG21_AUX(k, j, reclevel));
 
-	task->cl = &cl21;	
+	task->cl = &cl21;
 
 	/* which sub-data is manipulated ? */
-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k); 
-	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
-	task->buffers[1].mode = STARPU_RW;
+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
 
-	if (j == k+1) {
+	if (j == k+1)
+	{
 		task->priority = STARPU_MAX_PRIO;
 	}
 
 	/* enforce dependencies ... */
-	if (k > 0) {
+	if (k > 0)
+	{
 		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 2, TAG11_AUX(k, reclevel), TAG22_AUX(k-1, k, j, reclevel));
 	}
-	else {
+	else
+	{
 		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 1, TAG11_AUX(k, reclevel));
 	}
 
-	starpu_task_submit(task);
+	ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 }
 
-static starpu_codelet cl22 =
+static struct starpu_codelet cl22 =
 {
+	.modes = { STARPU_R, STARPU_R, STARPU_RW },
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = chol_cpu_codelet_update_u22,
+	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u22,
+	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
 #endif
 	.nbuffers = 3,
 	.model = &chol_model_22
 };
 
-static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j, unsigned reclevel)
+static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, unsigned j, unsigned reclevel)
 {
+	int ret;
+
 /*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22_AUX(k,i,j)); */
 
 	struct starpu_task *task = create_task(TAG22_AUX(k, i, j, reclevel));
@@ -128,44 +137,47 @@ static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, uns
 	task->cl = &cl22;
 
 	/* which sub-data is manipulated ? */
-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, i); 
-	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
-	task->buffers[1].mode = STARPU_R;
-	task->buffers[2].handle = starpu_data_get_sub_data(dataA, 2, i, j); 
-	task->buffers[2].mode = STARPU_RW;
-
-	if ( (i == k + 1) && (j == k +1) ) {
+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, i);
+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
+	task->handles[2] = starpu_data_get_sub_data(dataA, 2, i, j);
+
+	if ( (i == k + 1) && (j == k +1) )
+	{
 		task->priority = STARPU_MAX_PRIO;
 	}
 
 	/* enforce dependencies ... */
-	if (k > 0) {
+	if (k > 0)
+	{
 		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 3, TAG22_AUX(k-1, i, j, reclevel), TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
 	}
-	else {
+	else
+	{
 		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 2, TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
 	}
 
-	starpu_task_submit(task);
+	ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 }
 
 
 
 /*
- *	code to bootstrap the factorization 
+ *	code to bootstrap the factorization
  *	and construct the DAG
  */
 
 static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks, unsigned reclevel)
 {
+	int ret;
+
 	/* create a new codelet */
 	struct starpu_task *entry_task = NULL;
 
 	/* create all the DAG nodes */
 	unsigned i,j,k;
 
-	starpu_data_handle dataA;
+	starpu_data_handle_t dataA;
 
 	/* monitor and partition the A matrix into blocks :
 	 * one block is now determined by 2 unsigned (i,j) */
@@ -173,12 +185,14 @@ static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
 	starpu_data_set_sequential_consistency_flag(dataA, 0);
 
-	struct starpu_data_filter f = {
+	struct starpu_data_filter f =
+	{
 		.filter_func = starpu_vertical_block_filter_func,
 		.nchildren = nblocks
 	};
 
-	struct starpu_data_filter f2 = {
+	struct starpu_data_filter f2 =
+	{
 		.filter_func = starpu_block_filter_func,
 		.nchildren = nblocks
 	};
@@ -189,13 +203,16 @@ static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 	{
 		struct starpu_task *task = create_task_11(dataA, k, reclevel);
 		/* we defer the launch of the first task */
-		if (k == 0) {
+		if (k == 0)
+		{
 			entry_task = task;
 		}
-		else {
-			starpu_task_submit(task);
+		else
+		{
+			ret = starpu_task_submit(task);
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		}
-		
+
 		for (j = k+1; j<nblocks; j++)
 		{
 			create_task_21(dataA, k, j, reclevel);
@@ -209,7 +226,7 @@ static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 	}
 
 	/* schedule the codelet */
-	int ret = starpu_task_submit(entry_task);
+	ret = starpu_task_submit(entry_task);
 	if (STARPU_UNLIKELY(ret == -ENODEV))
 	{
 		FPRINTF(stderr, "No worker may execute this task\n");
@@ -221,9 +238,11 @@ static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 		/* stall the application until the end of computations */
 		starpu_tag_wait(TAG11_AUX(nblocks-1, reclevel));
 		starpu_data_unpartition(dataA, 0);
+		starpu_data_unregister(dataA);
 		return;
 	}
-	else {
+	else
+	{
 		STARPU_ASSERT(reclevel == 0);
 		unsigned ndeps_tags = (nblocks - nbigblocks)*(nblocks - nbigblocks);
 
@@ -253,20 +272,26 @@ static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
 static void initialize_system(float **A, unsigned dim, unsigned pinned)
 {
-	starpu_init(NULL);
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		exit(77);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	starpu_helper_cublas_init();
 
 	if (pinned)
 	{
 		starpu_malloc((void **)A, dim*dim*sizeof(float));
-	} 
-	else {
+	}
+	else
+	{
 		*A = malloc(dim*dim*sizeof(float));
 	}
 }
 
-void cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks)
+void cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks, unsigned pinned)
 {
 	struct timeval start;
 	struct timeval end;
@@ -284,8 +309,20 @@ void cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, u
 	double flop = (1.0f*size*size*size)/3.0f;
 	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
 
-	starpu_helper_cublas_shutdown();
+}
 
+static void shutdown_system(float **matA, unsigned pinned)
+{
+	if (pinned)
+	{
+	     starpu_free(*matA);
+	}
+	else
+	{
+	     free(*matA);
+	}
+
+	starpu_helper_cublas_shutdown();
 	starpu_shutdown();
 }
 
@@ -299,8 +336,6 @@ int main(int argc, char **argv)
 	parse_args(argc, argv);
 
 	float *mat;
-
-	mat = malloc(size*size*sizeof(float));
 	initialize_system(&mat, size, pinned);
 
 	unsigned i,j;
@@ -321,10 +356,12 @@ int main(int argc, char **argv)
 	{
 		for (i = 0; i < size; i++)
 		{
-			if (i <= j) {
+			if (i <= j)
+			{
 				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
 			}
-			else {
+			else
+			{
 				FPRINTF(stdout, ".\t");
 			}
 		}
@@ -332,8 +369,7 @@ int main(int argc, char **argv)
 	}
 #endif
 
-
-	cholesky_grain(mat, size, size, nblocks, nbigblocks);
+	cholesky_grain(mat, size, size, nblocks, nbigblocks, pinned);
 
 #ifdef CHECK_OUTPUT
 	FPRINTF(stdout, "Results :\n");
@@ -342,10 +378,12 @@ int main(int argc, char **argv)
 	{
 		for (i = 0; i < size; i++)
 		{
-			if (i <= j) {
+			if (i <= j)
+			{
 				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
 			}
-			else {
+			else
+			{
 				FPRINTF(stdout, ".\t");
 				mat[j+i*size] = 0.0f; /* debug */
 			}
@@ -357,7 +395,7 @@ int main(int argc, char **argv)
 	float *test_mat = malloc(size*size*sizeof(float));
 	STARPU_ASSERT(test_mat);
 
-	SSYRK("L", "N", size, size, 1.0f, 
+	SSYRK("L", "N", size, size, 1.0f,
 				mat, size, 0.0f, test_mat, size);
 
 	FPRINTF(stderr, "comparing results ...\n");
@@ -365,16 +403,20 @@ int main(int argc, char **argv)
 	{
 		for (i = 0; i < size; i++)
 		{
-			if (i <= j) {
+			if (i <= j)
+			{
                                 FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
 			}
-			else {
+			else
+			{
 				FPRINTF(stdout, ".\t");
 			}
 		}
 		FPRINTF(stdout, "\n");
 	}
+	free(test_mat);
 #endif
 
+	shutdown_system(&mat, pinned);
 	return 0;
 }

+ 89 - 62
sched_ctx_hypervisor/examples/cholesky/cholesky_implicit.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -17,45 +17,48 @@
  */
 
 #include "cholesky.h"
-#include "../sched_ctx_utils/sched_ctx_utils.h"
+
 /*
  *	Create the codelets
  */
 
-static starpu_codelet cl11 =
+static struct starpu_codelet cl11 =
 {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.type = STARPU_SEQ,
-	.cpu_func = chol_cpu_codelet_update_u11,
+	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u11,
+	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
 #endif
 	.nbuffers = 1,
+	.modes = {STARPU_RW},
 	.model = &chol_model_11
 };
 
-static starpu_codelet cl21 =
+static struct starpu_codelet cl21 =
 {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.type = STARPU_SEQ,
-	.cpu_func = chol_cpu_codelet_update_u21,
+	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u21,
+	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
 #endif
 	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_RW},
 	.model = &chol_model_21
 };
 
-static starpu_codelet cl22 =
+static struct starpu_codelet cl22 =
 {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.type = STARPU_SEQ,
 	.max_parallelism = INT_MAX,
-	.cpu_func = chol_cpu_codelet_update_u22,
+	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u22,
+	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
 #endif
 	.nbuffers = 3,
+	.modes = {STARPU_R, STARPU_R, STARPU_RW},
 	.model = &chol_model_22
 };
 
@@ -69,9 +72,9 @@ static void callback_turn_spmd_on(void *arg __attribute__ ((unused)))
 	cl22.type = STARPU_SPMD;
 }
 
-int hypervisor_tag = 1;
-static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
+static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 {
+	int ret;
 	struct timeval start;
 	struct timeval end;
 
@@ -81,19 +84,22 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 
 	gettimeofday(&start, NULL);
 
+	if (bound)
+		starpu_bound_start(0, 0);
 	/* create all the DAG nodes */
 	for (k = 0; k < nblocks; k++)
 	{
-                starpu_data_handle sdatakk = starpu_data_get_sub_data(dataA, 2, k, k);
+                starpu_data_handle_t sdatakk = starpu_data_get_sub_data(dataA, 2, k, k);
 		if(k == 0 && with_ctxs)
 		{
-			starpu_insert_task(&cl11,
+			 ret = starpu_insert_task(&cl11,
 					   STARPU_PRIORITY, prio_level,
 					   STARPU_RW, sdatakk,
 					   STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
 					   STARPU_HYPERVISOR_TAG, hypervisor_tag,
 					   0);
 			set_hypervisor_conf(START_BENCH, hypervisor_tag++);
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 		}
 		else
 			starpu_insert_task(&cl11,
@@ -104,49 +110,53 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 
 		for (j = k+1; j<nblocks; j++)
 		{
-                        starpu_data_handle sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
+                        starpu_data_handle_t sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
 
-                        starpu_insert_task(&cl21,
-                                           STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
-                                           STARPU_R, sdatakk,
-                                           STARPU_RW, sdatakj,
-                                           0);
+                        ret = starpu_insert_task(&cl21,
+						 STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
+						 STARPU_R, sdatakk,
+						 STARPU_RW, sdatakj,
+						 0);
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 
 			for (i = k+1; i<nblocks; i++)
 			{
 				if (i <= j)
                                 {
-					starpu_data_handle sdataki = starpu_data_get_sub_data(dataA, 2, k, i);
-					starpu_data_handle sdataij = starpu_data_get_sub_data(dataA, 2, i, j);
+					starpu_data_handle_t sdataki = starpu_data_get_sub_data(dataA, 2, k, i);
+					starpu_data_handle_t sdataij = starpu_data_get_sub_data(dataA, 2, i, j);
 
 					if(k == (nblocks-2) && j == (nblocks-1) &&
 					   i == (k + 1) && with_ctxs)
 					{
-						starpu_insert_task(&cl22,
+						ret = starpu_insert_task(&cl22,
 								   STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
 								   STARPU_R, sdataki,
 								   STARPU_R, sdatakj,
 								   STARPU_RW, sdataij,
 								   STARPU_HYPERVISOR_TAG, hypervisor_tag,
 								   0);
-						
+						STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 						set_hypervisor_conf(END_BENCH, hypervisor_tag++);
 					}
 					
 					else
-						starpu_insert_task(&cl22,
+						ret = starpu_insert_task(&cl22,
 								   STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
 								   STARPU_R, sdataki,
 								   STARPU_R, sdatakj,
 								   STARPU_RW, sdataij,
 								   0);
+						STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 					
-                                }
+                   }
 			}
 		}
 	}
 
 	starpu_task_wait_for_all();
+	if (bound)
+		starpu_bound_stop();
 
 	starpu_data_unpartition(dataA, 0);
 
@@ -166,23 +176,31 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 		FPRINTF(stdout, "%2.2f\n", timing/1000);
 	
 		FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+		if (bound)
+		{
+			double res;
+			starpu_bound_compute(&res, NULL, 0);
+			FPRINTF(stderr, "Theoretical GFlops: %2.2f\n", (flop/res/1000000.0f));
+		}
 	}
 }
 
 static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 {
-	starpu_data_handle dataA;
+	starpu_data_handle_t dataA;
 
 	/* monitor and partition the A matrix into blocks :
 	 * one block is now determined by 2 unsigned (i,j) */
 	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
 
-	struct starpu_data_filter f = {
+	struct starpu_data_filter f =
+	{
 		.filter_func = starpu_vertical_block_filter_func,
 		.nchildren = nblocks
 	};
 
-	struct starpu_data_filter f2 = {
+	struct starpu_data_filter f2 =
+	{
 		.filter_func = starpu_block_filter_func,
 		.nchildren = nblocks
 	};
@@ -190,11 +208,25 @@ static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 
 	_cholesky(dataA, nblocks);
+
+	starpu_data_unregister(dataA);
 }
 
-static void execute_cholesky(float *mat, unsigned size, unsigned nblocks)
+static void execute_cholesky(unsigned size, unsigned nblocks)
 {
+	float *mat;
+	starpu_malloc((void **)&mat, (size_t)size*size*sizeof(float));
+
 	unsigned i,j;
+	for (i = 0; i < size; i++)
+	{
+		for (j = 0; j < size; j++)
+		{
+			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
+			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
+		}
+	}
+
 /* #define PRINT_OUTPUT */
 #ifdef PRINT_OUTPUT
 	FPRINTF(stdout, "Input :\n");
@@ -203,16 +235,19 @@ static void execute_cholesky(float *mat, unsigned size, unsigned nblocks)
 	{
 		for (i = 0; i < size; i++)
 		{
-			if (i <= j) {
+			if (i <= j)
+			{
 				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
 			}
-			else {
+			else
+			{
 				FPRINTF(stdout, ".\t");
 			}
 		}
 		FPRINTF(stdout, "\n");
 	}
 #endif
+
 	cholesky(mat, size, size, nblocks);
 
 #ifdef PRINT_OUTPUT
@@ -221,10 +256,12 @@ static void execute_cholesky(float *mat, unsigned size, unsigned nblocks)
 	{
 		for (i = 0; i < size; i++)
 		{
-			if (i <= j) {
+			if (i <= j)
+			{
 				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
 			}
-			else {
+			else
+			{
 				FPRINTF(stdout, ".\t");
 				mat[j+i*size] = 0.0f; /* debug */
 			}
@@ -240,50 +277,56 @@ static void execute_cholesky(float *mat, unsigned size, unsigned nblocks)
 		{
 			for (i = 0; i < size; i++)
 			{
-				if (i > j) {
+				if (i > j)
+				{
 					mat[j+i*size] = 0.0f; /* debug */
 				}
 			}
 		}
 		float *test_mat = malloc(size*size*sizeof(float));
 		STARPU_ASSERT(test_mat);
-	
+
 		SSYRK("L", "N", size, size, 1.0f,
 					mat, size, 0.0f, test_mat, size);
-	
+
 		FPRINTF(stderr, "comparing results ...\n");
 #ifdef PRINT_OUTPUT
 		for (j = 0; j < size; j++)
 		{
 			for (i = 0; i < size; i++)
 			{
-				if (i <= j) {
+				if (i <= j)
+				{
 					FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
 				}
-				else {
+				else
+				{
 					FPRINTF(stdout, ".\t");
 				}
 			}
 			FPRINTF(stdout, "\n");
 		}
 #endif
-	
+
 		for (j = 0; j < size; j++)
 		{
 			for (i = 0; i < size; i++)
 			{
-				if (i <= j) {
+				if (i <= j)
+				{
 	                                float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
 	                                float err = abs(test_mat[j +i*size] - orig);
-	                                if (err > 0.00001) {
+	                                if (err > 0.00001)
+					{
 	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.2f != %2.2f (err %2.2f)\n", i, j, test_mat[j +i*size], orig, err);
 	                                        assert(0);
 	                                }
 	                        }
 			}
 	        }
+		free(test_mat);
 	}
-
+	starpu_free(mat);
 }
 
 int main(int argc, char **argv)
@@ -314,23 +357,7 @@ int main(int argc, char **argv)
 	else if(chole2)
 		start_2ndbench(execute_cholesky);
 	else
-	{
-		float *mat;
-		starpu_malloc((void **)&mat, (size_t)size*size*sizeof(float));
-		
-		unsigned i,j;
-		for (i = 0; i < size; i++)
-		{
-			for (j = 0; j < size; j++)
-			{
-				mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-				/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
-			}
-		}
-
-		execute_cholesky(mat, size, nblocks);
-	}
-
+		execute_cholesky(size, nblocks);
 
 	starpu_helper_cublas_shutdown();
 	starpu_shutdown();

+ 10 - 6
sched_ctx_hypervisor/examples/cholesky/cholesky_kernels.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -17,7 +17,7 @@
 
 #include <starpu_config.h>
 #include "cholesky.h"
-//#include "../common/blas.h"
+#include "../common/blas.h"
 #ifdef STARPU_USE_CUDA
 #include <starpu_cuda.h>
 #ifdef STARPU_HAVE_MAGMA
@@ -55,7 +55,8 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __at
 			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21, 
 				right, ld12, 1.0f, center, ld22);
 		}
-		else {
+		else
+		{
 			/* Parallel CPU kernel */
 			int rank = starpu_combined_worker_get_rank();
 
@@ -113,7 +114,8 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, __attrib
 	unsigned nx21 = STARPU_MATRIX_GET_NY(descr[1]);
 	unsigned ny21 = STARPU_MATRIX_GET_NX(descr[1]);
 
-	switch (s) {
+	switch (s)
+	{
 		case 0:
 			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
 			break;
@@ -157,7 +159,8 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 
 	unsigned z;
 
-	switch (s) {
+	switch (s)
+	{
 		case 0:
 
 			/*
@@ -188,7 +191,8 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 			int ret;
 			int info;
 			ret = magma_spotrf_gpu('L', nx, sub11, ld, &info);
-			if (ret != MAGMA_SUCCESS) {
+			if (ret != MAGMA_SUCCESS)
+			{
 				fprintf(stderr, "Error in Magma: %d\n", ret);
 				STARPU_ABORT();
 			}

+ 34 - 27
sched_ctx_hypervisor/examples/cholesky/cholesky_models.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -17,8 +17,8 @@
  */
 
 /*
- * As a convention, in that file, descr[0] is represented by A,
- * 				  descr[1] is B ...
+ * As a convention, in that file, buffers[0] is represented by A,
+ * 				  buffers[1] is B ...
  */
 
 /*
@@ -26,6 +26,7 @@
  */
 
 #include <starpu.h>
+#include "cholesky.h"
 
 /* #define USE_PERTURBATION	1 */
 
@@ -35,11 +36,11 @@
 #define PERTURBATE(a)	(a)
 #endif
 
-static double cpu_chol_task_11_cost(starpu_buffer_descr *descr)
+static double cpu_chol_task_11_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 
-	n = starpu_matrix_get_nx(descr[0].handle);
+	n = starpu_matrix_get_nx(task->handles[0]);
 
 	double cost = (((double)(n)*n*n)/1000.0f*0.894/0.79176);
 
@@ -50,11 +51,11 @@ static double cpu_chol_task_11_cost(starpu_buffer_descr *descr)
 	return PERTURBATE(cost);
 }
 
-static double cuda_chol_task_11_cost(starpu_buffer_descr *descr)
+static double cuda_chol_task_11_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 
-	n = starpu_matrix_get_nx(descr[0].handle);
+	n = starpu_matrix_get_nx(task->handles[0]);
 
 	double cost = (((double)(n)*n*n)/50.0f/10.75/5.088633/0.9883);
 
@@ -65,11 +66,11 @@ static double cuda_chol_task_11_cost(starpu_buffer_descr *descr)
 	return PERTURBATE(cost);
 }
 
-static double cpu_chol_task_21_cost(starpu_buffer_descr *descr)
+static double cpu_chol_task_21_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 
-	n = starpu_matrix_get_nx(descr[0].handle);
+	n = starpu_matrix_get_nx(task->handles[0]);
 
 	double cost = (((double)(n)*n*n)/7706.674/0.95/0.9965);
 
@@ -80,11 +81,11 @@ static double cpu_chol_task_21_cost(starpu_buffer_descr *descr)
 	return PERTURBATE(cost);
 }
 
-static double cuda_chol_task_21_cost(starpu_buffer_descr *descr)
+static double cuda_chol_task_21_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 
-	n = starpu_matrix_get_nx(descr[0].handle);
+	n = starpu_matrix_get_nx(task->handles[0]);
 
 	double cost = (((double)(n)*n*n)/50.0f/10.75/87.29520);
 
@@ -95,11 +96,11 @@ static double cuda_chol_task_21_cost(starpu_buffer_descr *descr)
 	return PERTURBATE(cost);
 }
 
-static double cpu_chol_task_22_cost(starpu_buffer_descr *descr)
+static double cpu_chol_task_22_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 
-	n = starpu_matrix_get_nx(descr[0].handle);
+	n = starpu_matrix_get_nx(task->handles[0]);
 
 	double cost = (((double)(n)*n*n)/50.0f/10.75/8.0760);
 
@@ -110,11 +111,11 @@ static double cpu_chol_task_22_cost(starpu_buffer_descr *descr)
 	return PERTURBATE(cost);
 }
 
-static double cuda_chol_task_22_cost(starpu_buffer_descr *descr)
+static double cuda_chol_task_22_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 
-	n = starpu_matrix_get_nx(descr[0].handle);
+	n = starpu_matrix_get_nx(task->handles[0]);
 
 	double cost = (((double)(n)*n*n)/50.0f/10.75/76.30666);
 
@@ -125,28 +126,34 @@ static double cuda_chol_task_22_cost(starpu_buffer_descr *descr)
 	return PERTURBATE(cost);
 }
 
-struct starpu_perfmodel_t chol_model_11 = {
-	.per_arch = {
-		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_11_cost },
-		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_11_cost }
+struct starpu_perfmodel chol_model_11 =
+{
+	.per_arch =
+	{
+		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_11_cost },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_11_cost }
 	},
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "chol_model_11"
 };
 
-struct starpu_perfmodel_t chol_model_21 = {
-	.per_arch = {
-		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_21_cost },
-		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_21_cost }
+struct starpu_perfmodel chol_model_21 =
+{
+	.per_arch =
+	{
+		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_21_cost },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_21_cost }
 	},
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "chol_model_21"
 };
 
-struct starpu_perfmodel_t chol_model_22 = {
-	.per_arch = {
-		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_22_cost },
-		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_22_cost }
+struct starpu_perfmodel chol_model_22 =
+{
+	.per_arch =
+	{
+		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_22_cost },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_22_cost }
 	},
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "chol_model_22"

+ 103 - 64
sched_ctx_hypervisor/examples/cholesky/cholesky_tag.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -36,96 +36,101 @@ static struct starpu_task *create_task(starpu_tag_t id)
  *	Create the codelets
  */
 
-static starpu_codelet cl11 =
+static struct starpu_codelet cl11 =
 {
+	.modes = { STARPU_RW },
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = chol_cpu_codelet_update_u11,
+	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u11,
+	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
 #endif
 	.nbuffers = 1,
 	.model = &chol_model_11
 };
 
-static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k)
+static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned k)
 {
 /*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
 
 	struct starpu_task *task = create_task(TAG11(k));
-	
+
 	task->cl = &cl11;
 
 	/* which sub-data is manipulated ? */
-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k);
-	task->buffers[0].mode = STARPU_RW;
+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
 
 	/* this is an important task */
 	if (!noprio)
 		task->priority = STARPU_MAX_PRIO;
 
 	/* enforce dependencies ... */
-	if (k > 0) {
+	if (k > 0)
+	{
 		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
 	}
 
 	return task;
 }
 
-static starpu_codelet cl21 =
+static struct starpu_codelet cl21 =
 {
+	.modes = { STARPU_R, STARPU_RW },
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = chol_cpu_codelet_update_u21,
+	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u21,
+	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
 #endif
 	.nbuffers = 2,
 	.model = &chol_model_21
 };
 
-static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j)
+static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j)
 {
 	struct starpu_task *task = create_task(TAG21(k, j));
 
-	task->cl = &cl21;	
+	task->cl = &cl21;
 
 	/* which sub-data is manipulated ? */
-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k); 
-	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
-	task->buffers[1].mode = STARPU_RW;
+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
 
-	if (!noprio && (j == k+1)) {
+	if (!noprio && (j == k+1))
+	{
 		task->priority = STARPU_MAX_PRIO;
 	}
 
 	/* enforce dependencies ... */
-	if (k > 0) {
+	if (k > 0)
+	{
 		starpu_tag_declare_deps(TAG21(k, j), 2, TAG11(k), TAG22(k-1, k, j));
 	}
-	else {
+	else
+	{
 		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
 	}
 
 	int ret = starpu_task_submit(task);
-        if (STARPU_UNLIKELY(ret == -ENODEV)) {
+        if (STARPU_UNLIKELY(ret == -ENODEV))
+	{
                 FPRINTF(stderr, "No worker may execute this task\n");
                 exit(0);
         }
 
 }
 
-static starpu_codelet cl22 =
+static struct starpu_codelet cl22 =
 {
+	.modes = { STARPU_R, STARPU_R, STARPU_RW },
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = chol_cpu_codelet_update_u22,
+	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u22,
+	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
 #endif
 	.nbuffers = 3,
 	.model = &chol_model_22
 };
 
-static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j)
+static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, unsigned j)
 {
 /*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
 
@@ -134,27 +139,28 @@ static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, uns
 	task->cl = &cl22;
 
 	/* which sub-data is manipulated ? */
-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, i); 
-	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
-	task->buffers[1].mode = STARPU_R;
-	task->buffers[2].handle = starpu_data_get_sub_data(dataA, 2, i, j); 
-	task->buffers[2].mode = STARPU_RW;
-
-	if (!noprio && (i == k + 1) && (j == k +1) ) {
+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, i);
+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
+	task->handles[2] = starpu_data_get_sub_data(dataA, 2, i, j);
+
+	if (!noprio && (i == k + 1) && (j == k +1) )
+	{
 		task->priority = STARPU_MAX_PRIO;
 	}
 
 	/* enforce dependencies ... */
-	if (k > 0) {
+	if (k > 0)
+	{
 		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j));
 	}
-	else {
+	else
+	{
 		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
 	}
 
 	int ret = starpu_task_submit(task);
-        if (STARPU_UNLIKELY(ret == -ENODEV)) {
+        if (STARPU_UNLIKELY(ret == -ENODEV))
+	{
                 FPRINTF(stderr, "No worker may execute this task\n");
                 exit(0);
         }
@@ -163,11 +169,11 @@ static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, uns
 
 
 /*
- *	code to bootstrap the factorization 
+ *	code to bootstrap the factorization
  *	and construct the DAG
  */
 
-static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
+static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 {
 	struct timeval start;
 	struct timeval end;
@@ -183,18 +189,21 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 	{
 		struct starpu_task *task = create_task_11(dataA, k);
 		/* we defer the launch of the first task */
-		if (k == 0) {
+		if (k == 0)
+		{
 			entry_task = task;
 		}
-		else {
+		else
+		{
 			int ret = starpu_task_submit(task);
-                        if (STARPU_UNLIKELY(ret == -ENODEV)) {
+                        if (STARPU_UNLIKELY(ret == -ENODEV))
+			{
                                 FPRINTF(stderr, "No worker may execute this task\n");
                                 exit(0);
                         }
 
 		}
-		
+
 		for (j = k+1; j<nblocks; j++)
 		{
 			create_task_21(dataA, k, j);
@@ -209,7 +218,8 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 
 	/* schedule the codelet */
 	int ret = starpu_task_submit(entry_task);
-        if (STARPU_UNLIKELY(ret == -ENODEV)) {
+        if (STARPU_UNLIKELY(ret == -ENODEV))
+	{
                 FPRINTF(stderr, "No worker may execute this task\n");
                 exit(0);
         }
@@ -233,24 +243,31 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
 }
 
-static void initialize_system(float **A, unsigned dim, unsigned pinned)
+static int initialize_system(float **A, unsigned dim, unsigned pinned)
 {
-	starpu_init(NULL);
-	
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
 	starpu_helper_cublas_init();
 
 	if (pinned)
 	{
 		starpu_malloc((void **)A, (size_t)dim*dim*sizeof(float));
-	} 
-	else {
+	}
+	else
+	{
 		*A = malloc(dim*dim*sizeof(float));
 	}
+	return 0;
 }
 
 static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 {
-	starpu_data_handle dataA;
+	starpu_data_handle_t dataA;
 
 	/* monitor and partition the A matrix into blocks :
 	 * one block is now determined by 2 unsigned (i,j) */
@@ -258,12 +275,14 @@ static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 
 	starpu_data_set_sequential_consistency_flag(dataA, 0);
 
-	struct starpu_data_filter f = {
+	struct starpu_data_filter f =
+	{
 		.filter_func = starpu_vertical_block_filter_func,
 		.nchildren = nblocks
 	};
 
-	struct starpu_data_filter f2 = {
+	struct starpu_data_filter f2 =
+	{
 		.filter_func = starpu_block_filter_func,
 		.nchildren = nblocks
 	};
@@ -272,8 +291,21 @@ static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 
 	_cholesky(dataA, nblocks);
 
-	starpu_helper_cublas_shutdown();
+	starpu_data_unregister(dataA);
+}
+
+static void shutdown_system(float **matA, unsigned pinned)
+{
+	if (pinned)
+	{
+		starpu_free(*matA);
+	}
+	else
+	{
+		free(*matA);
+	}
 
+	starpu_helper_cublas_shutdown();
 	starpu_shutdown();
 }
 
@@ -287,9 +319,8 @@ int main(int argc, char **argv)
 	parse_args(argc, argv);
 
 	float *mat;
-
-	mat = malloc(size*size*sizeof(float));
-	initialize_system(&mat, size, pinned);
+	int ret = initialize_system(&mat, size, pinned);
+	if (ret) return ret;
 
 	unsigned i,j;
 	for (i = 0; i < size; i++)
@@ -309,10 +340,12 @@ int main(int argc, char **argv)
 	{
 		for (i = 0; i < size; i++)
 		{
-			if (i <= j) {
+			if (i <= j)
+			{
 				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
 			}
-			else {
+			else
+			{
 				FPRINTF(stdout, ".\t");
 			}
 		}
@@ -330,10 +363,12 @@ int main(int argc, char **argv)
 	{
 		for (i = 0; i < size; i++)
 		{
-			if (i <= j) {
+			if (i <= j)
+			{
 				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
 			}
-			else {
+			else
+			{
 				FPRINTF(stdout, ".\t");
 				mat[j+i*size] = 0.0f; /* debug */
 			}
@@ -345,7 +380,7 @@ int main(int argc, char **argv)
 	float *test_mat = malloc(size*size*sizeof(float));
 	STARPU_ASSERT(test_mat);
 
-	SSYRK("L", "N", size, size, 1.0f, 
+	SSYRK("L", "N", size, size, 1.0f,
 				mat, size, 0.0f, test_mat, size);
 
 	FPRINTF(stderr, "comparing results ...\n");
@@ -353,16 +388,20 @@ int main(int argc, char **argv)
 	{
 		for (i = 0; i < size; i++)
 		{
-			if (i <= j) {
+			if (i <= j)
+			{
 				FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
 			}
-			else {
+			else
+			{
 				FPRINTF(stdout, ".\t");
 			}
 		}
 		FPRINTF(stdout, "\n");
 	}
+	free(test_mat);
 #endif
 
+	shutdown_system(&mat, pinned);
 	return 0;
 }

+ 75 - 54
sched_ctx_hypervisor/examples/cholesky/cholesky_tile_tag.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -19,7 +19,7 @@
 
 /* A [ y ] [ x ] */
 float *A[NMAXBLOCKS][NMAXBLOCKS];
-starpu_data_handle A_state[NMAXBLOCKS][NMAXBLOCKS];
+starpu_data_handle_t A_state[NMAXBLOCKS][NMAXBLOCKS];
 
 /*
  *	Some useful functions
@@ -39,12 +39,13 @@ static struct starpu_task *create_task(starpu_tag_t id)
  *	Create the codelets
  */
 
-static starpu_codelet cl11 =
+static struct starpu_codelet cl11 =
 {
+	.modes = { STARPU_RW },
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
-	.cpu_func = chol_cpu_codelet_update_u11,
+	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u11,
+	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
 #endif
 #ifdef STARPU_USE_GORDON
 #ifdef SPU_FUNC_POTRF
@@ -66,26 +67,27 @@ static struct starpu_task * create_task_11(unsigned k, unsigned nblocks)
 	task->cl = &cl11;
 
 	/* which sub-data is manipulated ? */
-	task->buffers[0].handle = A_state[k][k];
-	task->buffers[0].mode = STARPU_RW;
+	task->handles[0] = A_state[k][k];
 
 	/* this is an important task */
 	task->priority = STARPU_MAX_PRIO;
 
 	/* enforce dependencies ... */
-	if (k > 0) {
+	if (k > 0)
+	{
 		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
 	}
 
 	return task;
 }
 
-static starpu_codelet cl21 =
+static struct starpu_codelet cl21 =
 {
+	.modes = { STARPU_R, STARPU_RW },
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
-	.cpu_func = chol_cpu_codelet_update_u21,
+	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u21,
+	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
 #endif
 #ifdef STARPU_USE_GORDON
 #ifdef SPU_FUNC_STRSM
@@ -100,37 +102,42 @@ static starpu_codelet cl21 =
 
 static void create_task_21(unsigned k, unsigned j)
 {
+	int ret;
+
 	struct starpu_task *task = create_task(TAG21(k, j));
 
 	task->cl = &cl21;	
 
 	/* which sub-data is manipulated ? */
-	task->buffers[0].handle = A_state[k][k]; 
-	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = A_state[j][k]; 
-	task->buffers[1].mode = STARPU_RW;
+	task->handles[0] = A_state[k][k];
+	task->handles[1] = A_state[j][k];
 
-	if (j == k+1) {
+	if (j == k+1)
+	{
 		task->priority = STARPU_MAX_PRIO;
 	}
 
 	/* enforce dependencies ... */
-	if (k > 0) {
+	if (k > 0)
+	{
 		starpu_tag_declare_deps(TAG21(k, j), 2, TAG11(k), TAG22(k-1, k, j));
 	}
-	else {
+	else
+	{
 		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
 	}
 
-	starpu_task_submit(task);
+	ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 }
 
-static starpu_codelet cl22 =
+static struct starpu_codelet cl22 =
 {
+	.modes = { STARPU_R, STARPU_R, STARPU_RW },
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
-	.cpu_func = chol_cpu_codelet_update_u22,
+	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u22,
+	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
 #endif
 #ifdef STARPU_USE_GORDON
 #ifdef SPU_FUNC_SGEMM
@@ -145,6 +152,8 @@ static starpu_codelet cl22 =
 
 static void create_task_22(unsigned k, unsigned i, unsigned j)
 {
+	int ret;
+
 /*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
 
 	struct starpu_task *task = create_task(TAG22(k, i, j));
@@ -152,26 +161,27 @@ static void create_task_22(unsigned k, unsigned i, unsigned j)
 	task->cl = &cl22;
 
 	/* which sub-data is manipulated ? */
-	task->buffers[0].handle = A_state[i][k]; 
-	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = A_state[j][k]; 
-	task->buffers[1].mode = STARPU_R;
-	task->buffers[2].handle = A_state[j][i]; 
-	task->buffers[2].mode = STARPU_RW;
-
-	if ( (i == k + 1) && (j == k +1) ) {
+	task->handles[0] = A_state[i][k];
+	task->handles[1] = A_state[j][k];
+	task->handles[2] = A_state[j][i];
+
+	if ( (i == k + 1) && (j == k +1) )
+	{
 		task->priority = STARPU_MAX_PRIO;
 	}
 
 	/* enforce dependencies ... */
-	if (k > 0) {
+	if (k > 0)
+	{
 		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j));
 	}
-	else {
+	else
+	{
 		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
 	}
 
-	starpu_task_submit(task);
+	ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 }
 
 
@@ -183,6 +193,8 @@ static void create_task_22(unsigned k, unsigned i, unsigned j)
 
 static void cholesky_no_stride(void)
 {
+	int ret;
+
 	struct timeval start;
 	struct timeval end;
 
@@ -195,11 +207,14 @@ static void cholesky_no_stride(void)
 	{
 		struct starpu_task *task = create_task_11(k, nblocks);
 		/* we defer the launch of the first task */
-		if (k == 0) {
+		if (k == 0)
+		{
 			entry_task = task;
 		}
-		else {
-			starpu_task_submit(task);
+		else
+		{
+			ret = starpu_task_submit(task);
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		}
 		
 		for (j = k+1; j<nblocks; j++)
@@ -216,7 +231,8 @@ static void cholesky_no_stride(void)
 
 	/* schedule the codelet */
 	gettimeofday(&start, NULL);
-	starpu_task_submit(entry_task);
+	ret = starpu_task_submit(entry_task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
 	/* stall the application until the end of computations */
 	starpu_tag_wait(TAG11(nblocks-1));
@@ -235,13 +251,17 @@ int main(int argc, char **argv)
 {
 	unsigned x, y;
 	unsigned i, j;
+	int ret;
 
 	parse_args(argc, argv);
 	assert(nblocks <= NMAXBLOCKS);
 
 	FPRINTF(stderr, "BLOCK SIZE = %d\n", size / nblocks);
 
-	starpu_init(NULL);
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	/* Disable sequential consistency */
 	starpu_data_set_default_sequential_consistency_flag(0);
@@ -251,17 +271,8 @@ int main(int argc, char **argv)
 	for (y = 0; y < nblocks; y++)
 	for (x = 0; x < nblocks; x++)
 	{
-		if (x <= y) {
-			A[y][x] = malloc(BLOCKSIZE*BLOCKSIZE*sizeof(float));
-			assert(A[y][x]);
-		}
-	}
-
-
-	for (y = 0; y < nblocks; y++)
-	for (x = 0; x < nblocks; x++)
-	{
-		if (x <= y) {
+		if (x <= y)
+		{
 #ifdef STARPU_HAVE_POSIX_MEMALIGN
 			posix_memalign((void **)&A[y][x], 128, BLOCKSIZE*BLOCKSIZE*sizeof(float));
 #else
@@ -277,7 +288,8 @@ int main(int argc, char **argv)
 	 * */
 	for (y = 0; y < nblocks; y++)
 	for (x = 0; x < nblocks; x++)
-	if (x <= y) {
+	if (x <= y)
+	{
 		for (i = 0; i < BLOCKSIZE; i++)
 		for (j = 0; j < BLOCKSIZE; j++)
 		{
@@ -290,12 +302,11 @@ int main(int argc, char **argv)
 		}
 	}
 
-
-
 	for (y = 0; y < nblocks; y++)
 	for (x = 0; x < nblocks; x++)
 	{
-		if (x <= y) {
+		if (x <= y)
+		{
 			starpu_matrix_data_register(&A_state[y][x], 0, (uintptr_t)A[y][x], 
 				BLOCKSIZE, BLOCKSIZE, BLOCKSIZE, sizeof(float));
 		}
@@ -303,6 +314,16 @@ int main(int argc, char **argv)
 
 	cholesky_no_stride();
 
+	for (y = 0; y < nblocks; y++)
+	for (x = 0; x < nblocks; x++)
+	{
+		if (x <= y)
+		{
+			starpu_data_unregister(A_state[y][x]);
+			free(A[y][x]);
+		}
+	}
+
 	starpu_helper_cublas_shutdown();
 
 	starpu_shutdown();