14 years ago · bfd7f5a773
--- a/include/starpu_scheduler.h
+++ b/include/starpu_scheduler.h
@@ -130,10 +130,15 @@ struct worker_collection {
 
				 struct starpu_sched_ctx_hypervisor_criteria {
			
 
				 	void (*idle_time_cb)(unsigned sched_ctx, int worker, double idle_time);
			
 
				 	void (*working_time_cb)(unsigned sched_ctx, double working_time);
			
 
				+	void (*pushed_task_cb)(unsigned sched_ctx, int worker);
			
 
				+	void (*poped_task_cb)(unsigned sched_ctx, int worker);
			
 
				+	void (*post_exec_hook_cb)(unsigned sched_ctx, int taskid);
			
 
				 };
			
 
				 
			
 
				 #ifdef STARPU_BUILD_SCHED_CTX_HYPERVISOR
			
 
				 unsigned starpu_create_sched_ctx_with_criteria(const char *policy_name, int *workerids_ctx, int nworkers_ctx, const char *sched_name, struct starpu_sched_ctx_hypervisor_criteria *criteria);
			
 
				+void starpu_call_poped_task_cb(int workerid);
			
 
				+void starpu_call_pushed_task_cb(int workerid);
			
 
				 #endif //STARPU_BUILD_SCHED_CTX_HYPERVISOR
			
 
				 
			
 
				 unsigned starpu_create_sched_ctx(const char *policy_name, int *workerids_ctx, int nworkers_ctx, const char *sched_name);
			
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -182,6 +182,8 @@ struct starpu_task {
 
				 	/* flag to differentiate tasks needed by starpu management purposes 
			
 
				 	 from the ones provided by the appl*/
			
 
				 	unsigned control_task;
			
 
				+
			
 
				+	int checkpoint;
			
 
				 };
			
 
				 
			
 
				 /* It is possible to initialize statically allocated tasks with this value.
			
@@ -207,7 +209,8 @@ struct starpu_task {
 
				 	.predicted = -1.0,				\
			
 
				 	.starpu_private = NULL,				\
			
 
				 	.sched_ctx = 0,					\
			
 
				-	.control_task = 0				\
			
 
				+	.control_task = 0,				\
			
 
				+		.checkpoint = 0				\
			
 
				 };
			
 
				 
			
 
				 /*
			
--- a/include/starpu_util.h
+++ b/include/starpu_util.h
@@ -239,7 +239,7 @@ int starpu_data_cpy(starpu_data_handle dst_handle, starpu_data_handle src_handle
 
				 #define STARPU_PRIORITY		(1<<7)	/* Priority associated to the task */
			
 
				 #define STARPU_EXECUTE_ON_NODE	(1<<8)	/* Used by MPI to define which task is going to execute the codelet */
			
 
				 #define STARPU_EXECUTE_ON_DATA	(1<<9)	/* Used by MPI to define which task is going to execute the codelet */
			
 
				-#define STARPU_CTX		(1<<10)	/* Used to define which ctx will execute the codelet */
			
 
				+#define STARPU_CHECKPOINT	(1<<10)	/* Used to checkpoint a task after whose execution we'll execute  a code */
			
 
				 
			
 
				 /* Wrapper to create a task. */
			
 
				 int starpu_insert_task(starpu_codelet *cl, ...);
			
--- a/sched_ctx_hypervisor/examples/cholesky/cholesky.h
+++ b/sched_ctx_hypervisor/examples/cholesky/cholesky.h
@@ -0,0 +1,139 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __DW_CHOLESKY_H__
			
 
				+#define __DW_CHOLESKY_H__
			
 
				+
			
 
				+#include <limits.h>
			
 
				+#include <string.h>
			
 
				+#include <math.h>
			
 
				+#include <sys/time.h>
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#include <cuda.h>
			
 
				+#include <cuda_runtime.h>
			
 
				+#include <cublas.h>
			
 
				+#endif
			
 
				+
			
 
				+#include <common/blas.h>
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				+#define NMAXBLOCKS	32
			
 
				+
			
 
				+#define TAG11(k)	((starpu_tag_t)( (1ULL<<60) | (unsigned long long)(k)))
			
 
				+#define TAG21(k,j)	((starpu_tag_t)(((3ULL<<60) | (((unsigned long long)(k))<<32)	\
			
 
				+					| (unsigned long long)(j))))
			
 
				+#define TAG22(k,i,j)	((starpu_tag_t)(((4ULL<<60) | ((unsigned long long)(k)<<32) 	\
			
 
				+					| ((unsigned long long)(i)<<16)	\
			
 
				+					| (unsigned long long)(j))))
			
 
				+
			
 
				+#define TAG11_AUX(k, prefix)	((starpu_tag_t)( (((unsigned long long)(prefix))<<60)  |  (1ULL<<56) | (unsigned long long)(k)))
			
 
				+#define TAG21_AUX(k,j, prefix)	((starpu_tag_t)( (((unsigned long long)(prefix))<<60)  			\
			
 
				+					|  ((3ULL<<56) | (((unsigned long long)(k))<<32)	\
			
 
				+					| (unsigned long long)(j))))
			
 
				+#define TAG22_AUX(k,i,j, prefix)    ((starpu_tag_t)(  (((unsigned long long)(prefix))<<60)	\
			
 
				+					|  ((4ULL<<56) | ((unsigned long long)(k)<<32)  	\
			
 
				+					| ((unsigned long long)(i)<<16) 			\
			
 
				+					| (unsigned long long)(j))))
			
 
				+
			
 
				+#define BLOCKSIZE	(size/nblocks)
			
 
				+
			
 
				+#define BLAS3_FLOP(n1,n2,n3)    \
			
 
				+        (2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
			
 
				+
			
 
				+static unsigned size = 4*1024;
			
 
				+static unsigned nblocks = 16;
			
 
				+static unsigned nbigblocks = 8;
			
 
				+static unsigned pinned = 0;
			
 
				+static unsigned noprio = 0;
			
 
				+static unsigned check = 0;
			
 
				+static unsigned with_ctxs = 0;
			
 
				+static unsigned with_noctxs = 0;
			
 
				+static unsigned chole1 = 0;
			
 
				+static unsigned chole2 = 0;
			
 
				+
			
 
				+void chol_cpu_codelet_update_u11(void **, void *);
			
 
				+void chol_cpu_codelet_update_u21(void **, void *);
			
 
				+void chol_cpu_codelet_update_u22(void **, void *);
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+void chol_cublas_codelet_update_u11(void *descr[], void *_args);
			
 
				+void chol_cublas_codelet_update_u21(void *descr[], void *_args);
			
 
				+void chol_cublas_codelet_update_u22(void *descr[], void *_args);
			
 
				+#endif
			
 
				+
			
 
				+extern struct starpu_perfmodel_t chol_model_11;
			
 
				+extern struct starpu_perfmodel_t chol_model_21;
			
 
				+extern struct starpu_perfmodel_t chol_model_22;
			
 
				+
			
 
				+static void __attribute__((unused)) parse_args(int argc, char **argv)
			
 
				+{
			
 
				+	int i;
			
 
				+	for (i = 1; i < argc; i++) {
			
 
				+		if (strcmp(argv[i], "-with_ctxs") == 0) {
			
 
				+			with_ctxs = 1;
			
 
				+			break;
			
 
				+		}
			
 
				+		if (strcmp(argv[i], "-with_noctxs") == 0) {
			
 
				+			with_noctxs = 1;
			
 
				+			break;
			
 
				+		}
			
 
				+		
			
 
				+		if (strcmp(argv[i], "-chole1") == 0) {
			
 
				+			chole1 = 1;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-chole2") == 0) {
			
 
				+			chole2 = 1;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-size") == 0) {
			
 
				+			char *argptr;
			
 
				+			size = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+		
			
 
				+		if (strcmp(argv[i], "-nblocks") == 0) {
			
 
				+			char *argptr;
			
 
				+			nblocks = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+		
			
 
				+		if (strcmp(argv[i], "-nbigblocks") == 0) {
			
 
				+			char *argptr;
			
 
				+			nbigblocks = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+		
			
 
				+		if (strcmp(argv[i], "-pin") == 0) {
			
 
				+			pinned = 1;
			
 
				+		}
			
 
				+		
			
 
				+		if (strcmp(argv[i], "-no-prio") == 0) {
			
 
				+			noprio = 1;
			
 
				+		}
			
 
				+		
			
 
				+		if (strcmp(argv[i], "-check") == 0) {
			
 
				+			check = 1;
			
 
				+		}
			
 
				+		
			
 
				+		if (strcmp(argv[i], "-h") == 0) {
			
 
				+			printf("usage : %s [-pin] [-size size] [-nblocks nblocks] [-check]\n", argv[0]);
			
 
				+		}	
			
 
				+	}	
			
 
				+}
			
 
				+
			
 
				+#endif /* __DW_CHOLESKY_H__ */
			
--- a/sched_ctx_hypervisor/examples/cholesky/cholesky_grain_tag.c
+++ b/sched_ctx_hypervisor/examples/cholesky/cholesky_grain_tag.c
@@ -0,0 +1,380 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "cholesky.h"
			
 
				+
			
 
				+/*
			
 
				+ *	Some useful functions
			
 
				+ */
			
 
				+
			
 
				+static struct starpu_task *create_task(starpu_tag_t id)
			
 
				+{
			
 
				+	struct starpu_task *task = starpu_task_create();
			
 
				+		task->cl_arg = NULL;
			
 
				+		task->use_tag = 1;
			
 
				+		task->tag_id = id;
			
 
				+
			
 
				+	return task;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *	Create the codelets
			
 
				+ */
			
 
				+
			
 
				+static starpu_codelet cl11 =
			
 
				+{
			
 
				+	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.cpu_func = chol_cpu_codelet_update_u11,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_func = chol_cublas_codelet_update_u11,
			
 
				+#endif
			
 
				+	.nbuffers = 1,
			
 
				+	.model = &chol_model_11
			
 
				+};
			
 
				+
			
 
				+static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k, unsigned reclevel)
			
 
				+{
			
 
				+/*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
			
 
				+
			
 
				+	struct starpu_task *task = create_task(TAG11_AUX(k, reclevel));
			
 
				+	
			
 
				+	task->cl = &cl11;
			
 
				+
			
 
				+	/* which sub-data is manipulated ? */
			
 
				+	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k);
			
 
				+	task->buffers[0].mode = STARPU_RW;
			
 
				+
			
 
				+	/* this is an important task */
			
 
				+	task->priority = STARPU_MAX_PRIO;
			
 
				+
			
 
				+	/* enforce dependencies ... */
			
 
				+	if (k > 0) {
			
 
				+		starpu_tag_declare_deps(TAG11_AUX(k, reclevel), 1, TAG22_AUX(k-1, k, k, reclevel));
			
 
				+	}
			
 
				+
			
 
				+	return task;
			
 
				+}
			
 
				+
			
 
				+static starpu_codelet cl21 =
			
 
				+{
			
 
				+	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.cpu_func = chol_cpu_codelet_update_u21,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_func = chol_cublas_codelet_update_u21,
			
 
				+#endif
			
 
				+	.nbuffers = 2,
			
 
				+	.model = &chol_model_21
			
 
				+};
			
 
				+
			
 
				+static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j, unsigned reclevel)
			
 
				+{
			
 
				+	struct starpu_task *task = create_task(TAG21_AUX(k, j, reclevel));
			
 
				+
			
 
				+	task->cl = &cl21;	
			
 
				+
			
 
				+	/* which sub-data is manipulated ? */
			
 
				+	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k); 
			
 
				+	task->buffers[0].mode = STARPU_R;
			
 
				+	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
			
 
				+	task->buffers[1].mode = STARPU_RW;
			
 
				+
			
 
				+	if (j == k+1) {
			
 
				+		task->priority = STARPU_MAX_PRIO;
			
 
				+	}
			
 
				+
			
 
				+	/* enforce dependencies ... */
			
 
				+	if (k > 0) {
			
 
				+		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 2, TAG11_AUX(k, reclevel), TAG22_AUX(k-1, k, j, reclevel));
			
 
				+	}
			
 
				+	else {
			
 
				+		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 1, TAG11_AUX(k, reclevel));
			
 
				+	}
			
 
				+
			
 
				+	starpu_task_submit(task);
			
 
				+}
			
 
				+
			
 
				+static starpu_codelet cl22 =
			
 
				+{
			
 
				+	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.cpu_func = chol_cpu_codelet_update_u22,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_func = chol_cublas_codelet_update_u22,
			
 
				+#endif
			
 
				+	.nbuffers = 3,
			
 
				+	.model = &chol_model_22
			
 
				+};
			
 
				+
			
 
				+static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j, unsigned reclevel)
			
 
				+{
			
 
				+/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22_AUX(k,i,j)); */
			
 
				+
			
 
				+	struct starpu_task *task = create_task(TAG22_AUX(k, i, j, reclevel));
			
 
				+
			
 
				+	task->cl = &cl22;
			
 
				+
			
 
				+	/* which sub-data is manipulated ? */
			
 
				+	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, i); 
			
 
				+	task->buffers[0].mode = STARPU_R;
			
 
				+	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
			
 
				+	task->buffers[1].mode = STARPU_R;
			
 
				+	task->buffers[2].handle = starpu_data_get_sub_data(dataA, 2, i, j); 
			
 
				+	task->buffers[2].mode = STARPU_RW;
			
 
				+
			
 
				+	if ( (i == k + 1) && (j == k +1) ) {
			
 
				+		task->priority = STARPU_MAX_PRIO;
			
 
				+	}
			
 
				+
			
 
				+	/* enforce dependencies ... */
			
 
				+	if (k > 0) {
			
 
				+		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 3, TAG22_AUX(k-1, i, j, reclevel), TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
			
 
				+	}
			
 
				+	else {
			
 
				+		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 2, TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
			
 
				+	}
			
 
				+
			
 
				+	starpu_task_submit(task);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ *	code to bootstrap the factorization 
			
 
				+ *	and construct the DAG
			
 
				+ */
			
 
				+
			
 
				+static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks, unsigned reclevel)
			
 
				+{
			
 
				+	/* create a new codelet */
			
 
				+	struct starpu_task *entry_task = NULL;
			
 
				+
			
 
				+	/* create all the DAG nodes */
			
 
				+	unsigned i,j,k;
			
 
				+
			
 
				+	starpu_data_handle dataA;
			
 
				+
			
 
				+	/* monitor and partition the A matrix into blocks :
			
 
				+	 * one block is now determined by 2 unsigned (i,j) */
			
 
				+	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
			
 
				+
			
 
				+	starpu_data_set_sequential_consistency_flag(dataA, 0);
			
 
				+
			
 
				+	struct starpu_data_filter f = {
			
 
				+		.filter_func = starpu_vertical_block_filter_func,
			
 
				+		.nchildren = nblocks
			
 
				+	};
			
 
				+
			
 
				+	struct starpu_data_filter f2 = {
			
 
				+		.filter_func = starpu_block_filter_func,
			
 
				+		.nchildren = nblocks
			
 
				+	};
			
 
				+
			
 
				+	starpu_data_map_filters(dataA, 2, &f, &f2);
			
 
				+
			
 
				+	for (k = 0; k < nbigblocks; k++)
			
 
				+	{
			
 
				+		struct starpu_task *task = create_task_11(dataA, k, reclevel);
			
 
				+		/* we defer the launch of the first task */
			
 
				+		if (k == 0) {
			
 
				+			entry_task = task;
			
 
				+		}
			
 
				+		else {
			
 
				+			starpu_task_submit(task);
			
 
				+		}
			
 
				+		
			
 
				+		for (j = k+1; j<nblocks; j++)
			
 
				+		{
			
 
				+			create_task_21(dataA, k, j, reclevel);
			
 
				+
			
 
				+			for (i = k+1; i<nblocks; i++)
			
 
				+			{
			
 
				+				if (i <= j)
			
 
				+					create_task_22(dataA, k, i, j, reclevel);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* schedule the codelet */
			
 
				+	int ret = starpu_task_submit(entry_task);
			
 
				+	if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				+	{
			
 
				+		FPRINTF(stderr, "No worker may execute this task\n");
			
 
				+		exit(-1);
			
 
				+	}
			
 
				+
			
 
				+	if (nblocks == nbigblocks)
			
 
				+	{
			
 
				+		/* stall the application until the end of computations */
			
 
				+		starpu_tag_wait(TAG11_AUX(nblocks-1, reclevel));
			
 
				+		starpu_data_unpartition(dataA, 0);
			
 
				+		return;
			
 
				+	}
			
 
				+	else {
			
 
				+		STARPU_ASSERT(reclevel == 0);
			
 
				+		unsigned ndeps_tags = (nblocks - nbigblocks)*(nblocks - nbigblocks);
			
 
				+
			
 
				+		starpu_tag_t *tag_array = malloc(ndeps_tags*sizeof(starpu_tag_t));
			
 
				+		STARPU_ASSERT(tag_array);
			
 
				+
			
 
				+		unsigned ind = 0;
			
 
				+		for (i = nbigblocks; i < nblocks; i++)
			
 
				+		for (j = nbigblocks; j < nblocks; j++)
			
 
				+		{
			
 
				+			if (i <= j)
			
 
				+				tag_array[ind++] = TAG22_AUX(nbigblocks - 1, i, j, reclevel);
			
 
				+		}
			
 
				+
			
 
				+		starpu_tag_wait_array(ind, tag_array);
			
 
				+
			
 
				+		free(tag_array);
			
 
				+
			
 
				+		starpu_data_unpartition(dataA, 0);
			
 
				+		starpu_data_unregister(dataA);
			
 
				+
			
 
				+		float *newmatA = &matA[nbigblocks*(size/nblocks)*(ld+1)];
			
 
				+
			
 
				+		cholesky_grain_rec(newmatA, size/nblocks*(nblocks - nbigblocks), ld, (nblocks - nbigblocks)*2, (nblocks - nbigblocks)*2, reclevel+1);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void initialize_system(float **A, unsigned dim, unsigned pinned)
			
 
				+{
			
 
				+	starpu_init(NULL);
			
 
				+
			
 
				+	starpu_helper_cublas_init();
			
 
				+
			
 
				+	if (pinned)
			
 
				+	{
			
 
				+		starpu_malloc((void **)A, dim*dim*sizeof(float));
			
 
				+	} 
			
 
				+	else {
			
 
				+		*A = malloc(dim*dim*sizeof(float));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks)
			
 
				+{
			
 
				+	struct timeval start;
			
 
				+	struct timeval end;
			
 
				+
			
 
				+	gettimeofday(&start, NULL);
			
 
				+
			
 
				+	cholesky_grain_rec(matA, size, ld, nblocks, nbigblocks, 0);
			
 
				+
			
 
				+	gettimeofday(&end, NULL);
			
 
				+
			
 
				+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				+	FPRINTF(stderr, "Computation took (in ms)\n");
			
 
				+	FPRINTF(stdout, "%2.2f\n", timing/1000);
			
 
				+
			
 
				+	double flop = (1.0f*size*size*size)/3.0f;
			
 
				+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				+
			
 
				+	starpu_helper_cublas_shutdown();
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	/* create a simple definite positive symetric matrix example
			
 
				+	 *
			
 
				+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
			
 
				+	 * */
			
 
				+
			
 
				+	parse_args(argc, argv);
			
 
				+
			
 
				+	float *mat;
			
 
				+
			
 
				+	mat = malloc(size*size*sizeof(float));
			
 
				+	initialize_system(&mat, size, pinned);
			
 
				+
			
 
				+	unsigned i,j;
			
 
				+	for (i = 0; i < size; i++)
			
 
				+	{
			
 
				+		for (j = 0; j < size; j++)
			
 
				+		{
			
 
				+			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
			
 
				+			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+#ifdef CHECK_OUTPUT
			
 
				+	FPRINTF(stdout, "Input :\n");
			
 
				+
			
 
				+	for (j = 0; j < size; j++)
			
 
				+	{
			
 
				+		for (i = 0; i < size; i++)
			
 
				+		{
			
 
				+			if (i <= j) {
			
 
				+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
			
 
				+			}
			
 
				+			else {
			
 
				+				FPRINTF(stdout, ".\t");
			
 
				+			}
			
 
				+		}
			
 
				+		FPRINTF(stdout, "\n");
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+	cholesky_grain(mat, size, size, nblocks, nbigblocks);
			
 
				+
			
 
				+#ifdef CHECK_OUTPUT
			
 
				+	FPRINTF(stdout, "Results :\n");
			
 
				+
			
 
				+	for (j = 0; j < size; j++)
			
 
				+	{
			
 
				+		for (i = 0; i < size; i++)
			
 
				+		{
			
 
				+			if (i <= j) {
			
 
				+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
			
 
				+			}
			
 
				+			else {
			
 
				+				FPRINTF(stdout, ".\t");
			
 
				+				mat[j+i*size] = 0.0f; /* debug */
			
 
				+			}
			
 
				+		}
			
 
				+		FPRINTF(stdout, "\n");
			
 
				+	}
			
 
				+
			
 
				+	FPRINTF(stderr, "compute explicit LLt ...\n");
			
 
				+	float *test_mat = malloc(size*size*sizeof(float));
			
 
				+	STARPU_ASSERT(test_mat);
			
 
				+
			
 
				+	SSYRK("L", "N", size, size, 1.0f, 
			
 
				+				mat, size, 0.0f, test_mat, size);
			
 
				+
			
 
				+	FPRINTF(stderr, "comparing results ...\n");
			
 
				+	for (j = 0; j < size; j++)
			
 
				+	{
			
 
				+		for (i = 0; i < size; i++)
			
 
				+		{
			
 
				+			if (i <= j) {
			
 
				+                                FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
			
 
				+			}
			
 
				+			else {
			
 
				+				FPRINTF(stdout, ".\t");
			
 
				+			}
			
 
				+		}
			
 
				+		FPRINTF(stdout, "\n");
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/sched_ctx_hypervisor/examples/cholesky/cholesky_implicit.c
+++ b/sched_ctx_hypervisor/examples/cholesky/cholesky_implicit.c
@@ -0,0 +1,337 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "cholesky.h"
			
 
				+#include "../sched_ctx_utils/sched_ctx_utils.h"
			
 
				+/*
			
 
				+ *	Create the codelets
			
 
				+ */
			
 
				+
			
 
				+static starpu_codelet cl11 =
			
 
				+{
			
 
				+	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.type = STARPU_SEQ,
			
 
				+	.cpu_func = chol_cpu_codelet_update_u11,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_func = chol_cublas_codelet_update_u11,
			
 
				+#endif
			
 
				+	.nbuffers = 1,
			
 
				+	.model = &chol_model_11
			
 
				+};
			
 
				+
			
 
				+static starpu_codelet cl21 =
			
 
				+{
			
 
				+	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.type = STARPU_SEQ,
			
 
				+	.cpu_func = chol_cpu_codelet_update_u21,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_func = chol_cublas_codelet_update_u21,
			
 
				+#endif
			
 
				+	.nbuffers = 2,
			
 
				+	.model = &chol_model_21
			
 
				+};
			
 
				+
			
 
				+static starpu_codelet cl22 =
			
 
				+{
			
 
				+	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.type = STARPU_SEQ,
			
 
				+	.max_parallelism = INT_MAX,
			
 
				+	.cpu_func = chol_cpu_codelet_update_u22,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_func = chol_cublas_codelet_update_u22,
			
 
				+#endif
			
 
				+	.nbuffers = 3,
			
 
				+	.model = &chol_model_22
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ *	code to bootstrap the factorization
			
 
				+ *	and construct the DAG
			
 
				+ */
			
 
				+
			
 
				+static void callback_turn_spmd_on(void *arg __attribute__ ((unused)))
			
 
				+{
			
 
				+	cl22.type = STARPU_SPMD;
			
 
				+}
			
 
				+
			
 
				+int checkpoints = 1;
			
 
				+static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
			
 
				+{
			
 
				+	struct timeval start;
			
 
				+	struct timeval end;
			
 
				+
			
 
				+	unsigned i,j,k;
			
 
				+
			
 
				+	int prio_level = noprio?STARPU_DEFAULT_PRIO:STARPU_MAX_PRIO;
			
 
				+
			
 
				+	gettimeofday(&start, NULL);
			
 
				+
			
 
				+	/* create all the DAG nodes */
			
 
				+	for (k = 0; k < nblocks; k++)
			
 
				+	{
			
 
				+                starpu_data_handle sdatakk = starpu_data_get_sub_data(dataA, 2, k, k);
			
 
				+		if(k == 0 && with_ctxs)
			
 
				+		{
			
 
				+			starpu_insert_task(&cl11,
			
 
				+					   STARPU_PRIORITY, prio_level,
			
 
				+					   STARPU_RW, sdatakk,
			
 
				+					   STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
			
 
				+					   STARPU_CHECKPOINT, checkpoints,
			
 
				+					   0);
			
 
				+			set_hypervisor_conf(START_BENCH, checkpoints++);
			
 
				+		}
			
 
				+		else
			
 
				+			starpu_insert_task(&cl11,
			
 
				+					   STARPU_PRIORITY, prio_level,
			
 
				+					   STARPU_RW, sdatakk,
			
 
				+					   STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
			
 
				+					   0);
			
 
				+
			
 
				+		for (j = k+1; j<nblocks; j++)
			
 
				+		{
			
 
				+                        starpu_data_handle sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
			
 
				+
			
 
				+                        starpu_insert_task(&cl21,
			
 
				+                                           STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
			
 
				+                                           STARPU_R, sdatakk,
			
 
				+                                           STARPU_RW, sdatakj,
			
 
				+                                           0);
			
 
				+
			
 
				+			for (i = k+1; i<nblocks; i++)
			
 
				+			{
			
 
				+				if (i <= j)
			
 
				+                                {
			
 
				+					starpu_data_handle sdataki = starpu_data_get_sub_data(dataA, 2, k, i);
			
 
				+					starpu_data_handle sdataij = starpu_data_get_sub_data(dataA, 2, i, j);
			
 
				+
			
 
				+					if(k == (nblocks-1) && j == (nblocks-1) &&
			
 
				+					   i == (nblocks-1) && with_ctxs)
			
 
				+					{
			
 
				+						starpu_insert_task(&cl22,
			
 
				+								   STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
			
 
				+								   STARPU_R, sdataki,
			
 
				+								   STARPU_R, sdatakj,
			
 
				+								   STARPU_RW, sdataij,
			
 
				+								   STARPU_CHECKPOINT, checkpoints,
			
 
				+								   0);
			
 
				+						
			
 
				+						set_hypervisor_conf(END_BENCH, checkpoints++);
			
 
				+					}
			
 
				+					
			
 
				+					else
			
 
				+						starpu_insert_task(&cl22,
			
 
				+								   STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
			
 
				+								   STARPU_R, sdataki,
			
 
				+								   STARPU_R, sdatakj,
			
 
				+								   STARPU_RW, sdataij,
			
 
				+								   0);
			
 
				+					
			
 
				+                                }
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	starpu_data_unpartition(dataA, 0);
			
 
				+
			
 
				+	gettimeofday(&end, NULL);
			
 
				+
			
 
				+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				+
			
 
				+	unsigned long n = starpu_matrix_get_nx(dataA);
			
 
				+
			
 
				+	double flop = (1.0f*n*n*n)/3.0f;
			
 
				+
			
 
				+	if(with_ctxs || with_noctxs || chole1 || chole2)
			
 
				+		update_sched_ctx_timing_results((flop/timing/1000.0f), (timing/1000000.0f));
			
 
				+	else
			
 
				+	{
			
 
				+		FPRINTF(stderr, "Computation took (in ms)\n");
			
 
				+		FPRINTF(stdout, "%2.2f\n", timing/1000);
			
 
				+	
			
 
				+		FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
			
 
				+{
			
 
				+	starpu_data_handle dataA;
			
 
				+
			
 
				+	/* monitor and partition the A matrix into blocks :
			
 
				+	 * one block is now determined by 2 unsigned (i,j) */
			
 
				+	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
			
 
				+
			
 
				+	struct starpu_data_filter f = {
			
 
				+		.filter_func = starpu_vertical_block_filter_func,
			
 
				+		.nchildren = nblocks
			
 
				+	};
			
 
				+
			
 
				+	struct starpu_data_filter f2 = {
			
 
				+		.filter_func = starpu_block_filter_func,
			
 
				+		.nchildren = nblocks
			
 
				+	};
			
 
				+
			
 
				+	starpu_data_map_filters(dataA, 2, &f, &f2);
			
 
				+
			
 
				+	_cholesky(dataA, nblocks);
			
 
				+}
			
 
				+
			
 
				+static void execute_cholesky(unsigned size, unsigned nblocks)
			
 
				+{
			
 
				+	float *mat;
			
 
				+	starpu_malloc((void **)&mat, (size_t)size*size*sizeof(float));
			
 
				+
			
 
				+	unsigned i,j;
			
 
				+	for (i = 0; i < size; i++)
			
 
				+	{
			
 
				+		for (j = 0; j < size; j++)
			
 
				+		{
			
 
				+			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
			
 
				+			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+/* #define PRINT_OUTPUT */
			
 
				+#ifdef PRINT_OUTPUT
			
 
				+	FPRINTF(stdout, "Input :\n");
			
 
				+
			
 
				+	for (j = 0; j < size; j++)
			
 
				+	{
			
 
				+		for (i = 0; i < size; i++)
			
 
				+		{
			
 
				+			if (i <= j) {
			
 
				+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
			
 
				+			}
			
 
				+			else {
			
 
				+				FPRINTF(stdout, ".\t");
			
 
				+			}
			
 
				+		}
			
 
				+		FPRINTF(stdout, "\n");
			
 
				+	}
			
 
				+#endif
			
 
				+	cholesky(mat, size, size, nblocks);
			
 
				+
			
 
				+#ifdef PRINT_OUTPUT
			
 
				+	FPRINTF(stdout, "Results :\n");
			
 
				+	for (j = 0; j < size; j++)
			
 
				+	{
			
 
				+		for (i = 0; i < size; i++)
			
 
				+		{
			
 
				+			if (i <= j) {
			
 
				+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
			
 
				+			}
			
 
				+			else {
			
 
				+				FPRINTF(stdout, ".\t");
			
 
				+				mat[j+i*size] = 0.0f; /* debug */
			
 
				+			}
			
 
				+		}
			
 
				+		FPRINTF(stdout, "\n");
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	if (check)
			
 
				+	{
			
 
				+		FPRINTF(stderr, "compute explicit LLt ...\n");
			
 
				+		for (j = 0; j < size; j++)
			
 
				+		{
			
 
				+			for (i = 0; i < size; i++)
			
 
				+			{
			
 
				+				if (i > j) {
			
 
				+					mat[j+i*size] = 0.0f; /* debug */
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+		float *test_mat = malloc(size*size*sizeof(float));
			
 
				+		STARPU_ASSERT(test_mat);
			
 
				+	
			
 
				+		SSYRK("L", "N", size, size, 1.0f,
			
 
				+					mat, size, 0.0f, test_mat, size);
			
 
				+	
			
 
				+		FPRINTF(stderr, "comparing results ...\n");
			
 
				+#ifdef PRINT_OUTPUT
			
 
				+		for (j = 0; j < size; j++)
			
 
				+		{
			
 
				+			for (i = 0; i < size; i++)
			
 
				+			{
			
 
				+				if (i <= j) {
			
 
				+					FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
			
 
				+				}
			
 
				+				else {
			
 
				+					FPRINTF(stdout, ".\t");
			
 
				+				}
			
 
				+			}
			
 
				+			FPRINTF(stdout, "\n");
			
 
				+		}
			
 
				+#endif
			
 
				+	
			
 
				+		for (j = 0; j < size; j++)
			
 
				+		{
			
 
				+			for (i = 0; i < size; i++)
			
 
				+			{
			
 
				+				if (i <= j) {
			
 
				+	                                float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
			
 
				+	                                float err = abs(test_mat[j +i*size] - orig);
			
 
				+	                                if (err > 0.00001) {
			
 
				+	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.2f != %2.2f (err %2.2f)\n", i, j, test_mat[j +i*size], orig, err);
			
 
				+	                                        assert(0);
			
 
				+	                                }
			
 
				+	                        }
			
 
				+			}
			
 
				+	        }
			
 
				+	}
			
 
				+
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	/* create a simple definite positive symetric matrix example
			
 
				+	 *
			
 
				+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
			
 
				+	 * */
			
 
				+
			
 
				+	parse_args(argc, argv);
			
 
				+
			
 
				+	if(with_ctxs || with_noctxs || chole1 || chole2)
			
 
				+		parse_args_ctx(argc, argv);
			
 
				+
			
 
				+	starpu_init(NULL);
			
 
				+
			
 
				+	starpu_helper_cublas_init();
			
 
				+
			
 
				+	if(with_ctxs)
			
 
				+	{
			
 
				+		construct_contexts(execute_cholesky);
			
 
				+		start_2benchs(execute_cholesky);
			
 
				+	}
			
 
				+	else if(with_noctxs)
			
 
				+		start_2benchs(execute_cholesky);
			
 
				+	else if(chole1)
			
 
				+		start_1stbench(execute_cholesky);
			
 
				+	else if(chole2)
			
 
				+		start_2ndbench(execute_cholesky);
			
 
				+	else
			
 
				+		execute_cholesky(size, nblocks);
			
 
				+
			
 
				+	starpu_helper_cublas_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	if(with_ctxs)
			
 
				+		end_contexts();
			
 
				+	return 0;
			
 
				+}
			
--- a/sched_ctx_hypervisor/examples/cholesky/cholesky_kernels.c
+++ b/sched_ctx_hypervisor/examples/cholesky/cholesky_kernels.c
@@ -0,0 +1,247 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_config.h>
			
 
				+#include "cholesky.h"
			
 
				+//#include "../common/blas.h"
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#include <starpu_cuda.h>
			
 
				+#ifdef STARPU_HAVE_MAGMA
			
 
				+#include "magma.h"
			
 
				+#include "magma_lapack.h"
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+ *   U22 
			
 
				+ */
			
 
				+
			
 
				+static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __attribute__((unused)) void *_args)
			
 
				+{
			
 
				+	/* printf("22\n"); */
			
 
				+	float *left 	= (float *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	float *right 	= (float *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+	float *center 	= (float *)STARPU_MATRIX_GET_PTR(descr[2]);
			
 
				+
			
 
				+	unsigned dx = STARPU_MATRIX_GET_NY(descr[2]);
			
 
				+	unsigned dy = STARPU_MATRIX_GET_NX(descr[2]);
			
 
				+	unsigned dz = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+
			
 
				+	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+	unsigned ld12 = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				+	unsigned ld22 = STARPU_MATRIX_GET_LD(descr[2]);
			
 
				+
			
 
				+	if (s == 0)
			
 
				+	{
			
 
				+		int worker_size = starpu_combined_worker_get_size();
			
 
				+
			
 
				+		if (worker_size == 1)
			
 
				+		{
			
 
				+			/* Sequential CPU kernel */
			
 
				+			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21, 
			
 
				+				right, ld12, 1.0f, center, ld22);
			
 
				+		}
			
 
				+		else {
			
 
				+			/* Parallel CPU kernel */
			
 
				+			int rank = starpu_combined_worker_get_rank();
			
 
				+
			
 
				+			int block_size = (dx + worker_size - 1)/worker_size;
			
 
				+			int new_dx = STARPU_MIN(dx, block_size*(rank+1)) - block_size*rank;
			
 
				+			
			
 
				+			float *new_left = &left[block_size*rank];
			
 
				+			float *new_center = &center[block_size*rank];
			
 
				+
			
 
				+			SGEMM("N", "T", dy, new_dx, dz, -1.0f, new_left, ld21, 
			
 
				+				right, ld12, 1.0f, new_center, ld22);
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		/* CUDA kernel */
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		cublasSgemm('n', 't', dy, dx, dz, 
			
 
				+				-1.0f, left, ld21, right, ld12, 
			
 
				+				 1.0f, center, ld22);
			
 
				+		cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+#endif
			
 
				+
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void chol_cpu_codelet_update_u22(void *descr[], void *_args)
			
 
				+{
			
 
				+	chol_common_cpu_codelet_update_u22(descr, 0, _args);
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+void chol_cublas_codelet_update_u22(void *descr[], void *_args)
			
 
				+{
			
 
				+	chol_common_cpu_codelet_update_u22(descr, 1, _args);
			
 
				+}
			
 
				+#endif /* STARPU_USE_CUDA */
			
 
				+
			
 
				+/* 
			
 
				+ * U21
			
 
				+ */
			
 
				+
			
 
				+static inline void chol_common_codelet_update_u21(void *descr[], int s, __attribute__((unused)) void *_args)
			
 
				+{
			
 
				+/*	printf("21\n"); */
			
 
				+	float *sub11;
			
 
				+	float *sub21;
			
 
				+
			
 
				+	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	sub21 = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+
			
 
				+	unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				+
			
 
				+	unsigned nx21 = STARPU_MATRIX_GET_NY(descr[1]);
			
 
				+	unsigned ny21 = STARPU_MATRIX_GET_NX(descr[1]);
			
 
				+
			
 
				+	switch (s) {
			
 
				+		case 0:
			
 
				+			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
			
 
				+			break;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		case 1:
			
 
				+			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
			
 
				+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			STARPU_ABORT();
			
 
				+			break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void chol_cpu_codelet_update_u21(void *descr[], void *_args)
			
 
				+{
			
 
				+	 chol_common_codelet_update_u21(descr, 0, _args);
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+void chol_cublas_codelet_update_u21(void *descr[], void *_args)
			
 
				+{
			
 
				+	chol_common_codelet_update_u21(descr, 1, _args);
			
 
				+}
			
 
				+#endif 
			
 
				+
			
 
				+/*
			
 
				+ *	U11
			
 
				+ */
			
 
				+
			
 
				+static inline void chol_common_codelet_update_u11(void *descr[], int s, __attribute__((unused)) void *_args) 
			
 
				+{
			
 
				+/*	printf("11\n"); */
			
 
				+	float *sub11;
			
 
				+
			
 
				+	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]); 
			
 
				+
			
 
				+	unsigned nx = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+	unsigned ld = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+
			
 
				+	unsigned z;
			
 
				+
			
 
				+	switch (s) {
			
 
				+		case 0:
			
 
				+
			
 
				+			/*
			
 
				+			 *	- alpha 11 <- lambda 11 = sqrt(alpha11)
			
 
				+			 *	- alpha 21 <- l 21	= alpha 21 / lambda 11
			
 
				+			 *	- A22 <- A22 - l21 trans(l21)
			
 
				+			 */
			
 
				+
			
 
				+			for (z = 0; z < nx; z++)
			
 
				+			{
			
 
				+				float lambda11;
			
 
				+				lambda11 = sqrt(sub11[z+z*ld]);
			
 
				+				sub11[z+z*ld] = lambda11;
			
 
				+
			
 
				+				STARPU_ASSERT(lambda11 != 0.0f);
			
 
				+		
			
 
				+				SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
			
 
				+		
			
 
				+				SSYR("L", nx - z - 1, -1.0f, 
			
 
				+							&sub11[(z+1)+z*ld], 1,
			
 
				+							&sub11[(z+1)+(z+1)*ld], ld);
			
 
				+			}
			
 
				+			break;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		case 1:
			
 
				+#ifdef STARPU_HAVE_MAGMA
			
 
				+			{
			
 
				+			int ret;
			
 
				+			int info;
			
 
				+			ret = magma_spotrf_gpu('L', nx, sub11, ld, &info);
			
 
				+			if (ret != MAGMA_SUCCESS) {
			
 
				+				fprintf(stderr, "Error in Magma: %d\n", ret);
			
 
				+				STARPU_ABORT();
			
 
				+			}
			
 
				+			cudaError_t cures = cudaThreadSynchronize();
			
 
				+			STARPU_ASSERT(!cures);
			
 
				+			}
			
 
				+#else
			
 
				+			{
			
 
				+
			
 
				+			float *lambda11;
			
 
				+			cudaHostAlloc((void **)&lambda11, sizeof(float), 0);
			
 
				+
			
 
				+			for (z = 0; z < nx; z++)
			
 
				+			{
			
 
				+				
			
 
				+				cudaMemcpyAsync(lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				+				cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+
			
 
				+				STARPU_ASSERT(*lambda11 != 0.0f);
			
 
				+				
			
 
				+				*lambda11 = sqrt(*lambda11);
			
 
				+
			
 
				+/*				cublasSetVector(1, sizeof(float), lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float)); */
			
 
				+				cudaMemcpyAsync(&sub11[z+z*ld], lambda11, sizeof(float), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
			
 
				+
			
 
				+				cublasSscal(nx - z - 1, 1.0f/(*lambda11), &sub11[(z+1)+z*ld], 1);
			
 
				+
			
 
				+				cublasSsyr('U', nx - z - 1, -1.0f,
			
 
				+							&sub11[(z+1)+z*ld], 1,
			
 
				+							&sub11[(z+1)+(z+1)*ld], ld);
			
 
				+			}
			
 
				+
			
 
				+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+			cudaFreeHost(lambda11);
			
 
				+			}
			
 
				+#endif
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			STARPU_ABORT();
			
 
				+			break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void chol_cpu_codelet_update_u11(void *descr[], void *_args)
			
 
				+{
			
 
				+	chol_common_codelet_update_u11(descr, 0, _args);
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+void chol_cublas_codelet_update_u11(void *descr[], void *_args)
			
 
				+{
			
 
				+	chol_common_codelet_update_u11(descr, 1, _args);
			
 
				+}
			
 
				+#endif/* STARPU_USE_CUDA */
			
--- a/sched_ctx_hypervisor/examples/cholesky/cholesky_models.c
+++ b/sched_ctx_hypervisor/examples/cholesky/cholesky_models.c
@@ -0,0 +1,153 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * As a convention, in that file, descr[0] is represented by A,
			
 
				+ * 				  descr[1] is B ...
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ *	Number of flops of Gemm 
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+/* #define USE_PERTURBATION	1 */
			
 
				+
			
 
				+#ifdef USE_PERTURBATION
			
 
				+#define PERTURBATE(a)	((starpu_drand48()*2.0f*(AMPL) + 1.0f - (AMPL))*(a))
			
 
				+#else
			
 
				+#define PERTURBATE(a)	(a)
			
 
				+#endif
			
 
				+
			
 
				+static double cpu_chol_task_11_cost(starpu_buffer_descr *descr)
			
 
				+{
			
 
				+	uint32_t n;
			
 
				+
			
 
				+	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				+
			
 
				+	double cost = (((double)(n)*n*n)/1000.0f*0.894/0.79176);
			
 
				+
			
 
				+#ifdef STARPU_MODEL_DEBUG
			
 
				+	FPRINTF(stdout, "cpu_chol_task_11_cost n %d cost %e\n", n, cost);
			
 
				+#endif
			
 
				+
			
 
				+	return PERTURBATE(cost);
			
 
				+}
			
 
				+
			
 
				+static double cuda_chol_task_11_cost(starpu_buffer_descr *descr)
			
 
				+{
			
 
				+	uint32_t n;
			
 
				+
			
 
				+	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				+
			
 
				+	double cost = (((double)(n)*n*n)/50.0f/10.75/5.088633/0.9883);
			
 
				+
			
 
				+#ifdef STARPU_MODEL_DEBUG
			
 
				+	FPRINTF(stdout, "cuda_chol_task_11_cost n %d cost %e\n", n, cost);
			
 
				+#endif
			
 
				+
			
 
				+	return PERTURBATE(cost);
			
 
				+}
			
 
				+
			
 
				+static double cpu_chol_task_21_cost(starpu_buffer_descr *descr)
			
 
				+{
			
 
				+	uint32_t n;
			
 
				+
			
 
				+	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				+
			
 
				+	double cost = (((double)(n)*n*n)/7706.674/0.95/0.9965);
			
 
				+
			
 
				+#ifdef STARPU_MODEL_DEBUG
			
 
				+	FPRINTF(stdout, "cpu_chol_task_21_cost n %d cost %e\n", n, cost);
			
 
				+#endif
			
 
				+
			
 
				+	return PERTURBATE(cost);
			
 
				+}
			
 
				+
			
 
				+static double cuda_chol_task_21_cost(starpu_buffer_descr *descr)
			
 
				+{
			
 
				+	uint32_t n;
			
 
				+
			
 
				+	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				+
			
 
				+	double cost = (((double)(n)*n*n)/50.0f/10.75/87.29520);
			
 
				+
			
 
				+#ifdef STARPU_MODEL_DEBUG
			
 
				+	FPRINTF(stdout, "cuda_chol_task_21_cost n %d cost %e\n", n, cost);
			
 
				+#endif
			
 
				+
			
 
				+	return PERTURBATE(cost);
			
 
				+}
			
 
				+
			
 
				+static double cpu_chol_task_22_cost(starpu_buffer_descr *descr)
			
 
				+{
			
 
				+	uint32_t n;
			
 
				+
			
 
				+	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				+
			
 
				+	double cost = (((double)(n)*n*n)/50.0f/10.75/8.0760);
			
 
				+
			
 
				+#ifdef STARPU_MODEL_DEBUG
			
 
				+	FPRINTF(stdout, "cpu_chol_task_22_cost n %d cost %e\n", n, cost);
			
 
				+#endif
			
 
				+
			
 
				+	return PERTURBATE(cost);
			
 
				+}
			
 
				+
			
 
				+static double cuda_chol_task_22_cost(starpu_buffer_descr *descr)
			
 
				+{
			
 
				+	uint32_t n;
			
 
				+
			
 
				+	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				+
			
 
				+	double cost = (((double)(n)*n*n)/50.0f/10.75/76.30666);
			
 
				+
			
 
				+#ifdef STARPU_MODEL_DEBUG
			
 
				+	FPRINTF(stdout, "cuda_chol_task_22_cost n %d cost %e\n", n, cost);
			
 
				+#endif
			
 
				+
			
 
				+	return PERTURBATE(cost);
			
 
				+}
			
 
				+
			
 
				+struct starpu_perfmodel_t chol_model_11 = {
			
 
				+	.per_arch = {
			
 
				+		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_11_cost },
			
 
				+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_11_cost }
			
 
				+	},
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = "chol_model_11"
			
 
				+};
			
 
				+
			
 
				+struct starpu_perfmodel_t chol_model_21 = {
			
 
				+	.per_arch = {
			
 
				+		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_21_cost },
			
 
				+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_21_cost }
			
 
				+	},
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = "chol_model_21"
			
 
				+};
			
 
				+
			
 
				+struct starpu_perfmodel_t chol_model_22 = {
			
 
				+	.per_arch = {
			
 
				+		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_22_cost },
			
 
				+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_22_cost }
			
 
				+	},
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = "chol_model_22"
			
 
				+};
			
--- a/sched_ctx_hypervisor/examples/cholesky/cholesky_tag.c
+++ b/sched_ctx_hypervisor/examples/cholesky/cholesky_tag.c
@@ -0,0 +1,368 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "cholesky.h"
			
 
				+
			
 
				+/*
			
 
				+ *	Some useful functions
			
 
				+ */
			
 
				+
			
 
				+static struct starpu_task *create_task(starpu_tag_t id)
			
 
				+{
			
 
				+	struct starpu_task *task = starpu_task_create();
			
 
				+		task->cl_arg = NULL;
			
 
				+		task->use_tag = 1;
			
 
				+		task->tag_id = id;
			
 
				+
			
 
				+	return task;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *	Create the codelets
			
 
				+ */
			
 
				+
			
 
				+static starpu_codelet cl11 =
			
 
				+{
			
 
				+	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.cpu_func = chol_cpu_codelet_update_u11,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_func = chol_cublas_codelet_update_u11,
			
 
				+#endif
			
 
				+	.nbuffers = 1,
			
 
				+	.model = &chol_model_11
			
 
				+};
			
 
				+
			
 
				+static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k)
			
 
				+{
			
 
				+/*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
			
 
				+
			
 
				+	struct starpu_task *task = create_task(TAG11(k));
			
 
				+	
			
 
				+	task->cl = &cl11;
			
 
				+
			
 
				+	/* which sub-data is manipulated ? */
			
 
				+	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k);
			
 
				+	task->buffers[0].mode = STARPU_RW;
			
 
				+
			
 
				+	/* this is an important task */
			
 
				+	if (!noprio)
			
 
				+		task->priority = STARPU_MAX_PRIO;
			
 
				+
			
 
				+	/* enforce dependencies ... */
			
 
				+	if (k > 0) {
			
 
				+		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
			
 
				+	}
			
 
				+
			
 
				+	return task;
			
 
				+}
			
 
				+
			
 
				+static starpu_codelet cl21 =
			
 
				+{
			
 
				+	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.cpu_func = chol_cpu_codelet_update_u21,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_func = chol_cublas_codelet_update_u21,
			
 
				+#endif
			
 
				+	.nbuffers = 2,
			
 
				+	.model = &chol_model_21
			
 
				+};
			
 
				+
			
 
				+static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j)
			
 
				+{
			
 
				+	struct starpu_task *task = create_task(TAG21(k, j));
			
 
				+
			
 
				+	task->cl = &cl21;	
			
 
				+
			
 
				+	/* which sub-data is manipulated ? */
			
 
				+	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k); 
			
 
				+	task->buffers[0].mode = STARPU_R;
			
 
				+	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
			
 
				+	task->buffers[1].mode = STARPU_RW;
			
 
				+
			
 
				+	if (!noprio && (j == k+1)) {
			
 
				+		task->priority = STARPU_MAX_PRIO;
			
 
				+	}
			
 
				+
			
 
				+	/* enforce dependencies ... */
			
 
				+	if (k > 0) {
			
 
				+		starpu_tag_declare_deps(TAG21(k, j), 2, TAG11(k), TAG22(k-1, k, j));
			
 
				+	}
			
 
				+	else {
			
 
				+		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
			
 
				+	}
			
 
				+
			
 
				+	int ret = starpu_task_submit(task);
			
 
				+        if (STARPU_UNLIKELY(ret == -ENODEV)) {
			
 
				+                FPRINTF(stderr, "No worker may execute this task\n");
			
 
				+                exit(0);
			
 
				+        }
			
 
				+
			
 
				+}
			
 
				+
			
 
				+static starpu_codelet cl22 =
			
 
				+{
			
 
				+	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.cpu_func = chol_cpu_codelet_update_u22,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_func = chol_cublas_codelet_update_u22,
			
 
				+#endif
			
 
				+	.nbuffers = 3,
			
 
				+	.model = &chol_model_22
			
 
				+};
			
 
				+
			
 
				+static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j)
			
 
				+{
			
 
				+/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
			
 
				+
			
 
				+	struct starpu_task *task = create_task(TAG22(k, i, j));
			
 
				+
			
 
				+	task->cl = &cl22;
			
 
				+
			
 
				+	/* which sub-data is manipulated ? */
			
 
				+	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, i); 
			
 
				+	task->buffers[0].mode = STARPU_R;
			
 
				+	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
			
 
				+	task->buffers[1].mode = STARPU_R;
			
 
				+	task->buffers[2].handle = starpu_data_get_sub_data(dataA, 2, i, j); 
			
 
				+	task->buffers[2].mode = STARPU_RW;
			
 
				+
			
 
				+	if (!noprio && (i == k + 1) && (j == k +1) ) {
			
 
				+		task->priority = STARPU_MAX_PRIO;
			
 
				+	}
			
 
				+
			
 
				+	/* enforce dependencies ... */
			
 
				+	if (k > 0) {
			
 
				+		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j));
			
 
				+	}
			
 
				+	else {
			
 
				+		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
			
 
				+	}
			
 
				+
			
 
				+	int ret = starpu_task_submit(task);
			
 
				+        if (STARPU_UNLIKELY(ret == -ENODEV)) {
			
 
				+                FPRINTF(stderr, "No worker may execute this task\n");
			
 
				+                exit(0);
			
 
				+        }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ *	code to bootstrap the factorization 
			
 
				+ *	and construct the DAG
			
 
				+ */
			
 
				+
			
 
				+static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
			
 
				+{
			
 
				+	struct timeval start;
			
 
				+	struct timeval end;
			
 
				+
			
 
				+	struct starpu_task *entry_task = NULL;
			
 
				+
			
 
				+	/* create all the DAG nodes */
			
 
				+	unsigned i,j,k;
			
 
				+
			
 
				+	gettimeofday(&start, NULL);
			
 
				+
			
 
				+	for (k = 0; k < nblocks; k++)
			
 
				+	{
			
 
				+		struct starpu_task *task = create_task_11(dataA, k);
			
 
				+		/* we defer the launch of the first task */
			
 
				+		if (k == 0) {
			
 
				+			entry_task = task;
			
 
				+		}
			
 
				+		else {
			
 
				+			int ret = starpu_task_submit(task);
			
 
				+                        if (STARPU_UNLIKELY(ret == -ENODEV)) {
			
 
				+                                FPRINTF(stderr, "No worker may execute this task\n");
			
 
				+                                exit(0);
			
 
				+                        }
			
 
				+
			
 
				+		}
			
 
				+		
			
 
				+		for (j = k+1; j<nblocks; j++)
			
 
				+		{
			
 
				+			create_task_21(dataA, k, j);
			
 
				+
			
 
				+			for (i = k+1; i<nblocks; i++)
			
 
				+			{
			
 
				+				if (i <= j)
			
 
				+					create_task_22(dataA, k, i, j);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* schedule the codelet */
			
 
				+	int ret = starpu_task_submit(entry_task);
			
 
				+        if (STARPU_UNLIKELY(ret == -ENODEV)) {
			
 
				+                FPRINTF(stderr, "No worker may execute this task\n");
			
 
				+                exit(0);
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+	/* stall the application until the end of computations */
			
 
				+	starpu_tag_wait(TAG11(nblocks-1));
			
 
				+
			
 
				+	starpu_data_unpartition(dataA, 0);
			
 
				+
			
 
				+	gettimeofday(&end, NULL);
			
 
				+
			
 
				+
			
 
				+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				+	FPRINTF(stderr, "Computation took (in ms)\n");
			
 
				+	FPRINTF(stdout, "%2.2f\n", timing/1000);
			
 
				+
			
 
				+	unsigned n = starpu_matrix_get_nx(dataA);
			
 
				+
			
 
				+	double flop = (1.0f*n*n*n)/3.0f;
			
 
				+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				+}
			
 
				+
			
 
				+static void initialize_system(float **A, unsigned dim, unsigned pinned)
			
 
				+{
			
 
				+	starpu_init(NULL);
			
 
				+	
			
 
				+	starpu_helper_cublas_init();
			
 
				+
			
 
				+	if (pinned)
			
 
				+	{
			
 
				+		starpu_malloc((void **)A, (size_t)dim*dim*sizeof(float));
			
 
				+	} 
			
 
				+	else {
			
 
				+		*A = malloc(dim*dim*sizeof(float));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
			
 
				+{
			
 
				+	starpu_data_handle dataA;
			
 
				+
			
 
				+	/* monitor and partition the A matrix into blocks :
			
 
				+	 * one block is now determined by 2 unsigned (i,j) */
			
 
				+	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
			
 
				+
			
 
				+	starpu_data_set_sequential_consistency_flag(dataA, 0);
			
 
				+
			
 
				+	struct starpu_data_filter f = {
			
 
				+		.filter_func = starpu_vertical_block_filter_func,
			
 
				+		.nchildren = nblocks
			
 
				+	};
			
 
				+
			
 
				+	struct starpu_data_filter f2 = {
			
 
				+		.filter_func = starpu_block_filter_func,
			
 
				+		.nchildren = nblocks
			
 
				+	};
			
 
				+
			
 
				+	starpu_data_map_filters(dataA, 2, &f, &f2);
			
 
				+
			
 
				+	_cholesky(dataA, nblocks);
			
 
				+
			
 
				+	starpu_helper_cublas_shutdown();
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	/* create a simple definite positive symetric matrix example
			
 
				+	 *
			
 
				+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
			
 
				+	 * */
			
 
				+
			
 
				+	parse_args(argc, argv);
			
 
				+
			
 
				+	float *mat;
			
 
				+
			
 
				+	mat = malloc(size*size*sizeof(float));
			
 
				+	initialize_system(&mat, size, pinned);
			
 
				+
			
 
				+	unsigned i,j;
			
 
				+	for (i = 0; i < size; i++)
			
 
				+	{
			
 
				+		for (j = 0; j < size; j++)
			
 
				+		{
			
 
				+			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
			
 
				+			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+#ifdef CHECK_OUTPUT
			
 
				+	FPRINTF(stdout, "Input :\n");
			
 
				+
			
 
				+	for (j = 0; j < size; j++)
			
 
				+	{
			
 
				+		for (i = 0; i < size; i++)
			
 
				+		{
			
 
				+			if (i <= j) {
			
 
				+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
			
 
				+			}
			
 
				+			else {
			
 
				+				FPRINTF(stdout, ".\t");
			
 
				+			}
			
 
				+		}
			
 
				+		FPRINTF(stdout, "\n");
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+	cholesky(mat, size, size, nblocks);
			
 
				+
			
 
				+#ifdef CHECK_OUTPUT
			
 
				+	FPRINTF(stdout, "Results :\n");
			
 
				+
			
 
				+	for (j = 0; j < size; j++)
			
 
				+	{
			
 
				+		for (i = 0; i < size; i++)
			
 
				+		{
			
 
				+			if (i <= j) {
			
 
				+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
			
 
				+			}
			
 
				+			else {
			
 
				+				FPRINTF(stdout, ".\t");
			
 
				+				mat[j+i*size] = 0.0f; /* debug */
			
 
				+			}
			
 
				+		}
			
 
				+		FPRINTF(stdout, "\n");
			
 
				+	}
			
 
				+
			
 
				+	FPRINTF(stderr, "compute explicit LLt ...\n");
			
 
				+	float *test_mat = malloc(size*size*sizeof(float));
			
 
				+	STARPU_ASSERT(test_mat);
			
 
				+
			
 
				+	SSYRK("L", "N", size, size, 1.0f, 
			
 
				+				mat, size, 0.0f, test_mat, size);
			
 
				+
			
 
				+	FPRINTF(stderr, "comparing results ...\n");
			
 
				+	for (j = 0; j < size; j++)
			
 
				+	{
			
 
				+		for (i = 0; i < size; i++)
			
 
				+		{
			
 
				+			if (i <= j) {
			
 
				+				FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
			
 
				+			}
			
 
				+			else {
			
 
				+				FPRINTF(stdout, ".\t");
			
 
				+			}
			
 
				+		}
			
 
				+		FPRINTF(stdout, "\n");
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/sched_ctx_hypervisor/examples/cholesky/cholesky_tile_tag.c
+++ b/sched_ctx_hypervisor/examples/cholesky/cholesky_tile_tag.c
@@ -0,0 +1,312 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "cholesky.h"
			
 
				+
			
 
				+/* A [ y ] [ x ] */
			
 
				+float *A[NMAXBLOCKS][NMAXBLOCKS];
			
 
				+starpu_data_handle A_state[NMAXBLOCKS][NMAXBLOCKS];
			
 
				+
			
 
				+/*
			
 
				+ *	Some useful functions
			
 
				+ */
			
 
				+
			
 
				+static struct starpu_task *create_task(starpu_tag_t id)
			
 
				+{
			
 
				+	struct starpu_task *task = starpu_task_create();
			
 
				+		task->cl_arg = NULL;
			
 
				+		task->use_tag = 1;
			
 
				+		task->tag_id = id;
			
 
				+
			
 
				+	return task;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *	Create the codelets
			
 
				+ */
			
 
				+
			
 
				+static starpu_codelet cl11 =
			
 
				+{
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
			
 
				+	.cpu_func = chol_cpu_codelet_update_u11,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_func = chol_cublas_codelet_update_u11,
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_GORDON
			
 
				+#ifdef SPU_FUNC_POTRF
			
 
				+	.gordon_func = SPU_FUNC_POTRF,
			
 
				+#else
			
 
				+#warning SPU_FUNC_POTRF is not available
			
 
				+#endif
			
 
				+#endif
			
 
				+	.nbuffers = 1,
			
 
				+	.model = &chol_model_11
			
 
				+};
			
 
				+
			
 
				+static struct starpu_task * create_task_11(unsigned k, unsigned nblocks)
			
 
				+{
			
 
				+/*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
			
 
				+
			
 
				+	struct starpu_task *task = create_task(TAG11(k));
			
 
				+	
			
 
				+	task->cl = &cl11;
			
 
				+
			
 
				+	/* which sub-data is manipulated ? */
			
 
				+	task->buffers[0].handle = A_state[k][k];
			
 
				+	task->buffers[0].mode = STARPU_RW;
			
 
				+
			
 
				+	/* this is an important task */
			
 
				+	task->priority = STARPU_MAX_PRIO;
			
 
				+
			
 
				+	/* enforce dependencies ... */
			
 
				+	if (k > 0) {
			
 
				+		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
			
 
				+	}
			
 
				+
			
 
				+	return task;
			
 
				+}
			
 
				+
			
 
				+static starpu_codelet cl21 =
			
 
				+{
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
			
 
				+	.cpu_func = chol_cpu_codelet_update_u21,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_func = chol_cublas_codelet_update_u21,
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_GORDON
			
 
				+#ifdef SPU_FUNC_STRSM
			
 
				+	.gordon_func = SPU_FUNC_STRSM,
			
 
				+#else
			
 
				+#warning SPU_FUNC_STRSM is not available
			
 
				+#endif
			
 
				+#endif
			
 
				+	.nbuffers = 2,
			
 
				+	.model = &chol_model_21
			
 
				+};
			
 
				+
			
 
				+static void create_task_21(unsigned k, unsigned j)
			
 
				+{
			
 
				+	struct starpu_task *task = create_task(TAG21(k, j));
			
 
				+
			
 
				+	task->cl = &cl21;	
			
 
				+
			
 
				+	/* which sub-data is manipulated ? */
			
 
				+	task->buffers[0].handle = A_state[k][k]; 
			
 
				+	task->buffers[0].mode = STARPU_R;
			
 
				+	task->buffers[1].handle = A_state[j][k]; 
			
 
				+	task->buffers[1].mode = STARPU_RW;
			
 
				+
			
 
				+	if (j == k+1) {
			
 
				+		task->priority = STARPU_MAX_PRIO;
			
 
				+	}
			
 
				+
			
 
				+	/* enforce dependencies ... */
			
 
				+	if (k > 0) {
			
 
				+		starpu_tag_declare_deps(TAG21(k, j), 2, TAG11(k), TAG22(k-1, k, j));
			
 
				+	}
			
 
				+	else {
			
 
				+		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
			
 
				+	}
			
 
				+
			
 
				+	starpu_task_submit(task);
			
 
				+}
			
 
				+
			
 
				+static starpu_codelet cl22 =
			
 
				+{
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
			
 
				+	.cpu_func = chol_cpu_codelet_update_u22,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_func = chol_cublas_codelet_update_u22,
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_GORDON
			
 
				+#ifdef SPU_FUNC_SGEMM
			
 
				+	.gordon_func = SPU_FUNC_SGEMM,
			
 
				+#else
			
 
				+#warning SPU_FUNC_SGEMM is not available
			
 
				+#endif
			
 
				+#endif
			
 
				+	.nbuffers = 3,
			
 
				+	.model = &chol_model_22
			
 
				+};
			
 
				+
			
 
				+static void create_task_22(unsigned k, unsigned i, unsigned j)
			
 
				+{
			
 
				+/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
			
 
				+
			
 
				+	struct starpu_task *task = create_task(TAG22(k, i, j));
			
 
				+
			
 
				+	task->cl = &cl22;
			
 
				+
			
 
				+	/* which sub-data is manipulated ? */
			
 
				+	task->buffers[0].handle = A_state[i][k]; 
			
 
				+	task->buffers[0].mode = STARPU_R;
			
 
				+	task->buffers[1].handle = A_state[j][k]; 
			
 
				+	task->buffers[1].mode = STARPU_R;
			
 
				+	task->buffers[2].handle = A_state[j][i]; 
			
 
				+	task->buffers[2].mode = STARPU_RW;
			
 
				+
			
 
				+	if ( (i == k + 1) && (j == k +1) ) {
			
 
				+		task->priority = STARPU_MAX_PRIO;
			
 
				+	}
			
 
				+
			
 
				+	/* enforce dependencies ... */
			
 
				+	if (k > 0) {
			
 
				+		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j));
			
 
				+	}
			
 
				+	else {
			
 
				+		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
			
 
				+	}
			
 
				+
			
 
				+	starpu_task_submit(task);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ *	code to bootstrap the factorization 
			
 
				+ *	and construct the DAG
			
 
				+ */
			
 
				+
			
 
				+static void cholesky_no_stride(void)
			
 
				+{
			
 
				+	struct timeval start;
			
 
				+	struct timeval end;
			
 
				+
			
 
				+	struct starpu_task *entry_task = NULL;
			
 
				+
			
 
				+	/* create all the DAG nodes */
			
 
				+	unsigned i,j,k;
			
 
				+
			
 
				+	for (k = 0; k < nblocks; k++)
			
 
				+	{
			
 
				+		struct starpu_task *task = create_task_11(k, nblocks);
			
 
				+		/* we defer the launch of the first task */
			
 
				+		if (k == 0) {
			
 
				+			entry_task = task;
			
 
				+		}
			
 
				+		else {
			
 
				+			starpu_task_submit(task);
			
 
				+		}
			
 
				+		
			
 
				+		for (j = k+1; j<nblocks; j++)
			
 
				+		{
			
 
				+			create_task_21(k, j);
			
 
				+
			
 
				+			for (i = k+1; i<nblocks; i++)
			
 
				+			{
			
 
				+				if (i <= j)
			
 
				+					create_task_22(k, i, j);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* schedule the codelet */
			
 
				+	gettimeofday(&start, NULL);
			
 
				+	starpu_task_submit(entry_task);
			
 
				+
			
 
				+	/* stall the application until the end of computations */
			
 
				+	starpu_tag_wait(TAG11(nblocks-1));
			
 
				+
			
 
				+	gettimeofday(&end, NULL);
			
 
				+
			
 
				+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				+	FPRINTF(stderr, "Computation took (in ms)\n");
			
 
				+	FPRINTF(stdout, "%2.2f\n", timing/1000);
			
 
				+
			
 
				+	double flop = (1.0f*size*size*size)/3.0f;
			
 
				+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	unsigned x, y;
			
 
				+	unsigned i, j;
			
 
				+
			
 
				+	parse_args(argc, argv);
			
 
				+	assert(nblocks <= NMAXBLOCKS);
			
 
				+
			
 
				+	FPRINTF(stderr, "BLOCK SIZE = %d\n", size / nblocks);
			
 
				+
			
 
				+	starpu_init(NULL);
			
 
				+
			
 
				+	/* Disable sequential consistency */
			
 
				+	starpu_data_set_default_sequential_consistency_flag(0);
			
 
				+
			
 
				+	starpu_helper_cublas_init();
			
 
				+
			
 
				+	for (y = 0; y < nblocks; y++)
			
 
				+	for (x = 0; x < nblocks; x++)
			
 
				+	{
			
 
				+		if (x <= y) {
			
 
				+			A[y][x] = malloc(BLOCKSIZE*BLOCKSIZE*sizeof(float));
			
 
				+			assert(A[y][x]);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	for (y = 0; y < nblocks; y++)
			
 
				+	for (x = 0; x < nblocks; x++)
			
 
				+	{
			
 
				+		if (x <= y) {
			
 
				+#ifdef STARPU_HAVE_POSIX_MEMALIGN
			
 
				+			posix_memalign((void **)&A[y][x], 128, BLOCKSIZE*BLOCKSIZE*sizeof(float));
			
 
				+#else
			
 
				+			A[y][x] = malloc(BLOCKSIZE*BLOCKSIZE*sizeof(float));
			
 
				+#endif
			
 
				+			assert(A[y][x]);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* create a simple definite positive symetric matrix example
			
 
				+	 *
			
 
				+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1) ( + n In to make is stable ) 
			
 
				+	 * */
			
 
				+	for (y = 0; y < nblocks; y++)
			
 
				+	for (x = 0; x < nblocks; x++)
			
 
				+	if (x <= y) {
			
 
				+		for (i = 0; i < BLOCKSIZE; i++)
			
 
				+		for (j = 0; j < BLOCKSIZE; j++)
			
 
				+		{
			
 
				+			A[y][x][i*BLOCKSIZE + j] =
			
 
				+				(float)(1.0f/((float) (1.0+(x*BLOCKSIZE+i)+(y*BLOCKSIZE+j))));
			
 
				+
			
 
				+			/* make it a little more numerically stable ... ;) */
			
 
				+			if ((x == y) && (i == j))
			
 
				+				A[y][x][i*BLOCKSIZE + j] += (float)(2*size);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+
			
 
				+	for (y = 0; y < nblocks; y++)
			
 
				+	for (x = 0; x < nblocks; x++)
			
 
				+	{
			
 
				+		if (x <= y) {
			
 
				+			starpu_matrix_data_register(&A_state[y][x], 0, (uintptr_t)A[y][x], 
			
 
				+				BLOCKSIZE, BLOCKSIZE, BLOCKSIZE, sizeof(float));
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	cholesky_no_stride();
			
 
				+
			
 
				+	starpu_helper_cublas_shutdown();
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+
			
--- a/sched_ctx_hypervisor/examples/cholesky_2ctxs/cholesky_2ctxs
+++ b/sched_ctx_hypervisor/examples/cholesky_2ctxs/cholesky_2ctxs
--- a/sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.c
+++ b/sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.c
@@ -0,0 +1,355 @@
 
				+#include "sched_ctx_utils.h"
			
 
				+#include <starpu.h>
			
 
				+#include "sched_ctx_hypervisor.h"
			
 
				+
			
 
				+unsigned size1;
			
 
				+unsigned size2;
			
 
				+unsigned nblocks1;
			
 
				+unsigned nblocks2;
			
 
				+unsigned cpu1;
			
 
				+unsigned cpu2;
			
 
				+unsigned gpu;
			
 
				+unsigned gpu1;
			
 
				+unsigned gpu2;
			
 
				+
			
 
				+typedef struct {
			
 
				+	unsigned id;
			
 
				+	unsigned ctx;
			
 
				+	int the_other_ctx;
			
 
				+	int *procs;
			
 
				+	int nprocs;
			
 
				+	void (*bench)(unsigned, unsigned);
			
 
				+	unsigned size;
			
 
				+	unsigned nblocks;
			
 
				+} params;
			
 
				+
			
 
				+typedef struct {
			
 
				+	double flops;
			
 
				+	double avg_timing;
			
 
				+} retvals;
			
 
				+
			
 
				+#define NSAMPLES 3
			
 
				+int first = 1;
			
 
				+pthread_mutex_t mut;
			
 
				+retvals rv[2];
			
 
				+params p1, p2;
			
 
				+
			
 
				+struct sched_ctx_hypervisor_reply reply1[NSAMPLES*2*2];
			
 
				+struct sched_ctx_hypervisor_reply reply2[NSAMPLES*2*2];
			
 
				+
			
 
				+pthread_key_t key;
			
 
				+
			
 
				+void init()
			
 
				+{
			
 
				+	size1 = 4*1024;
			
 
				+	size2 = 4*1024;
			
 
				+	nblocks1 = 16;
			
 
				+	nblocks2 = 16;
			
 
				+	cpu1 = 0;
			
 
				+	cpu2 = 0;
			
 
				+	gpu = 0;
			
 
				+	gpu1 = 0;
			
 
				+	gpu2 = 0;
			
 
				+
			
 
				+	rv[0].flops = 0.0;
			
 
				+	rv[1].flops = 0.0;
			
 
				+	rv[1].avg_timing = 0.0;
			
 
				+	rv[1].avg_timing = 0.0;
			
 
				+
			
 
				+	p1.ctx = 0;
			
 
				+	p2.ctx = 0;
			
 
				+
			
 
				+	p1.id = 0;
			
 
				+	p2.id = 1;
			
 
				+	pthread_key_create(&key, NULL);
			
 
				+}
			
 
				+
			
 
				+void update_sched_ctx_timing_results(double flops, double avg_timing)
			
 
				+{
			
 
				+	unsigned *id = pthread_getspecific(key);
			
 
				+	rv[*id].flops += flops;
			
 
				+	rv[*id].avg_timing += avg_timing;
			
 
				+}
			
 
				+
			
 
				+void* start_bench(void *val){
			
 
				+	params *p = (params*)val;
			
 
				+	int i;
			
 
				+
			
 
				+	pthread_setspecific(key, &p->id);
			
 
				+
			
 
				+	if(p->ctx != 0)
			
 
				+		starpu_set_sched_ctx(&p->ctx);
			
 
				+
			
 
				+	for(i = 0; i < NSAMPLES; i++)
			
 
				+		p->bench(p->size, p->nblocks);
			
 
				+
			
 
				+	if(p->ctx != 0)
			
 
				+	{
			
 
				+		pthread_mutex_lock(&mut);
			
 
				+		if(first){
			
 
				+			sched_ctx_hypervisor_ignore_ctx(p->ctx);
			
 
				+			starpu_delete_sched_ctx(p->ctx, p->the_other_ctx);
			
 
				+		}
			
 
				+		
			
 
				+		first = 0;
			
 
				+		pthread_mutex_unlock(&mut);
			
 
				+	}
			
 
				+
			
 
				+	rv[p->id].flops /= NSAMPLES;
			
 
				+	rv[p->id].avg_timing /= NSAMPLES;
			
 
				+	
			
 
				+}
			
 
				+
			
 
				+void start_2benchs(void (*bench)(unsigned, unsigned))
			
 
				+{
			
 
				+	p1.bench = bench;
			
 
				+	p1.size = size1;
			
 
				+	p1.nblocks = nblocks1;
			
 
				+	
			
 
				+	p2.bench = bench;
			
 
				+	p2.size = size2;
			
 
				+	p2.nblocks = nblocks2;
			
 
				+	
			
 
				+	pthread_t tid[2];
			
 
				+	pthread_mutex_init(&mut, NULL);
			
 
				+
			
 
				+	struct timeval start;
			
 
				+	struct timeval end;
			
 
				+
			
 
				+	gettimeofday(&start, NULL);
			
 
				+
			
 
				+	pthread_create(&tid[0], NULL, (void*)start_bench, (void*)&p1);
			
 
				+	pthread_create(&tid[1], NULL, (void*)start_bench, (void*)&p2);
			
 
				+ 
			
 
				+	pthread_join(tid[0], NULL);
			
 
				+	pthread_join(tid[1], NULL);
			
 
				+
			
 
				+	gettimeofday(&end, NULL);
			
 
				+
			
 
				+	pthread_mutex_destroy(&mut);
			
 
				+
			
 
				+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				+	timing /= 1000000;
			
 
				+
			
 
				+	printf("%2.2f %2.2f ", rv[0].flops, rv[1].flops);
			
 
				+	printf("%2.2f %2.2f %2.2f\n", rv[0].avg_timing, rv[1].avg_timing, timing);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+void start_1stbench(void (*bench)(unsigned, unsigned))
			
 
				+{
			
 
				+	p1.bench = bench;
			
 
				+	p1.size = size1;
			
 
				+	p1.nblocks = nblocks1;
			
 
				+	
			
 
				+	struct timeval start;
			
 
				+	struct timeval end;
			
 
				+
			
 
				+	gettimeofday(&start, NULL);
			
 
				+
			
 
				+	start_bench((void*)&p1);
			
 
				+
			
 
				+	gettimeofday(&end, NULL);
			
 
				+
			
 
				+	pthread_mutex_destroy(&mut);
			
 
				+
			
 
				+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				+	timing /= 1000000;
			
 
				+
			
 
				+	printf("%2.2f ", rv[0].flops);
			
 
				+	printf("%2.2f %2.2f\n", rv[0].avg_timing, timing);
			
 
				+}
			
 
				+
			
 
				+void start_2ndbench(void (*bench)(unsigned, unsigned))
			
 
				+{
			
 
				+	p2.bench = bench;
			
 
				+	p2.size = size2;
			
 
				+	p2.nblocks = nblocks2;
			
 
				+	
			
 
				+	struct timeval start;
			
 
				+	struct timeval end;
			
 
				+
			
 
				+	gettimeofday(&start, NULL);
			
 
				+
			
 
				+	start_bench((void*)&p2);
			
 
				+
			
 
				+	gettimeofday(&end, NULL);
			
 
				+
			
 
				+	pthread_mutex_destroy(&mut);
			
 
				+
			
 
				+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				+	timing /= 1000000;
			
 
				+
			
 
				+	printf("%2.2f ", rv[1].flops);
			
 
				+	printf("%2.2f %2.2f\n", rv[1].avg_timing, timing);
			
 
				+}
			
 
				+
			
 
				+void construct_contexts(void (*bench)(unsigned, unsigned))
			
 
				+{
			
 
				+	struct starpu_sched_ctx_hypervisor_criteria *criteria = sched_ctx_hypervisor_init(SIMPLE_POLICY);
			
 
				+	int nprocs1 = cpu1 + gpu + gpu1;
			
 
				+	int nprocs2 = cpu2 + gpu + gpu2;
			
 
				+	unsigned n_all_gpus = gpu + gpu1 + gpu2;
			
 
				+
			
 
				+
			
 
				+	int i;
			
 
				+	int k = 0;
			
 
				+
			
 
				+	p1.procs = (int*)malloc(nprocs1*sizeof(int));
			
 
				+
			
 
				+	for(i = 0; i < gpu; i++)
			
 
				+		p1.procs[k++] = i;
			
 
				+
			
 
				+	for(i = gpu; i < gpu + gpu1; i++)
			
 
				+		p1.procs[k++] = i;
			
 
				+
			
 
				+
			
 
				+	for(i = n_all_gpus; i < n_all_gpus + cpu1; i++)
			
 
				+		p1.procs[k++] = i;
			
 
				+
			
 
				+
			
 
				+	p1.ctx = starpu_create_sched_ctx_with_criteria("heft", p1.procs, nprocs1, "sched_ctx1", criteria);
			
 
				+	p2.the_other_ctx = (int)p1.ctx;
			
 
				+	p1.nprocs = nprocs1;
			
 
				+	sched_ctx_hypervisor_handle_ctx(p1.ctx);
			
 
				+	
			
 
				+	sched_ctx_hypervisor_ioctl(p1.ctx,
			
 
				+				   HYPERVISOR_MAX_IDLE, p1.procs, p1.nprocs, 100000.0,
			
 
				+				   HYPERVISOR_MAX_IDLE, p1.procs, gpu+gpu1, 100000000.0,
			
 
				+				   HYPERVISOR_MIN_WORKING, p1.procs, p1.nprocs, 200.0,
			
 
				+//				   HYPERVISOR_PRIORITY, p1.procs, p1.nprocs, 1,
			
 
				+				   HYPERVISOR_PRIORITY, p1.procs, gpu+gpu1, 2,
			
 
				+				   HYPERVISOR_MIN_PROCS, 1,
			
 
				+				   HYPERVISOR_MAX_PROCS, 11,
			
 
				+				   HYPERVISOR_GRANULARITY, 4,
			
 
				+				   HYPERVISOR_FIXED_PROCS, p1.procs, gpu,
			
 
				+				   HYPERVISOR_MIN_TASKS, 10000,
			
 
				+				   HYPERVISOR_NEW_WORKERS_MAX_IDLE, 1000000.0,
			
 
				+				   NULL);
			
 
				+
			
 
				+	k = 0;
			
 
				+	p2.procs = (int*)malloc(nprocs2*sizeof(int));
			
 
				+
			
 
				+	for(i = 0; i < gpu; i++)
			
 
				+		p2.procs[k++] = i;
			
 
				+
			
 
				+	for(i = gpu + gpu1; i < gpu + gpu1 + gpu2; i++)
			
 
				+		p2.procs[k++] = i;
			
 
				+
			
 
				+	for(i = n_all_gpus  + cpu1; i < n_all_gpus + cpu1 + cpu2; i++)
			
 
				+		p2.procs[k++] = i;
			
 
				+
			
 
				+	p2.ctx = starpu_create_sched_ctx_with_criteria("heft", p2.procs, nprocs2, "sched_ctx2", criteria);
			
 
				+	p1.the_other_ctx = (int)p2.ctx;
			
 
				+	p2.nprocs = nprocs2;
			
 
				+	sched_ctx_hypervisor_handle_ctx(p2.ctx);
			
 
				+	
			
 
				+	sched_ctx_hypervisor_ioctl(p2.ctx,
			
 
				+				   HYPERVISOR_MAX_IDLE, p2.procs, p2.nprocs, 100000.0,
			
 
				+				   HYPERVISOR_MAX_IDLE, p2.procs, gpu+gpu2, 10000000000.0,
			
 
				+				   HYPERVISOR_MIN_WORKING, p2.procs, p2.nprocs, 200.0,
			
 
				+//				   HYPERVISOR_PRIORITY, p2.procs, p2.nprocs, 1,
			
 
				+				   HYPERVISOR_PRIORITY, p2.procs, gpu+gpu2, 2,
			
 
				+				   HYPERVISOR_MIN_PROCS, 1,
			
 
				+				   HYPERVISOR_MAX_PROCS, 11,
			
 
				+				   HYPERVISOR_GRANULARITY, 3,
			
 
				+				   HYPERVISOR_FIXED_PROCS, p2.procs, gpu,
			
 
				+				   HYPERVISOR_MIN_TASKS, 10000,
			
 
				+				   HYPERVISOR_NEW_WORKERS_MAX_IDLE, 100000.0,
			
 
				+				   NULL);
			
 
				+}
			
 
				+
			
 
				+void set_hypervisor_conf(int event, int checkpoint)
			
 
				+{
			
 
				+	unsigned *id = pthread_getspecific(key);
			
 
				+	pthread_mutex_lock(&mut);
			
 
				+	int reset_conf = 1;
			
 
				+	pthread_mutex_unlock(&mut);
			
 
				+	reset_conf = first;
			
 
				+
			
 
				+
			
 
				+	if(*id == 1 && reset_conf)
			
 
				+	{
			
 
				+		double  max_idle_time_big = 0, max_idle_time_small;
			
 
				+		if(event == START_BENCH)
			
 
				+		{
			
 
				+			max_idle_time_big = 1000.0;
			
 
				+			max_idle_time_small = 1000000.0;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			max_idle_time_big = 10000000.0;
			
 
				+			max_idle_time_small = 1000.0;
			
 
				+
			
 
				+		}
			
 
				+ 		
			
 
				+
			
 
				+		sched_ctx_hypervisor_advise(p2.ctx, p2.procs, p2.nprocs, &reply2[checkpoint]);
			
 
				+		if(reply2[checkpoint].procs)
			
 
				+			sched_ctx_hypervisor_ioctl(p2.ctx,
			
 
				+						   HYPERVISOR_MAX_IDLE, reply2[checkpoint].procs, reply2[checkpoint].nprocs, max_idle_time_small,
			
 
				+						   HYPERVISOR_TIME_TO_APPLY, checkpoint,
			
 
				+						   HYPERVISOR_GRANULARITY, 1,
			
 
				+						   NULL);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void end_contexts()
			
 
				+{
			
 
				+	free(p1.procs);
			
 
				+	free(p2.procs);
			
 
				+	sched_ctx_hypervisor_shutdown();
			
 
				+}
			
 
				+
			
 
				+void parse_args_ctx(int argc, char **argv)
			
 
				+{
			
 
				+	init();
			
 
				+	int i;
			
 
				+	for (i = 1; i < argc; i++) {
			
 
				+		if (strcmp(argv[i], "-size1") == 0) {
			
 
				+			char *argptr;
			
 
				+			size1 = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-nblocks1") == 0) {
			
 
				+			char *argptr;
			
 
				+			nblocks1 = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+		
			
 
				+		if (strcmp(argv[i], "-size2") == 0) {
			
 
				+			char *argptr;
			
 
				+			size2 = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-nblocks2") == 0) {
			
 
				+			char *argptr;
			
 
				+			nblocks2 = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-cpu1") == 0) {
			
 
				+			char *argptr;
			
 
				+			cpu1 = strtol(argv[++i], &argptr, 10);
			
 
				+		}    
			
 
				+
			
 
				+		if (strcmp(argv[i], "-cpu2") == 0) {
			
 
				+			char *argptr;
			
 
				+			cpu2 = strtol(argv[++i], &argptr, 10);
			
 
				+		}    
			
 
				+
			
 
				+		if (strcmp(argv[i], "-gpu") == 0) {
			
 
				+			char *argptr;
			
 
				+			gpu = strtol(argv[++i], &argptr, 10);
			
 
				+		}    
			
 
				+
			
 
				+		if (strcmp(argv[i], "-gpu1") == 0) {
			
 
				+			char *argptr;
			
 
				+			gpu1 = strtol(argv[++i], &argptr, 10);
			
 
				+		}    
			
 
				+
			
 
				+		if (strcmp(argv[i], "-gpu2") == 0) {
			
 
				+			char *argptr;
			
 
				+			gpu2 = strtol(argv[++i], &argptr, 10);
			
 
				+		}    
			
 
				+	}
			
 
				+}
			
 
				+
			
--- a/sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.h
+++ b/sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.h
@@ -0,0 +1,16 @@
 
				+#include <limits.h>
			
 
				+#include <string.h>
			
 
				+#include <math.h>
			
 
				+#include <sys/time.h>
			
 
				+#include <stdlib.h>
			
 
				+
			
 
				+#define START_BENCH 0
			
 
				+#define END_BENCH 1
			
 
				+
			
 
				+void parse_args_ctx(int argc, char **argv);
			
 
				+void update_sched_ctx_timing_results(double gflops, double timing);
			
 
				+void construct_contexts(void (*bench)(unsigned size, unsigned nblocks));
			
 
				+void end_contexts(void);
			
 
				+void start_2benchs(void (*bench)(unsigned size, unsigned nblocks));
			
 
				+void start_1stbench(void (*bench)(unsigned size, unsigned nblocks));
			
 
				+void start_2ndbench(void (*bench)(unsigned size, unsigned nblocks));
			
--- a/sched_ctx_hypervisor/include/sched_ctx_hypervisor.h
+++ b/sched_ctx_hypervisor/include/sched_ctx_hypervisor.h
@@ -1,5 +1,7 @@
 
				 #include <starpu.h>
			
 
				 #include <../common/config.h>
			
 
				+#include <../common/htable32.h>
			
 
				+#include <pthread.h>
			
 
				 
			
 
				 /* ioctl properties*/
			
 
				 #define HYPERVISOR_MAX_IDLE 1
			
@@ -9,6 +11,15 @@
 
				 #define HYPERVISOR_MAX_PROCS 5
			
 
				 #define HYPERVISOR_GRANULARITY 6
			
 
				 #define HYPERVISOR_FIXED_PROCS 7
			
 
				+#define HYPERVISOR_MIN_TASKS 8
			
 
				+#define HYPERVISOR_NEW_WORKERS_MAX_IDLE 9
			
 
				+#define HYPERVISOR_TIME_TO_APPLY 10
			
 
				+
			
 
				+struct sched_ctx_hypervisor_reply{
			
 
				+	int procs[STARPU_NMAXWORKERS];
			
 
				+	int nprocs;
			
 
				+};
			
 
				+pthread_mutex_t act_hypervisor_mutex;
			
 
				 
			
 
				 struct starpu_sched_ctx_hypervisor_criteria* sched_ctx_hypervisor_init(int type);
			
 
				 
			
@@ -26,6 +37,10 @@ void* sched_ctx_hypervisor_get_data(unsigned sched_ctx);
 
				 
			
 
				 void sched_ctx_hypervisor_ioctl(unsigned sched_ctx, ...);
			
 
				 
			
 
				+void sched_ctx_hypervisor_advise(unsigned sched_ctx, int *workers, int nworkers, struct sched_ctx_hypervisor_reply *reply);
			
 
				+
			
 
				+struct sched_ctx_hypervisor_reply* sched_ctx_hypervisor_request(unsigned sched_ctx, int *workers, int nworkers);
			
 
				+
			
 
				 /* hypervisor policies */
			
 
				 #define SIMPLE_POLICY 1
			
 
				 
			
@@ -34,6 +49,6 @@ struct hypervisor_policy {
 
				 	void (*deinit)(void);
			
 
				 	void (*add_sched_ctx)(unsigned sched_ctx);
			
 
				 	void(*remove_sched_ctx)(unsigned sched_ctx);
			
 
				-	void (*ioctl)(unsigned sched_ctx, va_list varg_list);
			
 
				+	void* (*ioctl)(unsigned sched_ctx, va_list varg_list, unsigned later);
			
 
				 	void (*manage_idle_time)(unsigned req_sched_ctx, int *sched_ctxs, unsigned nsched_ctxs, int worker, double idle_time);
			
 
				 };
			
--- a/sched_ctx_hypervisor/src/sched_ctx_hypervisor.c
+++ b/sched_ctx_hypervisor/src/sched_ctx_hypervisor.c
@@ -5,6 +5,9 @@ struct starpu_sched_ctx_hypervisor_criteria* criteria = NULL;
 
				 extern struct hypervisor_policy simple_policy;
			
 
				 
			
 
				 static void idle_time_cb(unsigned sched_ctx, int worker, double idle_time);
			
 
				+static void pushed_task_cb(unsigned sched_ctx, int worker);
			
 
				+static void poped_task_cb(unsigned sched_ctx, int worker);
			
 
				+static void post_exec_hook_cb(unsigned sched_ctx, int taskid);
			
 
				 
			
 
				 static void _load_hypervisor_policy(int type)
			
 
				 {
			
@@ -19,46 +22,55 @@ static void _load_hypervisor_policy(int type)
 
				 		hypervisor.policy.manage_idle_time = simple_policy.manage_idle_time;
			
 
				 		break;
			
 
				 	}
			
 
				-		
			
 
				 }
			
 
				 
			
 
				 struct starpu_sched_ctx_hypervisor_criteria* sched_ctx_hypervisor_init(int type)
			
 
				 {
			
 
				-	hypervisor.resize = 1;
			
 
				 	hypervisor.nsched_ctxs = 0;
			
 
				-
			
 
				+	hypervisor.resize = 0;
			
 
				+	pthread_mutex_init(&act_hypervisor_mutex, NULL);
			
 
				 	int i;
			
 
				 	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
			
 
				 	{
			
 
				+		hypervisor.configurations[i] = NULL;
			
 
				 		hypervisor.sched_ctxs[i] = STARPU_NMAX_SCHED_CTXS;
			
 
				 		hypervisor.sched_ctx_w[i].sched_ctx = STARPU_NMAX_SCHED_CTXS;
			
 
				 		hypervisor.sched_ctx_w[i].data = NULL;
			
 
				 		int j;
			
 
				 		for(j = 0; j < STARPU_NMAXWORKERS; j++)
			
 
				+		{
			
 
				 			hypervisor.sched_ctx_w[i].current_idle_time[j] = 0.0;
			
 
				+			hypervisor.sched_ctx_w[i].tasks[j] = 0;
			
 
				+			hypervisor.sched_ctx_w[i].poped_tasks[j] = 0;
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				-
			
 
				 	_load_hypervisor_policy(type);
			
 
				 
			
 
				 	hypervisor.policy.init();
			
 
				 	criteria = (struct starpu_sched_ctx_hypervisor_criteria*)malloc(sizeof(struct starpu_sched_ctx_hypervisor_criteria));
			
 
				 	criteria->idle_time_cb = idle_time_cb;
			
 
				+	criteria->pushed_task_cb = pushed_task_cb;
			
 
				+	criteria->poped_task_cb = poped_task_cb;
			
 
				+	criteria->post_exec_hook_cb = post_exec_hook_cb;
			
 
				 	return criteria;
			
 
				 }
			
 
				 
			
 
				 void sched_ctx_hypervisor_shutdown(void)
			
 
				 {
			
 
				-	hypervisor.policy.deinit();
			
 
				 	hypervisor.resize = 0;
			
 
				+	hypervisor.policy.deinit();
			
 
				 	free(criteria);
			
 
				+	pthread_mutex_destroy(&act_hypervisor_mutex);
			
 
				 }
			
 
				 
			
 
				 void sched_ctx_hypervisor_handle_ctx(unsigned sched_ctx)
			
 
				 {	
			
 
				+	hypervisor.configurations[sched_ctx] = (struct starpu_htbl32_node_s*)malloc(sizeof(struct starpu_htbl32_node_s));
			
 
				 	hypervisor.policy.add_sched_ctx(sched_ctx);
			
 
				 	hypervisor.sched_ctx_w[sched_ctx].sched_ctx = sched_ctx;
			
 
				 	hypervisor.sched_ctxs[hypervisor.nsched_ctxs++] = sched_ctx;
			
 
				+
			
 
				 }
			
 
				 
			
 
				 static int _get_first_free_sched_ctx(int *sched_ctxs, unsigned nsched_ctxs)
			
@@ -109,11 +121,18 @@ void sched_ctx_hypervisor_ignore_ctx(unsigned sched_ctx)
 
				 	hypervisor.nsched_ctxs--;
			
 
				 	hypervisor.sched_ctx_w[sched_ctx].sched_ctx = STARPU_NMAX_SCHED_CTXS;
			
 
				 	hypervisor.policy.remove_sched_ctx(sched_ctx);
			
 
				+	free(hypervisor.configurations[sched_ctx]);
			
 
				 }
			
 
				 
			
 
				 void sched_ctx_hypervisor_set_data(unsigned sched_ctx, void *data)
			
 
				 {
			
 
				+	pthread_mutex_lock(&act_hypervisor_mutex);
			
 
				+
			
 
				+	if(hypervisor.sched_ctx_w[sched_ctx].data != NULL)
			
 
				+		free(hypervisor.sched_ctx_w[sched_ctx].data);
			
 
				+
			
 
				 	hypervisor.sched_ctx_w[sched_ctx].data = data;
			
 
				+	pthread_mutex_unlock(&act_hypervisor_mutex);
			
 
				 	return;
			
 
				 }
			
 
				 
			
@@ -127,16 +146,74 @@ void sched_ctx_hypervisor_ioctl(unsigned sched_ctx, ...)
 
				 	va_list varg_list;
			
 
				 	va_start(varg_list, sched_ctx);
			
 
				 
			
 
				-	hypervisor.policy.ioctl(sched_ctx, varg_list);
			
 
				+	int arg_type;
			
 
				+	int stop = 0;
			
 
				+	int task_checkpoint = -1;
			
 
				+
			
 
				+	while ((arg_type = va_arg(varg_list, int)) != 0) 
			
 
				+	{
			
 
				+		switch(arg_type)
			
 
				+		{
			
 
				+		case HYPERVISOR_TIME_TO_APPLY:
			
 
				+			task_checkpoint = va_arg(varg_list, int);
			
 
				+			stop = 1;
			
 
				+			break;
			
 
				+
			
 
				+		case HYPERVISOR_MIN_TASKS:
			
 
				+			hypervisor.min_tasks = va_arg(varg_list, int);
			
 
				+			break;
			
 
				+		}
			
 
				+		if(stop) break;
			
 
				+	}
			
 
				+
			
 
				+	va_end(varg_list);
			
 
				+	va_start(varg_list, sched_ctx);
			
 
				+
			
 
				+	/* hypervisor configuration to be considered later */
			
 
				+	void *data = hypervisor.policy.ioctl(sched_ctx, varg_list, (task_checkpoint > 0));
			
 
				+	if(data != NULL)
			
 
				+		_starpu_htbl_insert_32(&hypervisor.configurations[sched_ctx], (uint32_t)task_checkpoint, data);
			
 
				+
			
 
				 	return;
			
 
				 }
			
 
				 
			
 
				 static void _sched_ctx_hypervisor_resize(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, int* workers_to_move, unsigned nworkers_to_move)
			
 
				 {
			
 
				-	printf("resize ctx %d with %d\n", sender_sched_ctx, workers_to_move[0]);
			
 
				+	int i;
			
 
				+	printf("resize ctx %d with", sender_sched_ctx);
			
 
				+	for(i = 0; i < nworkers_to_move; i++)
			
 
				+		printf(" %d", workers_to_move[i]);
			
 
				+	printf("\n");
			
 
				+
			
 
				 	starpu_remove_workers_from_sched_ctx(workers_to_move, nworkers_to_move, sender_sched_ctx);
			
 
				 	starpu_add_workers_to_sched_ctx(workers_to_move, nworkers_to_move, receiver_sched_ctx);
			
 
				+
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				+static int get_ntasks( int *tasks)
			
 
				+{
			
 
				+	int ntasks = 0;
			
 
				+	int j;
			
 
				+	for(j = 0; j < STARPU_NMAXWORKERS; j++)
			
 
				+	{
			
 
				+		ntasks += tasks[j];
			
 
				+	}
			
 
				+	return ntasks;
			
 
				+}
			
 
				+
			
 
				+static unsigned check_tasks_of_sched_ctx(unsigned sched_ctx)
			
 
				+{
			
 
				+	int ntasks = get_ntasks(hypervisor.sched_ctx_w[sched_ctx].tasks);
			
 
				 	
			
 
				+	return ntasks > hypervisor.min_tasks;
			
 
				+}
			
 
				+
			
 
				+void sched_ctx_hypervisor_resize(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, int* workers_to_move, unsigned nworkers_to_move)
			
 
				+{
			
 
				+	if(hypervisor.resize)
			
 
				+		_sched_ctx_hypervisor_resize(sender_sched_ctx, receiver_sched_ctx, workers_to_move, nworkers_to_move);
			
 
				+
			
 
				 	int i;
			
 
				 	for(i = 0; i < nworkers_to_move; i++)
			
 
				 		hypervisor.sched_ctx_w[sender_sched_ctx].current_idle_time[workers_to_move[i]] = 0.0;
			
@@ -144,14 +221,46 @@ static void _sched_ctx_hypervisor_resize(unsigned sender_sched_ctx, unsigned rec
 
				 	return;
			
 
				 }
			
 
				 
			
 
				-void sched_ctx_hypervisor_resize(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, int* workers_to_move, unsigned nworkers_to_move)
			
 
				+void sched_ctx_hypervisor_advise(unsigned sched_ctx, int *workerids, int nworkers, struct sched_ctx_hypervisor_reply *reply)
			
 
				 {
			
 
				-	if(hypervisor.resize)
			
 
				-		_sched_ctx_hypervisor_resize(sender_sched_ctx, receiver_sched_ctx, workers_to_move, nworkers_to_move);
			
 
				+	pthread_mutex_lock(&act_hypervisor_mutex);
			
 
				+	
			
 
				+	if(hypervisor.sched_ctx_w[sched_ctx].sched_ctx != STARPU_NMAX_SCHED_CTXS)
			
 
				+	{
			
 
				+		starpu_add_workers_to_sched_ctx(workerids, nworkers, sched_ctx);
			
 
				+		
			
 
				+		struct worker_collection *workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx);
			
 
				+		
			
 
				+		int i = 0;
			
 
				+		
			
 
				+		if(workers->init_cursor)
			
 
				+			workers->init_cursor(workers);
			
 
				+		
			
 
				+		while(workers->has_next(workers))
			
 
				+			reply->procs[i++] = workers->get_next(workers);
			
 
				+		
			
 
				+		if(workers->init_cursor)
			
 
				+			workers->deinit_cursor(workers);
			
 
				+		
			
 
				+		reply->nprocs = i;
			
 
				+		sched_ctx_hypervisor_ioctl(sched_ctx, 
			
 
				+					   HYPERVISOR_PRIORITY, workerids, nworkers, 1,
			
 
				+					   NULL);
			
 
				+		
			
 
				+	}
			
 
				 
			
 
				+	pthread_mutex_unlock(&act_hypervisor_mutex);
			
 
				 	return;
			
 
				 }
			
 
				 
			
 
				+struct sched_ctx_hypervisor_reply* sched_ctx_hypervisor_request(unsigned sched_ctx, int *workers, int nworkers)
			
 
				+{
			
 
				+	if(hypervisor.sched_ctx_w[sched_ctx].sched_ctx != STARPU_NMAX_SCHED_CTXS)
			
 
				+	{
			
 
				+	}
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				 static void idle_time_cb(unsigned sched_ctx, int worker, double idle_time)
			
 
				 {
			
 
				 	hypervisor.sched_ctx_w[sched_ctx].current_idle_time[worker] += idle_time;
			
@@ -168,3 +277,28 @@ static void working_time_cb(unsigned sched_ctx, int worker, double working_time,
 
				 	return;
			
 
				 }
			
 
				 
			
 
				+
			
 
				+static void pushed_task_cb(unsigned sched_ctx, int worker)
			
 
				+{	
			
 
				+	hypervisor.sched_ctx_w[sched_ctx].tasks[worker]++;
			
 
				+       
			
 
				+	int ntasks = get_ntasks(hypervisor.sched_ctx_w[sched_ctx].tasks);
			
 
				+	
			
 
				+	hypervisor.resize = (ntasks > hypervisor.min_tasks);
			
 
				+}
			
 
				+
			
 
				+static void poped_task_cb(unsigned sched_ctx, int worker)
			
 
				+{
			
 
				+	hypervisor.sched_ctx_w[sched_ctx].poped_tasks[worker]++;
			
 
				+	int npoped_tasks = get_ntasks(hypervisor.sched_ctx_w[sched_ctx].poped_tasks);
			
 
				+       	int ntasks = get_ntasks(hypervisor.sched_ctx_w[sched_ctx].tasks);
			
 
				+	hypervisor.resize = ((ntasks - npoped_tasks) > 0);
			
 
				+}
			
 
				+
			
 
				+static void post_exec_hook_cb(unsigned sched_ctx, int task_checkpoint)
			
 
				+{
			
 
				+	STARPU_ASSERT(task_checkpoint > 0);
			
 
				+	void *data = _starpu_htbl_search_32(hypervisor.configurations[sched_ctx], (uint32_t)task_checkpoint);
			
 
				+	if(data != NULL)	
			
 
				+		sched_ctx_hypervisor_set_data(sched_ctx, data);
			
 
				+}
			
--- a/sched_ctx_hypervisor/src/sched_ctx_hypervisor_intern.h
+++ b/sched_ctx_hypervisor/src/sched_ctx_hypervisor_intern.h
@@ -4,6 +4,8 @@ struct sched_ctx_wrapper {
 
				 	unsigned sched_ctx;
			
 
				 	void *data;
			
 
				 	double current_idle_time[STARPU_NMAXWORKERS];
			
 
				+	int tasks[STARPU_NMAXWORKERS];
			
 
				+	int poped_tasks[STARPU_NMAXWORKERS];
			
 
				 };
			
 
				 
			
 
				 struct sched_ctx_hypervisor {
			
@@ -11,7 +13,9 @@ struct sched_ctx_hypervisor {
 
				 	int sched_ctxs[STARPU_NMAX_SCHED_CTXS];
			
 
				 	unsigned nsched_ctxs;
			
 
				 	unsigned resize;
			
 
				+	int min_tasks;
			
 
				 	struct hypervisor_policy policy;
			
 
				+	struct starpu_htbl32_node_s *configurations[STARPU_NMAX_SCHED_CTXS];
			
 
				 };
			
 
				 
			
 
				 struct sched_ctx_hypervisor hypervisor;
			
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -191,7 +191,7 @@ void _starpu_handle_job_termination(starpu_job_t j, unsigned job_is_already_lock
 
				 	}
			
 
				 
			
 
				 	/* control task should not execute post_exec_hook */
			
 
				-	if(task->cl != NULL)
			
 
				+	if(task->cl != NULL && !task->control_task)
			
 
				 		_starpu_sched_post_exec_hook(task);
			
 
				 
			
 
				 	STARPU_TRACE_TASK_DONE(j);
			
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
@@ -22,119 +22,79 @@ extern struct worker_collection worker_list;
 
				 
			
 
				 pthread_key_t sched_ctx_key;
			
 
				 
			
 
				-struct sched_ctx_info {
			
 
				-	unsigned sched_ctx_id;
			
 
				-	struct starpu_sched_ctx *sched_ctx;
			
 
				-	struct starpu_worker_s *worker;
			
 
				-};
			
 
				-
			
 
				 static unsigned _starpu_get_first_free_sched_ctx(struct starpu_machine_config_s *config);
			
 
				 static unsigned _starpu_worker_get_first_free_sched_ctx(struct starpu_worker_s *worker);
			
 
				 
			
 
				 static unsigned _starpu_worker_get_sched_ctx_id(struct starpu_worker_s *worker, unsigned sched_ctx_id);
			
 
				 
			
 
				-static void change_worker_sched_ctx( struct starpu_worker_s *worker, struct starpu_sched_ctx *sched_ctx, unsigned sched_ctx_id)
			
 
				+static void change_worker_sched_ctx(unsigned sched_ctx_id)
			
 
				 {
			
 
				-	if(sched_ctx != NULL)
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+	struct starpu_worker_s *worker = _starpu_get_worker_struct(workerid);
			
 
				+
			
 
				+	int worker_sched_ctx_id = _starpu_worker_get_sched_ctx_id(worker, sched_ctx_id);
			
 
				+	/* if the ctx is not in the worker's list it means the update concerns the addition of ctxs*/
			
 
				+	if(worker_sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
			
 
				 	{
			
 
				+		worker_sched_ctx_id = _starpu_worker_get_first_free_sched_ctx(worker);
			
 
				+		struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				 		/* add context to worker */
			
 
				-		worker->sched_ctx[sched_ctx_id] = sched_ctx;
			
 
				+		worker->sched_ctx[worker_sched_ctx_id] = sched_ctx;
			
 
				 		worker->nsched_ctxs++;
			
 
				 	}
			
 
				-	else
			
 
				+	else 
			
 
				 	{
			
 
				 		/* remove context from worker */
			
 
				-		worker->sched_ctx[sched_ctx_id]->sched_mutex[worker->workerid] = NULL;
			
 
				-		worker->sched_ctx[sched_ctx_id]->sched_cond[worker->workerid] = NULL;
			
 
				-		worker->sched_ctx[sched_ctx_id] = NULL;
			
 
				+		if(worker->sched_ctx[worker_sched_ctx_id]->sched_policy)
			
 
				+			worker->sched_ctx[worker_sched_ctx_id]->sched_policy->remove_workers(sched_ctx_id, &worker->workerid, 1);
			
 
				+		worker->sched_ctx[worker_sched_ctx_id] = NULL;
			
 
				 		worker->nsched_ctxs--;
			
 
				 	}
			
 
				-
			
 
				 }
			
 
				 
			
 
				 static void update_workers_func(void *buffers[] __attribute__ ((unused)), void *_args)
			
 
				 {
			
 
				-	struct sched_ctx_info *sched_ctx_info_args = (struct sched_ctx_info*)_args;
			
 
				-	struct starpu_worker_s *worker = sched_ctx_info_args->worker;
			
 
				-	struct starpu_sched_ctx *current_sched_ctx = sched_ctx_info_args->sched_ctx;
			
 
				-	unsigned sched_ctx_id = sched_ctx_info_args->sched_ctx_id;
			
 
				-	change_worker_sched_ctx(worker, current_sched_ctx, sched_ctx_id);
			
 
				+	int sched_ctx_id = (int)_args;
			
 
				+	change_worker_sched_ctx(sched_ctx_id);
			
 
				 }
			
 
				 
			
 
				 struct starpu_codelet_t sched_ctx_info_cl = {
			
 
				 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cuda_func = update_workers_func,
			
 
				 	.cpu_func = update_workers_func,
			
 
				-		.opencl_func = update_workers_func,
			
 
				+	.opencl_func = update_workers_func,
			
 
				 	.nbuffers = 0
			
 
				 };
			
 
				 
			
 
				-static void _starpu_update_workers(int *workerids, int nworkers, 
			
 
				-				   int sched_ctx_id, struct starpu_sched_ctx *sched_ctx)
			
 
				+static void _starpu_update_workers(int *workerids, int nworkers, int sched_ctx_id)
			
 
				 {
			
 
				-	struct starpu_task *tasks[nworkers];
			
 
				-
			
 
				 	int i, ret;
			
 
				 	struct starpu_worker_s *worker[nworkers];
			
 
				-	struct sched_ctx_info sched_info_args[nworkers];
			
 
				  	struct starpu_worker_s *curr_worker = _starpu_get_local_worker_key();
			
 
				 
			
 
				-	int worker_sched_ctx_id = -1;
			
 
				 	for(i = 0; i < nworkers; i++)
			
 
				 	{
			
 
				 		worker[i] = _starpu_get_worker_struct(workerids[i]);
			
 
				-		worker_sched_ctx_id = sched_ctx_id == -1  ? 
			
 
				-			_starpu_worker_get_first_free_sched_ctx(worker[i]) : 
			
 
				-			_starpu_worker_get_sched_ctx_id(worker[i], sched_ctx_id);
			
 
				 
			
 
				 		/* if the current thread requires resize it's no need
			
 
				 		   to send itsefl a message in order to change its 
			
 
				 		   sched_ctx info */
			
 
				 		if(curr_worker && curr_worker == worker[i])
			
 
				-		{
			
 
				-			change_worker_sched_ctx(curr_worker, sched_ctx, worker_sched_ctx_id);
			
 
				-			tasks[i] = NULL;
			
 
				-		}
			
 
				+			change_worker_sched_ctx(sched_ctx_id);
			
 
				 		else
			
 
				-		{
			
 
				-			
			
 
				-			sched_info_args[i].sched_ctx_id = worker_sched_ctx_id;
			
 
				-			
			
 
				-			sched_info_args[i].sched_ctx = sched_ctx;
			
 
				-			sched_info_args[i].worker = worker[i];
			
 
				-			
			
 
				-			tasks[i] = starpu_task_create();
			
 
				-			tasks[i]->cl = &sched_ctx_info_cl;
			
 
				-			tasks[i]->cl_arg = &sched_info_args[i];
			
 
				-			tasks[i]->execute_on_a_specific_worker = 1;
			
 
				-			tasks[i]->workerid = workerids[i];
			
 
				-			tasks[i]->detach = 0;
			
 
				-			tasks[i]->destroy = 0;
			
 
				-			
			
 
				-			_starpu_exclude_task_from_dag(tasks[i]);
			
 
				-			
			
 
				-			ret = _starpu_task_submit_internal(tasks[i]);
			
 
				-			if (ret == -ENODEV)
			
 
				-			{
			
 
				-				/* if the worker is not able to execute this tasks, we
			
 
				-				 * don't insist as this means the worker is not
			
 
				-				 * designated by the "where" bitmap */
			
 
				-				starpu_task_destroy(tasks[i]);
			
 
				-				tasks[i] = NULL;
			
 
				-			}
			
 
				+		{			
			
 
				+			worker[i]->tasks[sched_ctx_id] = starpu_task_create();
			
 
				+			worker[i]->tasks[sched_ctx_id]->cl = &sched_ctx_info_cl;
			
 
				+			worker[i]->tasks[sched_ctx_id]->cl_arg = (void*)sched_ctx_id;
			
 
				+			worker[i]->tasks[sched_ctx_id]->execute_on_a_specific_worker = 1;
			
 
				+			worker[i]->tasks[sched_ctx_id]->workerid = workerids[i];
			
 
				+			worker[i]->tasks[sched_ctx_id]->destroy = 1;
			
 
				+
			
 
				+			_starpu_exclude_task_from_dag(worker[i]->tasks[sched_ctx_id]);
			
 
				 			
			
 
				+			ret = _starpu_task_submit_internal(worker[i]->tasks[sched_ctx_id]);
			
 
				 		}		
			
 
				 	}
			
 
				-
			
 
				-	for (i = 0; i < nworkers; i++)
			
 
				-	{
			
 
				-		if (tasks[i])
			
 
				-		{
			
 
				-			ret = starpu_task_wait(tasks[i]);
			
 
				-			STARPU_ASSERT(!ret);
			
 
				-			starpu_task_destroy(tasks[i]);
			
 
				-		}
			
 
				-	}
			
 
				 }
			
 
				 
			
 
				 
			
@@ -210,7 +170,7 @@ struct starpu_sched_ctx*  _starpu_create_sched_ctx(const char *policy_name, int
 
				   
			
 
				 	PTHREAD_MUTEX_INIT(&sched_ctx->changing_ctx_mutex, NULL);
			
 
				 
			
 
				-	sched_ctx->sched_policy = malloc(sizeof(struct starpu_sched_policy_s));
			
 
				+	sched_ctx->sched_policy = (struct starpu_sched_policy_s*)malloc(sizeof(struct starpu_sched_policy_s));
			
 
				 	sched_ctx->is_initial_sched = is_initial_sched;
			
 
				 	sched_ctx->name = sched_name;
			
 
				 
			
@@ -253,7 +213,7 @@ unsigned starpu_create_sched_ctx(const char *policy_name, int *workerids,
 
				 {
			
 
				 	struct starpu_sched_ctx *sched_ctx = _starpu_create_sched_ctx(policy_name, workerids, nworkers_ctx, 0, sched_name);
			
 
				 
			
 
				-	_starpu_update_workers(sched_ctx->workers->workerids, sched_ctx->workers->nworkers, -1, sched_ctx);
			
 
				+	_starpu_update_workers(sched_ctx->workers->workerids, sched_ctx->workers->nworkers, sched_ctx->id);
			
 
				 	return sched_ctx->id;
			
 
				 }
			
 
				 
			
@@ -273,16 +233,18 @@ unsigned starpu_create_sched_ctx_with_criteria(const char *policy_name, int *wor
 
				 /* free all structures for the context */
			
 
				 static void free_sched_ctx_mem(struct starpu_sched_ctx *sched_ctx)
			
 
				 {
			
 
				-	/* just for debug in order to seg fault if we use these structures after del */
			
 
				-	sched_ctx->sched_policy = NULL;
			
 
				-	sched_ctx->sched_mutex = NULL;
			
 
				-	sched_ctx->sched_cond = NULL;
			
 
				 	sched_ctx->workers->deinit(sched_ctx->workers);
			
 
				 
			
 
				 	free(sched_ctx->workers);
			
 
				 	free(sched_ctx->sched_policy);
			
 
				 	free(sched_ctx->sched_mutex);
			
 
				 	free(sched_ctx->sched_cond);
			
 
				+
			
 
				+	sched_ctx->workers = NULL;
			
 
				+	sched_ctx->sched_policy = NULL;
			
 
				+	sched_ctx->sched_mutex = NULL;
			
 
				+	sched_ctx->sched_cond = NULL;
			
 
				+
			
 
				 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				 	config->topology.nsched_ctxs--;
			
 
				 	sched_ctx->id = STARPU_NMAX_SCHED_CTXS;
			
@@ -295,7 +257,7 @@ void starpu_delete_sched_ctx(unsigned sched_ctx_id, unsigned inheritor_sched_ctx
 
				 	struct starpu_sched_ctx *inheritor_sched_ctx = _starpu_get_sched_ctx_struct(inheritor_sched_ctx_id);
			
 
				 
			
 
				 	PTHREAD_MUTEX_LOCK(&sched_ctx->changing_ctx_mutex);
			
 
				-	_starpu_update_workers(sched_ctx->workers->workerids, sched_ctx->workers->nworkers, sched_ctx->id, NULL);
			
 
				+	_starpu_update_workers(sched_ctx->workers->workerids, sched_ctx->workers->nworkers, sched_ctx->id);
			
 
				 	PTHREAD_MUTEX_UNLOCK(&sched_ctx->changing_ctx_mutex);
			
 
				 
			
 
				 	/*if both of them have all the ressources is pointless*/
			
@@ -303,7 +265,7 @@ void starpu_delete_sched_ctx(unsigned sched_ctx_id, unsigned inheritor_sched_ctx
 
				 	struct starpu_machine_config_s *config = (struct starpu_machine_config_s *)_starpu_get_machine_config();
			
 
				 	int nworkers = config->topology.nworkers;
			
 
				 
			
 
				-	if(!(sched_ctx->workers->nworkers == nworkers && sched_ctx->workers->nworkers == inheritor_sched_ctx->workers->nworkers))
			
 
				+	if(!(sched_ctx->workers->nworkers == nworkers && sched_ctx->workers->nworkers == inheritor_sched_ctx->workers->nworkers) && sched_ctx->workers->nworkers > 0)
			
 
				 		starpu_add_workers_to_sched_ctx(sched_ctx->workers->workerids, sched_ctx->workers->nworkers, inheritor_sched_ctx_id);
			
 
				 	
			
 
				 	if(!starpu_wait_for_all_tasks_of_sched_ctx(sched_ctx_id))
			
@@ -361,7 +323,7 @@ void starpu_add_workers_to_sched_ctx(int *workers_to_add, int nworkers_to_add,
 
				 	PTHREAD_MUTEX_UNLOCK(&sched_ctx->changing_ctx_mutex);
			
 
				 
			
 
				 	if(n_added_workers > 0)
			
 
				-		_starpu_update_workers(added_workers, n_added_workers, -1, sched_ctx);
			
 
				+		_starpu_update_workers(added_workers, n_added_workers, sched_ctx->id);
			
 
				        
			
 
				 	return;
			
 
				 }
			
@@ -380,11 +342,7 @@ void starpu_remove_workers_from_sched_ctx(int *workers_to_remove, int nworkers_t
 
				 	_starpu_remove_workers_from_sched_ctx(sched_ctx, workers_to_remove, nworkers_to_remove, removed_workers, &n_removed_workers);
			
 
				 	PTHREAD_MUTEX_UNLOCK(&sched_ctx->changing_ctx_mutex);	
			
 
				 	if(n_removed_workers > 0)
			
 
				-	{
			
 
				-		_starpu_update_workers(removed_workers, n_removed_workers, sched_ctx->id, NULL);
			
 
				-		sched_ctx->sched_policy->remove_workers(sched_ctx_id, removed_workers, n_removed_workers);
			
 
				-	} 
			
 
				-
			
 
				+		_starpu_update_workers(removed_workers, n_removed_workers, sched_ctx->id);
			
 
				        
			
 
				 	return;
			
 
				 }
			
@@ -439,11 +397,18 @@ static unsigned _starpu_worker_get_first_free_sched_ctx(struct starpu_worker_s *
 
				 
			
 
				 static unsigned _starpu_worker_get_sched_ctx_id(struct starpu_worker_s *worker, unsigned sched_ctx_id)
			
 
				 {
			
 
				+	unsigned to_be_deleted = STARPU_NMAX_SCHED_CTXS;
			
 
				 	unsigned i;
			
 
				 	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
			
 
				-		if(worker->sched_ctx[i] != NULL && worker->sched_ctx[i]->id == sched_ctx_id)
			
 
				-			return i;
			
 
				-	STARPU_ASSERT(0);
			
 
				+		if(worker->sched_ctx[i] != NULL)
			
 
				+			if(worker->sched_ctx[i]->id == sched_ctx_id)
			
 
				+				return i;
			
 
				+			else if(worker->sched_ctx[i]->id == STARPU_NMAX_SCHED_CTXS)
			
 
				+				to_be_deleted = i;
			
 
				+
			
 
				+	/* little bit of a hack be carefull */
			
 
				+	if(to_be_deleted != STARPU_NMAX_SCHED_CTXS)
			
 
				+		return to_be_deleted;
			
 
				 	return STARPU_NMAX_SCHED_CTXS;
			
 
				 }
			
 
				 
			
@@ -611,7 +576,6 @@ void starpu_create_worker_collection_for_sched_ctx(unsigned sched_ctx_id, int wo
 
				 		break;
			
 
				 	}
			
 
				 
			
 
				-
			
 
				 	return;
			
 
				 }
			
 
				 
			
@@ -633,3 +597,34 @@ unsigned starpu_get_nworkers_of_sched_ctx(unsigned sched_ctx_id)
 
				 	return sched_ctx->workers->nworkers;
			
 
				 
			
 
				 }
			
 
				+
			
 
				+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				+void starpu_call_poped_task_cb(int workerid)
			
 
				+{
			
 
				+	struct starpu_worker_s *worker =  _starpu_get_worker_struct(workerid);
			
 
				+	unsigned i;
			
 
				+	struct starpu_sched_ctx *sched_ctx = NULL;
			
 
				+	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
			
 
				+	{
			
 
				+		sched_ctx = worker->sched_ctx[i];
			
 
				+		if(sched_ctx != NULL && sched_ctx->id != 0 && sched_ctx->id != STARPU_NMAX_SCHED_CTXS
			
 
				+		   && sched_ctx->criteria != NULL)
			
 
				+			sched_ctx->criteria->poped_task_cb(sched_ctx->id, worker->workerid);
			
 
				+	}
			
 
				+	
			
 
				+}
			
 
				+
			
 
				+void starpu_call_pushed_task_cb(int workerid)
			
 
				+{
			
 
				+	struct starpu_worker_s *worker =  _starpu_get_worker_struct(workerid);
			
 
				+	unsigned i;
			
 
				+	struct starpu_sched_ctx *sched_ctx = NULL;
			
 
				+	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
			
 
				+	{
			
 
				+		sched_ctx = worker->sched_ctx[i];
			
 
				+		if(sched_ctx != NULL && sched_ctx->id != 0  && sched_ctx->criteria != NULL)
			
 
				+			sched_ctx->criteria->pushed_task_cb(sched_ctx->id, workerid);
			
 
				+	}
			
 
				+
			
 
				+}
			
 
				+#endif //STARPU_USE_SCHED_CTX_HYPERVISOR
			
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -367,7 +367,7 @@ struct starpu_task *_starpu_pop_task(struct starpu_worker_s *worker)
 
				 			}
			
 
				 		}
			
 
				 	  }
			
 
				-	
			
 
				+
			
 
				 	/* Note that we may get a NULL task in case the scheduler was unlocked
			
 
				 	 * for some reason. */
			
 
				 	if (profiling && task)
			
@@ -397,13 +397,8 @@ struct starpu_task *_starpu_pop_task(struct starpu_worker_s *worker)
 
				 		for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
			
 
				 		{
			
 
				 			sched_ctx = worker->sched_ctx[i];
			
 
				-			if(sched_ctx != NULL && sched_ctx->id != 0)
			
 
				-			{
			
 
				-				if(sched_ctx != NULL && sched_ctx->criteria != NULL)
			
 
				-				{
			
 
				-					sched_ctx->criteria->idle_time_cb(sched_ctx->id, worker->workerid, 1.0);
			
 
				-				}
			
 
				-			}
			
 
				+			if(sched_ctx != NULL && sched_ctx->id != 0 && sched_ctx->criteria != NULL)
			
 
				+				sched_ctx->criteria->idle_time_cb(sched_ctx->id, worker->workerid, 1.0);
			
 
				 		}
			
 
				 	}
			
 
				 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
			
@@ -422,6 +417,13 @@ struct starpu_task *_starpu_pop_every_task(struct starpu_sched_ctx *sched_ctx)
 
				 void _starpu_sched_post_exec_hook(struct starpu_task *task)
			
 
				 {
			
 
				 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
			
 
				+
			
 
				+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				+	if(task->checkpoint > 0 && sched_ctx != NULL && 
			
 
				+	   sched_ctx->id != 0 && sched_ctx->criteria != NULL)
			
 
				+		sched_ctx->criteria->post_exec_hook_cb(sched_ctx->id, task->checkpoint);
			
 
				+#endif //STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				+
			
 
				 	if (sched_ctx->sched_policy->post_exec_hook)
			
 
				 		sched_ctx->sched_policy->post_exec_hook(task);
			
 
				 }
			
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -83,6 +83,8 @@ void starpu_task_init(struct starpu_task *task)
 
				 	task->sched_ctx = _starpu_get_initial_sched_ctx()->id;
			
 
				 	
			
 
				 	task->control_task = 0;
			
 
				+
			
 
				+	task->checkpoint = 0;
			
 
				 }
			
 
				 
			
 
				 /* Free all the ressources allocated for a task, without deallocating the task
			
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -82,8 +82,8 @@ struct starpu_worker_s {
 
				 
			
 
				 	struct starpu_sched_ctx **sched_ctx;
			
 
				 	unsigned nsched_ctxs; /* the no of contexts a worker belongs to*/
			
 
				-
			
 
				 	struct _starpu_barrier_counter_t tasks_barrier; /* wait for the tasks submitted */
			
 
				+	struct starpu_task *tasks[STARPU_NMAX_SCHED_CTXS];
			
 
				        
			
 
				 	unsigned has_prev_init; /* had already been inited in another ctx */
			
 
				 #ifdef __GLIBC__
			
--- a/src/sched_policies/heft.c
+++ b/src/sched_policies/heft.c
@@ -139,6 +139,11 @@ static void heft_post_exec_hook(struct starpu_task *task)
 
				 	/* Once we have executed the task, we can update the predicted amount
			
 
				 	 * of work. */
			
 
				 	PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				+
			
 
				+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				+	starpu_call_poped_task_cb(workerid);
			
 
				+#endif //STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				+
			
 
				 	exp_len[workerid] -= model;
			
 
				 	exp_start[workerid] = starpu_timing_now() + model;
			
 
				 	exp_end[workerid] = exp_start[workerid] + exp_len[workerid];
			
@@ -162,6 +167,10 @@ static void heft_push_task_notify(struct starpu_task *task, int workerid)
 
				 	/* Update the predictions */
			
 
				 	PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				 
			
 
				+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				+	starpu_call_pushed_task_cb(workerid);
			
 
				+#endif STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				+
			
 
				 	/* Sometimes workers didn't take the tasks as early as we expected */
			
 
				 	exp_start[workerid] = STARPU_MAX(exp_start[workerid], starpu_timing_now());
			
 
				 	exp_end[workerid] = STARPU_MAX(exp_start[workerid], starpu_timing_now());
			
@@ -184,11 +193,17 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
				 	/* make sure someone coule execute that task ! */
			
 
				 	STARPU_ASSERT(best_workerid != -1);
			
 
				 
			
 
				+	_starpu_increment_nsubmitted_tasks_of_worker(best_workerid);
			
 
				+
			
 
				 	pthread_mutex_t *sched_mutex;
			
 
				 	pthread_cond_t *sched_cond;
			
 
				 	starpu_worker_get_sched_condition(sched_ctx_id, best_workerid, &sched_mutex, &sched_cond);
			
 
				 
			
 
				 	PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				+	starpu_call_pushed_task_cb(best_workerid);
			
 
				+#endif STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				+
			
 
				 	exp_end[best_workerid] += predicted;
			
 
				 	exp_len[best_workerid] += predicted;
			
 
				 	ntasks[best_workerid]++;
			
@@ -357,7 +372,6 @@ static int _heft_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
				 	/* If there is no prediction available for that task with that arch we
			
 
				 	 * want to speed-up calibration time so we force this measurement */
			
 
				 	if (forced_best != -1){
			
 
				-		_starpu_increment_nsubmitted_tasks_of_worker(forced_best);
			
 
				 		return push_task_on_best_worker(task, forced_best, 0.0, prio, sched_ctx_id);
			
 
				 	}
			
 
				 	
			
@@ -433,7 +447,6 @@ static int _heft_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
				 	if(workers->init_cursor)
			
 
				 		workers->deinit_cursor(workers);
			
 
				 
			
 
				-	_starpu_increment_nsubmitted_tasks_of_worker(best);
			
 
				 	return push_task_on_best_worker(task, best, model_best, prio, sched_ctx_id);
			
 
				 }
			
 
				 
			
--- a/src/util/starpu_insert_task_utils.c
+++ b/src/util/starpu_insert_task_utils.c
@@ -193,6 +193,10 @@ int _starpu_insert_task_create_and_submit(char *arg_buffer, starpu_codelet *cl,
 
				 		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
			
 
				 			va_arg(varg_list, starpu_data_handle);
			
 
				 		}
			
 
				+		else if (arg_type==STARPU_CHECKPOINT) {
			
 
				+			int checkpoint = va_arg(varg_list, int);
			
 
				+			(*task)->checkpoint = checkpoint;
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	va_end(varg_list);