Переглянути джерело

Hypervisor taged tasks + bug fixing

Andra Hugo 14 роки тому
батько
коміт
bfd7f5a773

+ 5 - 0
include/starpu_scheduler.h

@@ -130,10 +130,15 @@ struct worker_collection {
 struct starpu_sched_ctx_hypervisor_criteria {
 	void (*idle_time_cb)(unsigned sched_ctx, int worker, double idle_time);
 	void (*working_time_cb)(unsigned sched_ctx, double working_time);
+	void (*pushed_task_cb)(unsigned sched_ctx, int worker);
+	void (*poped_task_cb)(unsigned sched_ctx, int worker);
+	void (*post_exec_hook_cb)(unsigned sched_ctx, int taskid);
 };
 
 #ifdef STARPU_BUILD_SCHED_CTX_HYPERVISOR
 unsigned starpu_create_sched_ctx_with_criteria(const char *policy_name, int *workerids_ctx, int nworkers_ctx, const char *sched_name, struct starpu_sched_ctx_hypervisor_criteria *criteria);
+void starpu_call_poped_task_cb(int workerid);
+void starpu_call_pushed_task_cb(int workerid);
 #endif //STARPU_BUILD_SCHED_CTX_HYPERVISOR
 
 unsigned starpu_create_sched_ctx(const char *policy_name, int *workerids_ctx, int nworkers_ctx, const char *sched_name);

+ 4 - 1
include/starpu_task.h

@@ -182,6 +182,8 @@ struct starpu_task {
 	/* flag to differentiate tasks needed by starpu management purposes 
 	 from the ones provided by the appl*/
 	unsigned control_task;
+
+	int checkpoint;
 };
 
 /* It is possible to initialize statically allocated tasks with this value.
@@ -207,7 +209,8 @@ struct starpu_task {
 	.predicted = -1.0,				\
 	.starpu_private = NULL,				\
 	.sched_ctx = 0,					\
-	.control_task = 0				\
+	.control_task = 0,				\
+		.checkpoint = 0				\
 };
 
 /*

+ 1 - 1
include/starpu_util.h

@@ -239,7 +239,7 @@ int starpu_data_cpy(starpu_data_handle dst_handle, starpu_data_handle src_handle
 #define STARPU_PRIORITY		(1<<7)	/* Priority associated to the task */
 #define STARPU_EXECUTE_ON_NODE	(1<<8)	/* Used by MPI to define which task is going to execute the codelet */
 #define STARPU_EXECUTE_ON_DATA	(1<<9)	/* Used by MPI to define which task is going to execute the codelet */
-#define STARPU_CTX		(1<<10)	/* Used to define which ctx will execute the codelet */
+#define STARPU_CHECKPOINT	(1<<10)	/* Used to checkpoint a task after whose execution we'll execute  a code */
 
 /* Wrapper to create a task. */
 int starpu_insert_task(starpu_codelet *cl, ...);

+ 139 - 0
sched_ctx_hypervisor/examples/cholesky/cholesky.h

@@ -0,0 +1,139 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DW_CHOLESKY_H__
+#define __DW_CHOLESKY_H__
+
+#include <limits.h>
+#include <string.h>
+#include <math.h>
+#include <sys/time.h>
+#ifdef STARPU_USE_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cublas.h>
+#endif
+
+#include <common/blas.h>
+#include <starpu.h>
+
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+#define NMAXBLOCKS	32
+
+#define TAG11(k)	((starpu_tag_t)( (1ULL<<60) | (unsigned long long)(k)))
+#define TAG21(k,j)	((starpu_tag_t)(((3ULL<<60) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(j))))
+#define TAG22(k,i,j)	((starpu_tag_t)(((4ULL<<60) | ((unsigned long long)(k)<<32) 	\
+					| ((unsigned long long)(i)<<16)	\
+					| (unsigned long long)(j))))
+
+#define TAG11_AUX(k, prefix)	((starpu_tag_t)( (((unsigned long long)(prefix))<<60)  |  (1ULL<<56) | (unsigned long long)(k)))
+#define TAG21_AUX(k,j, prefix)	((starpu_tag_t)( (((unsigned long long)(prefix))<<60)  			\
+					|  ((3ULL<<56) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(j))))
+#define TAG22_AUX(k,i,j, prefix)    ((starpu_tag_t)(  (((unsigned long long)(prefix))<<60)	\
+					|  ((4ULL<<56) | ((unsigned long long)(k)<<32)  	\
+					| ((unsigned long long)(i)<<16) 			\
+					| (unsigned long long)(j))))
+
+#define BLOCKSIZE	(size/nblocks)
+
+#define BLAS3_FLOP(n1,n2,n3)    \
+        (2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
+
+static unsigned size = 4*1024;
+static unsigned nblocks = 16;
+static unsigned nbigblocks = 8;
+static unsigned pinned = 0;
+static unsigned noprio = 0;
+static unsigned check = 0;
+static unsigned with_ctxs = 0;
+static unsigned with_noctxs = 0;
+static unsigned chole1 = 0;
+static unsigned chole2 = 0;
+
+void chol_cpu_codelet_update_u11(void **, void *);
+void chol_cpu_codelet_update_u21(void **, void *);
+void chol_cpu_codelet_update_u22(void **, void *);
+
+#ifdef STARPU_USE_CUDA
+void chol_cublas_codelet_update_u11(void *descr[], void *_args);
+void chol_cublas_codelet_update_u21(void *descr[], void *_args);
+void chol_cublas_codelet_update_u22(void *descr[], void *_args);
+#endif
+
+extern struct starpu_perfmodel_t chol_model_11;
+extern struct starpu_perfmodel_t chol_model_21;
+extern struct starpu_perfmodel_t chol_model_22;
+
+static void __attribute__((unused)) parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-with_ctxs") == 0) {
+			with_ctxs = 1;
+			break;
+		}
+		if (strcmp(argv[i], "-with_noctxs") == 0) {
+			with_noctxs = 1;
+			break;
+		}
+		
+		if (strcmp(argv[i], "-chole1") == 0) {
+			chole1 = 1;
+			break;
+		}
+
+		if (strcmp(argv[i], "-chole2") == 0) {
+			chole2 = 1;
+			break;
+		}
+
+		if (strcmp(argv[i], "-size") == 0) {
+			char *argptr;
+			size = strtol(argv[++i], &argptr, 10);
+		}
+		
+		if (strcmp(argv[i], "-nblocks") == 0) {
+			char *argptr;
+			nblocks = strtol(argv[++i], &argptr, 10);
+		}
+		
+		if (strcmp(argv[i], "-nbigblocks") == 0) {
+			char *argptr;
+			nbigblocks = strtol(argv[++i], &argptr, 10);
+		}
+		
+		if (strcmp(argv[i], "-pin") == 0) {
+			pinned = 1;
+		}
+		
+		if (strcmp(argv[i], "-no-prio") == 0) {
+			noprio = 1;
+		}
+		
+		if (strcmp(argv[i], "-check") == 0) {
+			check = 1;
+		}
+		
+		if (strcmp(argv[i], "-h") == 0) {
+			printf("usage : %s [-pin] [-size size] [-nblocks nblocks] [-check]\n", argv[0]);
+		}	
+	}	
+}
+
+#endif /* __DW_CHOLESKY_H__ */

+ 380 - 0
sched_ctx_hypervisor/examples/cholesky/cholesky_grain_tag.c

@@ -0,0 +1,380 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "cholesky.h"
+
+/*
+ *	Some useful functions
+ */
+
+static struct starpu_task *create_task(starpu_tag_t id)
+{
+	struct starpu_task *task = starpu_task_create();
+		task->cl_arg = NULL;
+		task->use_tag = 1;
+		task->tag_id = id;
+
+	return task;
+}
+
+/*
+ *	Create the codelets
+ */
+
+static starpu_codelet cl11 =
+{
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_func = chol_cpu_codelet_update_u11,
+#ifdef STARPU_USE_CUDA
+	.cuda_func = chol_cublas_codelet_update_u11,
+#endif
+	.nbuffers = 1,
+	.model = &chol_model_11
+};
+
+static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k, unsigned reclevel)
+{
+/*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
+
+	struct starpu_task *task = create_task(TAG11_AUX(k, reclevel));
+	
+	task->cl = &cl11;
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k);
+	task->buffers[0].mode = STARPU_RW;
+
+	/* this is an important task */
+	task->priority = STARPU_MAX_PRIO;
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG11_AUX(k, reclevel), 1, TAG22_AUX(k-1, k, k, reclevel));
+	}
+
+	return task;
+}
+
+static starpu_codelet cl21 =
+{
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_func = chol_cpu_codelet_update_u21,
+#ifdef STARPU_USE_CUDA
+	.cuda_func = chol_cublas_codelet_update_u21,
+#endif
+	.nbuffers = 2,
+	.model = &chol_model_21
+};
+
+static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j, unsigned reclevel)
+{
+	struct starpu_task *task = create_task(TAG21_AUX(k, j, reclevel));
+
+	task->cl = &cl21;	
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k); 
+	task->buffers[0].mode = STARPU_R;
+	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
+	task->buffers[1].mode = STARPU_RW;
+
+	if (j == k+1) {
+		task->priority = STARPU_MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 2, TAG11_AUX(k, reclevel), TAG22_AUX(k-1, k, j, reclevel));
+	}
+	else {
+		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 1, TAG11_AUX(k, reclevel));
+	}
+
+	starpu_task_submit(task);
+}
+
+static starpu_codelet cl22 =
+{
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_func = chol_cpu_codelet_update_u22,
+#ifdef STARPU_USE_CUDA
+	.cuda_func = chol_cublas_codelet_update_u22,
+#endif
+	.nbuffers = 3,
+	.model = &chol_model_22
+};
+
+static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j, unsigned reclevel)
+{
+/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22_AUX(k,i,j)); */
+
+	struct starpu_task *task = create_task(TAG22_AUX(k, i, j, reclevel));
+
+	task->cl = &cl22;
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, i); 
+	task->buffers[0].mode = STARPU_R;
+	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
+	task->buffers[1].mode = STARPU_R;
+	task->buffers[2].handle = starpu_data_get_sub_data(dataA, 2, i, j); 
+	task->buffers[2].mode = STARPU_RW;
+
+	if ( (i == k + 1) && (j == k +1) ) {
+		task->priority = STARPU_MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 3, TAG22_AUX(k-1, i, j, reclevel), TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
+	}
+	else {
+		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 2, TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
+	}
+
+	starpu_task_submit(task);
+}
+
+
+
+/*
+ *	code to bootstrap the factorization 
+ *	and construct the DAG
+ */
+
+static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks, unsigned reclevel)
+{
+	/* create a new codelet */
+	struct starpu_task *entry_task = NULL;
+
+	/* create all the DAG nodes */
+	unsigned i,j,k;
+
+	starpu_data_handle dataA;
+
+	/* monitor and partition the A matrix into blocks :
+	 * one block is now determined by 2 unsigned (i,j) */
+	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
+
+	starpu_data_set_sequential_consistency_flag(dataA, 0);
+
+	struct starpu_data_filter f = {
+		.filter_func = starpu_vertical_block_filter_func,
+		.nchildren = nblocks
+	};
+
+	struct starpu_data_filter f2 = {
+		.filter_func = starpu_block_filter_func,
+		.nchildren = nblocks
+	};
+
+	starpu_data_map_filters(dataA, 2, &f, &f2);
+
+	for (k = 0; k < nbigblocks; k++)
+	{
+		struct starpu_task *task = create_task_11(dataA, k, reclevel);
+		/* we defer the launch of the first task */
+		if (k == 0) {
+			entry_task = task;
+		}
+		else {
+			starpu_task_submit(task);
+		}
+		
+		for (j = k+1; j<nblocks; j++)
+		{
+			create_task_21(dataA, k, j, reclevel);
+
+			for (i = k+1; i<nblocks; i++)
+			{
+				if (i <= j)
+					create_task_22(dataA, k, i, j, reclevel);
+			}
+		}
+	}
+
+	/* schedule the codelet */
+	int ret = starpu_task_submit(entry_task);
+	if (STARPU_UNLIKELY(ret == -ENODEV))
+	{
+		FPRINTF(stderr, "No worker may execute this task\n");
+		exit(-1);
+	}
+
+	if (nblocks == nbigblocks)
+	{
+		/* stall the application until the end of computations */
+		starpu_tag_wait(TAG11_AUX(nblocks-1, reclevel));
+		starpu_data_unpartition(dataA, 0);
+		return;
+	}
+	else {
+		STARPU_ASSERT(reclevel == 0);
+		unsigned ndeps_tags = (nblocks - nbigblocks)*(nblocks - nbigblocks);
+
+		starpu_tag_t *tag_array = malloc(ndeps_tags*sizeof(starpu_tag_t));
+		STARPU_ASSERT(tag_array);
+
+		unsigned ind = 0;
+		for (i = nbigblocks; i < nblocks; i++)
+		for (j = nbigblocks; j < nblocks; j++)
+		{
+			if (i <= j)
+				tag_array[ind++] = TAG22_AUX(nbigblocks - 1, i, j, reclevel);
+		}
+
+		starpu_tag_wait_array(ind, tag_array);
+
+		free(tag_array);
+
+		starpu_data_unpartition(dataA, 0);
+		starpu_data_unregister(dataA);
+
+		float *newmatA = &matA[nbigblocks*(size/nblocks)*(ld+1)];
+
+		cholesky_grain_rec(newmatA, size/nblocks*(nblocks - nbigblocks), ld, (nblocks - nbigblocks)*2, (nblocks - nbigblocks)*2, reclevel+1);
+	}
+}
+
+static void initialize_system(float **A, unsigned dim, unsigned pinned)
+{
+	starpu_init(NULL);
+
+	starpu_helper_cublas_init();
+
+	if (pinned)
+	{
+		starpu_malloc((void **)A, dim*dim*sizeof(float));
+	} 
+	else {
+		*A = malloc(dim*dim*sizeof(float));
+	}
+}
+
+void cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks)
+{
+	struct timeval start;
+	struct timeval end;
+
+	gettimeofday(&start, NULL);
+
+	cholesky_grain_rec(matA, size, ld, nblocks, nbigblocks, 0);
+
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	FPRINTF(stderr, "Computation took (in ms)\n");
+	FPRINTF(stdout, "%2.2f\n", timing/1000);
+
+	double flop = (1.0f*size*size*size)/3.0f;
+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+
+	starpu_helper_cublas_shutdown();
+
+	starpu_shutdown();
+}
+
+int main(int argc, char **argv)
+{
+	/* create a simple definite positive symetric matrix example
+	 *
+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
+	 * */
+
+	parse_args(argc, argv);
+
+	float *mat;
+
+	mat = malloc(size*size*sizeof(float));
+	initialize_system(&mat, size, pinned);
+
+	unsigned i,j;
+	for (i = 0; i < size; i++)
+	{
+		for (j = 0; j < size; j++)
+		{
+			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
+			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
+		}
+	}
+
+
+#ifdef CHECK_OUTPUT
+	FPRINTF(stdout, "Input :\n");
+
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+			if (i <= j) {
+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
+			}
+			else {
+				FPRINTF(stdout, ".\t");
+			}
+		}
+		FPRINTF(stdout, "\n");
+	}
+#endif
+
+
+	cholesky_grain(mat, size, size, nblocks, nbigblocks);
+
+#ifdef CHECK_OUTPUT
+	FPRINTF(stdout, "Results :\n");
+
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+			if (i <= j) {
+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
+			}
+			else {
+				FPRINTF(stdout, ".\t");
+				mat[j+i*size] = 0.0f; /* debug */
+			}
+		}
+		FPRINTF(stdout, "\n");
+	}
+
+	FPRINTF(stderr, "compute explicit LLt ...\n");
+	float *test_mat = malloc(size*size*sizeof(float));
+	STARPU_ASSERT(test_mat);
+
+	SSYRK("L", "N", size, size, 1.0f, 
+				mat, size, 0.0f, test_mat, size);
+
+	FPRINTF(stderr, "comparing results ...\n");
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+			if (i <= j) {
+                                FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
+			}
+			else {
+				FPRINTF(stdout, ".\t");
+			}
+		}
+		FPRINTF(stdout, "\n");
+	}
+#endif
+
+	return 0;
+}

+ 337 - 0
sched_ctx_hypervisor/examples/cholesky/cholesky_implicit.c

@@ -0,0 +1,337 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "cholesky.h"
+#include "../sched_ctx_utils/sched_ctx_utils.h"
+/*
+ *	Create the codelets
+ */
+
+static starpu_codelet cl11 =
+{
+	.where = STARPU_CPU|STARPU_CUDA,
+	.type = STARPU_SEQ,
+	.cpu_func = chol_cpu_codelet_update_u11,
+#ifdef STARPU_USE_CUDA
+	.cuda_func = chol_cublas_codelet_update_u11,
+#endif
+	.nbuffers = 1,
+	.model = &chol_model_11
+};
+
+static starpu_codelet cl21 =
+{
+	.where = STARPU_CPU|STARPU_CUDA,
+	.type = STARPU_SEQ,
+	.cpu_func = chol_cpu_codelet_update_u21,
+#ifdef STARPU_USE_CUDA
+	.cuda_func = chol_cublas_codelet_update_u21,
+#endif
+	.nbuffers = 2,
+	.model = &chol_model_21
+};
+
+static starpu_codelet cl22 =
+{
+	.where = STARPU_CPU|STARPU_CUDA,
+	.type = STARPU_SEQ,
+	.max_parallelism = INT_MAX,
+	.cpu_func = chol_cpu_codelet_update_u22,
+#ifdef STARPU_USE_CUDA
+	.cuda_func = chol_cublas_codelet_update_u22,
+#endif
+	.nbuffers = 3,
+	.model = &chol_model_22
+};
+
+/*
+ *	code to bootstrap the factorization
+ *	and construct the DAG
+ */
+
+static void callback_turn_spmd_on(void *arg __attribute__ ((unused)))
+{
+	cl22.type = STARPU_SPMD;
+}
+
+int checkpoints = 1;
+static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
+{
+	struct timeval start;
+	struct timeval end;
+
+	unsigned i,j,k;
+
+	int prio_level = noprio?STARPU_DEFAULT_PRIO:STARPU_MAX_PRIO;
+
+	gettimeofday(&start, NULL);
+
+	/* create all the DAG nodes */
+	for (k = 0; k < nblocks; k++)
+	{
+                starpu_data_handle sdatakk = starpu_data_get_sub_data(dataA, 2, k, k);
+		if(k == 0 && with_ctxs)
+		{
+			starpu_insert_task(&cl11,
+					   STARPU_PRIORITY, prio_level,
+					   STARPU_RW, sdatakk,
+					   STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
+					   STARPU_CHECKPOINT, checkpoints,
+					   0);
+			set_hypervisor_conf(START_BENCH, checkpoints++);
+		}
+		else
+			starpu_insert_task(&cl11,
+					   STARPU_PRIORITY, prio_level,
+					   STARPU_RW, sdatakk,
+					   STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
+					   0);
+
+		for (j = k+1; j<nblocks; j++)
+		{
+                        starpu_data_handle sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
+
+                        starpu_insert_task(&cl21,
+                                           STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
+                                           STARPU_R, sdatakk,
+                                           STARPU_RW, sdatakj,
+                                           0);
+
+			for (i = k+1; i<nblocks; i++)
+			{
+				if (i <= j)
+                                {
+					starpu_data_handle sdataki = starpu_data_get_sub_data(dataA, 2, k, i);
+					starpu_data_handle sdataij = starpu_data_get_sub_data(dataA, 2, i, j);
+
+					if(k == (nblocks-1) && j == (nblocks-1) &&
+					   i == (nblocks-1) && with_ctxs)
+					{
+						starpu_insert_task(&cl22,
+								   STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
+								   STARPU_R, sdataki,
+								   STARPU_R, sdatakj,
+								   STARPU_RW, sdataij,
+								   STARPU_CHECKPOINT, checkpoints,
+								   0);
+						
+						set_hypervisor_conf(END_BENCH, checkpoints++);
+					}
+					
+					else
+						starpu_insert_task(&cl22,
+								   STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
+								   STARPU_R, sdataki,
+								   STARPU_R, sdatakj,
+								   STARPU_RW, sdataij,
+								   0);
+					
+                                }
+			}
+		}
+	}
+
+	starpu_task_wait_for_all();
+
+	starpu_data_unpartition(dataA, 0);
+
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+
+	unsigned long n = starpu_matrix_get_nx(dataA);
+
+	double flop = (1.0f*n*n*n)/3.0f;
+
+	if(with_ctxs || with_noctxs || chole1 || chole2)
+		update_sched_ctx_timing_results((flop/timing/1000.0f), (timing/1000000.0f));
+	else
+	{
+		FPRINTF(stderr, "Computation took (in ms)\n");
+		FPRINTF(stdout, "%2.2f\n", timing/1000);
+	
+		FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+	}
+}
+
+static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
+{
+	starpu_data_handle dataA;
+
+	/* monitor and partition the A matrix into blocks :
+	 * one block is now determined by 2 unsigned (i,j) */
+	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
+
+	struct starpu_data_filter f = {
+		.filter_func = starpu_vertical_block_filter_func,
+		.nchildren = nblocks
+	};
+
+	struct starpu_data_filter f2 = {
+		.filter_func = starpu_block_filter_func,
+		.nchildren = nblocks
+	};
+
+	starpu_data_map_filters(dataA, 2, &f, &f2);
+
+	_cholesky(dataA, nblocks);
+}
+
+static void execute_cholesky(unsigned size, unsigned nblocks)
+{
+	float *mat;
+	starpu_malloc((void **)&mat, (size_t)size*size*sizeof(float));
+
+	unsigned i,j;
+	for (i = 0; i < size; i++)
+	{
+		for (j = 0; j < size; j++)
+		{
+			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
+			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
+		}
+	}
+
+/* #define PRINT_OUTPUT */
+#ifdef PRINT_OUTPUT
+	FPRINTF(stdout, "Input :\n");
+
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+			if (i <= j) {
+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
+			}
+			else {
+				FPRINTF(stdout, ".\t");
+			}
+		}
+		FPRINTF(stdout, "\n");
+	}
+#endif
+	cholesky(mat, size, size, nblocks);
+
+#ifdef PRINT_OUTPUT
+	FPRINTF(stdout, "Results :\n");
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+			if (i <= j) {
+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
+			}
+			else {
+				FPRINTF(stdout, ".\t");
+				mat[j+i*size] = 0.0f; /* debug */
+			}
+		}
+		FPRINTF(stdout, "\n");
+	}
+#endif
+
+	if (check)
+	{
+		FPRINTF(stderr, "compute explicit LLt ...\n");
+		for (j = 0; j < size; j++)
+		{
+			for (i = 0; i < size; i++)
+			{
+				if (i > j) {
+					mat[j+i*size] = 0.0f; /* debug */
+				}
+			}
+		}
+		float *test_mat = malloc(size*size*sizeof(float));
+		STARPU_ASSERT(test_mat);
+	
+		SSYRK("L", "N", size, size, 1.0f,
+					mat, size, 0.0f, test_mat, size);
+	
+		FPRINTF(stderr, "comparing results ...\n");
+#ifdef PRINT_OUTPUT
+		for (j = 0; j < size; j++)
+		{
+			for (i = 0; i < size; i++)
+			{
+				if (i <= j) {
+					FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
+				}
+				else {
+					FPRINTF(stdout, ".\t");
+				}
+			}
+			FPRINTF(stdout, "\n");
+		}
+#endif
+	
+		for (j = 0; j < size; j++)
+		{
+			for (i = 0; i < size; i++)
+			{
+				if (i <= j) {
+	                                float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
+	                                float err = abs(test_mat[j +i*size] - orig);
+	                                if (err > 0.00001) {
+	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.2f != %2.2f (err %2.2f)\n", i, j, test_mat[j +i*size], orig, err);
+	                                        assert(0);
+	                                }
+	                        }
+			}
+	        }
+	}
+
+}
+
+int main(int argc, char **argv)
+{
+	/* create a simple definite positive symetric matrix example
+	 *
+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
+	 * */
+
+	parse_args(argc, argv);
+
+	if(with_ctxs || with_noctxs || chole1 || chole2)
+		parse_args_ctx(argc, argv);
+
+	starpu_init(NULL);
+
+	starpu_helper_cublas_init();
+
+	if(with_ctxs)
+	{
+		construct_contexts(execute_cholesky);
+		start_2benchs(execute_cholesky);
+	}
+	else if(with_noctxs)
+		start_2benchs(execute_cholesky);
+	else if(chole1)
+		start_1stbench(execute_cholesky);
+	else if(chole2)
+		start_2ndbench(execute_cholesky);
+	else
+		execute_cholesky(size, nblocks);
+
+	starpu_helper_cublas_shutdown();
+	starpu_shutdown();
+
+	if(with_ctxs)
+		end_contexts();
+	return 0;
+}

+ 247 - 0
sched_ctx_hypervisor/examples/cholesky/cholesky_kernels.c

@@ -0,0 +1,247 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_config.h>
+#include "cholesky.h"
+//#include "../common/blas.h"
+#ifdef STARPU_USE_CUDA
+#include <starpu_cuda.h>
+#ifdef STARPU_HAVE_MAGMA
+#include "magma.h"
+#include "magma_lapack.h"
+#endif
+#endif
+
+/*
+ *   U22 
+ */
+
+static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __attribute__((unused)) void *_args)
+{
+	/* printf("22\n"); */
+	float *left 	= (float *)STARPU_MATRIX_GET_PTR(descr[0]);
+	float *right 	= (float *)STARPU_MATRIX_GET_PTR(descr[1]);
+	float *center 	= (float *)STARPU_MATRIX_GET_PTR(descr[2]);
+
+	unsigned dx = STARPU_MATRIX_GET_NY(descr[2]);
+	unsigned dy = STARPU_MATRIX_GET_NX(descr[2]);
+	unsigned dz = STARPU_MATRIX_GET_NY(descr[0]);
+
+	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[0]);
+	unsigned ld12 = STARPU_MATRIX_GET_LD(descr[1]);
+	unsigned ld22 = STARPU_MATRIX_GET_LD(descr[2]);
+
+	if (s == 0)
+	{
+		int worker_size = starpu_combined_worker_get_size();
+
+		if (worker_size == 1)
+		{
+			/* Sequential CPU kernel */
+			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21, 
+				right, ld12, 1.0f, center, ld22);
+		}
+		else {
+			/* Parallel CPU kernel */
+			int rank = starpu_combined_worker_get_rank();
+
+			int block_size = (dx + worker_size - 1)/worker_size;
+			int new_dx = STARPU_MIN(dx, block_size*(rank+1)) - block_size*rank;
+			
+			float *new_left = &left[block_size*rank];
+			float *new_center = &center[block_size*rank];
+
+			SGEMM("N", "T", dy, new_dx, dz, -1.0f, new_left, ld21, 
+				right, ld12, 1.0f, new_center, ld22);
+		}
+	}
+	else
+	{
+		/* CUDA kernel */
+#ifdef STARPU_USE_CUDA
+		cublasSgemm('n', 't', dy, dx, dz, 
+				-1.0f, left, ld21, right, ld12, 
+				 1.0f, center, ld22);
+		cudaStreamSynchronize(starpu_cuda_get_local_stream());
+#endif
+
+	}
+}
+
+void chol_cpu_codelet_update_u22(void *descr[], void *_args)
+{
+	chol_common_cpu_codelet_update_u22(descr, 0, _args);
+}
+
+#ifdef STARPU_USE_CUDA
+void chol_cublas_codelet_update_u22(void *descr[], void *_args)
+{
+	chol_common_cpu_codelet_update_u22(descr, 1, _args);
+}
+#endif /* STARPU_USE_CUDA */
+
+/* 
+ * U21
+ */
+
+static inline void chol_common_codelet_update_u21(void *descr[], int s, __attribute__((unused)) void *_args)
+{
+/*	printf("21\n"); */
+	float *sub11;
+	float *sub21;
+
+	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
+	sub21 = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
+
+	unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
+	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);
+
+	unsigned nx21 = STARPU_MATRIX_GET_NY(descr[1]);
+	unsigned ny21 = STARPU_MATRIX_GET_NX(descr[1]);
+
+	switch (s) {
+		case 0:
+			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
+			break;
+#ifdef STARPU_USE_CUDA
+		case 1:
+			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+			break;
+	}
+}
+
+void chol_cpu_codelet_update_u21(void *descr[], void *_args)
+{
+	 chol_common_codelet_update_u21(descr, 0, _args);
+}
+
+#ifdef STARPU_USE_CUDA
+void chol_cublas_codelet_update_u21(void *descr[], void *_args)
+{
+	chol_common_codelet_update_u21(descr, 1, _args);
+}
+#endif 
+
+/*
+ *	U11
+ */
+
+static inline void chol_common_codelet_update_u11(void *descr[], int s, __attribute__((unused)) void *_args) 
+{
+/*	printf("11\n"); */
+	float *sub11;
+
+	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]); 
+
+	unsigned nx = STARPU_MATRIX_GET_NY(descr[0]);
+	unsigned ld = STARPU_MATRIX_GET_LD(descr[0]);
+
+	unsigned z;
+
+	switch (s) {
+		case 0:
+
+			/*
+			 *	- alpha 11 <- lambda 11 = sqrt(alpha11)
+			 *	- alpha 21 <- l 21	= alpha 21 / lambda 11
+			 *	- A22 <- A22 - l21 trans(l21)
+			 */
+
+			for (z = 0; z < nx; z++)
+			{
+				float lambda11;
+				lambda11 = sqrt(sub11[z+z*ld]);
+				sub11[z+z*ld] = lambda11;
+
+				STARPU_ASSERT(lambda11 != 0.0f);
+		
+				SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
+		
+				SSYR("L", nx - z - 1, -1.0f, 
+							&sub11[(z+1)+z*ld], 1,
+							&sub11[(z+1)+(z+1)*ld], ld);
+			}
+			break;
+#ifdef STARPU_USE_CUDA
+		case 1:
+#ifdef STARPU_HAVE_MAGMA
+			{
+			int ret;
+			int info;
+			ret = magma_spotrf_gpu('L', nx, sub11, ld, &info);
+			if (ret != MAGMA_SUCCESS) {
+				fprintf(stderr, "Error in Magma: %d\n", ret);
+				STARPU_ABORT();
+			}
+			cudaError_t cures = cudaThreadSynchronize();
+			STARPU_ASSERT(!cures);
+			}
+#else
+			{
+
+			float *lambda11;
+			cudaHostAlloc((void **)&lambda11, sizeof(float), 0);
+
+			for (z = 0; z < nx; z++)
+			{
+				
+				cudaMemcpyAsync(lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+				cudaStreamSynchronize(starpu_cuda_get_local_stream());
+
+				STARPU_ASSERT(*lambda11 != 0.0f);
+				
+				*lambda11 = sqrt(*lambda11);
+
+/*				cublasSetVector(1, sizeof(float), lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float)); */
+				cudaMemcpyAsync(&sub11[z+z*ld], lambda11, sizeof(float), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
+
+				cublasSscal(nx - z - 1, 1.0f/(*lambda11), &sub11[(z+1)+z*ld], 1);
+
+				cublasSsyr('U', nx - z - 1, -1.0f,
+							&sub11[(z+1)+z*ld], 1,
+							&sub11[(z+1)+(z+1)*ld], ld);
+			}
+
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+			cudaFreeHost(lambda11);
+			}
+#endif
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+			break;
+	}
+}
+
+
+void chol_cpu_codelet_update_u11(void *descr[], void *_args)
+{
+	chol_common_codelet_update_u11(descr, 0, _args);
+}
+
+#ifdef STARPU_USE_CUDA
+void chol_cublas_codelet_update_u11(void *descr[], void *_args)
+{
+	chol_common_codelet_update_u11(descr, 1, _args);
+}
+#endif/* STARPU_USE_CUDA */

+ 153 - 0
sched_ctx_hypervisor/examples/cholesky/cholesky_models.c

@@ -0,0 +1,153 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * As a convention, in that file, descr[0] is represented by A,
+ * 				  descr[1] is B ...
+ */
+
+/*
+ *	Number of flops of Gemm 
+ */
+
+#include <starpu.h>
+
+/* #define USE_PERTURBATION	1 */
+
+#ifdef USE_PERTURBATION
+#define PERTURBATE(a)	((starpu_drand48()*2.0f*(AMPL) + 1.0f - (AMPL))*(a))
+#else
+#define PERTURBATE(a)	(a)
+#endif
+
+static double cpu_chol_task_11_cost(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_matrix_get_nx(descr[0].handle);
+
+	double cost = (((double)(n)*n*n)/1000.0f*0.894/0.79176);
+
+#ifdef STARPU_MODEL_DEBUG
+	FPRINTF(stdout, "cpu_chol_task_11_cost n %d cost %e\n", n, cost);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+static double cuda_chol_task_11_cost(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_matrix_get_nx(descr[0].handle);
+
+	double cost = (((double)(n)*n*n)/50.0f/10.75/5.088633/0.9883);
+
+#ifdef STARPU_MODEL_DEBUG
+	FPRINTF(stdout, "cuda_chol_task_11_cost n %d cost %e\n", n, cost);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+static double cpu_chol_task_21_cost(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_matrix_get_nx(descr[0].handle);
+
+	double cost = (((double)(n)*n*n)/7706.674/0.95/0.9965);
+
+#ifdef STARPU_MODEL_DEBUG
+	FPRINTF(stdout, "cpu_chol_task_21_cost n %d cost %e\n", n, cost);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+static double cuda_chol_task_21_cost(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_matrix_get_nx(descr[0].handle);
+
+	double cost = (((double)(n)*n*n)/50.0f/10.75/87.29520);
+
+#ifdef STARPU_MODEL_DEBUG
+	FPRINTF(stdout, "cuda_chol_task_21_cost n %d cost %e\n", n, cost);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+static double cpu_chol_task_22_cost(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_matrix_get_nx(descr[0].handle);
+
+	double cost = (((double)(n)*n*n)/50.0f/10.75/8.0760);
+
+#ifdef STARPU_MODEL_DEBUG
+	FPRINTF(stdout, "cpu_chol_task_22_cost n %d cost %e\n", n, cost);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+static double cuda_chol_task_22_cost(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_matrix_get_nx(descr[0].handle);
+
+	double cost = (((double)(n)*n*n)/50.0f/10.75/76.30666);
+
+#ifdef STARPU_MODEL_DEBUG
+	FPRINTF(stdout, "cuda_chol_task_22_cost n %d cost %e\n", n, cost);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+struct starpu_perfmodel_t chol_model_11 = {
+	.per_arch = {
+		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_11_cost },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_11_cost }
+	},
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "chol_model_11"
+};
+
+struct starpu_perfmodel_t chol_model_21 = {
+	.per_arch = {
+		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_21_cost },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_21_cost }
+	},
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "chol_model_21"
+};
+
+struct starpu_perfmodel_t chol_model_22 = {
+	.per_arch = {
+		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_22_cost },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_22_cost }
+	},
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "chol_model_22"
+};

+ 368 - 0
sched_ctx_hypervisor/examples/cholesky/cholesky_tag.c

@@ -0,0 +1,368 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "cholesky.h"
+
+/*
+ *	Some useful functions
+ */
+
+static struct starpu_task *create_task(starpu_tag_t id)
+{
+	struct starpu_task *task = starpu_task_create();
+		task->cl_arg = NULL;
+		task->use_tag = 1;
+		task->tag_id = id;
+
+	return task;
+}
+
+/*
+ *	Create the codelets
+ */
+
+static starpu_codelet cl11 =
+{
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_func = chol_cpu_codelet_update_u11,
+#ifdef STARPU_USE_CUDA
+	.cuda_func = chol_cublas_codelet_update_u11,
+#endif
+	.nbuffers = 1,
+	.model = &chol_model_11
+};
+
+static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k)
+{
+/*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
+
+	struct starpu_task *task = create_task(TAG11(k));
+	
+	task->cl = &cl11;
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k);
+	task->buffers[0].mode = STARPU_RW;
+
+	/* this is an important task */
+	if (!noprio)
+		task->priority = STARPU_MAX_PRIO;
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
+	}
+
+	return task;
+}
+
+static starpu_codelet cl21 =
+{
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_func = chol_cpu_codelet_update_u21,
+#ifdef STARPU_USE_CUDA
+	.cuda_func = chol_cublas_codelet_update_u21,
+#endif
+	.nbuffers = 2,
+	.model = &chol_model_21
+};
+
+static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j)
+{
+	struct starpu_task *task = create_task(TAG21(k, j));
+
+	task->cl = &cl21;	
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k); 
+	task->buffers[0].mode = STARPU_R;
+	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
+	task->buffers[1].mode = STARPU_RW;
+
+	if (!noprio && (j == k+1)) {
+		task->priority = STARPU_MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG21(k, j), 2, TAG11(k), TAG22(k-1, k, j));
+	}
+	else {
+		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
+	}
+
+	int ret = starpu_task_submit(task);
+        if (STARPU_UNLIKELY(ret == -ENODEV)) {
+                FPRINTF(stderr, "No worker may execute this task\n");
+                exit(0);
+        }
+
+}
+
+static starpu_codelet cl22 =
+{
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_func = chol_cpu_codelet_update_u22,
+#ifdef STARPU_USE_CUDA
+	.cuda_func = chol_cublas_codelet_update_u22,
+#endif
+	.nbuffers = 3,
+	.model = &chol_model_22
+};
+
+static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j)
+{
+/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
+
+	struct starpu_task *task = create_task(TAG22(k, i, j));
+
+	task->cl = &cl22;
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, i); 
+	task->buffers[0].mode = STARPU_R;
+	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
+	task->buffers[1].mode = STARPU_R;
+	task->buffers[2].handle = starpu_data_get_sub_data(dataA, 2, i, j); 
+	task->buffers[2].mode = STARPU_RW;
+
+	if (!noprio && (i == k + 1) && (j == k +1) ) {
+		task->priority = STARPU_MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j));
+	}
+	else {
+		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
+	}
+
+	int ret = starpu_task_submit(task);
+        if (STARPU_UNLIKELY(ret == -ENODEV)) {
+                FPRINTF(stderr, "No worker may execute this task\n");
+                exit(0);
+        }
+}
+
+
+
+/*
+ *	code to bootstrap the factorization 
+ *	and construct the DAG
+ */
+
+static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
+{
+	struct timeval start;
+	struct timeval end;
+
+	struct starpu_task *entry_task = NULL;
+
+	/* create all the DAG nodes */
+	unsigned i,j,k;
+
+	gettimeofday(&start, NULL);
+
+	for (k = 0; k < nblocks; k++)
+	{
+		struct starpu_task *task = create_task_11(dataA, k);
+		/* we defer the launch of the first task */
+		if (k == 0) {
+			entry_task = task;
+		}
+		else {
+			int ret = starpu_task_submit(task);
+                        if (STARPU_UNLIKELY(ret == -ENODEV)) {
+                                FPRINTF(stderr, "No worker may execute this task\n");
+                                exit(0);
+                        }
+
+		}
+		
+		for (j = k+1; j<nblocks; j++)
+		{
+			create_task_21(dataA, k, j);
+
+			for (i = k+1; i<nblocks; i++)
+			{
+				if (i <= j)
+					create_task_22(dataA, k, i, j);
+			}
+		}
+	}
+
+	/* schedule the codelet */
+	int ret = starpu_task_submit(entry_task);
+        if (STARPU_UNLIKELY(ret == -ENODEV)) {
+                FPRINTF(stderr, "No worker may execute this task\n");
+                exit(0);
+        }
+
+
+	/* stall the application until the end of computations */
+	starpu_tag_wait(TAG11(nblocks-1));
+
+	starpu_data_unpartition(dataA, 0);
+
+	gettimeofday(&end, NULL);
+
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	FPRINTF(stderr, "Computation took (in ms)\n");
+	FPRINTF(stdout, "%2.2f\n", timing/1000);
+
+	unsigned n = starpu_matrix_get_nx(dataA);
+
+	double flop = (1.0f*n*n*n)/3.0f;
+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+}
+
+static void initialize_system(float **A, unsigned dim, unsigned pinned)
+{
+	starpu_init(NULL);
+	
+	starpu_helper_cublas_init();
+
+	if (pinned)
+	{
+		starpu_malloc((void **)A, (size_t)dim*dim*sizeof(float));
+	} 
+	else {
+		*A = malloc(dim*dim*sizeof(float));
+	}
+}
+
+static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
+{
+	starpu_data_handle dataA;
+
+	/* monitor and partition the A matrix into blocks :
+	 * one block is now determined by 2 unsigned (i,j) */
+	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
+
+	starpu_data_set_sequential_consistency_flag(dataA, 0);
+
+	struct starpu_data_filter f = {
+		.filter_func = starpu_vertical_block_filter_func,
+		.nchildren = nblocks
+	};
+
+	struct starpu_data_filter f2 = {
+		.filter_func = starpu_block_filter_func,
+		.nchildren = nblocks
+	};
+
+	starpu_data_map_filters(dataA, 2, &f, &f2);
+
+	_cholesky(dataA, nblocks);
+
+	starpu_helper_cublas_shutdown();
+
+	starpu_shutdown();
+}
+
+int main(int argc, char **argv)
+{
+	/* create a simple definite positive symetric matrix example
+	 *
+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
+	 * */
+
+	parse_args(argc, argv);
+
+	float *mat;
+
+	mat = malloc(size*size*sizeof(float));
+	initialize_system(&mat, size, pinned);
+
+	unsigned i,j;
+	for (i = 0; i < size; i++)
+	{
+		for (j = 0; j < size; j++)
+		{
+			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
+			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
+		}
+	}
+
+
+#ifdef CHECK_OUTPUT
+	FPRINTF(stdout, "Input :\n");
+
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+			if (i <= j) {
+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
+			}
+			else {
+				FPRINTF(stdout, ".\t");
+			}
+		}
+		FPRINTF(stdout, "\n");
+	}
+#endif
+
+
+	cholesky(mat, size, size, nblocks);
+
+#ifdef CHECK_OUTPUT
+	FPRINTF(stdout, "Results :\n");
+
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+			if (i <= j) {
+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
+			}
+			else {
+				FPRINTF(stdout, ".\t");
+				mat[j+i*size] = 0.0f; /* debug */
+			}
+		}
+		FPRINTF(stdout, "\n");
+	}
+
+	FPRINTF(stderr, "compute explicit LLt ...\n");
+	float *test_mat = malloc(size*size*sizeof(float));
+	STARPU_ASSERT(test_mat);
+
+	SSYRK("L", "N", size, size, 1.0f, 
+				mat, size, 0.0f, test_mat, size);
+
+	FPRINTF(stderr, "comparing results ...\n");
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+			if (i <= j) {
+				FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
+			}
+			else {
+				FPRINTF(stdout, ".\t");
+			}
+		}
+		FPRINTF(stdout, "\n");
+	}
+#endif
+
+	return 0;
+}

+ 312 - 0
sched_ctx_hypervisor/examples/cholesky/cholesky_tile_tag.c

@@ -0,0 +1,312 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "cholesky.h"
+
+/* A [ y ] [ x ] */
+float *A[NMAXBLOCKS][NMAXBLOCKS];
+starpu_data_handle A_state[NMAXBLOCKS][NMAXBLOCKS];
+
+/*
+ *	Some useful functions
+ */
+
+static struct starpu_task *create_task(starpu_tag_t id)
+{
+	struct starpu_task *task = starpu_task_create();
+		task->cl_arg = NULL;
+		task->use_tag = 1;
+		task->tag_id = id;
+
+	return task;
+}
+
+/*
+ *	Create the codelets
+ */
+
+static starpu_codelet cl11 =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
+	.cpu_func = chol_cpu_codelet_update_u11,
+#ifdef STARPU_USE_CUDA
+	.cuda_func = chol_cublas_codelet_update_u11,
+#endif
+#ifdef STARPU_USE_GORDON
+#ifdef SPU_FUNC_POTRF
+	.gordon_func = SPU_FUNC_POTRF,
+#else
+#warning SPU_FUNC_POTRF is not available
+#endif
+#endif
+	.nbuffers = 1,
+	.model = &chol_model_11
+};
+
+static struct starpu_task * create_task_11(unsigned k, unsigned nblocks)
+{
+/*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
+
+	struct starpu_task *task = create_task(TAG11(k));
+	
+	task->cl = &cl11;
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = A_state[k][k];
+	task->buffers[0].mode = STARPU_RW;
+
+	/* this is an important task */
+	task->priority = STARPU_MAX_PRIO;
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
+	}
+
+	return task;
+}
+
+static starpu_codelet cl21 =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
+	.cpu_func = chol_cpu_codelet_update_u21,
+#ifdef STARPU_USE_CUDA
+	.cuda_func = chol_cublas_codelet_update_u21,
+#endif
+#ifdef STARPU_USE_GORDON
+#ifdef SPU_FUNC_STRSM
+	.gordon_func = SPU_FUNC_STRSM,
+#else
+#warning SPU_FUNC_STRSM is not available
+#endif
+#endif
+	.nbuffers = 2,
+	.model = &chol_model_21
+};
+
+static void create_task_21(unsigned k, unsigned j)
+{
+	struct starpu_task *task = create_task(TAG21(k, j));
+
+	task->cl = &cl21;	
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = A_state[k][k]; 
+	task->buffers[0].mode = STARPU_R;
+	task->buffers[1].handle = A_state[j][k]; 
+	task->buffers[1].mode = STARPU_RW;
+
+	if (j == k+1) {
+		task->priority = STARPU_MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG21(k, j), 2, TAG11(k), TAG22(k-1, k, j));
+	}
+	else {
+		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
+	}
+
+	starpu_task_submit(task);
+}
+
+static starpu_codelet cl22 =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
+	.cpu_func = chol_cpu_codelet_update_u22,
+#ifdef STARPU_USE_CUDA
+	.cuda_func = chol_cublas_codelet_update_u22,
+#endif
+#ifdef STARPU_USE_GORDON
+#ifdef SPU_FUNC_SGEMM
+	.gordon_func = SPU_FUNC_SGEMM,
+#else
+#warning SPU_FUNC_SGEMM is not available
+#endif
+#endif
+	.nbuffers = 3,
+	.model = &chol_model_22
+};
+
+static void create_task_22(unsigned k, unsigned i, unsigned j)
+{
+/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
+
+	struct starpu_task *task = create_task(TAG22(k, i, j));
+
+	task->cl = &cl22;
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = A_state[i][k]; 
+	task->buffers[0].mode = STARPU_R;
+	task->buffers[1].handle = A_state[j][k]; 
+	task->buffers[1].mode = STARPU_R;
+	task->buffers[2].handle = A_state[j][i]; 
+	task->buffers[2].mode = STARPU_RW;
+
+	if ( (i == k + 1) && (j == k +1) ) {
+		task->priority = STARPU_MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j));
+	}
+	else {
+		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
+	}
+
+	starpu_task_submit(task);
+}
+
+
+
+/*
+ *	code to bootstrap the factorization 
+ *	and construct the DAG
+ */
+
+static void cholesky_no_stride(void)
+{
+	struct timeval start;
+	struct timeval end;
+
+	struct starpu_task *entry_task = NULL;
+
+	/* create all the DAG nodes */
+	unsigned i,j,k;
+
+	for (k = 0; k < nblocks; k++)
+	{
+		struct starpu_task *task = create_task_11(k, nblocks);
+		/* we defer the launch of the first task */
+		if (k == 0) {
+			entry_task = task;
+		}
+		else {
+			starpu_task_submit(task);
+		}
+		
+		for (j = k+1; j<nblocks; j++)
+		{
+			create_task_21(k, j);
+
+			for (i = k+1; i<nblocks; i++)
+			{
+				if (i <= j)
+					create_task_22(k, i, j);
+			}
+		}
+	}
+
+	/* schedule the codelet */
+	gettimeofday(&start, NULL);
+	starpu_task_submit(entry_task);
+
+	/* stall the application until the end of computations */
+	starpu_tag_wait(TAG11(nblocks-1));
+
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	FPRINTF(stderr, "Computation took (in ms)\n");
+	FPRINTF(stdout, "%2.2f\n", timing/1000);
+
+	double flop = (1.0f*size*size*size)/3.0f;
+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+}
+
+int main(int argc, char **argv)
+{
+	unsigned x, y;
+	unsigned i, j;
+
+	parse_args(argc, argv);
+	assert(nblocks <= NMAXBLOCKS);
+
+	FPRINTF(stderr, "BLOCK SIZE = %d\n", size / nblocks);
+
+	starpu_init(NULL);
+
+	/* Disable sequential consistency */
+	starpu_data_set_default_sequential_consistency_flag(0);
+
+	starpu_helper_cublas_init();
+
+	for (y = 0; y < nblocks; y++)
+	for (x = 0; x < nblocks; x++)
+	{
+		if (x <= y) {
+			A[y][x] = malloc(BLOCKSIZE*BLOCKSIZE*sizeof(float));
+			assert(A[y][x]);
+		}
+	}
+
+
+	for (y = 0; y < nblocks; y++)
+	for (x = 0; x < nblocks; x++)
+	{
+		if (x <= y) {
+#ifdef STARPU_HAVE_POSIX_MEMALIGN
+			posix_memalign((void **)&A[y][x], 128, BLOCKSIZE*BLOCKSIZE*sizeof(float));
+#else
+			A[y][x] = malloc(BLOCKSIZE*BLOCKSIZE*sizeof(float));
+#endif
+			assert(A[y][x]);
+		}
+	}
+
+	/* create a simple definite positive symetric matrix example
+	 *
+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1) ( + n In to make is stable ) 
+	 * */
+	for (y = 0; y < nblocks; y++)
+	for (x = 0; x < nblocks; x++)
+	if (x <= y) {
+		for (i = 0; i < BLOCKSIZE; i++)
+		for (j = 0; j < BLOCKSIZE; j++)
+		{
+			A[y][x][i*BLOCKSIZE + j] =
+				(float)(1.0f/((float) (1.0+(x*BLOCKSIZE+i)+(y*BLOCKSIZE+j))));
+
+			/* make it a little more numerically stable ... ;) */
+			if ((x == y) && (i == j))
+				A[y][x][i*BLOCKSIZE + j] += (float)(2*size);
+		}
+	}
+
+
+
+	for (y = 0; y < nblocks; y++)
+	for (x = 0; x < nblocks; x++)
+	{
+		if (x <= y) {
+			starpu_matrix_data_register(&A_state[y][x], 0, (uintptr_t)A[y][x], 
+				BLOCKSIZE, BLOCKSIZE, BLOCKSIZE, sizeof(float));
+		}
+	}
+
+	cholesky_no_stride();
+
+	starpu_helper_cublas_shutdown();
+
+	starpu_shutdown();
+	return 0;
+}
+
+

Різницю між файлами не показано, бо вона завелика
+ 1 - 1
sched_ctx_hypervisor/examples/cholesky_2ctxs/cholesky_2ctxs


+ 355 - 0
sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.c

@@ -0,0 +1,355 @@
+#include "sched_ctx_utils.h"
+#include <starpu.h>
+#include "sched_ctx_hypervisor.h"
+
+unsigned size1;
+unsigned size2;
+unsigned nblocks1;
+unsigned nblocks2;
+unsigned cpu1;
+unsigned cpu2;
+unsigned gpu;
+unsigned gpu1;
+unsigned gpu2;
+
+typedef struct {
+	unsigned id;
+	unsigned ctx;
+	int the_other_ctx;
+	int *procs;
+	int nprocs;
+	void (*bench)(unsigned, unsigned);
+	unsigned size;
+	unsigned nblocks;
+} params;
+
+typedef struct {
+	double flops;
+	double avg_timing;
+} retvals;
+
+#define NSAMPLES 3
+int first = 1;
+pthread_mutex_t mut;
+retvals rv[2];
+params p1, p2;
+
+struct sched_ctx_hypervisor_reply reply1[NSAMPLES*2*2];
+struct sched_ctx_hypervisor_reply reply2[NSAMPLES*2*2];
+
+pthread_key_t key;
+
+void init()
+{
+	size1 = 4*1024;
+	size2 = 4*1024;
+	nblocks1 = 16;
+	nblocks2 = 16;
+	cpu1 = 0;
+	cpu2 = 0;
+	gpu = 0;
+	gpu1 = 0;
+	gpu2 = 0;
+
+	rv[0].flops = 0.0;
+	rv[1].flops = 0.0;
+	rv[1].avg_timing = 0.0;
+	rv[1].avg_timing = 0.0;
+
+	p1.ctx = 0;
+	p2.ctx = 0;
+
+	p1.id = 0;
+	p2.id = 1;
+	pthread_key_create(&key, NULL);
+}
+
+void update_sched_ctx_timing_results(double flops, double avg_timing)
+{
+	unsigned *id = pthread_getspecific(key);
+	rv[*id].flops += flops;
+	rv[*id].avg_timing += avg_timing;
+}
+
+void* start_bench(void *val){
+	params *p = (params*)val;
+	int i;
+
+	pthread_setspecific(key, &p->id);
+
+	if(p->ctx != 0)
+		starpu_set_sched_ctx(&p->ctx);
+
+	for(i = 0; i < NSAMPLES; i++)
+		p->bench(p->size, p->nblocks);
+
+	if(p->ctx != 0)
+	{
+		pthread_mutex_lock(&mut);
+		if(first){
+			sched_ctx_hypervisor_ignore_ctx(p->ctx);
+			starpu_delete_sched_ctx(p->ctx, p->the_other_ctx);
+		}
+		
+		first = 0;
+		pthread_mutex_unlock(&mut);
+	}
+
+	rv[p->id].flops /= NSAMPLES;
+	rv[p->id].avg_timing /= NSAMPLES;
+	
+}
+
+void start_2benchs(void (*bench)(unsigned, unsigned))
+{
+	p1.bench = bench;
+	p1.size = size1;
+	p1.nblocks = nblocks1;
+	
+	p2.bench = bench;
+	p2.size = size2;
+	p2.nblocks = nblocks2;
+	
+	pthread_t tid[2];
+	pthread_mutex_init(&mut, NULL);
+
+	struct timeval start;
+	struct timeval end;
+
+	gettimeofday(&start, NULL);
+
+	pthread_create(&tid[0], NULL, (void*)start_bench, (void*)&p1);
+	pthread_create(&tid[1], NULL, (void*)start_bench, (void*)&p2);
+ 
+	pthread_join(tid[0], NULL);
+	pthread_join(tid[1], NULL);
+
+	gettimeofday(&end, NULL);
+
+	pthread_mutex_destroy(&mut);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	timing /= 1000000;
+
+	printf("%2.2f %2.2f ", rv[0].flops, rv[1].flops);
+	printf("%2.2f %2.2f %2.2f\n", rv[0].avg_timing, rv[1].avg_timing, timing);
+
+}
+
+void start_1stbench(void (*bench)(unsigned, unsigned))
+{
+	p1.bench = bench;
+	p1.size = size1;
+	p1.nblocks = nblocks1;
+	
+	struct timeval start;
+	struct timeval end;
+
+	gettimeofday(&start, NULL);
+
+	start_bench((void*)&p1);
+
+	gettimeofday(&end, NULL);
+
+	pthread_mutex_destroy(&mut);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	timing /= 1000000;
+
+	printf("%2.2f ", rv[0].flops);
+	printf("%2.2f %2.2f\n", rv[0].avg_timing, timing);
+}
+
+void start_2ndbench(void (*bench)(unsigned, unsigned))
+{
+	p2.bench = bench;
+	p2.size = size2;
+	p2.nblocks = nblocks2;
+	
+	struct timeval start;
+	struct timeval end;
+
+	gettimeofday(&start, NULL);
+
+	start_bench((void*)&p2);
+
+	gettimeofday(&end, NULL);
+
+	pthread_mutex_destroy(&mut);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	timing /= 1000000;
+
+	printf("%2.2f ", rv[1].flops);
+	printf("%2.2f %2.2f\n", rv[1].avg_timing, timing);
+}
+
+void construct_contexts(void (*bench)(unsigned, unsigned))
+{
+	struct starpu_sched_ctx_hypervisor_criteria *criteria = sched_ctx_hypervisor_init(SIMPLE_POLICY);
+	int nprocs1 = cpu1 + gpu + gpu1;
+	int nprocs2 = cpu2 + gpu + gpu2;
+	unsigned n_all_gpus = gpu + gpu1 + gpu2;
+
+
+	int i;
+	int k = 0;
+
+	p1.procs = (int*)malloc(nprocs1*sizeof(int));
+
+	for(i = 0; i < gpu; i++)
+		p1.procs[k++] = i;
+
+	for(i = gpu; i < gpu + gpu1; i++)
+		p1.procs[k++] = i;
+
+
+	for(i = n_all_gpus; i < n_all_gpus + cpu1; i++)
+		p1.procs[k++] = i;
+
+
+	p1.ctx = starpu_create_sched_ctx_with_criteria("heft", p1.procs, nprocs1, "sched_ctx1", criteria);
+	p2.the_other_ctx = (int)p1.ctx;
+	p1.nprocs = nprocs1;
+	sched_ctx_hypervisor_handle_ctx(p1.ctx);
+	
+	sched_ctx_hypervisor_ioctl(p1.ctx,
+				   HYPERVISOR_MAX_IDLE, p1.procs, p1.nprocs, 100000.0,
+				   HYPERVISOR_MAX_IDLE, p1.procs, gpu+gpu1, 100000000.0,
+				   HYPERVISOR_MIN_WORKING, p1.procs, p1.nprocs, 200.0,
+//				   HYPERVISOR_PRIORITY, p1.procs, p1.nprocs, 1,
+				   HYPERVISOR_PRIORITY, p1.procs, gpu+gpu1, 2,
+				   HYPERVISOR_MIN_PROCS, 1,
+				   HYPERVISOR_MAX_PROCS, 11,
+				   HYPERVISOR_GRANULARITY, 4,
+				   HYPERVISOR_FIXED_PROCS, p1.procs, gpu,
+				   HYPERVISOR_MIN_TASKS, 10000,
+				   HYPERVISOR_NEW_WORKERS_MAX_IDLE, 1000000.0,
+				   NULL);
+
+	k = 0;
+	p2.procs = (int*)malloc(nprocs2*sizeof(int));
+
+	for(i = 0; i < gpu; i++)
+		p2.procs[k++] = i;
+
+	for(i = gpu + gpu1; i < gpu + gpu1 + gpu2; i++)
+		p2.procs[k++] = i;
+
+	for(i = n_all_gpus  + cpu1; i < n_all_gpus + cpu1 + cpu2; i++)
+		p2.procs[k++] = i;
+
+	p2.ctx = starpu_create_sched_ctx_with_criteria("heft", p2.procs, nprocs2, "sched_ctx2", criteria);
+	p1.the_other_ctx = (int)p2.ctx;
+	p2.nprocs = nprocs2;
+	sched_ctx_hypervisor_handle_ctx(p2.ctx);
+	
+	sched_ctx_hypervisor_ioctl(p2.ctx,
+				   HYPERVISOR_MAX_IDLE, p2.procs, p2.nprocs, 100000.0,
+				   HYPERVISOR_MAX_IDLE, p2.procs, gpu+gpu2, 10000000000.0,
+				   HYPERVISOR_MIN_WORKING, p2.procs, p2.nprocs, 200.0,
+//				   HYPERVISOR_PRIORITY, p2.procs, p2.nprocs, 1,
+				   HYPERVISOR_PRIORITY, p2.procs, gpu+gpu2, 2,
+				   HYPERVISOR_MIN_PROCS, 1,
+				   HYPERVISOR_MAX_PROCS, 11,
+				   HYPERVISOR_GRANULARITY, 3,
+				   HYPERVISOR_FIXED_PROCS, p2.procs, gpu,
+				   HYPERVISOR_MIN_TASKS, 10000,
+				   HYPERVISOR_NEW_WORKERS_MAX_IDLE, 100000.0,
+				   NULL);
+}
+
+void set_hypervisor_conf(int event, int checkpoint)
+{
+	unsigned *id = pthread_getspecific(key);
+	pthread_mutex_lock(&mut);
+	int reset_conf = 1;
+	pthread_mutex_unlock(&mut);
+	reset_conf = first;
+
+
+	if(*id == 1 && reset_conf)
+	{
+		double  max_idle_time_big = 0, max_idle_time_small;
+		if(event == START_BENCH)
+		{
+			max_idle_time_big = 1000.0;
+			max_idle_time_small = 1000000.0;
+		}
+		else
+		{
+			max_idle_time_big = 10000000.0;
+			max_idle_time_small = 1000.0;
+
+		}
+ 		
+
+		sched_ctx_hypervisor_advise(p2.ctx, p2.procs, p2.nprocs, &reply2[checkpoint]);
+		if(reply2[checkpoint].procs)
+			sched_ctx_hypervisor_ioctl(p2.ctx,
+						   HYPERVISOR_MAX_IDLE, reply2[checkpoint].procs, reply2[checkpoint].nprocs, max_idle_time_small,
+						   HYPERVISOR_TIME_TO_APPLY, checkpoint,
+						   HYPERVISOR_GRANULARITY, 1,
+						   NULL);
+	}
+}
+
+void end_contexts()
+{
+	free(p1.procs);
+	free(p2.procs);
+	sched_ctx_hypervisor_shutdown();
+}
+
+void parse_args_ctx(int argc, char **argv)
+{
+	init();
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-size1") == 0) {
+			char *argptr;
+			size1 = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nblocks1") == 0) {
+			char *argptr;
+			nblocks1 = strtol(argv[++i], &argptr, 10);
+		}
+		
+		if (strcmp(argv[i], "-size2") == 0) {
+			char *argptr;
+			size2 = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nblocks2") == 0) {
+			char *argptr;
+			nblocks2 = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-cpu1") == 0) {
+			char *argptr;
+			cpu1 = strtol(argv[++i], &argptr, 10);
+		}    
+
+		if (strcmp(argv[i], "-cpu2") == 0) {
+			char *argptr;
+			cpu2 = strtol(argv[++i], &argptr, 10);
+		}    
+
+		if (strcmp(argv[i], "-gpu") == 0) {
+			char *argptr;
+			gpu = strtol(argv[++i], &argptr, 10);
+		}    
+
+		if (strcmp(argv[i], "-gpu1") == 0) {
+			char *argptr;
+			gpu1 = strtol(argv[++i], &argptr, 10);
+		}    
+
+		if (strcmp(argv[i], "-gpu2") == 0) {
+			char *argptr;
+			gpu2 = strtol(argv[++i], &argptr, 10);
+		}    
+	}
+}
+

+ 16 - 0
sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.h

@@ -0,0 +1,16 @@
+#include <limits.h>
+#include <string.h>
+#include <math.h>
+#include <sys/time.h>
+#include <stdlib.h>
+
+#define START_BENCH 0
+#define END_BENCH 1
+
+void parse_args_ctx(int argc, char **argv);
+void update_sched_ctx_timing_results(double gflops, double timing);
+void construct_contexts(void (*bench)(unsigned size, unsigned nblocks));
+void end_contexts(void);
+void start_2benchs(void (*bench)(unsigned size, unsigned nblocks));
+void start_1stbench(void (*bench)(unsigned size, unsigned nblocks));
+void start_2ndbench(void (*bench)(unsigned size, unsigned nblocks));

+ 16 - 1
sched_ctx_hypervisor/include/sched_ctx_hypervisor.h

@@ -1,5 +1,7 @@
 #include <starpu.h>
 #include <../common/config.h>
+#include <../common/htable32.h>
+#include <pthread.h>
 
 /* ioctl properties*/
 #define HYPERVISOR_MAX_IDLE 1
@@ -9,6 +11,15 @@
 #define HYPERVISOR_MAX_PROCS 5
 #define HYPERVISOR_GRANULARITY 6
 #define HYPERVISOR_FIXED_PROCS 7
+#define HYPERVISOR_MIN_TASKS 8
+#define HYPERVISOR_NEW_WORKERS_MAX_IDLE 9
+#define HYPERVISOR_TIME_TO_APPLY 10
+
+struct sched_ctx_hypervisor_reply{
+	int procs[STARPU_NMAXWORKERS];
+	int nprocs;
+};
+pthread_mutex_t act_hypervisor_mutex;
 
 struct starpu_sched_ctx_hypervisor_criteria* sched_ctx_hypervisor_init(int type);
 
@@ -26,6 +37,10 @@ void* sched_ctx_hypervisor_get_data(unsigned sched_ctx);
 
 void sched_ctx_hypervisor_ioctl(unsigned sched_ctx, ...);
 
+void sched_ctx_hypervisor_advise(unsigned sched_ctx, int *workers, int nworkers, struct sched_ctx_hypervisor_reply *reply);
+
+struct sched_ctx_hypervisor_reply* sched_ctx_hypervisor_request(unsigned sched_ctx, int *workers, int nworkers);
+
 /* hypervisor policies */
 #define SIMPLE_POLICY 1
 
@@ -34,6 +49,6 @@ struct hypervisor_policy {
 	void (*deinit)(void);
 	void (*add_sched_ctx)(unsigned sched_ctx);
 	void(*remove_sched_ctx)(unsigned sched_ctx);
-	void (*ioctl)(unsigned sched_ctx, va_list varg_list);
+	void* (*ioctl)(unsigned sched_ctx, va_list varg_list, unsigned later);
 	void (*manage_idle_time)(unsigned req_sched_ctx, int *sched_ctxs, unsigned nsched_ctxs, int worker, double idle_time);
 };

+ 144 - 10
sched_ctx_hypervisor/src/sched_ctx_hypervisor.c

@@ -5,6 +5,9 @@ struct starpu_sched_ctx_hypervisor_criteria* criteria = NULL;
 extern struct hypervisor_policy simple_policy;
 
 static void idle_time_cb(unsigned sched_ctx, int worker, double idle_time);
+static void pushed_task_cb(unsigned sched_ctx, int worker);
+static void poped_task_cb(unsigned sched_ctx, int worker);
+static void post_exec_hook_cb(unsigned sched_ctx, int taskid);
 
 static void _load_hypervisor_policy(int type)
 {
@@ -19,46 +22,55 @@ static void _load_hypervisor_policy(int type)
 		hypervisor.policy.manage_idle_time = simple_policy.manage_idle_time;
 		break;
 	}
-		
 }
 
 struct starpu_sched_ctx_hypervisor_criteria* sched_ctx_hypervisor_init(int type)
 {
-	hypervisor.resize = 1;
 	hypervisor.nsched_ctxs = 0;
-
+	hypervisor.resize = 0;
+	pthread_mutex_init(&act_hypervisor_mutex, NULL);
 	int i;
 	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
 	{
+		hypervisor.configurations[i] = NULL;
 		hypervisor.sched_ctxs[i] = STARPU_NMAX_SCHED_CTXS;
 		hypervisor.sched_ctx_w[i].sched_ctx = STARPU_NMAX_SCHED_CTXS;
 		hypervisor.sched_ctx_w[i].data = NULL;
 		int j;
 		for(j = 0; j < STARPU_NMAXWORKERS; j++)
+		{
 			hypervisor.sched_ctx_w[i].current_idle_time[j] = 0.0;
+			hypervisor.sched_ctx_w[i].tasks[j] = 0;
+			hypervisor.sched_ctx_w[i].poped_tasks[j] = 0;
+		}
 	}
 
-
 	_load_hypervisor_policy(type);
 
 	hypervisor.policy.init();
 	criteria = (struct starpu_sched_ctx_hypervisor_criteria*)malloc(sizeof(struct starpu_sched_ctx_hypervisor_criteria));
 	criteria->idle_time_cb = idle_time_cb;
+	criteria->pushed_task_cb = pushed_task_cb;
+	criteria->poped_task_cb = poped_task_cb;
+	criteria->post_exec_hook_cb = post_exec_hook_cb;
 	return criteria;
 }
 
 void sched_ctx_hypervisor_shutdown(void)
 {
-	hypervisor.policy.deinit();
 	hypervisor.resize = 0;
+	hypervisor.policy.deinit();
 	free(criteria);
+	pthread_mutex_destroy(&act_hypervisor_mutex);
 }
 
 void sched_ctx_hypervisor_handle_ctx(unsigned sched_ctx)
 {	
+	hypervisor.configurations[sched_ctx] = (struct starpu_htbl32_node_s*)malloc(sizeof(struct starpu_htbl32_node_s));
 	hypervisor.policy.add_sched_ctx(sched_ctx);
 	hypervisor.sched_ctx_w[sched_ctx].sched_ctx = sched_ctx;
 	hypervisor.sched_ctxs[hypervisor.nsched_ctxs++] = sched_ctx;
+
 }
 
 static int _get_first_free_sched_ctx(int *sched_ctxs, unsigned nsched_ctxs)
@@ -109,11 +121,18 @@ void sched_ctx_hypervisor_ignore_ctx(unsigned sched_ctx)
 	hypervisor.nsched_ctxs--;
 	hypervisor.sched_ctx_w[sched_ctx].sched_ctx = STARPU_NMAX_SCHED_CTXS;
 	hypervisor.policy.remove_sched_ctx(sched_ctx);
+	free(hypervisor.configurations[sched_ctx]);
 }
 
 void sched_ctx_hypervisor_set_data(unsigned sched_ctx, void *data)
 {
+	pthread_mutex_lock(&act_hypervisor_mutex);
+
+	if(hypervisor.sched_ctx_w[sched_ctx].data != NULL)
+		free(hypervisor.sched_ctx_w[sched_ctx].data);
+
 	hypervisor.sched_ctx_w[sched_ctx].data = data;
+	pthread_mutex_unlock(&act_hypervisor_mutex);
 	return;
 }
 
@@ -127,16 +146,74 @@ void sched_ctx_hypervisor_ioctl(unsigned sched_ctx, ...)
 	va_list varg_list;
 	va_start(varg_list, sched_ctx);
 
-	hypervisor.policy.ioctl(sched_ctx, varg_list);
+	int arg_type;
+	int stop = 0;
+	int task_checkpoint = -1;
+
+	while ((arg_type = va_arg(varg_list, int)) != 0) 
+	{
+		switch(arg_type)
+		{
+		case HYPERVISOR_TIME_TO_APPLY:
+			task_checkpoint = va_arg(varg_list, int);
+			stop = 1;
+			break;
+
+		case HYPERVISOR_MIN_TASKS:
+			hypervisor.min_tasks = va_arg(varg_list, int);
+			break;
+		}
+		if(stop) break;
+	}
+
+	va_end(varg_list);
+	va_start(varg_list, sched_ctx);
+
+	/* hypervisor configuration to be considered later */
+	void *data = hypervisor.policy.ioctl(sched_ctx, varg_list, (task_checkpoint > 0));
+	if(data != NULL)
+		_starpu_htbl_insert_32(&hypervisor.configurations[sched_ctx], (uint32_t)task_checkpoint, data);
+
 	return;
 }
 
 static void _sched_ctx_hypervisor_resize(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, int* workers_to_move, unsigned nworkers_to_move)
 {
-	printf("resize ctx %d with %d\n", sender_sched_ctx, workers_to_move[0]);
+	int i;
+	printf("resize ctx %d with", sender_sched_ctx);
+	for(i = 0; i < nworkers_to_move; i++)
+		printf(" %d", workers_to_move[i]);
+	printf("\n");
+
 	starpu_remove_workers_from_sched_ctx(workers_to_move, nworkers_to_move, sender_sched_ctx);
 	starpu_add_workers_to_sched_ctx(workers_to_move, nworkers_to_move, receiver_sched_ctx);
+
+	return;
+}
+
+static int get_ntasks( int *tasks)
+{
+	int ntasks = 0;
+	int j;
+	for(j = 0; j < STARPU_NMAXWORKERS; j++)
+	{
+		ntasks += tasks[j];
+	}
+	return ntasks;
+}
+
+static unsigned check_tasks_of_sched_ctx(unsigned sched_ctx)
+{
+	int ntasks = get_ntasks(hypervisor.sched_ctx_w[sched_ctx].tasks);
 	
+	return ntasks > hypervisor.min_tasks;
+}
+
+void sched_ctx_hypervisor_resize(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, int* workers_to_move, unsigned nworkers_to_move)
+{
+	if(hypervisor.resize)
+		_sched_ctx_hypervisor_resize(sender_sched_ctx, receiver_sched_ctx, workers_to_move, nworkers_to_move);
+
 	int i;
 	for(i = 0; i < nworkers_to_move; i++)
 		hypervisor.sched_ctx_w[sender_sched_ctx].current_idle_time[workers_to_move[i]] = 0.0;
@@ -144,14 +221,46 @@ static void _sched_ctx_hypervisor_resize(unsigned sender_sched_ctx, unsigned rec
 	return;
 }
 
-void sched_ctx_hypervisor_resize(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, int* workers_to_move, unsigned nworkers_to_move)
+void sched_ctx_hypervisor_advise(unsigned sched_ctx, int *workerids, int nworkers, struct sched_ctx_hypervisor_reply *reply)
 {
-	if(hypervisor.resize)
-		_sched_ctx_hypervisor_resize(sender_sched_ctx, receiver_sched_ctx, workers_to_move, nworkers_to_move);
+	pthread_mutex_lock(&act_hypervisor_mutex);
+	
+	if(hypervisor.sched_ctx_w[sched_ctx].sched_ctx != STARPU_NMAX_SCHED_CTXS)
+	{
+		starpu_add_workers_to_sched_ctx(workerids, nworkers, sched_ctx);
+		
+		struct worker_collection *workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx);
+		
+		int i = 0;
+		
+		if(workers->init_cursor)
+			workers->init_cursor(workers);
+		
+		while(workers->has_next(workers))
+			reply->procs[i++] = workers->get_next(workers);
+		
+		if(workers->init_cursor)
+			workers->deinit_cursor(workers);
+		
+		reply->nprocs = i;
+		sched_ctx_hypervisor_ioctl(sched_ctx, 
+					   HYPERVISOR_PRIORITY, workerids, nworkers, 1,
+					   NULL);
+		
+	}
 
+	pthread_mutex_unlock(&act_hypervisor_mutex);
 	return;
 }
 
+struct sched_ctx_hypervisor_reply* sched_ctx_hypervisor_request(unsigned sched_ctx, int *workers, int nworkers)
+{
+	if(hypervisor.sched_ctx_w[sched_ctx].sched_ctx != STARPU_NMAX_SCHED_CTXS)
+	{
+	}
+	return NULL;
+}
+
 static void idle_time_cb(unsigned sched_ctx, int worker, double idle_time)
 {
 	hypervisor.sched_ctx_w[sched_ctx].current_idle_time[worker] += idle_time;
@@ -168,3 +277,28 @@ static void working_time_cb(unsigned sched_ctx, int worker, double working_time,
 	return;
 }
 
+
+static void pushed_task_cb(unsigned sched_ctx, int worker)
+{	
+	hypervisor.sched_ctx_w[sched_ctx].tasks[worker]++;
+       
+	int ntasks = get_ntasks(hypervisor.sched_ctx_w[sched_ctx].tasks);
+	
+	hypervisor.resize = (ntasks > hypervisor.min_tasks);
+}
+
+static void poped_task_cb(unsigned sched_ctx, int worker)
+{
+	hypervisor.sched_ctx_w[sched_ctx].poped_tasks[worker]++;
+	int npoped_tasks = get_ntasks(hypervisor.sched_ctx_w[sched_ctx].poped_tasks);
+       	int ntasks = get_ntasks(hypervisor.sched_ctx_w[sched_ctx].tasks);
+	hypervisor.resize = ((ntasks - npoped_tasks) > 0);
+}
+
+static void post_exec_hook_cb(unsigned sched_ctx, int task_checkpoint)
+{
+	STARPU_ASSERT(task_checkpoint > 0);
+	void *data = _starpu_htbl_search_32(hypervisor.configurations[sched_ctx], (uint32_t)task_checkpoint);
+	if(data != NULL)	
+		sched_ctx_hypervisor_set_data(sched_ctx, data);
+}

+ 4 - 0
sched_ctx_hypervisor/src/sched_ctx_hypervisor_intern.h

@@ -4,6 +4,8 @@ struct sched_ctx_wrapper {
 	unsigned sched_ctx;
 	void *data;
 	double current_idle_time[STARPU_NMAXWORKERS];
+	int tasks[STARPU_NMAXWORKERS];
+	int poped_tasks[STARPU_NMAXWORKERS];
 };
 
 struct sched_ctx_hypervisor {
@@ -11,7 +13,9 @@ struct sched_ctx_hypervisor {
 	int sched_ctxs[STARPU_NMAX_SCHED_CTXS];
 	unsigned nsched_ctxs;
 	unsigned resize;
+	int min_tasks;
 	struct hypervisor_policy policy;
+	struct starpu_htbl32_node_s *configurations[STARPU_NMAX_SCHED_CTXS];
 };
 
 struct sched_ctx_hypervisor hypervisor;

+ 1 - 1
src/core/jobs.c

@@ -191,7 +191,7 @@ void _starpu_handle_job_termination(starpu_job_t j, unsigned job_is_already_lock
 	}
 
 	/* control task should not execute post_exec_hook */
-	if(task->cl != NULL)
+	if(task->cl != NULL && !task->control_task)
 		_starpu_sched_post_exec_hook(task);
 
 	STARPU_TRACE_TASK_DONE(j);

+ 82 - 87
src/core/sched_ctx.c

@@ -22,119 +22,79 @@ extern struct worker_collection worker_list;
 
 pthread_key_t sched_ctx_key;
 
-struct sched_ctx_info {
-	unsigned sched_ctx_id;
-	struct starpu_sched_ctx *sched_ctx;
-	struct starpu_worker_s *worker;
-};
-
 static unsigned _starpu_get_first_free_sched_ctx(struct starpu_machine_config_s *config);
 static unsigned _starpu_worker_get_first_free_sched_ctx(struct starpu_worker_s *worker);
 
 static unsigned _starpu_worker_get_sched_ctx_id(struct starpu_worker_s *worker, unsigned sched_ctx_id);
 
-static void change_worker_sched_ctx( struct starpu_worker_s *worker, struct starpu_sched_ctx *sched_ctx, unsigned sched_ctx_id)
+static void change_worker_sched_ctx(unsigned sched_ctx_id)
 {
-	if(sched_ctx != NULL)
+	int workerid = starpu_worker_get_id();
+	struct starpu_worker_s *worker = _starpu_get_worker_struct(workerid);
+
+	int worker_sched_ctx_id = _starpu_worker_get_sched_ctx_id(worker, sched_ctx_id);
+	/* if the ctx is not in the worker's list it means the update concerns the addition of ctxs*/
+	if(worker_sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
 	{
+		worker_sched_ctx_id = _starpu_worker_get_first_free_sched_ctx(worker);
+		struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 		/* add context to worker */
-		worker->sched_ctx[sched_ctx_id] = sched_ctx;
+		worker->sched_ctx[worker_sched_ctx_id] = sched_ctx;
 		worker->nsched_ctxs++;
 	}
-	else
+	else 
 	{
 		/* remove context from worker */
-		worker->sched_ctx[sched_ctx_id]->sched_mutex[worker->workerid] = NULL;
-		worker->sched_ctx[sched_ctx_id]->sched_cond[worker->workerid] = NULL;
-		worker->sched_ctx[sched_ctx_id] = NULL;
+		if(worker->sched_ctx[worker_sched_ctx_id]->sched_policy)
+			worker->sched_ctx[worker_sched_ctx_id]->sched_policy->remove_workers(sched_ctx_id, &worker->workerid, 1);
+		worker->sched_ctx[worker_sched_ctx_id] = NULL;
 		worker->nsched_ctxs--;
 	}
-
 }
 
 static void update_workers_func(void *buffers[] __attribute__ ((unused)), void *_args)
 {
-	struct sched_ctx_info *sched_ctx_info_args = (struct sched_ctx_info*)_args;
-	struct starpu_worker_s *worker = sched_ctx_info_args->worker;
-	struct starpu_sched_ctx *current_sched_ctx = sched_ctx_info_args->sched_ctx;
-	unsigned sched_ctx_id = sched_ctx_info_args->sched_ctx_id;
-	change_worker_sched_ctx(worker, current_sched_ctx, sched_ctx_id);
+	int sched_ctx_id = (int)_args;
+	change_worker_sched_ctx(sched_ctx_id);
 }
 
 struct starpu_codelet_t sched_ctx_info_cl = {
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cuda_func = update_workers_func,
 	.cpu_func = update_workers_func,
-		.opencl_func = update_workers_func,
+	.opencl_func = update_workers_func,
 	.nbuffers = 0
 };
 
-static void _starpu_update_workers(int *workerids, int nworkers, 
-				   int sched_ctx_id, struct starpu_sched_ctx *sched_ctx)
+static void _starpu_update_workers(int *workerids, int nworkers, int sched_ctx_id)
 {
-	struct starpu_task *tasks[nworkers];
-
 	int i, ret;
 	struct starpu_worker_s *worker[nworkers];
-	struct sched_ctx_info sched_info_args[nworkers];
  	struct starpu_worker_s *curr_worker = _starpu_get_local_worker_key();
 
-	int worker_sched_ctx_id = -1;
 	for(i = 0; i < nworkers; i++)
 	{
 		worker[i] = _starpu_get_worker_struct(workerids[i]);
-		worker_sched_ctx_id = sched_ctx_id == -1  ? 
-			_starpu_worker_get_first_free_sched_ctx(worker[i]) : 
-			_starpu_worker_get_sched_ctx_id(worker[i], sched_ctx_id);
 
 		/* if the current thread requires resize it's no need
 		   to send itsefl a message in order to change its 
 		   sched_ctx info */
 		if(curr_worker && curr_worker == worker[i])
-		{
-			change_worker_sched_ctx(curr_worker, sched_ctx, worker_sched_ctx_id);
-			tasks[i] = NULL;
-		}
+			change_worker_sched_ctx(sched_ctx_id);
 		else
-		{
-			
-			sched_info_args[i].sched_ctx_id = worker_sched_ctx_id;
-			
-			sched_info_args[i].sched_ctx = sched_ctx;
-			sched_info_args[i].worker = worker[i];
-			
-			tasks[i] = starpu_task_create();
-			tasks[i]->cl = &sched_ctx_info_cl;
-			tasks[i]->cl_arg = &sched_info_args[i];
-			tasks[i]->execute_on_a_specific_worker = 1;
-			tasks[i]->workerid = workerids[i];
-			tasks[i]->detach = 0;
-			tasks[i]->destroy = 0;
-			
-			_starpu_exclude_task_from_dag(tasks[i]);
-			
-			ret = _starpu_task_submit_internal(tasks[i]);
-			if (ret == -ENODEV)
-			{
-				/* if the worker is not able to execute this tasks, we
-				 * don't insist as this means the worker is not
-				 * designated by the "where" bitmap */
-				starpu_task_destroy(tasks[i]);
-				tasks[i] = NULL;
-			}
+		{			
+			worker[i]->tasks[sched_ctx_id] = starpu_task_create();
+			worker[i]->tasks[sched_ctx_id]->cl = &sched_ctx_info_cl;
+			worker[i]->tasks[sched_ctx_id]->cl_arg = (void*)sched_ctx_id;
+			worker[i]->tasks[sched_ctx_id]->execute_on_a_specific_worker = 1;
+			worker[i]->tasks[sched_ctx_id]->workerid = workerids[i];
+			worker[i]->tasks[sched_ctx_id]->destroy = 1;
+
+			_starpu_exclude_task_from_dag(worker[i]->tasks[sched_ctx_id]);
 			
+			ret = _starpu_task_submit_internal(worker[i]->tasks[sched_ctx_id]);
 		}		
 	}
-
-	for (i = 0; i < nworkers; i++)
-	{
-		if (tasks[i])
-		{
-			ret = starpu_task_wait(tasks[i]);
-			STARPU_ASSERT(!ret);
-			starpu_task_destroy(tasks[i]);
-		}
-	}
 }
 
 
@@ -210,7 +170,7 @@ struct starpu_sched_ctx*  _starpu_create_sched_ctx(const char *policy_name, int
   
 	PTHREAD_MUTEX_INIT(&sched_ctx->changing_ctx_mutex, NULL);
 
-	sched_ctx->sched_policy = malloc(sizeof(struct starpu_sched_policy_s));
+	sched_ctx->sched_policy = (struct starpu_sched_policy_s*)malloc(sizeof(struct starpu_sched_policy_s));
 	sched_ctx->is_initial_sched = is_initial_sched;
 	sched_ctx->name = sched_name;
 
@@ -253,7 +213,7 @@ unsigned starpu_create_sched_ctx(const char *policy_name, int *workerids,
 {
 	struct starpu_sched_ctx *sched_ctx = _starpu_create_sched_ctx(policy_name, workerids, nworkers_ctx, 0, sched_name);
 
-	_starpu_update_workers(sched_ctx->workers->workerids, sched_ctx->workers->nworkers, -1, sched_ctx);
+	_starpu_update_workers(sched_ctx->workers->workerids, sched_ctx->workers->nworkers, sched_ctx->id);
 	return sched_ctx->id;
 }
 
@@ -273,16 +233,18 @@ unsigned starpu_create_sched_ctx_with_criteria(const char *policy_name, int *wor
 /* free all structures for the context */
 static void free_sched_ctx_mem(struct starpu_sched_ctx *sched_ctx)
 {
-	/* just for debug in order to seg fault if we use these structures after del */
-	sched_ctx->sched_policy = NULL;
-	sched_ctx->sched_mutex = NULL;
-	sched_ctx->sched_cond = NULL;
 	sched_ctx->workers->deinit(sched_ctx->workers);
 
 	free(sched_ctx->workers);
 	free(sched_ctx->sched_policy);
 	free(sched_ctx->sched_mutex);
 	free(sched_ctx->sched_cond);
+
+	sched_ctx->workers = NULL;
+	sched_ctx->sched_policy = NULL;
+	sched_ctx->sched_mutex = NULL;
+	sched_ctx->sched_cond = NULL;
+
 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
 	config->topology.nsched_ctxs--;
 	sched_ctx->id = STARPU_NMAX_SCHED_CTXS;
@@ -295,7 +257,7 @@ void starpu_delete_sched_ctx(unsigned sched_ctx_id, unsigned inheritor_sched_ctx
 	struct starpu_sched_ctx *inheritor_sched_ctx = _starpu_get_sched_ctx_struct(inheritor_sched_ctx_id);
 
 	PTHREAD_MUTEX_LOCK(&sched_ctx->changing_ctx_mutex);
-	_starpu_update_workers(sched_ctx->workers->workerids, sched_ctx->workers->nworkers, sched_ctx->id, NULL);
+	_starpu_update_workers(sched_ctx->workers->workerids, sched_ctx->workers->nworkers, sched_ctx->id);
 	PTHREAD_MUTEX_UNLOCK(&sched_ctx->changing_ctx_mutex);
 
 	/*if both of them have all the ressources is pointless*/
@@ -303,7 +265,7 @@ void starpu_delete_sched_ctx(unsigned sched_ctx_id, unsigned inheritor_sched_ctx
 	struct starpu_machine_config_s *config = (struct starpu_machine_config_s *)_starpu_get_machine_config();
 	int nworkers = config->topology.nworkers;
 
-	if(!(sched_ctx->workers->nworkers == nworkers && sched_ctx->workers->nworkers == inheritor_sched_ctx->workers->nworkers))
+	if(!(sched_ctx->workers->nworkers == nworkers && sched_ctx->workers->nworkers == inheritor_sched_ctx->workers->nworkers) && sched_ctx->workers->nworkers > 0)
 		starpu_add_workers_to_sched_ctx(sched_ctx->workers->workerids, sched_ctx->workers->nworkers, inheritor_sched_ctx_id);
 	
 	if(!starpu_wait_for_all_tasks_of_sched_ctx(sched_ctx_id))
@@ -361,7 +323,7 @@ void starpu_add_workers_to_sched_ctx(int *workers_to_add, int nworkers_to_add,
 	PTHREAD_MUTEX_UNLOCK(&sched_ctx->changing_ctx_mutex);
 
 	if(n_added_workers > 0)
-		_starpu_update_workers(added_workers, n_added_workers, -1, sched_ctx);
+		_starpu_update_workers(added_workers, n_added_workers, sched_ctx->id);
        
 	return;
 }
@@ -380,11 +342,7 @@ void starpu_remove_workers_from_sched_ctx(int *workers_to_remove, int nworkers_t
 	_starpu_remove_workers_from_sched_ctx(sched_ctx, workers_to_remove, nworkers_to_remove, removed_workers, &n_removed_workers);
 	PTHREAD_MUTEX_UNLOCK(&sched_ctx->changing_ctx_mutex);	
 	if(n_removed_workers > 0)
-	{
-		_starpu_update_workers(removed_workers, n_removed_workers, sched_ctx->id, NULL);
-		sched_ctx->sched_policy->remove_workers(sched_ctx_id, removed_workers, n_removed_workers);
-	} 
-
+		_starpu_update_workers(removed_workers, n_removed_workers, sched_ctx->id);
        
 	return;
 }
@@ -439,11 +397,18 @@ static unsigned _starpu_worker_get_first_free_sched_ctx(struct starpu_worker_s *
 
 static unsigned _starpu_worker_get_sched_ctx_id(struct starpu_worker_s *worker, unsigned sched_ctx_id)
 {
+	unsigned to_be_deleted = STARPU_NMAX_SCHED_CTXS;
 	unsigned i;
 	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
-		if(worker->sched_ctx[i] != NULL && worker->sched_ctx[i]->id == sched_ctx_id)
-			return i;
-	STARPU_ASSERT(0);
+		if(worker->sched_ctx[i] != NULL)
+			if(worker->sched_ctx[i]->id == sched_ctx_id)
+				return i;
+			else if(worker->sched_ctx[i]->id == STARPU_NMAX_SCHED_CTXS)
+				to_be_deleted = i;
+
+	/* little bit of a hack be carefull */
+	if(to_be_deleted != STARPU_NMAX_SCHED_CTXS)
+		return to_be_deleted;
 	return STARPU_NMAX_SCHED_CTXS;
 }
 
@@ -611,7 +576,6 @@ void starpu_create_worker_collection_for_sched_ctx(unsigned sched_ctx_id, int wo
 		break;
 	}
 
-
 	return;
 }
 
@@ -633,3 +597,34 @@ unsigned starpu_get_nworkers_of_sched_ctx(unsigned sched_ctx_id)
 	return sched_ctx->workers->nworkers;
 
 }
+
+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
+void starpu_call_poped_task_cb(int workerid)
+{
+	struct starpu_worker_s *worker =  _starpu_get_worker_struct(workerid);
+	unsigned i;
+	struct starpu_sched_ctx *sched_ctx = NULL;
+	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
+	{
+		sched_ctx = worker->sched_ctx[i];
+		if(sched_ctx != NULL && sched_ctx->id != 0 && sched_ctx->id != STARPU_NMAX_SCHED_CTXS
+		   && sched_ctx->criteria != NULL)
+			sched_ctx->criteria->poped_task_cb(sched_ctx->id, worker->workerid);
+	}
+	
+}
+
+void starpu_call_pushed_task_cb(int workerid)
+{
+	struct starpu_worker_s *worker =  _starpu_get_worker_struct(workerid);
+	unsigned i;
+	struct starpu_sched_ctx *sched_ctx = NULL;
+	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
+	{
+		sched_ctx = worker->sched_ctx[i];
+		if(sched_ctx != NULL && sched_ctx->id != 0  && sched_ctx->criteria != NULL)
+			sched_ctx->criteria->pushed_task_cb(sched_ctx->id, workerid);
+	}
+
+}
+#endif //STARPU_USE_SCHED_CTX_HYPERVISOR

+ 10 - 8
src/core/sched_policy.c

@@ -367,7 +367,7 @@ struct starpu_task *_starpu_pop_task(struct starpu_worker_s *worker)
 			}
 		}
 	  }
-	
+
 	/* Note that we may get a NULL task in case the scheduler was unlocked
 	 * for some reason. */
 	if (profiling && task)
@@ -397,13 +397,8 @@ struct starpu_task *_starpu_pop_task(struct starpu_worker_s *worker)
 		for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
 		{
 			sched_ctx = worker->sched_ctx[i];
-			if(sched_ctx != NULL && sched_ctx->id != 0)
-			{
-				if(sched_ctx != NULL && sched_ctx->criteria != NULL)
-				{
-					sched_ctx->criteria->idle_time_cb(sched_ctx->id, worker->workerid, 1.0);
-				}
-			}
+			if(sched_ctx != NULL && sched_ctx->id != 0 && sched_ctx->criteria != NULL)
+				sched_ctx->criteria->idle_time_cb(sched_ctx->id, worker->workerid, 1.0);
 		}
 	}
 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
@@ -422,6 +417,13 @@ struct starpu_task *_starpu_pop_every_task(struct starpu_sched_ctx *sched_ctx)
 void _starpu_sched_post_exec_hook(struct starpu_task *task)
 {
 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
+
+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
+	if(task->checkpoint > 0 && sched_ctx != NULL && 
+	   sched_ctx->id != 0 && sched_ctx->criteria != NULL)
+		sched_ctx->criteria->post_exec_hook_cb(sched_ctx->id, task->checkpoint);
+#endif //STARPU_USE_SCHED_CTX_HYPERVISOR
+
 	if (sched_ctx->sched_policy->post_exec_hook)
 		sched_ctx->sched_policy->post_exec_hook(task);
 }

+ 2 - 0
src/core/task.c

@@ -83,6 +83,8 @@ void starpu_task_init(struct starpu_task *task)
 	task->sched_ctx = _starpu_get_initial_sched_ctx()->id;
 	
 	task->control_task = 0;
+
+	task->checkpoint = 0;
 }
 
 /* Free all the ressources allocated for a task, without deallocating the task

+ 1 - 1
src/core/workers.h

@@ -82,8 +82,8 @@ struct starpu_worker_s {
 
 	struct starpu_sched_ctx **sched_ctx;
 	unsigned nsched_ctxs; /* the no of contexts a worker belongs to*/
-
 	struct _starpu_barrier_counter_t tasks_barrier; /* wait for the tasks submitted */
+	struct starpu_task *tasks[STARPU_NMAX_SCHED_CTXS];
        
 	unsigned has_prev_init; /* had already been inited in another ctx */
 #ifdef __GLIBC__

+ 15 - 2
src/sched_policies/heft.c

@@ -139,6 +139,11 @@ static void heft_post_exec_hook(struct starpu_task *task)
 	/* Once we have executed the task, we can update the predicted amount
 	 * of work. */
 	PTHREAD_MUTEX_LOCK(sched_mutex);
+
+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
+	starpu_call_poped_task_cb(workerid);
+#endif //STARPU_USE_SCHED_CTX_HYPERVISOR
+
 	exp_len[workerid] -= model;
 	exp_start[workerid] = starpu_timing_now() + model;
 	exp_end[workerid] = exp_start[workerid] + exp_len[workerid];
@@ -162,6 +167,10 @@ static void heft_push_task_notify(struct starpu_task *task, int workerid)
 	/* Update the predictions */
 	PTHREAD_MUTEX_LOCK(sched_mutex);
 
+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
+	starpu_call_pushed_task_cb(workerid);
+#endif STARPU_USE_SCHED_CTX_HYPERVISOR
+
 	/* Sometimes workers didn't take the tasks as early as we expected */
 	exp_start[workerid] = STARPU_MAX(exp_start[workerid], starpu_timing_now());
 	exp_end[workerid] = STARPU_MAX(exp_start[workerid], starpu_timing_now());
@@ -184,11 +193,17 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 	/* make sure someone coule execute that task ! */
 	STARPU_ASSERT(best_workerid != -1);
 
+	_starpu_increment_nsubmitted_tasks_of_worker(best_workerid);
+
 	pthread_mutex_t *sched_mutex;
 	pthread_cond_t *sched_cond;
 	starpu_worker_get_sched_condition(sched_ctx_id, best_workerid, &sched_mutex, &sched_cond);
 
 	PTHREAD_MUTEX_LOCK(sched_mutex);
+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
+	starpu_call_pushed_task_cb(best_workerid);
+#endif STARPU_USE_SCHED_CTX_HYPERVISOR
+
 	exp_end[best_workerid] += predicted;
 	exp_len[best_workerid] += predicted;
 	ntasks[best_workerid]++;
@@ -357,7 +372,6 @@ static int _heft_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 	/* If there is no prediction available for that task with that arch we
 	 * want to speed-up calibration time so we force this measurement */
 	if (forced_best != -1){
-		_starpu_increment_nsubmitted_tasks_of_worker(forced_best);
 		return push_task_on_best_worker(task, forced_best, 0.0, prio, sched_ctx_id);
 	}
 	
@@ -433,7 +447,6 @@ static int _heft_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 	if(workers->init_cursor)
 		workers->deinit_cursor(workers);
 
-	_starpu_increment_nsubmitted_tasks_of_worker(best);
 	return push_task_on_best_worker(task, best, model_best, prio, sched_ctx_id);
 }
 

+ 4 - 0
src/util/starpu_insert_task_utils.c

@@ -193,6 +193,10 @@ int _starpu_insert_task_create_and_submit(char *arg_buffer, starpu_codelet *cl,
 		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
 			va_arg(varg_list, starpu_data_handle);
 		}
+		else if (arg_type==STARPU_CHECKPOINT) {
+			int checkpoint = va_arg(varg_list, int);
+			(*task)->checkpoint = checkpoint;
+		}
 	}
 
 	va_end(varg_list);