лет назад: 11 · a4d31c80da
--- a/ChangeLog
+++ b/ChangeLog
@@ -63,6 +63,9 @@ New features:
 
				     modes field to the task structure, which permit to define codelets taking a
			
 
				     variable number of data.
			
 
				   * Add support for implementing OpenMP runtimes on top of StarPU
			
 
				+  * New performance model format to better represent parallel tasks.
			
 
				+    Used to provide estimations for the execution times of the
			
 
				+    parallel tasks on scheduling contexts or combined workers.
			
 
				 
			
 
				 Small features:
			
 
				   * Tasks can now have a name (via the field const char *name of
			
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -255,7 +255,8 @@ STARPU_EXAMPLES +=				\
 
				 	openmp/vector_scal_omp			\
			
 
				 	sched_ctx/sched_ctx_without_sched_policy\
			
 
				 	sched_ctx/nested_sched_ctxs		\
			
 
				-	sched_ctx/sched_ctx_without_sched_policy
			
 
				+	sched_ctx/sched_ctx_without_sched_policy\
			
 
				+	sched_ctx/sched_ctx_without_sched_policy_awake 
			
 
				 
			
 
				 if STARPU_LONG_CHECK
			
 
				 STARPU_EXAMPLES +=				\
			
--- a/examples/cholesky/cholesky_models.c
+++ b/examples/cholesky/cholesky_models.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010-2011  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -128,13 +128,25 @@ double cuda_chol_task_22_cost(struct starpu_task *task, struct starpu_perfmodel_
 
				 }
			
 
				 
			
 
				 void initialize_chol_model(struct starpu_perfmodel* model, char * symbol,
			
 
				-		double (*cpu_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned),
			
 
				-		double (*cuda_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned))
			
 
				+			   double (*cpu_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned),
			
 
				+			   double (*cuda_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned))
			
 
				 {
			
 
				+	struct starpu_perfmodel_per_arch *per_arch;
			
 
				+
			
 
				 	model->symbol = symbol;
			
 
				 	model->type = STARPU_HISTORY_BASED;
			
 
				-	starpu_perfmodel_init(model);
			
 
				-	model->per_arch[STARPU_CPU_WORKER][0][0][0].cost_function = cpu_cost_function;
			
 
				+
			
 
				+	starpu_perfmodel_init(NULL, model);
			
 
				+
			
 
				+	per_arch = starpu_perfmodel_get_model_per_devices(model, 0, STARPU_CPU_WORKER, 0, 1, -1);
			
 
				+        per_arch->cost_function = cpu_cost_function;
			
 
				+	// We could also call directly:
			
 
				+	// starpu_perfmodel_set_per_devices_cost_function(model, 0, cpu_cost_function, STARPU_CPU_WORKER, 0, 1, -1);
			
 
				+
			
 
				 	if(starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) != 0)
			
 
				-		model->per_arch[STARPU_CUDA_WORKER][0][0][0].cost_function = cuda_cost_function;
			
 
				+	{
			
 
				+	     	per_arch = starpu_perfmodel_get_model_per_devices(model, 0, STARPU_CUDA_WORKER, 0, 1, -1);
			
 
				+		per_arch->cost_function = cuda_cost_function;
			
 
				+
			
 
				+	}
			
 
				 }
			
--- a/examples/heat/lu_kernels_model.c
+++ b/examples/heat/lu_kernels_model.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010-2011  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -215,15 +215,19 @@ double task_22_cost_cpu(struct starpu_task *task, struct starpu_perfmodel_arch*
 
				 }
			
 
				 
			
 
				 void initialize_lu_kernels_model(struct starpu_perfmodel* model, char * symbol,
			
 
				-		double (*cost_function)(struct starpu_task *, unsigned),
			
 
				-		double (*cpu_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned),
			
 
				-		double (*cuda_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned))
			
 
				+				 double (*cost_function)(struct starpu_task *, unsigned),
			
 
				+				 double (*cpu_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned),
			
 
				+				 double (*cuda_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned))
			
 
				 {
			
 
				 	model->symbol = symbol;
			
 
				 	model->type = STARPU_HISTORY_BASED;
			
 
				-	starpu_perfmodel_init(model);
			
 
				-	model->cost_function = cost_function;
			
 
				-	model->per_arch[STARPU_CPU_WORKER][0][0][0].cost_function = cpu_cost_function;
			
 
				+
			
 
				+	starpu_perfmodel_init(NULL, model);
			
 
				+
			
 
				+	starpu_perfmodel_set_per_devices_cost_function(model, 0, cpu_cost_function, STARPU_CPU_WORKER, 0, 1, -1);
			
 
				+
			
 
				 	if(starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) != 0)
			
 
				-		model->per_arch[STARPU_CUDA_WORKER][0][0][0].cost_function = cuda_cost_function;
			
 
				+	{
			
 
				+		starpu_perfmodel_set_per_devices_cost_function(model, 0, cuda_cost_function, STARPU_CUDA_WORKER, 0, 1, -1);
			
 
				+	}
			
 
				 }
			
--- a/examples/sched_ctx/nested_sched_ctxs.c
+++ b/examples/sched_ctx/nested_sched_ctxs.c
@@ -161,6 +161,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 		task->cl = &sched_ctx_codelet;
			
 
				 		task->cl_arg = sched_ctx1;
			
 
				+		task->possibly_parallel = 1;
			
 
				 
			
 
				 		/*submit tasks to context*/
			
 
				 		ret = starpu_task_submit_to_ctx(task,sched_ctx1);
			
@@ -174,6 +175,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 		task->cl = &sched_ctx_codelet;
			
 
				 		task->cl_arg = sched_ctx2;
			
 
				+		task->possibly_parallel = 1;
			
 
				 
			
 
				 		/*submit tasks to context*/
			
 
				 		ret = starpu_task_submit_to_ctx(task,sched_ctx2);
			
--- a/examples/sched_ctx/sched_ctx.c
+++ b/examples/sched_ctx/sched_ctx.c
@@ -93,7 +93,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 	/*create contexts however you want*/
			
 
				 	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", STARPU_SCHED_CTX_POLICY_NAME, "eager", 0);
			
 
				-	unsigned sched_ctx2 = starpu_sched_ctx_create(procs2, nprocs2, "ctx2", STARPU_SCHED_CTX_POLICY_NAME, "eager",  0);
			
 
				+	unsigned sched_ctx2 = starpu_sched_ctx_create(procs2, nprocs2, "ctx2", STARPU_SCHED_CTX_POLICY_NAME, "eager", 0);
			
 
				 
			
 
				 	/*indicate what to do with the resources when context 2 finishes (it depends on your application)*/
			
 
				 	starpu_sched_ctx_set_inheritor(sched_ctx2, sched_ctx1);
			
@@ -152,6 +152,7 @@ int main(int argc, char **argv)
 
				 	/* wait for all tasks at the end*/
			
 
				 	starpu_task_wait_for_all();
			
 
				 
			
 
				+	starpu_sched_ctx_add_workers(procs1, nprocs1, sched_ctx2);
			
 
				 	starpu_sched_ctx_delete(sched_ctx1);
			
 
				 	starpu_sched_ctx_delete(sched_ctx2);
			
 
				 	printf("tasks executed %d out of %d\n", tasks_executed, ntasks/2);
			
--- a/examples/sched_ctx/sched_ctx_without_sched_policy.c
+++ b/examples/sched_ctx/sched_ctx_without_sched_policy.c
@@ -59,7 +59,8 @@ static void sched_ctx_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
 
				 static struct starpu_codelet sched_ctx_codelet =
			
 
				 {
			
 
				 	.cpu_funcs = {sched_ctx_func, NULL},
			
 
				-	.cuda_funcs = { NULL},
			
 
				+#warning FIXME: cuda_funcs should not need to be defined
			
 
				+	.cuda_funcs = {sched_ctx_func, NULL},
			
 
				 	.opencl_funcs = {NULL},
			
 
				 	.model = NULL,
			
 
				 	.nbuffers = 0,
			
@@ -83,8 +84,14 @@ int main(int argc, char **argv)
 
				 	starpu_pthread_mutex_init(&mut, NULL);
			
 
				 	int nprocs1 = 1;
			
 
				 	int nprocs2 = 1;
			
 
				-	int *procs1, *procs2;
			
 
				+	int ncuda = 0;
			
 
				+	int *procs1, *procs2, *procscuda;
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	ncuda = starpu_cuda_worker_get_count();
			
 
				+	procscuda = (int*)malloc(ncuda*sizeof(int));
			
 
				+	starpu_worker_get_ids_by_type(STARPU_CUDA_WORKER, procscuda, ncuda);
			
 
				+#endif
			
 
				 #ifdef STARPU_USE_CPU
			
 
				 	ncpus = starpu_cpu_worker_get_count();
			
 
				 	procs1 = (int*)malloc(ncpus*sizeof(int));
			
@@ -108,6 +115,10 @@ int main(int argc, char **argv)
 
				 #endif
			
 
				 
			
 
				 	if (ncpus == 0) goto enodev;
			
 
				+	if (ncuda > 0 && nprocs1 > 1)
			
 
				+	{
			
 
				+		procs1[nprocs1-1] = procscuda[0];
			
 
				+	}
			
 
				 
			
 
				 	/*create contexts however you want*/
			
 
				 	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", 0);
			
--- a/examples/sched_ctx/sched_ctx_without_sched_policy_awake.c
+++ b/examples/sched_ctx/sched_ctx_without_sched_policy_awake.c
@@ -0,0 +1,171 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010-2013  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2014  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <omp.h>
			
 
				+
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				+#define NTASKS 64
			
 
				+#else
			
 
				+#define NTASKS 100
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+starpu_pthread_mutex_t mut;
			
 
				+
			
 
				+int tasks_executed[2][STARPU_NMAXWORKERS];
			
 
				+int parallel_code(int sched_ctx)
			
 
				+{
			
 
				+	int i;
			
 
				+	int t = 0;
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+	for(i = 0; i < NTASKS; i++)
			
 
				+		t++;
			
 
				+	tasks_executed[sched_ctx-1][workerid] = t;
			
 
				+//	printf("executed %d tasks on worker %d of sched_ctx %d \n", t, workerid, sched_ctx);
			
 
				+
			
 
				+	return t;
			
 
				+}
			
 
				+
			
 
				+static void sched_ctx_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
			
 
				+{
			
 
				+	unsigned sched_ctx = (unsigned)arg;
			
 
				+	parallel_code(sched_ctx);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static struct starpu_codelet sched_ctx_codelet =
			
 
				+{
			
 
				+	.cpu_funcs = {sched_ctx_func, NULL},
			
 
				+	.cuda_funcs = { NULL},
			
 
				+	.opencl_funcs = {NULL},
			
 
				+	.model = NULL,
			
 
				+	.nbuffers = 0,
			
 
				+	.name = "sched_ctx"
			
 
				+};
			
 
				+
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int i;
			
 
				+	for(i = 0; i < STARPU_NMAXWORKERS; i++)
			
 
				+	{
			
 
				+		tasks_executed[0][i] = 0;
			
 
				+		tasks_executed[1][i] = 0;
			
 
				+	}
			
 
				+	int ntasks = NTASKS;
			
 
				+	int ret, j, k;
			
 
				+	unsigned ncpus = 0;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV)
			
 
				+		return 77;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	starpu_pthread_mutex_init(&mut, NULL);
			
 
				+	int nprocs1 = 1;
			
 
				+	int nprocs2 = 1;
			
 
				+	int *procs1, *procs2;
			
 
				+
			
 
				+#ifdef STARPU_USE_CPU
			
 
				+	ncpus = starpu_cpu_worker_get_count();
			
 
				+	procs1 = (int*)malloc(ncpus*sizeof(int));
			
 
				+	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, ncpus);
			
 
				+
			
 
				+	if(ncpus > 1)
			
 
				+	{
			
 
				+		nprocs1 = ncpus/2;
			
 
				+		nprocs2 =  ncpus-nprocs1;
			
 
				+		k = 0;
			
 
				+		procs2 = (int*)malloc(nprocs2*sizeof(int));
			
 
				+		for(j = nprocs1; j < nprocs1+nprocs2; j++)
			
 
				+			procs2[k++] = procs1[j];
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		procs2 = (int*)malloc(nprocs2*sizeof(int));
			
 
				+		procs2[0] = procs1[0];
			
 
				+
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	if (ncpus == 0) goto enodev;
			
 
				+
			
 
				+	/*create contexts however you want*/
			
 
				+	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", STARPU_SCHED_CTX_AWAKE_WORKERS, 0);
			
 
				+	unsigned sched_ctx2 = starpu_sched_ctx_create(procs2, nprocs2, "ctx2", STARPU_SCHED_CTX_AWAKE_WORKERS, 0);
			
 
				+
			
 
				+
			
 
				+	for (i = 0; i < ntasks; i++)
			
 
				+	{
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+		task->cl = &sched_ctx_codelet;
			
 
				+		task->cl_arg = sched_ctx1;
			
 
				+
			
 
				+		/*submit tasks to context*/
			
 
				+		ret = starpu_task_submit_to_ctx(task,sched_ctx1);
			
 
				+
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+
			
 
				+	for (i = 0; i < ntasks; i++)
			
 
				+	{
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+		task->cl = &sched_ctx_codelet;
			
 
				+		task->cl_arg = sched_ctx2;
			
 
				+
			
 
				+		/*submit tasks to context*/
			
 
				+		ret = starpu_task_submit_to_ctx(task,sched_ctx2);
			
 
				+
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	/* tell starpu when you finished submitting tasks to this context
			
 
				+	   in order to allow moving resources from this context to the inheritor one
			
 
				+	   when its corresponding tasks finished executing */
			
 
				+
			
 
				+
			
 
				+
			
 
				+	/* wait for all tasks at the end*/
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	starpu_sched_ctx_delete(sched_ctx1);
			
 
				+	starpu_sched_ctx_delete(sched_ctx2);
			
 
				+
			
 
				+	int tasks_per_ctx[2];
			
 
				+	tasks_per_ctx[0] = 0;
			
 
				+	tasks_per_ctx[1] = 0;
			
 
				+	for(i = 0; i < STARPU_NMAXWORKERS; i++)
			
 
				+	{
			
 
				+		tasks_per_ctx[0] += tasks_executed[0][i];
			
 
				+		tasks_per_ctx[1] += tasks_executed[1][i];
			
 
				+	}
			
 
				+
			
 
				+	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx1, tasks_per_ctx[0]/nprocs1, NTASKS);
			
 
				+	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx2, tasks_per_ctx[1]/nprocs2, NTASKS);
			
 
				+
			
 
				+enodev:
			
 
				+#ifdef STARPU_USE_CPU
			
 
				+	free(procs1);
			
 
				+	free(procs2);
			
 
				+#endif
			
 
				+	starpu_shutdown();
			
 
				+	return ncpus == 0 ? 77 : 0;
			
 
				+}
			
--- a/include/starpu_fxt.h
+++ b/include/starpu_fxt.h
@@ -31,7 +31,7 @@ struct starpu_fxt_codelet_event
 
				 {
			
 
				 	char symbol[256];
			
 
				 	int workerid;
			
 
				-	struct starpu_perfmodel_arch arch;
			
 
				+	char perfmodel_archname[256];
			
 
				 	uint32_t hash;
			
 
				 	size_t size;
			
 
				 	float time;
			
--- a/include/starpu_perfmodel.h
+++ b/include/starpu_perfmodel.h
@@ -35,13 +35,20 @@ struct starpu_data_descr;
 
				 
			
 
				 #define STARPU_NARCH STARPU_ANY_WORKER
			
 
				 
			
 
				-struct starpu_perfmodel_arch
			
 
				+struct starpu_perfmodel_device
			
 
				 {
			
 
				 	enum starpu_worker_archtype type;
			
 
				 	int devid;	/* identifier of the precise device */
			
 
				-	int ncore;	/* number of execution in parallel, minus 1 */
			
 
				+	int ncores;	/* number of execution in parallel, minus 1 */	
			
 
				+};
			
 
				+
			
 
				+struct starpu_perfmodel_arch
			
 
				+{
			
 
				+	int ndevices;
			
 
				+	struct starpu_perfmodel_device *devices;
			
 
				 };
			
 
				 
			
 
				+
			
 
				 struct starpu_perfmodel_history_entry
			
 
				 {
			
 
				 	double mean;
			
@@ -91,10 +98,13 @@ struct starpu_perfmodel_history_table;
 
				 
			
 
				 #define starpu_per_arch_perfmodel starpu_perfmodel_per_arch STARPU_DEPRECATED
			
 
				 
			
 
				+typedef double (*starpu_perfmodel_per_arch_cost_function)(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				+typedef size_t (*starpu_perfmodel_per_arch_size_base)(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				+
			
 
				 struct starpu_perfmodel_per_arch
			
 
				 {
			
 
				-	double (*cost_function)(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				-	size_t (*size_base)(struct starpu_task *, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				+	starpu_perfmodel_per_arch_cost_function cost_function;
			
 
				+	starpu_perfmodel_per_arch_size_base size_base;
			
 
				 
			
 
				 	struct starpu_perfmodel_history_table *history;
			
 
				 	struct starpu_perfmodel_history_list *list;
			
@@ -114,6 +124,9 @@ enum starpu_perfmodel_type
 
				 	STARPU_NL_REGRESSION_BASED
			
 
				 };
			
 
				 
			
 
				+struct _starpu_perfmodel_state;
			
 
				+typedef struct _starpu_perfmodel_state* starpu_perfmodel_state_t;
			
 
				+
			
 
				 struct starpu_perfmodel
			
 
				 {
			
 
				 	enum starpu_perfmodel_type type;
			
@@ -123,23 +136,31 @@ struct starpu_perfmodel
 
				 	size_t (*size_base)(struct starpu_task *, unsigned nimpl);
			
 
				 	uint32_t (*footprint)(struct starpu_task *);
			
 
				 
			
 
				-	struct starpu_perfmodel_per_arch**** per_arch; /*STARPU_MAXIMPLEMENTATIONS*/
			
 
				-
			
 
				 	const char *symbol;
			
 
				 
			
 
				-	unsigned is_init;
			
 
				 	unsigned is_loaded;
			
 
				 	unsigned benchmarking;
			
 
				-	starpu_pthread_rwlock_t model_rwlock;
			
 
				+	unsigned is_init;
			
 
				+
			
 
				+	starpu_perfmodel_state_t state;
			
 
				 };
			
 
				 
			
 
				-void starpu_perfmodel_init(struct starpu_perfmodel *model);
			
 
				-void starpu_perfmodel_init_with_file(FILE*f, struct starpu_perfmodel *model);
			
 
				+void starpu_perfmodel_init(FILE *f, struct starpu_perfmodel *model);
			
 
				+//void starpu_perfmodel_init_with_file(FILE*f, struct starpu_perfmodel *model);
			
 
				 
			
 
				-struct starpu_perfmodel_arch *starpu_worker_get_perf_archtype(int workerid);
			
 
				+struct starpu_perfmodel_arch *starpu_worker_get_perf_archtype(int workerid, unsigned sched_ctx_id);
			
 
				 
			
 
				 int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *model);
			
 
				 int starpu_perfmodel_unload_model(struct starpu_perfmodel *model);
			
 
				+int starpu_get_narch_combs();
			
 
				+int starpu_perfmodel_arch_comb_add(int ndevices, struct starpu_perfmodel_device* devices);
			
 
				+int starpu_perfmodel_arch_comb_get(int ndevices, struct starpu_perfmodel_device *devices);
			
 
				+
			
 
				+struct starpu_perfmodel_per_arch *starpu_perfmodel_get_model_per_arch(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, unsigned impl);
			
 
				+struct starpu_perfmodel_per_arch *starpu_perfmodel_get_model_per_devices(struct starpu_perfmodel *model, int impl, ...);
			
 
				+
			
 
				+int starpu_perfmodel_set_per_devices_cost_function(struct starpu_perfmodel *model, int impl, starpu_perfmodel_per_arch_cost_function func, ...);
			
 
				+int starpu_perfmodel_set_per_devices_size_base(struct starpu_perfmodel *model, int impl, starpu_perfmodel_per_arch_size_base func, ...);
			
 
				 
			
 
				 void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, char *path, size_t maxlen, unsigned nimpl);
			
 
				 char* starpu_perfmodel_get_archtype_name(enum starpu_worker_archtype archtype);
			
@@ -150,6 +171,8 @@ int starpu_perfmodel_list(FILE *output);
 
				 void starpu_perfmodel_print(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output);
			
 
				 int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char *parameter, uint32_t *footprint, FILE *output);
			
 
				 
			
 
				+int starpu_perfmodel_list_combs(FILE *output, struct starpu_perfmodel *model);
			
 
				+
			
 
				 void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned cpuid, unsigned nimpl, double measured);
			
 
				 void starpu_perfmodel_directory(FILE *output);
			
 
				 
			
--- a/include/starpu_sched_ctx.h
+++ b/include/starpu_sched_ctx.h
@@ -30,6 +30,7 @@ extern "C"
 
				 #define STARPU_SCHED_CTX_POLICY_MAX_PRIO	 (4<<16)
			
 
				 #define STARPU_SCHED_CTX_HIERARCHY_LEVEL         (5<<16)
			
 
				 #define STARPU_SCHED_CTX_NESTED                  (6<<16)
			
 
				+#define STARPU_SCHED_CTX_AWAKE_WORKERS           (7<<16)
			
 
				 
			
 
				 unsigned starpu_sched_ctx_create(int *workerids_ctx, int nworkers_ctx, const char *sched_ctx_name, ...);
			
 
				 
			
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -184,6 +184,7 @@ struct starpu_task
 
				 
			
 
				 	unsigned sched_ctx;
			
 
				 	int hypervisor_tag;
			
 
				+	unsigned possibly_parallel;
			
 
				 
			
 
				 	starpu_task_bundle_t bundle;
			
 
				 
			
@@ -232,7 +233,8 @@ struct starpu_task
 
				 	.dyn_handles = NULL,				\
			
 
				 	.dyn_interfaces = NULL,				\
			
 
				 	.dyn_modes = NULL,				\
			
 
				-	.name = NULL                        		\
			
 
				+	.name = NULL,                        		\
			
 
				+	.possibly_parallel = 0                        	\
			
 
				 }
			
 
				 
			
 
				 #define STARPU_TASK_GET_NBUFFERS(task) ((unsigned)((task)->cl->nbuffers == STARPU_VARIABLE_NBUFFERS ? ((task)->nbuffers) : ((task)->cl->nbuffers)))
			
--- a/include/starpu_task_util.h
+++ b/include/starpu_task_util.h
@@ -32,24 +32,25 @@ extern "C"
 
				 
			
 
				 void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t *deps, void (*callback)(void *), void *callback_arg);
			
 
				 
			
 
				-#define STARPU_VALUE		 (1<<18)
			
 
				-#define STARPU_CALLBACK		 (2<<18)
			
 
				-#define STARPU_CALLBACK_WITH_ARG (3<<18)
			
 
				-#define STARPU_CALLBACK_ARG	 (4<<18)
			
 
				-#define STARPU_PRIORITY		 (5<<18)
			
 
				-#define STARPU_EXECUTE_ON_NODE	 (6<<18)
			
 
				-#define STARPU_EXECUTE_ON_DATA	 (7<<18)
			
 
				-#define STARPU_DATA_ARRAY        (8<<18)
			
 
				-#define STARPU_TAG               (9<<18)
			
 
				-#define STARPU_HYPERVISOR_TAG	 (10<<18)
			
 
				-#define STARPU_FLOPS	         (11<<18)
			
 
				-#define STARPU_SCHED_CTX	 (12<<18)
			
 
				-#define STARPU_PROLOGUE_CALLBACK   (13<<18)
			
 
				-#define STARPU_PROLOGUE_CALLBACK_ARG (14<<18)
			
 
				-#define STARPU_PROLOGUE_CALLBACK_POP   (15<<18)
			
 
				-#define STARPU_PROLOGUE_CALLBACK_POP_ARG (16<<18)
			
 
				-#define STARPU_EXECUTE_ON_WORKER (17<<18)
			
 
				-#define STARPU_TAG_ONLY          (18<<18)
			
 
				+#define STARPU_VALUE		 (1<<20)
			
 
				+#define STARPU_CALLBACK		 (2<<20)
			
 
				+#define STARPU_CALLBACK_WITH_ARG (3<<20)
			
 
				+#define STARPU_CALLBACK_ARG	 (4<<20)
			
 
				+#define STARPU_PRIORITY		 (5<<20)
			
 
				+#define STARPU_EXECUTE_ON_NODE	 (6<<20)
			
 
				+#define STARPU_EXECUTE_ON_DATA	 (7<<20)
			
 
				+#define STARPU_DATA_ARRAY        (8<<20)
			
 
				+#define STARPU_TAG               (9<<20)
			
 
				+#define STARPU_HYPERVISOR_TAG	 (10<<20)
			
 
				+#define STARPU_FLOPS	         (11<<20)
			
 
				+#define STARPU_SCHED_CTX	 (12<<20)
			
 
				+#define STARPU_PROLOGUE_CALLBACK   (13<<20)
			
 
				+#define STARPU_PROLOGUE_CALLBACK_ARG (14<<20)
			
 
				+#define STARPU_PROLOGUE_CALLBACK_POP   (15<<20)
			
 
				+#define STARPU_PROLOGUE_CALLBACK_POP_ARG (16<<20)
			
 
				+#define STARPU_EXECUTE_ON_WORKER (17<<20)
			
 
				+#define STARPU_TAG_ONLY          (18<<20)
			
 
				+#define STARPU_POSSIBLY_PARALLEL    (19<<20)
			
 
				 #define STARPU_WORKER_ORDER      (19<<18)
			
 
				 
			
 
				 struct starpu_task *starpu_task_build(struct starpu_codelet *cl, ...);
			
--- a/sc_hypervisor/examples/cholesky/cholesky_models.c
+++ b/sc_hypervisor/examples/cholesky/cholesky_models.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010-2011  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -26,6 +26,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_perfmodel.h>
			
 
				 #include "cholesky.h"
			
 
				 
			
 
				 /* #define USE_PERTURBATION	1 */
			
@@ -127,12 +128,25 @@ double cuda_chol_task_22_cost(struct starpu_task *task, struct starpu_perfmodel_
 
				 }
			
 
				 
			
 
				 void initialize_chol_model(struct starpu_perfmodel* model, char * symbol,
			
 
				-		double (*cpu_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned),
			
 
				-		double (*cuda_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned))
			
 
				+			   double (*cpu_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned),
			
 
				+			   double (*cuda_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned))
			
 
				 {
			
 
				+	struct starpu_perfmodel_per_arch *per_arch;
			
 
				+
			
 
				 	model->symbol = symbol;
			
 
				 	model->type = STARPU_HISTORY_BASED;
			
 
				-	starpu_perfmodel_init(model);
			
 
				-	model->per_arch[STARPU_CPU_WORKER][0][0][0].cost_function = cpu_cost_function;
			
 
				-	model->per_arch[STARPU_CUDA_WORKER][0][0][0].cost_function = cuda_cost_function;
			
 
				+
			
 
				+	starpu_perfmodel_init(NULL, model);
			
 
				+
			
 
				+	per_arch = starpu_perfmodel_get_model_per_devices(model, 0, STARPU_CPU_WORKER, 0, 1, -1);
			
 
				+        per_arch->cost_function = cpu_cost_function;
			
 
				+	// We could also call directly:
			
 
				+	// starpu_perfmodel_set_per_devices_cost_function(model, 0, cpu_cost_function, STARPU_CPU_WORKER, 0, 1, -1);
			
 
				+
			
 
				+	if(starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) != 0)
			
 
				+	{
			
 
				+	     	per_arch = starpu_perfmodel_get_model_per_devices(model, 0, STARPU_CUDA_WORKER, 0, 1, -1);
			
 
				+		per_arch->cost_function = cuda_cost_function;
			
 
				+
			
 
				+	}
			
 
				 }
			
--- a/sc_hypervisor/src/policies_utils/policy_tools.c
+++ b/sc_hypervisor/src/policies_utils/policy_tools.c
@@ -414,7 +414,7 @@ void sc_hypervisor_get_tasks_times(int nw, int nt, double times[nw][nt], int *wo
 
				                 for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
			
 
				                 {
			
 
				 			int worker = workers == NULL ? w : workers[w];
			
 
				-                        struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(worker);
			
 
				+                        struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(worker, STARPU_NMAX_SCHED_CTXS);
			
 
				                         double length = starpu_permodel_history_based_expected_perf(tp->cl->model, arch, tp->footprint);
			
 
				 
			
 
				                         if (isnan(length))
			
--- a/sc_hypervisor/src/sc_hypervisor.c
+++ b/sc_hypervisor/src/sc_hypervisor.c
@@ -382,9 +382,12 @@ void sc_hypervisor_unregister_ctx(unsigned sched_ctx)
 
				 	int *pus;
			
 
				 	unsigned npus = starpu_sched_ctx_get_workers_list(sched_ctx, &pus);
			
 
				 
			
 
				-	starpu_sched_ctx_set_priority(pus, npus, father, 1);
			
 
				-	starpu_sched_ctx_set_priority_on_level(pus, npus, father, 1);
			
 
				-	free(pus);
			
 
				+	if(npus)
			
 
				+	{
			
 
				+		starpu_sched_ctx_set_priority(pus, npus, father, 1);
			
 
				+		starpu_sched_ctx_set_priority_on_level(pus, npus, father, 1);
			
 
				+		free(pus);
			
 
				+	}
			
 
				 
			
 
				 	unsigned i;
			
 
				 	for(i = 0; i < hypervisor.nsched_ctxs; i++)
			
--- a/src/common/fxt.h
+++ b/src/common/fxt.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009-2014  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -414,7 +414,7 @@ do {									\
 
				 #define _STARPU_TRACE_WORKER_INIT_END(__workerid)				\
			
 
				 	FUT_DO_PROBE2(_STARPU_FUT_WORKER_INIT_END, _starpu_gettid(), (__workerid));
			
 
				 
			
 
				-#define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, archtype, workerid)				\
			
 
				+#define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, perf_arch, workerid)				\
			
 
				 do {									\
			
 
				         const char *model_name = _starpu_job_get_model_name((job));         \
			
 
				 	if (model_name)                                                 \
			
@@ -442,17 +442,19 @@ do {									\
 
				 				}					\
			
 
				 			}						\
			
 
				 		}							\
			
 
				-		const size_t __job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));	\
			
 
				-		const uint32_t __job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));\
			
 
				-		FUT_DO_PROBE6(_STARPU_FUT_CODELET_DETAILS, (job), ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->tag_id, workerid);	\
			
 
				+		const size_t __job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));	\
			
 
				+		const uint32_t __job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));\
			
 
				+		FUT_DO_PROBE7(_STARPU_FUT_CODELET_DETAILS, (job), ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->tag_id, workerid, ((job)->job_id)); \
			
 
				 	}								\
			
 
				 } while(0);
			
 
				 
			
 
				-#define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, archtype, workerid)			\
			
 
				+#define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, perf_arch, workerid)			\
			
 
				 do {									\
			
 
				-	const size_t job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));	\
			
 
				-	const uint32_t job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));\
			
 
				-	FUT_DO_PROBE7(_STARPU_FUT_END_CODELET_BODY, (job), (job_size), (job_hash), (archtype)->type, (archtype)->devid, (archtype)->ncore, workerid);	\
			
 
				+	const size_t job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));	\
			
 
				+	const uint32_t job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));\
			
 
				+	char _archname[32]=""; \
			
 
				+	starpu_perfmodel_get_arch_name(perf_arch, _archname, 32, 0);	\
			
 
				+	_STARPU_FUT_DO_PROBE4STR(_STARPU_FUT_END_CODELET_BODY, (job), (job_size), (job_hash), workerid, _archname); \
			
 
				 } while(0);
			
 
				 
			
 
				 #define _STARPU_TRACE_START_EXECUTING()				\
			
@@ -818,8 +820,8 @@ do {										\
 
				 #define _STARPU_TRACE_NEW_MEM_NODE(nodeid)	do {} while(0)
			
 
				 #define _STARPU_TRACE_WORKER_INIT_START(a,b,c)	do {} while(0)
			
 
				 #define _STARPU_TRACE_WORKER_INIT_END(workerid)	do {} while(0)
			
 
				-#define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, archtype, workerid)	do {} while(0)
			
 
				-#define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, a, workerid)	do {} while(0)
			
 
				+#define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, perf_arch, workerid)	do {} while(0)
			
 
				+#define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, perf_arch, workerid)	do {} while(0)
			
 
				 #define _STARPU_TRACE_START_EXECUTING()	do {} while(0)
			
 
				 #define _STARPU_TRACE_END_EXECUTING()	do {} while(0)
			
 
				 #define _STARPU_TRACE_START_CALLBACK(job)	do {} while(0)
			
--- a/src/core/combined_workers.c
+++ b/src/core/combined_workers.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2014  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2014  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -102,10 +102,11 @@ int starpu_combined_worker_assign_workerid(int nworkers, int workerid_array[])
 
				 		&config->combined_workers[combined_worker_id];
			
 
				 
			
 
				 	combined_worker->worker_size = nworkers;
			
 
				-
			
 
				-	combined_worker->perf_arch.type = config->workers[workerid_array[0]].perf_arch.type;
			
 
				-	combined_worker->perf_arch.devid = config->workers[workerid_array[0]].perf_arch.devid; 
			
 
				-	combined_worker->perf_arch.ncore = nworkers - 1;
			
 
				+	combined_worker->perf_arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
			
 
				+	combined_worker->perf_arch.ndevices = 1;
			
 
				+	combined_worker->perf_arch.devices[0].type = config->workers[workerid_array[0]].perf_arch.devices[0].type;
			
 
				+	combined_worker->perf_arch.devices[0].devid = config->workers[workerid_array[0]].perf_arch.devices[0].devid; 
			
 
				+	combined_worker->perf_arch.devices[0].ncores = nworkers - 1;
			
 
				 	combined_worker->worker_mask = config->workers[workerid_array[0]].worker_mask;
			
 
				 	
			
 
				 #ifdef STARPU_USE_MP
			
--- a/src/core/detect_combined_workers.c
+++ b/src/core/detect_combined_workers.c
@@ -44,7 +44,7 @@ static void find_workers(hwloc_obj_t obj, int cpu_workers[STARPU_NMAXWORKERS], u
 
				 	for(worker = _starpu_worker_list_begin(workers); worker != _starpu_worker_list_end(workers); worker = _starpu_worker_list_next(worker))
			
 
				 	{
			
 
				 		/* is it a CPU worker? */
			
 
				-		if (worker->perf_arch.type == STARPU_CPU_WORKER && worker->perf_arch.ncore == 0)
			
 
				+		if (worker->perf_arch.devices[0].type == STARPU_CPU_WORKER && worker->perf_arch.devices[0].ncores == 0)
			
 
				 		{
			
 
				 			_STARPU_DEBUG("worker %d is part of it\n", worker->workerid);
			
 
				 			/* Add it to the combined worker */
			
@@ -178,7 +178,7 @@ static void find_and_assign_combinations_with_hwloc(int *workerids, int nworkers
 
				 	for (i = 0; i < nworkers; i++)
			
 
				 	{
			
 
				 		struct _starpu_worker *worker = _starpu_get_worker_struct(workerids[i]);
			
 
				-		if (worker->perf_arch.type == STARPU_CPU_WORKER && worker->perf_arch.ncore == 0)
			
 
				+		if (worker->perf_arch.devices[0].type == STARPU_CPU_WORKER && worker->perf_arch.devices[0].ncores == 0)
			
 
				 		{
			
 
				 			hwloc_obj_t obj = hwloc_get_obj_by_depth(topology->hwtopology, config->pu_depth, worker->bindid);
			
 
				 			obj = obj->parent;
			
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -193,6 +193,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 
				 {
			
 
				 	struct starpu_task *task = j->task;
			
 
				 	unsigned sched_ctx = task->sched_ctx;
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				 	double flops = task->flops;
			
 
				 	const unsigned continuation =
			
 
				 #ifdef STARPU_OPENMP
			
@@ -219,12 +220,11 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 
				 		 * the callback is not done yet. */
			
 
				 		j->terminated = 1;
			
 
				 	}
			
 
				-
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
			
 
				 
			
 
				+
			
 
				 #ifdef STARPU_USE_SC_HYPERVISOR
			
 
				 	size_t data_size = 0;
			
 
				-	int workerid = starpu_worker_get_id();
			
 
				 #endif //STARPU_USE_SC_HYPERVISOR
			
 
				 
			
 
				 	/* We release handle reference count */
			
@@ -259,6 +259,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 
				 		 * implicit dependencies any more.  */
			
 
				 		_starpu_release_task_enforce_sequential_consistency(j);
			
 
				 	}
			
 
				+
			
 
				 	/* Task does not have a cl, but has explicit data dependencies, we need
			
 
				 	 * to tell them that we will not exist any more before notifying the
			
 
				 	 * tasks waiting for us
			
--- a/src/core/perfmodel/perfmodel.c
+++ b/src/core/perfmodel/perfmodel.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009-2014  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -39,7 +39,6 @@
 
				  *	2: models must be calibrated, existing models are overwritten.
			
 
				  */
			
 
				 static unsigned calibrate_flag = 0;
			
 
				-
			
 
				 void _starpu_set_calibrate_flag(unsigned val)
			
 
				 {
			
 
				 	calibrate_flag = val;
			
@@ -50,8 +49,15 @@ unsigned _starpu_get_calibrate_flag(void)
 
				 	return calibrate_flag;
			
 
				 }
			
 
				 
			
 
				-struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid)
			
 
				+struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid, unsigned sched_ctx_id)
			
 
				 {
			
 
				+	if(sched_ctx_id != STARPU_NMAX_SCHED_CTXS)
			
 
				+	{
			
 
				+		unsigned child_sched_ctx = starpu_sched_ctx_worker_is_master_for_child_ctx(workerid, sched_ctx_id);
			
 
				+		if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
			
 
				+			return _starpu_sched_ctx_get_perf_archtype(child_sched_ctx);
			
 
				+	}
			
 
				+
			
 
				 	struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				 
			
 
				 	/* This workerid may either be a basic worker or a combined worker */
			
@@ -60,6 +66,7 @@ struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid)
 
				 	if (workerid < (int)config->topology.nworkers)
			
 
				 		return &config->workers[workerid].perf_arch;
			
 
				 
			
 
				+
			
 
				 	/* We have a combined worker */
			
 
				 	unsigned ncombinedworkers = config->topology.ncombinedworkers;
			
 
				 	STARPU_ASSERT(workerid < (int)(ncombinedworkers + nworkers));
			
@@ -72,10 +79,17 @@ struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid)
 
				 
			
 
				 static double per_arch_task_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, struct starpu_task *task, unsigned nimpl)
			
 
				 {
			
 
				+	int comb;
			
 
				 	double (*per_arch_cost_function)(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				 
			
 
				-	per_arch_cost_function = model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].cost_function;
			
 
				+	comb = starpu_perfmodel_arch_comb_get(arch->ndevices, arch->devices);
			
 
				+	if (comb == -1)
			
 
				+		return NAN;
			
 
				+	if (model->state->per_arch[comb] == NULL)
			
 
				+		// The model has not been executed on this combination
			
 
				+		return NAN;
			
 
				 
			
 
				+	per_arch_cost_function = model->state->per_arch[comb][nimpl].cost_function;
			
 
				 	STARPU_ASSERT_MSG(per_arch_cost_function, "STARPU_PER_ARCH needs per-arch cost_function to be defined");
			
 
				 
			
 
				 	return per_arch_cost_function(task, arch, nimpl);
			
@@ -87,26 +101,23 @@ static double per_arch_task_expected_perf(struct starpu_perfmodel *model, struct
 
				 
			
 
				 double starpu_worker_get_relative_speedup(struct starpu_perfmodel_arch* perf_arch)
			
 
				 {
			
 
				-	if (perf_arch->type == STARPU_CPU_WORKER)
			
 
				-	{
			
 
				-		return _STARPU_CPU_ALPHA * (perf_arch->ncore + 1);
			
 
				-	}
			
 
				-	else if (perf_arch->type == STARPU_CUDA_WORKER)
			
 
				-	{
			
 
				-		return _STARPU_CUDA_ALPHA;
			
 
				-	}
			
 
				-	else if (perf_arch->type == STARPU_OPENCL_WORKER)
			
 
				+	double speedup = 0;
			
 
				+	int dev;
			
 
				+	for(dev = 0; dev < perf_arch->ndevices; dev++)
			
 
				 	{
			
 
				-		return _STARPU_OPENCL_ALPHA;
			
 
				+		double coef = 0.0;
			
 
				+		if (perf_arch->devices[dev].type == STARPU_CPU_WORKER)
			
 
				+			coef = _STARPU_CPU_ALPHA;
			
 
				+		else if (perf_arch->devices[dev].type == STARPU_CUDA_WORKER)
			
 
				+			coef = _STARPU_CUDA_ALPHA;
			
 
				+		else if (perf_arch->devices[dev].type == STARPU_OPENCL_WORKER)
			
 
				+			coef = _STARPU_OPENCL_ALPHA;
			
 
				+		else if (perf_arch->devices[dev].type == STARPU_MIC_WORKER)
			
 
				+			coef =  _STARPU_MIC_ALPHA;
			
 
				+
			
 
				+		speedup += coef * (perf_arch->devices[dev].ncores + 1);
			
 
				 	}
			
 
				-	else if (perf_arch->type == STARPU_MIC_WORKER)
			
 
				-	{
			
 
				-		return _STARPU_MIC_ALPHA * (perf_arch->ncore + 1);
			
 
				-	}
			
 
				-	STARPU_ABORT();
			
 
				-
			
 
				-	/* Never reached ! */
			
 
				-	return NAN;
			
 
				+	return speedup == 0 ? NAN : speedup;
			
 
				 }
			
 
				 
			
 
				 static double common_task_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, struct starpu_task *task, unsigned nimpl)
			
@@ -124,13 +135,18 @@ static double common_task_expected_perf(struct starpu_perfmodel *model, struct s
 
				 	return (exp/alpha);
			
 
				 }
			
 
				 
			
 
				-void _starpu_load_perfmodel(struct starpu_perfmodel *model)
			
 
				+void _starpu_init_and_load_perfmodel(struct starpu_perfmodel *model)
			
 
				 {
			
 
				 	if (!model || model->is_loaded)
			
 
				 		return;
			
 
				 
			
 
				-	int load_model = _starpu_register_model(model);
			
 
				-	if (!load_model)
			
 
				+	starpu_perfmodel_init(NULL, model);
			
 
				+
			
 
				+	// Check if a symbol is defined before trying to load the model from a file
			
 
				+	if (!model->symbol)
			
 
				+		return;
			
 
				+
			
 
				+	if (model->is_loaded)
			
 
				 		return;
			
 
				 
			
 
				 	switch (model->type)
			
@@ -160,30 +176,22 @@ static double starpu_model_expected_perf(struct starpu_task *task, struct starpu
 
				 {
			
 
				 	if (model)
			
 
				 	{
			
 
				-		if (model->symbol)
			
 
				-			_starpu_load_perfmodel(model);
			
 
				+		_starpu_init_and_load_perfmodel(model);
			
 
				 
			
 
				 		struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
			
 
				 
			
 
				 		switch (model->type)
			
 
				 		{
			
 
				 			case STARPU_PER_ARCH:
			
 
				-
			
 
				 				return per_arch_task_expected_perf(model, arch, task, nimpl);
			
 
				 			case STARPU_COMMON:
			
 
				 				return common_task_expected_perf(model, arch, task, nimpl);
			
 
				-
			
 
				 			case STARPU_HISTORY_BASED:
			
 
				-
			
 
				 				return _starpu_history_based_job_expected_perf(model, arch, j, nimpl);
			
 
				 			case STARPU_REGRESSION_BASED:
			
 
				-
			
 
				 				return _starpu_regression_based_job_expected_perf(model, arch, j, nimpl);
			
 
				-
			
 
				 			case STARPU_NL_REGRESSION_BASED:
			
 
				-
			
 
				 				return _starpu_non_linear_regression_based_job_expected_perf(model, arch, j,nimpl);
			
 
				-
			
 
				 			default:
			
 
				 				STARPU_ABORT();
			
 
				 		}
			
@@ -207,6 +215,8 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 
				 					    struct starpu_perfmodel_arch* arch,
			
 
				 					    unsigned nimpl)
			
 
				 {
			
 
				+	if(arch->ndevices > 1)
			
 
				+		return -1.0;
			
 
				 	unsigned i;
			
 
				 	double sum = 0.0;
			
 
				 	enum starpu_node_kind node_kind;
			
@@ -220,8 +230,8 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 
				 		handle = STARPU_TASK_GET_HANDLE(task, i);
			
 
				 		if (!_starpu_data_is_multiformat_handle(handle))
			
 
				 			continue;
			
 
				-		
			
 
				-		switch(arch->type)
			
 
				+
			
 
				+		switch(arch->devices[0].type)
			
 
				 		{
			
 
				 			case STARPU_CPU_WORKER:
			
 
				 				node_kind = STARPU_CPU_RAM;
			
--- a/src/core/perfmodel/perfmodel.h
+++ b/src/core/perfmodel/perfmodel.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009-2013  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -38,7 +38,20 @@ extern "C"
 
				  * differents versions of StarPU having different performance model
			
 
				  * formats.
			
 
				  */
			
 
				-#define _STARPU_PERFMODEL_VERSION 43
			
 
				+#define _STARPU_PERFMODEL_VERSION 44
			
 
				+
			
 
				+struct _starpu_perfmodel_state
			
 
				+{
			
 
				+	struct starpu_perfmodel_per_arch** per_arch; /*STARPU_MAXIMPLEMENTATIONS*/
			
 
				+	int** per_arch_is_set; /*STARPU_MAXIMPLEMENTATIONS*/
			
 
				+
			
 
				+	starpu_pthread_rwlock_t model_rwlock;
			
 
				+	int *nimpls;
			
 
				+	int *nimpls_set;
			
 
				+	int ncombs;  /* The number of combinations currently used by the model */
			
 
				+	int ncombs_set; /* The number of combinations allocated in the array nimpls and ncombs */
			
 
				+	int *combs;
			
 
				+};
			
 
				 
			
 
				 struct _starpu_perfmodel_list
			
 
				 {
			
@@ -56,11 +69,10 @@ void _starpu_get_perf_model_dir_bus(char *path, size_t maxlen);
 
				 void _starpu_get_perf_model_dir_debug(char *path, size_t maxlen);
			
 
				 
			
 
				 double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, struct _starpu_job *j, unsigned nimpl);
			
 
				-int _starpu_register_model(struct starpu_perfmodel *model);
			
 
				 void _starpu_load_per_arch_based_model(struct starpu_perfmodel *model);
			
 
				 void _starpu_load_common_based_model(struct starpu_perfmodel *model);
			
 
				 void _starpu_load_history_based_model(struct starpu_perfmodel *model, unsigned scan_history);
			
 
				-void _starpu_load_perfmodel(struct starpu_perfmodel *model);
			
 
				+void _starpu_init_and_load_perfmodel(struct starpu_perfmodel *model);
			
 
				 void _starpu_initialize_registered_performance_models(void);
			
 
				 void _starpu_deinitialize_registered_performance_models(void);
			
 
				 void _starpu_deinitialize_performance_model(struct starpu_perfmodel *model);
			
@@ -71,6 +83,7 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 
				 					struct starpu_perfmodel_arch* arch, struct _starpu_job *j, unsigned nimpl);
			
 
				 void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch,
			
 
				 				unsigned cpuid, double measured, unsigned nimpl);
			
 
				+int _starpu_perfmodel_create_comb_if_needed(struct starpu_perfmodel_arch* arch);
			
 
				 
			
 
				 void _starpu_create_sampling_directory_if_needed(void);
			
 
				 
			
@@ -86,13 +99,16 @@ int *_starpu_get_cuda_affinity_vector(unsigned gpuid);
 
				 int *_starpu_get_opencl_affinity_vector(unsigned gpuid);
			
 
				 #endif
			
 
				 
			
 
				-
			
 
				 void _starpu_save_bandwidth_and_latency_disk(double bandwidth_write, double bandwidth_read, 
			
 
				 					    double latency_write, double latency_read, unsigned node);
			
 
				 
			
 
				 int _starpu_read_double(FILE *f, char *format, double *val);
			
 
				 void _starpu_simgrid_get_platform_path(char *path, size_t maxlen);
			
 
				 
			
 
				+struct starpu_perfmodel_arch * _starpu_arch_comb_get(int comb);
			
 
				+
			
 
				+void _starpu_perfmodel_realloc(struct starpu_perfmodel *model, int nb);
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				 #endif
			
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
--- a/src/core/perfmodel/perfmodel_print.c
+++ b/src/core/perfmodel/perfmodel_print.c
@@ -19,6 +19,7 @@
 
				 #include <starpu.h>
			
 
				 #include <starpu_perfmodel.h>
			
 
				 #include <common/config.h>
			
 
				+#include "perfmodel.h"
			
 
				 
			
 
				 static
			
 
				 void _starpu_perfmodel_print_history_based(struct starpu_perfmodel_per_arch *per_arch_model, char *parameter, uint32_t *footprint, FILE *output)
			
@@ -63,13 +64,16 @@ void _starpu_perfmodel_print_history_based(struct starpu_perfmodel_per_arch *per
 
				 
			
 
				 void starpu_perfmodel_print(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output)
			
 
				 {
			
 
				-	struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[arch->type][arch->devid][arch->ncore][nimpl];
			
 
				+	int comb = starpu_perfmodel_arch_comb_get(arch->ndevices, arch->devices);
			
 
				+	STARPU_ASSERT(comb != -1);
			
 
				+
			
 
				+	struct starpu_perfmodel_per_arch *arch_model = &model->state->per_arch[comb][nimpl];
			
 
				 	char archname[32];
			
 
				 
			
 
				 	if (arch_model->regression.nsample || arch_model->regression.valid || arch_model->regression.nl_valid || arch_model->list)
			
 
				 	{
			
 
				 		starpu_perfmodel_get_arch_name(arch, archname, 32, nimpl);
			
 
				-		fprintf(output, "performance model for %s\n", archname);
			
 
				+		fprintf(output, "# performance model for %s\n", archname);
			
 
				 	}
			
 
				 
			
 
				 	if (parameter == NULL)
			
@@ -170,24 +174,13 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 
				 {
			
 
				 	if (arch == NULL)
			
 
				 	{
			
 
				-		/* display all architectures */
			
 
				-		unsigned archtype, devid, ncore, implid;
			
 
				-		struct starpu_perfmodel_arch perf_arch;
			
 
				-		for (archtype = 0; archtype < STARPU_NARCH; archtype++)
			
 
				+		int comb, impl;
			
 
				+		for(comb = 0; comb < starpu_get_narch_combs(); comb++)
			
 
				 		{
			
 
				-			perf_arch.type = archtype;
			
 
				-			for(devid = 0; model->per_arch[archtype][devid] != NULL; devid++)
			
 
				-			{
			
 
				-				perf_arch.devid = devid;
			
 
				-				for(ncore = 0; model->per_arch[archtype][devid][ncore] != NULL; ncore++)
			
 
				-				{
			
 
				-					perf_arch.ncore = ncore;
			
 
				-					for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				-					{ /* Display all codelets on each arch */
			
 
				-						starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
			
 
				-					}
			
 
				-				}
			
 
				-			}
			
 
				+			struct starpu_perfmodel_arch *arch_comb = _starpu_arch_comb_get(comb);
			
 
				+			int nimpls = model->state ? model->state->nimpls[comb] : 0;
			
 
				+			for(impl = 0; impl < nimpls; impl++)
			
 
				+				starpu_perfmodel_print(model, arch_comb, impl, parameter, footprint, output);
			
 
				 		}
			
 
				 	}
			
 
				 	else
			
@@ -196,11 +189,17 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 
				 		{
			
 
				 			int implid;
			
 
				 			struct starpu_perfmodel_arch perf_arch;
			
 
				-			perf_arch.type = STARPU_CPU_WORKER;
			
 
				-			perf_arch.devid = 0;
			
 
				-			perf_arch.ncore = 0;
			
 
				-			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				+			perf_arch.ndevices = 1;
			
 
				+			perf_arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
			
 
				+			perf_arch.devices[0].type = STARPU_CPU_WORKER;
			
 
				+			perf_arch.devices[0].devid = 0;
			
 
				+			perf_arch.devices[0].ncores = 1;
			
 
				+			int comb = starpu_perfmodel_arch_comb_get(perf_arch.ndevices, perf_arch.devices);
			
 
				+			STARPU_ASSERT(comb != -1);
			
 
				+			int nimpls = model->state->nimpls[comb];
			
 
				+			for (implid = 0; implid < nimpls; implid++)
			
 
				 				starpu_perfmodel_print(model, &perf_arch,implid, parameter, footprint, output); /* Display all codelets on cpu */
			
 
				+			free(perf_arch.devices);
			
 
				 			return 0;
			
 
				 		}
			
 
				 
			
@@ -216,28 +215,44 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 
				 
			
 
				 			int implid;
			
 
				 			struct starpu_perfmodel_arch perf_arch;
			
 
				-			perf_arch.type = STARPU_CPU_WORKER;
			
 
				-			perf_arch.devid = 0;
			
 
				-			perf_arch.ncore = k-1;
			
 
				-			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				+			perf_arch.ndevices = 1;
			
 
				+			perf_arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
			
 
				+			perf_arch.devices[0].type = STARPU_CPU_WORKER;
			
 
				+			perf_arch.devices[0].devid = 0;
			
 
				+			perf_arch.devices[0].ncores = k-1;
			
 
				+			int comb = starpu_perfmodel_arch_comb_get(perf_arch.ndevices, perf_arch.devices);
			
 
				+			STARPU_ASSERT(comb != -1);
			
 
				+			int nimpls = model->state->nimpls[comb];
			
 
				+
			
 
				+			for (implid = 0; implid < nimpls; implid++)
			
 
				 				starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
			
 
				+			free(perf_arch.devices);
			
 
				 			return 0;
			
 
				 		}
			
 
				 
			
 
				 		if (strcmp(arch, "cuda") == 0)
			
 
				 		{
			
 
				-			unsigned devid;
			
 
				 			int implid;
			
 
				 			struct starpu_perfmodel_arch perf_arch;
			
 
				-			perf_arch.type = STARPU_CUDA_WORKER;
			
 
				-			perf_arch.ncore = 0;
			
 
				 
			
 
				-			for (devid = 0; model->per_arch[STARPU_CUDA_WORKER] != NULL; devid++)
			
 
				+			perf_arch.ndevices = 1;
			
 
				+			perf_arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
			
 
				+			perf_arch.devices[0].type = STARPU_CUDA_WORKER;
			
 
				+			perf_arch.devices[0].ncores = 1;
			
 
				+			int comb;
			
 
				+			for(comb = 0; comb < starpu_get_narch_combs(); comb++)
			
 
				 			{
			
 
				-				perf_arch.devid = devid;
			
 
				-				for (implid = 0; implid <STARPU_MAXIMPLEMENTATIONS; implid ++)
			
 
				-					starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
			
 
				+				struct starpu_perfmodel_arch *arch_comb = _starpu_arch_comb_get(comb);
			
 
				+				if(arch_comb->ndevices == 1 && arch_comb->devices[0].type == STARPU_CUDA_WORKER)
			
 
				+				{
			
 
				+					perf_arch.devices[0].devid = arch_comb->devices[0].devid;
			
 
				+					int nimpls = model->state->nimpls[comb];
			
 
				+
			
 
				+					for (implid = 0; implid < nimpls; implid++)
			
 
				+						starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
			
 
				+				}
			
 
				 			}
			
 
				+			free(perf_arch.devices);
			
 
				 			return 0;
			
 
				 		}
			
 
				 
			
@@ -248,11 +263,19 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 
				 		if (nmatched == 1)
			
 
				 		{
			
 
				 			struct starpu_perfmodel_arch perf_arch;
			
 
				-			perf_arch.type = STARPU_CUDA_WORKER;
			
 
				-			perf_arch.devid = gpuid;
			
 
				-			perf_arch.ncore = 0;
			
 
				+			perf_arch.ndevices = 1;
			
 
				+			perf_arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
			
 
				+
			
 
				+			perf_arch.devices[0].type = STARPU_CUDA_WORKER;
			
 
				+			perf_arch.devices[0].devid = gpuid;
			
 
				+			perf_arch.devices[0].ncores = 1;
			
 
				+
			
 
				+			int comb = starpu_perfmodel_arch_comb_get(perf_arch.ndevices, perf_arch.devices);
			
 
				+			STARPU_ASSERT(comb != -1);
			
 
				+			int nimpls = model->state->nimpls[comb];
			
 
				+
			
 
				 			int implid;
			
 
				-			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				+			for (implid = 0; implid < nimpls; implid++)
			
 
				 				starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
			
 
				 			return 0;
			
 
				 		}
			
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
@@ -34,11 +34,13 @@ static size_t data_size[STARPU_NMAX_SCHED_CTXS][STARPU_NMAXWORKERS];
 
				 static unsigned _starpu_get_first_free_sched_ctx(struct _starpu_machine_config *config);
			
 
				 static void _starpu_sched_ctx_add_workers_to_master(unsigned sched_ctx_id, int *workerids, int nworkers, int new_master);
			
 
				 static void _starpu_sched_ctx_wake_these_workers_up(unsigned sched_ctx_id, int *workerids, int nworkers);
			
 
				+static int _starpu_sched_ctx_find_master(unsigned sched_ctx_id, int *workerids, int nworkers);
			
 
				+static void _starpu_sched_ctx_set_master(struct _starpu_sched_ctx *sched_ctx, int *workerids, int nworkers, int master);
			
 
				 
			
 
				 static void _starpu_worker_gets_into_ctx(unsigned sched_ctx_id, struct _starpu_worker *worker)
			
 
				 {
			
 
				 	unsigned ret_sched_ctx = _starpu_sched_ctx_list_get_sched_ctx(worker->sched_ctx_list, sched_ctx_id);
			
 
				-	/* the worker was planning to go away in another ctx but finally he changed his mind & 
			
 
				+	/* the worker was planning to go away in another ctx but finally he changed his mind &
			
 
				 	   he's staying */
			
 
				 	if (ret_sched_ctx == STARPU_NMAX_SCHED_CTXS)
			
 
				 	{
			
@@ -58,13 +60,16 @@ void _starpu_worker_gets_out_of_ctx(unsigned sched_ctx_id, struct _starpu_worker
 
				 	/* remove context from worker */
			
 
				 	if(ret_sched_ctx != STARPU_NMAX_SCHED_CTXS)
			
 
				 	{
			
 
				-		struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				-		if(sched_ctx && sched_ctx->sched_policy && sched_ctx->sched_policy->remove_workers)
			
 
				-		{
			
 
				-			_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
			
 
				-			sched_ctx->sched_policy->remove_workers(sched_ctx_id, &worker->workerid, 1);
			
 
				-			_STARPU_TRACE_WORKER_SCHEDULING_POP;
			
 
				-		}
			
 
				+		/* don't remove scheduling data here, there might be tasks running and when post_exec
			
 
				+		   executes scheduling data is not there any more, do it when deleting context, then
			
 
				+		   we really won't need it anymore */
			
 
				+		/* struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id); */
			
 
				+		/* if(sched_ctx && sched_ctx->sched_policy && sched_ctx->sched_policy->remove_workers) */
			
 
				+		/* { */
			
 
				+		/* 	_STARPU_TRACE_WORKER_SCHEDULING_PUSH; */
			
 
				+		/* 	sched_ctx->sched_policy->remove_workers(sched_ctx_id, &worker->workerid, 1); */
			
 
				+		/* 	_STARPU_TRACE_WORKER_SCHEDULING_POP; */
			
 
				+		/* } */
			
 
				 		_starpu_sched_ctx_list_remove(&worker->sched_ctx_list, sched_ctx_id);
			
 
				 		worker->nsched_ctxs--;
			
 
				 	}
			
@@ -143,7 +148,10 @@ static void _starpu_add_workers_to_sched_ctx(struct _starpu_sched_ctx *sched_ctx
 
				 	int nworkers_to_add = nworkers == -1 ? (int)config->topology.nworkers : nworkers;
			
 
				 	int workers_to_add[nworkers_to_add];
			
 
				 
			
 
				-
			
 
				+	struct starpu_perfmodel_device devices[nworkers_to_add];
			
 
				+	int ndevices = 0;
			
 
				+	struct _starpu_worker *str_worker = NULL;
			
 
				+	int worker;
			
 
				 	int i = 0;
			
 
				 	for(i = 0; i < nworkers_to_add; i++)
			
 
				 	{
			
@@ -151,7 +159,7 @@ static void _starpu_add_workers_to_sched_ctx(struct _starpu_sched_ctx *sched_ctx
 
				 		/* if the function is called at the creation of the context it's no need to do this verif */
			
 
				 		if(added_workers)
			
 
				 		{
			
 
				-			int worker = workers->add(workers, (workerids == NULL ? i : workerids[i]));
			
 
				+			worker = workers->add(workers, (workerids == NULL ? i : workerids[i]));
			
 
				 			if(worker >= 0)
			
 
				 				added_workers[(*n_added_workers)++] = worker;
			
 
				 			else
			
@@ -169,22 +177,133 @@ static void _starpu_add_workers_to_sched_ctx(struct _starpu_sched_ctx *sched_ctx
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				-			int worker = (workerids == NULL ? i : workerids[i]);
			
 
				+			worker = (workerids == NULL ? i : workerids[i]);
			
 
				 			workers->add(workers, worker);
			
 
				 			workers_to_add[i] = worker;
			
 
				-			struct _starpu_worker *str_worker = _starpu_get_worker_struct(worker);
			
 
				+			str_worker = _starpu_get_worker_struct(worker);
			
 
				 			str_worker->tmp_sched_ctx = (int)sched_ctx->id;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	int *wa;
			
 
				+	int na;
			
 
				+	if(added_workers)
			
 
				+	{
			
 
				+		na = *n_added_workers;
			
 
				+		wa = added_workers;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		na = nworkers_to_add;
			
 
				+		wa = workers_to_add;
			
 
				+	}
			
 
				+
			
 
				+	for(i = 0; i < na; i++)
			
 
				+	{
			
 
				+		worker = wa[i];
			
 
				+		str_worker = _starpu_get_worker_struct(worker);
			
 
				+		int dev1, dev2;
			
 
				+		unsigned found = 0;
			
 
				+		for(dev1 = 0; dev1 < str_worker->perf_arch.ndevices; dev1++)
			
 
				+		{
			
 
				+			for(dev2 = 0; dev2 < ndevices; dev2++)
			
 
				+			{
			
 
				+				if(devices[dev2].type == str_worker->perf_arch.devices[dev1].type &&
			
 
				+				   devices[dev2].devid == str_worker->perf_arch.devices[dev1].devid)
			
 
				+				{
			
 
				+					devices[dev2].ncores += str_worker->perf_arch.devices[dev1].ncores;
			
 
				+					found = 1;
			
 
				+					break;
			
 
				+				}
			
 
				+			}
			
 
				+			if(!found)
			
 
				+			{
			
 
				+				devices[ndevices].type = str_worker->perf_arch.devices[dev1].type;
			
 
				+				devices[ndevices].devid = str_worker->perf_arch.devices[dev1].devid;
			
 
				+				devices[ndevices].ncores = str_worker->perf_arch.devices[dev1].ncores;
			
 
				+				ndevices++;
			
 
				+			}
			
 
				+			else
			
 
				+				found = 0;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if(ndevices > 0)
			
 
				+	{
			
 
				+
			
 
				+		if(sched_ctx->perf_arch.devices == NULL)
			
 
				+			sched_ctx->perf_arch.devices = (struct starpu_perfmodel_device*)malloc(ndevices*sizeof(struct starpu_perfmodel_device));
			
 
				+		else
			
 
				+		{
			
 
				+			int nfinal_devices = 0;
			
 
				+			int dev1, dev2;
			
 
				+			unsigned found = 0;
			
 
				+			for(dev1 = 0; dev1 < ndevices; dev1++)
			
 
				+			{
			
 
				+				for(dev2 = 0; dev2 < sched_ctx->perf_arch.ndevices; dev2++)
			
 
				+				{
			
 
				+					if(sched_ctx->perf_arch.devices[dev2].type == devices[dev1].type && sched_ctx->perf_arch.devices[dev2].devid == devices[dev1].devid)
			
 
				+						found = 1;
			
 
				+				}
			
 
				+				
			
 
				+				if(!found)
			
 
				+				{
			
 
				+					nfinal_devices++;
			
 
				+				}
			
 
				+				else
			
 
				+					found = 0;
			
 
				+				
			
 
				+			}
			
 
				+
			
 
				+
			
 
				+			int nsize =  (sched_ctx->perf_arch.ndevices+nfinal_devices);
			
 
				+			sched_ctx->perf_arch.devices  = (struct starpu_perfmodel_device*)realloc(sched_ctx->perf_arch.devices, nsize*sizeof(struct starpu_perfmodel_device));
			
 
				+			
			
 
				+		}
			
 
				+
			
 
				+		int dev1, dev2;
			
 
				+		unsigned found = 0;
			
 
				+		for(dev1 = 0; dev1 < ndevices; dev1++)
			
 
				+		{
			
 
				+			for(dev2 = 0; dev2 < sched_ctx->perf_arch.ndevices; dev2++)
			
 
				+			{
			
 
				+				if(sched_ctx->perf_arch.devices[dev2].type == devices[dev1].type && sched_ctx->perf_arch.devices[dev2].devid == devices[dev1].devid)
			
 
				+				{
			
 
				+					if(sched_ctx->perf_arch.devices[dev2].type == STARPU_CPU_WORKER)
			
 
				+						sched_ctx->perf_arch.devices[dev2].ncores += devices[dev1].ncores;
			
 
				+				     
			
 
				+					found = 1;
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			if(!found)
			
 
				+			{
			
 
				+				sched_ctx->perf_arch.devices[sched_ctx->perf_arch.ndevices].type = devices[dev1].type;
			
 
				+				sched_ctx->perf_arch.devices[sched_ctx->perf_arch.ndevices].devid = devices[dev1].devid;
			
 
				+				sched_ctx->perf_arch.devices[sched_ctx->perf_arch.ndevices].ncores = devices[dev1].ncores;
			
 
				+				sched_ctx->perf_arch.ndevices++;
			
 
				+			}
			
 
				+			else
			
 
				+				found = 0;
			
 
				 
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				 	if(!sched_ctx->sched_policy)
			
 
				 	{
			
 
				-		if(sched_ctx->main_master == -1)
			
 
				-			sched_ctx->main_master = starpu_sched_ctx_book_workers_for_task(sched_ctx->id, workerids, nworkers);
			
 
				+		if(!sched_ctx->awake_workers)
			
 
				+		{
			
 
				+			if(sched_ctx->main_master == -1)
			
 
				+				sched_ctx->main_master = starpu_sched_ctx_book_workers_for_task(sched_ctx->id, wa, na);
			
 
				+			else
			
 
				+			{
			
 
				+				_starpu_sched_ctx_add_workers_to_master(sched_ctx->id, wa, na, sched_ctx->main_master);
			
 
				+			}
			
 
				+		}
			
 
				 		else
			
 
				 		{
			
 
				-			_starpu_sched_ctx_add_workers_to_master(sched_ctx->id, workerids, nworkers, sched_ctx->main_master);
			
 
				+			sched_ctx->main_master = _starpu_sched_ctx_find_master(sched_ctx->id, wa, na);
			
 
				+			_starpu_sched_ctx_set_master(sched_ctx, wa, na, sched_ctx->main_master);
			
 
				 		}
			
 
				 	}
			
 
				 	else if(sched_ctx->sched_policy->add_workers)
			
@@ -196,7 +315,9 @@ static void _starpu_add_workers_to_sched_ctx(struct _starpu_sched_ctx *sched_ctx
 
				 				sched_ctx->sched_policy->add_workers(sched_ctx->id, added_workers, *n_added_workers);
			
 
				 		}
			
 
				 		else
			
 
				+		{
			
 
				 			sched_ctx->sched_policy->add_workers(sched_ctx->id, workers_to_add, nworkers_to_add);
			
 
				+		}
			
 
				 		_STARPU_TRACE_WORKER_SCHEDULING_POP;
			
 
				 	}
			
 
				 	return;
			
@@ -207,8 +328,10 @@ static void _starpu_remove_workers_from_sched_ctx(struct _starpu_sched_ctx *sche
 
				 {
			
 
				 	struct starpu_worker_collection *workers = sched_ctx->workers;
			
 
				 
			
 
				-	int i = 0;
			
 
				+	struct starpu_perfmodel_device devices[workers->nworkers];
			
 
				+	int ndevices = 0;
			
 
				 
			
 
				+	int i = 0;
			
 
				 	for(i = 0; i < nworkers; i++)
			
 
				 	{
			
 
				 		if(workers->nworkers > 0)
			
@@ -222,26 +345,80 @@ static void _starpu_remove_workers_from_sched_ctx(struct _starpu_sched_ctx *sche
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	int worker;
			
 
				+	unsigned found = 0;
			
 
				+	int dev;
			
 
				+	struct starpu_sched_ctx_iterator it;
			
 
				+	if(workers->init_iterator)
			
 
				+		workers->init_iterator(workers, &it);
			
 
				+
			
 
				+	while(workers->has_next(workers, &it))
			
 
				+	{
			
 
				+		worker = workers->get_next(workers, &it);
			
 
				+		struct _starpu_worker *str_worker = _starpu_get_worker_struct(worker);
			
 
				+		for(dev = 0; dev < str_worker->perf_arch.ndevices; dev++)
			
 
				+		{
			
 
				+			int dev2;
			
 
				+			for(dev2 = 0; dev2 < ndevices; dev2++)
			
 
				+			{
			
 
				+				if(devices[dev2].type == str_worker->perf_arch.devices[dev].type &&
			
 
				+				   devices[dev2].devid == str_worker->perf_arch.devices[dev].devid)
			
 
				+				{
			
 
				+					if(devices[dev2].type == STARPU_CPU_WORKER)
			
 
				+						devices[dev2].ncores += str_worker->perf_arch.devices[dev].ncores;
			
 
				+				}
			
 
				+
			
 
				+					found = 1;
			
 
				+			}
			
 
				+			if(!found)
			
 
				+			{
			
 
				+				devices[ndevices].type = str_worker->perf_arch.devices[dev].type;
			
 
				+				devices[ndevices].devid = str_worker->perf_arch.devices[dev].devid;
			
 
				+				devices[ndevices].ncores = str_worker->perf_arch.devices[dev].ncores;
			
 
				+				ndevices++;
			
 
				+			}
			
 
				+			else 
			
 
				+				found = 0;
			
 
				+		}
			
 
				+		found = 0;
			
 
				+		
			
 
				+	}
			
 
				+	sched_ctx->perf_arch.ndevices = ndevices;
			
 
				+	for(dev = 0; dev < ndevices; dev++)
			
 
				+	{
			
 
				+		sched_ctx->perf_arch.devices[dev].type = devices[dev].type;
			
 
				+		sched_ctx->perf_arch.devices[dev].devid = devices[dev].devid;
			
 
				+		sched_ctx->perf_arch.devices[dev].ncores = devices[dev].ncores;
			
 
				+	}
			
 
				+
			
 
				 	if(!sched_ctx->sched_policy)
			
 
				-		_starpu_sched_ctx_wake_these_workers_up(sched_ctx->id, removed_workers, *n_removed_workers);
			
 
				+	{
			
 
				+		if(!sched_ctx->awake_workers)
			
 
				+		{
			
 
				+			_starpu_sched_ctx_wake_these_workers_up(sched_ctx->id, removed_workers, *n_removed_workers);
			
 
				+		}
			
 
				+	}
			
 
				 
			
 
				 	return;
			
 
				 }
			
 
				 
			
 
				 static void _starpu_sched_ctx_free_scheduling_data(struct _starpu_sched_ctx *sched_ctx)
			
 
				 {
			
 
				-	int *workerids = NULL;
			
 
				-
			
 
				-	unsigned nworkers_ctx = starpu_sched_ctx_get_workers_list(sched_ctx->id, &workerids);
			
 
				-
			
 
				-	if(nworkers_ctx > 0 && sched_ctx->sched_policy->remove_workers)
			
 
				+	if(sched_ctx->sched_policy && sched_ctx->sched_policy->remove_workers)
			
 
				 	{
			
 
				-		_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
			
 
				-		sched_ctx->sched_policy->remove_workers(sched_ctx->id, workerids, nworkers_ctx);
			
 
				-		_STARPU_TRACE_WORKER_SCHEDULING_POP;
			
 
				+		int *workerids = NULL;
			
 
				+		
			
 
				+		unsigned nworkers_ctx = starpu_sched_ctx_get_workers_list(sched_ctx->id, &workerids);
			
 
				+		
			
 
				+		if(nworkers_ctx > 0)
			
 
				+		{
			
 
				+			_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
			
 
				+			sched_ctx->sched_policy->remove_workers(sched_ctx->id, workerids, nworkers_ctx);
			
 
				+			_STARPU_TRACE_WORKER_SCHEDULING_POP;
			
 
				+		}
			
 
				+		
			
 
				+		free(workerids);
			
 
				 	}
			
 
				-
			
 
				-	free(workerids);
			
 
				 	return;
			
 
				 
			
 
				 }
			
@@ -275,7 +452,7 @@ struct _starpu_sched_ctx* _starpu_create_sched_ctx(struct starpu_sched_policy *p
 
				 						   int nworkers_ctx, unsigned is_initial_sched,
			
 
				 						   const char *sched_ctx_name,
			
 
				 						   int min_prio_set, int min_prio,
			
 
				-						   int max_prio_set, int max_prio)
			
 
				+						   int max_prio_set, int max_prio, unsigned awake_workers)
			
 
				 {
			
 
				 	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
			
 
				 
			
@@ -314,7 +491,9 @@ struct _starpu_sched_ctx* _starpu_create_sched_ctx(struct starpu_sched_policy *p
 
				 
			
 
				 	sched_ctx->ready_flops = 0.0;
			
 
				 	sched_ctx->main_master = -1;
			
 
				-	
			
 
				+	sched_ctx->perf_arch.devices = NULL;
			
 
				+	sched_ctx->perf_arch.ndevices = 0;
			
 
				+
			
 
				 	int w;
			
 
				 	for(w = 0; w < nworkers; w++)
			
 
				 	{
			
@@ -323,22 +502,24 @@ struct _starpu_sched_ctx* _starpu_create_sched_ctx(struct starpu_sched_policy *p
 
				 
			
 
				 		STARPU_PTHREAD_COND_INIT(&sched_ctx->parallel_sect_cond[w], NULL);
			
 
				 		STARPU_PTHREAD_MUTEX_INIT(&sched_ctx->parallel_sect_mutex[w], NULL);
			
 
				-		
			
 
				+
			
 
				 		sched_ctx->master[w] = -1;
			
 
				 		sched_ctx->parallel_sect[w] = 0;
			
 
				 		sched_ctx->sleeping[w] = 0;
			
 
				 	}
			
 
				 
			
 
				-	
			
 
				+
			
 
				         /*init the strategy structs and the worker_collection of the ressources of the context */
			
 
				 	if(policy)
			
 
				+	{
			
 
				 		_starpu_init_sched_policy(config, sched_ctx, policy);
			
 
				+		sched_ctx->awake_workers = 1;
			
 
				+	}
			
 
				 	else
			
 
				+	{
			
 
				+		sched_ctx->awake_workers = awake_workers;
			
 
				 		starpu_sched_ctx_create_worker_collection(sched_ctx->id, STARPU_WORKER_LIST);
			
 
				-	
			
 
				-        /* construct the collection of workers(list/tree/etc.) */
			
 
				-	sched_ctx->workers->init(sched_ctx->workers);
			
 
				-
			
 
				+	}
			
 
				 
			
 
				 	/* after having an worker_collection on the ressources add them */
			
 
				 	_starpu_add_workers_to_sched_ctx(sched_ctx, workerids, nworkers_ctx, NULL, NULL);
			
@@ -365,7 +546,7 @@ struct _starpu_sched_ctx* _starpu_create_sched_ctx(struct starpu_sched_policy *p
 
				 			worker->nsched_ctxs++;
			
 
				 		}
			
 
				 	}
			
 
				-	
			
 
				+
			
 
				 	return sched_ctx;
			
 
				 }
			
 
				 
			
@@ -504,7 +685,7 @@ unsigned starpu_sched_ctx_create_inside_interval(const char *policy_name, const
 
				 	for(i = 0; i < nw; i++)
			
 
				 		printf("%d ", workers[i]);
			
 
				 	printf("\n");
			
 
				-	sched_ctx = _starpu_create_sched_ctx(selected_policy, workers, nw, 0, sched_ctx_name, 0, 0, 0, 0);
			
 
				+	sched_ctx = _starpu_create_sched_ctx(selected_policy, workers, nw, 0, sched_ctx_name, 0, 0, 0, 0, 1);
			
 
				 	sched_ctx->min_ncpus = min_ncpus;
			
 
				 	sched_ctx->max_ncpus = max_ncpus;
			
 
				 	sched_ctx->min_ngpus = min_ngpus;
			
@@ -533,6 +714,7 @@ unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched
 
				 	struct starpu_sched_policy *sched_policy = NULL;
			
 
				 	unsigned hierarchy_level = 0;
			
 
				 	unsigned nesting_sched_ctx = STARPU_NMAX_SCHED_CTXS;
			
 
				+	unsigned awake_workers = 0;
			
 
				 
			
 
				 	va_start(varg_list, sched_ctx_name);
			
 
				 	while ((arg_type = va_arg(varg_list, int)) != 0)
			
@@ -565,6 +747,10 @@ unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched
 
				 		{
			
 
				 			nesting_sched_ctx = va_arg(varg_list, unsigned);
			
 
				 		}
			
 
				+		else if (arg_type == STARPU_SCHED_CTX_AWAKE_WORKERS)
			
 
				+		{
			
 
				+			awake_workers = 1;
			
 
				+		}
			
 
				 		else
			
 
				 		{
			
 
				 			STARPU_ABORT_MSG("Unrecognized argument %d\n", arg_type);
			
@@ -574,7 +760,7 @@ unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched
 
				 	va_end(varg_list);
			
 
				 
			
 
				 	struct _starpu_sched_ctx *sched_ctx = NULL;
			
 
				-	sched_ctx = _starpu_create_sched_ctx(sched_policy, workerids, nworkers, 0, sched_ctx_name, min_prio_set, min_prio, max_prio_set, max_prio);
			
 
				+	sched_ctx = _starpu_create_sched_ctx(sched_policy, workerids, nworkers, 0, sched_ctx_name, min_prio_set, min_prio, max_prio_set, max_prio, awake_workers);
			
 
				 	sched_ctx->hierarchy_level = hierarchy_level;
			
 
				 	sched_ctx->nesting_sched_ctx = nesting_sched_ctx;
			
 
				 
			
@@ -617,7 +803,16 @@ static void _starpu_delete_sched_ctx(struct _starpu_sched_ctx *sched_ctx)
 
				 		free(sched_ctx->sched_policy);
			
 
				 		sched_ctx->sched_policy = NULL;
			
 
				 	}
			
 
				-	
			
 
				+	else
			
 
				+	{
			
 
				+		starpu_sched_ctx_delete_worker_collection(sched_ctx->id);
			
 
				+	}
			
 
				+
			
 
				+	if (sched_ctx->perf_arch.devices)
			
 
				+	{
			
 
				+		free(sched_ctx->perf_arch.devices);
			
 
				+		sched_ctx->perf_arch.devices = NULL;
			
 
				+	}
			
 
				 
			
 
				 	STARPU_PTHREAD_MUTEX_DESTROY(&sched_ctx->empty_ctx_mutex);
			
 
				 	sched_ctx->id = STARPU_NMAX_SCHED_CTXS;
			
@@ -653,13 +848,13 @@ void starpu_sched_ctx_delete(unsigned sched_ctx_id)
 
				 
			
 
				 	int *workerids;
			
 
				 	unsigned nworkers_ctx = starpu_sched_ctx_get_workers_list(sched_ctx->id, &workerids);
			
 
				-	
			
 
				+
			
 
				 	/*if both of them have all the ressources is pointless*/
			
 
				 	/*trying to transfer ressources from one ctx to the other*/
			
 
				 	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
			
 
				 	unsigned nworkers = config->topology.nworkers;
			
 
				 
			
 
				-	if(nworkers_ctx > 0 && inheritor_sched_ctx && inheritor_sched_ctx->id != STARPU_NMAX_SCHED_CTXS && 
			
 
				+	if(nworkers_ctx > 0 && inheritor_sched_ctx && inheritor_sched_ctx->id != STARPU_NMAX_SCHED_CTXS &&
			
 
				 	   !(nworkers_ctx == nworkers && nworkers_ctx == inheritor_sched_ctx->workers->nworkers))
			
 
				 	{
			
 
				 		starpu_sched_ctx_add_workers(workerids, nworkers_ctx, inheritor_sched_ctx_id);
			
@@ -674,10 +869,10 @@ void starpu_sched_ctx_delete(unsigned sched_ctx_id)
 
				 		/*if btw the mutex release & the mutex lock the context has changed take care to free all
			
 
				 		  scheduling data before deleting the context */
			
 
				 		_starpu_update_workers_without_ctx(workerids, nworkers_ctx, sched_ctx_id, 1);
			
 
				-//		_starpu_sched_ctx_free_scheduling_data(sched_ctx);
			
 
				+		_starpu_sched_ctx_free_scheduling_data(sched_ctx);
			
 
				 		_starpu_delete_sched_ctx(sched_ctx);
			
 
				-
			
 
				 	}
			
 
				+
			
 
				 	STARPU_PTHREAD_RWLOCK_UNLOCK(&changing_ctx_mutex[sched_ctx_id]);
			
 
				 	/* workerids is malloc-ed in starpu_sched_ctx_get_workers_list, don't forget to free it when
			
 
				 	   you don't use it anymore */
			
@@ -734,7 +929,7 @@ void _starpu_fetch_tasks_from_empty_ctx_list(struct _starpu_sched_ctx *sched_ctx
 
				                 /* you're not suppose to get here if you deleted the context
			
 
				 		   so no point in having the mutex locked */
			
 
				 		STARPU_PTHREAD_RWLOCK_UNLOCK(&changing_ctx_mutex[sched_ctx->id]);
			
 
				-	
			
 
				+
			
 
				 	while(!starpu_task_list_empty(&sched_ctx->empty_ctx_tasks))
			
 
				 	{
			
 
				 		if(unlocked)
			
@@ -814,7 +1009,7 @@ void starpu_sched_ctx_add_workers(int *workers_to_add, int nworkers_to_add, unsi
 
				 	if(sched_ctx->id != STARPU_NMAX_SCHED_CTXS)
			
 
				 	{
			
 
				 		_starpu_add_workers_to_sched_ctx(sched_ctx, workers_to_add, nworkers_to_add, added_workers, &n_added_workers);
			
 
				-		
			
 
				+
			
 
				 		if(n_added_workers > 0)
			
 
				 		{
			
 
				 			_starpu_update_workers_with_ctx(added_workers, n_added_workers, sched_ctx->id);
			
@@ -874,13 +1069,13 @@ int _starpu_nworkers_able_to_execute_task(struct starpu_task *task, struct _star
 
				 
			
 
				 	STARPU_PTHREAD_RWLOCK_WRLOCK(&changing_ctx_mutex[sched_ctx->id]);
			
 
				 	struct starpu_worker_collection *workers = sched_ctx->workers;
			
 
				-	
			
 
				+
			
 
				 	struct starpu_sched_ctx_iterator it;
			
 
				 
			
 
				 	workers->init_iterator(workers, &it);
			
 
				-	while(workers->has_next(workers, &it))
			
 
				+	while(workers->has_next_master(workers, &it))
			
 
				 	{
			
 
				-		worker = workers->get_next(workers, &it);
			
 
				+		worker = workers->get_next_master(workers, &it);
			
 
				 		STARPU_ASSERT_MSG(worker < STARPU_NMAXWORKERS, "worker id %d", worker);
			
 
				 		if (starpu_worker_can_execute_task_first_impl(worker, task, NULL))
			
 
				 			nworkers++;
			
@@ -951,7 +1146,7 @@ void _starpu_decrement_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
 
				 
			
 
				 				int *workerids = NULL;
			
 
				 				unsigned nworkers = starpu_sched_ctx_get_workers_list(sched_ctx->id, &workerids);
			
 
				-				
			
 
				+
			
 
				 				if(nworkers > 0)
			
 
				 				{
			
 
				 					starpu_sched_ctx_add_workers(workerids, nworkers, sched_ctx->inheritor);
			
@@ -1172,6 +1367,9 @@ struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection(unsig
 
				 
			
 
				 	}
			
 
				 
			
 
				+        /* construct the collection of workers(list/tree/etc.) */
			
 
				+	sched_ctx->workers->init(sched_ctx->workers);
			
 
				+
			
 
				 	return sched_ctx->workers;
			
 
				 }
			
 
				 
			
@@ -1196,6 +1394,7 @@ unsigned starpu_sched_ctx_get_workers_list(unsigned sched_ctx_id, int **workerid
 
				 {
			
 
				 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				 	struct starpu_worker_collection *workers = sched_ctx->workers;
			
 
				+	if(!workers) return 0;
			
 
				 	*workerids = (int*)malloc(workers->nworkers*sizeof(int));
			
 
				 	int worker;
			
 
				 	unsigned nworkers = 0;
			
@@ -1216,6 +1415,7 @@ void starpu_sched_ctx_delete_worker_collection(unsigned sched_ctx_id)
 
				 	sched_ctx->workers->deinit(sched_ctx->workers);
			
 
				 
			
 
				 	free(sched_ctx->workers);
			
 
				+	sched_ctx->workers = NULL;
			
 
				 }
			
 
				 
			
 
				 struct starpu_worker_collection* starpu_sched_ctx_get_worker_collection(unsigned sched_ctx_id)
			
@@ -1351,7 +1551,7 @@ unsigned starpu_sched_ctx_worker_get_id(unsigned sched_ctx_id)
 
				 			return workerid;
			
 
				 	return -1;
			
 
				 }
			
 
				-		 
			
 
				+
			
 
				 unsigned starpu_sched_ctx_overlapping_ctxs_on_worker(int workerid)
			
 
				 {
			
 
				 	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
			
@@ -1397,7 +1597,7 @@ void starpu_sched_ctx_finished_submit(unsigned sched_ctx_id)
 
				 void _starpu_sched_ctx_post_exec_task_cb(int workerid, struct starpu_task *task, size_t data_size2, uint32_t footprint)
			
 
				 {
			
 
				 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
			
 
				-	if(sched_ctx != NULL && task->sched_ctx != _starpu_get_initial_sched_ctx()->id && 
			
 
				+	if(sched_ctx != NULL && task->sched_ctx != _starpu_get_initial_sched_ctx()->id &&
			
 
				 	   task->sched_ctx != STARPU_NMAX_SCHED_CTXS  && sched_ctx->perf_counters != NULL)
			
 
				 	{
			
 
				 		flops[task->sched_ctx][workerid] += task->flops;
			
@@ -1532,13 +1732,13 @@ unsigned _starpu_sched_ctx_last_worker_awake(struct _starpu_worker *worker)
 
				         for (l = worker->sched_ctx_list; l; l = l->next)
			
 
				         {
			
 
				 		struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
			
 
				-		
			
 
				+
			
 
				 		unsigned last_worker_awake = 1;
			
 
				 		struct starpu_worker_collection *workers = sched_ctx->workers;
			
 
				 		struct starpu_sched_ctx_iterator it;
			
 
				 
			
 
				 		int workerid;
			
 
				-		
			
 
				+
			
 
				 		workers->init_iterator(workers, &it);
			
 
				 		while(workers->has_next(workers, &it))
			
 
				 		{
			
@@ -1575,7 +1775,7 @@ void starpu_sched_ctx_bind_current_thread_to_cpuid(unsigned cpuid STARPU_ATTRIBU
 
				 							  config->pu_depth, cpuid);
			
 
				 		hwloc_bitmap_t set = obj->cpuset;
			
 
				 		int ret;
			
 
				-		
			
 
				+
			
 
				 		hwloc_bitmap_singlify(set);
			
 
				 		ret = hwloc_set_cpubind (config->topology.hwtopology, set,
			
 
				 					 HWLOC_CPUBIND_THREAD);
			
@@ -1621,7 +1821,7 @@ unsigned starpu_sched_ctx_worker_is_master_for_child_ctx(int workerid, unsigned
 
				 	struct _starpu_sched_ctx_list *l = NULL;
			
 
				 	struct _starpu_sched_ctx *sched_ctx = NULL;
			
 
				 	for (l = worker->sched_ctx_list; l; l = l->next)
			
 
				-	{ 
			
 
				+	{
			
 
				 		 sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
			
 
				 		if(sched_ctx-> main_master == workerid && sched_ctx->nesting_sched_ctx == sched_ctx_id)
			
 
				 			return sched_ctx->id;
			
@@ -1630,6 +1830,18 @@ unsigned starpu_sched_ctx_worker_is_master_for_child_ctx(int workerid, unsigned
 
				 
			
 
				 }
			
 
				 
			
 
				+struct _starpu_sched_ctx *_starpu_sched_ctx_get_sched_ctx_for_worker_and_job(struct _starpu_worker *worker, struct _starpu_job *j)
			
 
				+{
			
 
				+	struct _starpu_sched_ctx_list *l = NULL;
			
 
				+	for (l = worker->sched_ctx_list; l; l = l->next)
			
 
				+	{
			
 
				+		struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
			
 
				+		if (j->task->sched_ctx == sched_ctx->id)
			
 
				+			return sched_ctx;
			
 
				+	}
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				 void starpu_sched_ctx_revert_task_counters(unsigned sched_ctx_id, double ready_flops)
			
 
				 {
			
 
				         _starpu_decrement_nsubmitted_tasks_of_sched_ctx(sched_ctx_id);
			
@@ -1863,7 +2075,7 @@ static int _starpu_sched_ctx_find_master(unsigned sched_ctx_id, int *workerids,
 
				                 if (master > -1)
			
 
				 		{
			
 
				                         int already_seen = 0;
			
 
				-                        //Could create a function for this. Basically searching an element in an array.                                                                                                             
			
 
				+                        //Could create a function for this. Basically searching an element in an array.
			
 
				                         for (i = 0 ; i < npotential_masters; i++)
			
 
				                         {
			
 
				                                 if (potential_masters[i] == master)
			
@@ -1881,7 +2093,7 @@ static int _starpu_sched_ctx_find_master(unsigned sched_ctx_id, int *workerids,
 
				 
			
 
				         for (i = 0 ; i < npotential_masters ; i++) {
			
 
				 		int master_is_in_section = 0;
			
 
				-		//Could create a function for this. Basically searching an element in an array.                                                                                                                     
			
 
				+		//Could create a function for this. Basically searching an element in an array.
			
 
				 		for (w = 0 ; w < nworkers ; w++)
			
 
				 		{
			
 
				 			if (workerids[w] == potential_masters[i])
			
@@ -1917,7 +2129,7 @@ static void _starpu_sched_ctx_add_workers_to_master(unsigned sched_ctx_id, int *
 
				 	int nwake_up = 0;
			
 
				 	int put_to_sleep[nworkers];
			
 
				 	int wake_up[nworkers];
			
 
				-	
			
 
				+
			
 
				 	for(w = 0 ; w < nworkers ; w++)
			
 
				 	{
			
 
				 		int master = sched_ctx->master[workerids[w]];
			
@@ -1934,9 +2146,19 @@ static void _starpu_sched_ctx_add_workers_to_master(unsigned sched_ctx_id, int *
 
				 
			
 
				 }
			
 
				 
			
 
				+static void _starpu_sched_ctx_set_master(struct _starpu_sched_ctx *sched_ctx, int *workerids, int nworkers, int master)
			
 
				+{
			
 
				+	int i;
			
 
				+	for(i = 0; i < nworkers; i++)
			
 
				+	{
			
 
				+		if(workerids[i] != master)
			
 
				+			sched_ctx->master[workerids[i]] = master;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 int starpu_sched_ctx_book_workers_for_task(unsigned sched_ctx_id, int *workerids, int nworkers)
			
 
				-{ 
			
 
				-	int new_master = _starpu_sched_ctx_find_master(sched_ctx_id, workerids, nworkers);	
			
 
				+{
			
 
				+	int new_master = _starpu_sched_ctx_find_master(sched_ctx_id, workerids, nworkers);
			
 
				 	_starpu_sched_ctx_add_workers_to_master(sched_ctx_id, workerids, nworkers, new_master);
			
 
				 	return new_master;
			
 
				 }
			
@@ -1947,12 +2169,20 @@ void starpu_sched_ctx_unbook_workers_for_task(unsigned sched_ctx_id, int master)
 
				 	_starpu_sched_ctx_wake_up_workers(sched_ctx_id, master);
			
 
				 }
			
 
				 
			
 
				+struct starpu_perfmodel_arch * _starpu_sched_ctx_get_perf_archtype(unsigned sched_ctx_id)
			
 
				+{
			
 
				+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				+	return &sched_ctx->perf_arch;
			
 
				+}
			
 
				+
			
 
				 int starpu_sched_ctx_get_worker_rank(unsigned sched_ctx_id)
			
 
				 {
			
 
				 	int idx = 0;
			
 
				 	int curr_workerid = starpu_worker_get_id();
			
 
				 	int worker;
			
 
				 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				+	if(sched_ctx->sched_policy || !sched_ctx->awake_workers)
			
 
				+		return -1;
			
 
				 	struct starpu_worker_collection *workers = sched_ctx->workers;
			
 
				 
			
 
				 	struct starpu_sched_ctx_iterator it;
			
--- a/src/core/sched_ctx.h
+++ b/src/core/sched_ctx.h
@@ -150,6 +150,13 @@ struct _starpu_sched_ctx
 
				 	/* ctx nesting the current ctx */
			
 
				 	unsigned nesting_sched_ctx;
			
 
				 
			
 
				+	/* perf model for the device comb of the ctx */
			
 
				+	struct starpu_perfmodel_arch perf_arch;
			
 
				+
			
 
				+	/* for ctxs without policy: flag to indicate that we want to get
			
 
				+	   the threads to sleep in order to replace them with other threads or leave
			
 
				+	   them awake & use them in the parallel code*/
			
 
				+	unsigned awake_workers;
			
 
				 };
			
 
				 
			
 
				 struct _starpu_machine_config;
			
@@ -160,7 +167,7 @@ void _starpu_init_all_sched_ctxs(struct _starpu_machine_config *config);
 
				 /* allocate all structures belonging to a context */
			
 
				 struct _starpu_sched_ctx*  _starpu_create_sched_ctx(struct starpu_sched_policy *policy, int *workerid, int nworkerids, unsigned is_init_sched, const char *sched_name,
			
 
				 						    int min_prio_set, int min_prio,
			
 
				-						    int max_prio_set, int max_prio);
			
 
				+						    int max_prio_set, int max_prio, unsigned awake_workers);
			
 
				 
			
 
				 /* delete all sched_ctx */
			
 
				 void _starpu_delete_all_sched_ctxs();
			
@@ -224,10 +231,14 @@ void _starpu_fetch_tasks_from_empty_ctx_list(struct _starpu_sched_ctx *sched_ctx
 
				 
			
 
				 unsigned _starpu_sched_ctx_allow_hypervisor(unsigned sched_ctx_id);
			
 
				 
			
 
				+struct starpu_perfmodel_arch * _starpu_sched_ctx_get_perf_archtype(unsigned sched_ctx);
			
 
				 #ifdef STARPU_USE_SC_HYPERVISOR
			
 
				 /* Notifies the hypervisor that a tasks was poped from the workers' list */
			
 
				 void _starpu_sched_ctx_post_exec_task_cb(int workerid, struct starpu_task *task, size_t data_size, uint32_t footprint);
			
 
				 
			
 
				 #endif //STARPU_USE_SC_HYPERVISOR
			
 
				 
			
 
				+/* if the worker is the master of a parallel context, and the job is meant to be executed on this parallel context, return a pointer to the context */
			
 
				+struct _starpu_sched_ctx *_starpu_sched_ctx_get_sched_ctx_for_worker_and_job(struct _starpu_worker *worker, struct _starpu_job *j);
			
 
				+
			
 
				 #endif // __SCHED_CONTEXT_H__
			
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -438,7 +438,7 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 
				 
			
 
				 	_starpu_profiling_set_task_push_start_time(task);
			
 
				 
			
 
				-	int ret;
			
 
				+	int ret = 0;
			
 
				 	if (STARPU_UNLIKELY(task->execute_on_a_specific_worker))
			
 
				 	{
			
 
				 		unsigned node = starpu_worker_get_memory_node(task->workerid);
			
@@ -469,7 +469,36 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 
				 
			
 
				 		if(!sched_ctx->sched_policy)
			
 
				 		{
			
 
				-			ret = _starpu_push_task_on_specific_worker(task, sched_ctx->main_master);
			
 
				+			if(!sched_ctx->awake_workers)
			
 
				+				ret = _starpu_push_task_on_specific_worker(task, sched_ctx->main_master);
			
 
				+			else
			
 
				+			{
			
 
				+				struct starpu_worker_collection *workers = sched_ctx->workers;
			
 
				+				
			
 
				+				struct _starpu_job *job = _starpu_get_job_associated_to_task(task);
			
 
				+				job->task_size = workers->nworkers;
			
 
				+				job->combined_workerid = -1; // workerid; its a ctx not combined worker
			
 
				+				job->active_task_alias_count = 0;
			
 
				+
			
 
				+				STARPU_PTHREAD_BARRIER_INIT(&job->before_work_barrier, NULL, workers->nworkers);
			
 
				+				STARPU_PTHREAD_BARRIER_INIT(&job->after_work_barrier, NULL, workers->nworkers);
			
 
				+				
			
 
				+				/* Note: we have to call that early, or else the task may have
			
 
				+				 * disappeared already */
			
 
				+				starpu_push_task_end(task);
			
 
				+
			
 
				+				unsigned workerid;
			
 
				+				struct starpu_sched_ctx_iterator it;
			
 
				+				if(workers->init_iterator)
			
 
				+					workers->init_iterator(workers, &it);
			
 
				+
			
 
				+				while(workers->has_next(workers, &it))
			
 
				+				{
			
 
				+					workerid = workers->get_next(workers, &it);
			
 
				+					struct starpu_task *alias = starpu_task_dup(task);
			
 
				+					ret |= _starpu_push_task_on_specific_worker(alias, workerid);
			
 
				+				}
			
 
				+			}
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -288,10 +288,13 @@ int _starpu_submit_job(struct _starpu_job *j)
 
				 	   && sched_ctx->perf_counters != NULL)
			
 
				 	{
			
 
				 		struct starpu_perfmodel_arch arch;
			
 
				-		arch.type = STARPU_CPU_WORKER;
			
 
				-		arch.devid = 0;
			
 
				-		arch.ncore = 0;
			
 
				+		arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
			
 
				+		arch.ndevices = 1;
			
 
				+		arch.devices[0].type = STARPU_CPU_WORKER;
			
 
				+		arch.devices[0].devid = 0;
			
 
				+		arch.devices[0].ncores = 1;
			
 
				 		_starpu_compute_buffers_footprint(j->task->cl->model, &arch, 0, j);
			
 
				+		free(arch.devices);
			
 
				 		int i;
			
 
				 		size_t data_size = 0;
			
 
				 		if (j->task->cl)
			
@@ -543,11 +546,11 @@ int starpu_task_submit(struct starpu_task *task)
 
				 			_starpu_detect_implicit_data_deps(task);
			
 
				 		}
			
 
				 
			
 
				-		if (task->cl->model && task->cl->model->symbol)
			
 
				-			_starpu_load_perfmodel(task->cl->model);
			
 
				+		if (task->cl->model)
			
 
				+			_starpu_init_and_load_perfmodel(task->cl->model);
			
 
				 
			
 
				-		if (task->cl->power_model && task->cl->power_model->symbol)
			
 
				-			_starpu_load_perfmodel(task->cl->power_model);
			
 
				+		if (task->cl->power_model)
			
 
				+			_starpu_init_and_load_perfmodel(task->cl->power_model);
			
 
				 	}
			
 
				 
			
 
				 	if (bundle)
			
@@ -562,11 +565,11 @@ int starpu_task_submit(struct starpu_task *task)
 
				 
			
 
				 		while (entry)
			
 
				 		{
			
 
				-			if (entry->task->cl->model && entry->task->cl->model->symbol)
			
 
				-				_starpu_load_perfmodel(entry->task->cl->model);
			
 
				+			if (entry->task->cl->model)
			
 
				+				_starpu_init_and_load_perfmodel(entry->task->cl->model);
			
 
				 
			
 
				-			if (entry->task->cl->power_model && entry->task->cl->power_model->symbol)
			
 
				-				_starpu_load_perfmodel(entry->task->cl->power_model);
			
 
				+			if (entry->task->cl->power_model)
			
 
				+				_starpu_init_and_load_perfmodel(entry->task->cl->power_model);
			
 
				 
			
 
				 			entry = entry->next;
			
 
				 		}
			
@@ -626,10 +629,10 @@ int _starpu_task_submit_nodeps(struct starpu_task *task)
 
				 	if (task->cl)
			
 
				 	{
			
 
				 		if (task->cl->model)
			
 
				-			_starpu_load_perfmodel(task->cl->model);
			
 
				+			_starpu_init_and_load_perfmodel(task->cl->model);
			
 
				 
			
 
				 		if (task->cl->power_model)
			
 
				-			_starpu_load_perfmodel(task->cl->power_model);
			
 
				+			_starpu_init_and_load_perfmodel(task->cl->power_model);
			
 
				 	}
			
 
				 
			
 
				 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
			
@@ -687,10 +690,10 @@ int _starpu_task_submit_conversion_task(struct starpu_task *task,
 
				 
			
 
				 	/* We should factorize that */
			
 
				 	if (task->cl->model)
			
 
				-		_starpu_load_perfmodel(task->cl->model);
			
 
				+		_starpu_init_and_load_perfmodel(task->cl->model);
			
 
				 
			
 
				 	if (task->cl->power_model)
			
 
				-		_starpu_load_perfmodel(task->cl->power_model);
			
 
				+		_starpu_init_and_load_perfmodel(task->cl->power_model);
			
 
				 
			
 
				 	/* We retain handle reference count */
			
 
				 	unsigned i;
			
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -334,7 +334,6 @@ _starpu_init_mic_topology (struct _starpu_machine_config *config, long mic_idx)
 
				 	topology->nhwmiccores[mic_idx] = nbcores;
			
 
				 }
			
 
				 
			
 
				-
			
 
				 static int
			
 
				 _starpu_init_mic_node (struct _starpu_machine_config *config, int mic_idx,
			
 
				 		       COIENGINE *coi_handle, COIPROCESS *coi_process)
			
@@ -403,8 +402,6 @@ _starpu_init_mic_node (struct _starpu_machine_config *config, int mic_idx,
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-
			
 
				-
			
 
				 static void
			
 
				 _starpu_init_topology (struct _starpu_machine_config *config)
			
 
				 {
			
@@ -732,7 +729,6 @@ _starpu_init_mic_config (struct _starpu_machine_config *config,
 
				 	topology->nworkers += topology->nmiccores[mic_idx];
			
 
				     }
			
 
				 
			
 
				-
			
 
				 #ifdef STARPU_USE_MIC
			
 
				 static COIENGINE handles[2];
			
 
				 static COIPROCESS process[2];
			
@@ -870,12 +866,15 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 
				 		for (i = 0; i < nworker_per_cuda; i++)
			
 
				 		{
			
 
				 			int worker_idx = topology->nworkers + cudagpu * nworker_per_cuda + i;
			
 
				+
			
 
				 			config->workers[worker_idx].arch = STARPU_CUDA_WORKER;
			
 
				-			config->workers[worker_idx].perf_arch.type = STARPU_CUDA_WORKER;
			
 
				-			config->workers[worker_idx].perf_arch.devid = devid;
			
 
				+			config->workers[worker_idx].perf_arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
			
 
				+			config->workers[worker_idx].perf_arch.ndevices = 1;
			
 
				+			config->workers[worker_idx].perf_arch.devices[0].type = STARPU_CUDA_WORKER;
			
 
				+			config->workers[worker_idx].perf_arch.devices[0].devid = devid;
			
 
				 			// TODO: fix perfmodels etc.
			
 
				 			//config->workers[worker_idx].perf_arch.ncore = nworker_per_cuda - 1;
			
 
				-			config->workers[worker_idx].perf_arch.ncore = 0;
			
 
				+			config->workers[worker_idx].perf_arch.devices[0].ncores = 1;
			
 
				 			config->workers[worker_idx].devid = devid;
			
 
				 			config->workers[worker_idx].subworkerid = i;
			
 
				 			config->workers[worker_idx].worker_mask = STARPU_CUDA;
			
@@ -948,9 +947,11 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 
				 			break;
			
 
				 		}
			
 
				 		config->workers[worker_idx].arch = STARPU_OPENCL_WORKER;
			
 
				-		config->workers[worker_idx].perf_arch.type = STARPU_OPENCL_WORKER;
			
 
				-		config->workers[worker_idx].perf_arch.devid = devid;
			
 
				-		config->workers[worker_idx].perf_arch.ncore = 0;
			
 
				+		config->workers[worker_idx].perf_arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
			
 
				+		config->workers[worker_idx].perf_arch.ndevices = 1;
			
 
				+		config->workers[worker_idx].perf_arch.devices[0].type = STARPU_OPENCL_WORKER;
			
 
				+		config->workers[worker_idx].perf_arch.devices[0].devid = devid;
			
 
				+		config->workers[worker_idx].perf_arch.devices[0].ncores = 1;
			
 
				 		config->workers[worker_idx].subworkerid = 0;
			
 
				 		config->workers[worker_idx].devid = devid;
			
 
				 		config->workers[worker_idx].worker_mask = STARPU_OPENCL;
			
@@ -1010,9 +1011,12 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 
				 	{
			
 
				 		config->workers[topology->nworkers + sccdev].arch = STARPU_SCC_WORKER;
			
 
				 		int devid = _starpu_get_next_scc_deviceid(config);
			
 
				-		config->workers[topology->nworkers + sccdev].perf_arch.type = STARPU_SCC_WORKER;
			
 
				-		config->workers[topology->nworkers + sccdev].perf_arch.devid = sccdev;
			
 
				-		config->workers[topology->nworkers + sccdev].perf_arch.ncore = 0;
			
 
				+		config->workers[topology->nworkers + sccdev].perf_arch.devices = (struct starpu_perfmodel_device)malloc(sizeof(struct starpu_perfmodel_device));
			
 
				+		config->workers[topology->nworkers + sccdev].perf_arch.ndevices = 1;
			
 
				+
			
 
				+		config->workers[topology->nworkers + sccdev].perf_arch.devices[0].type = STARPU_SCC_WORKER;
			
 
				+		config->workers[topology->nworkers + sccdev].perf_arch.devices[0].devid = sccdev;
			
 
				+		config->workers[topology->nworkers + sccdev].perf_arch.devices[0].ncore = 1;
			
 
				 		config->workers[topology->nworkers + sccdev].subworkerid = 0;
			
 
				 		config->workers[topology->nworkers + sccdev].devid = devid;
			
 
				 		config->workers[topology->nworkers + sccdev].worker_mask = STARPU_SCC;
			
@@ -1076,9 +1080,11 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 
				 	{
			
 
				 		int worker_idx = topology->nworkers + cpu;
			
 
				 		config->workers[worker_idx].arch = STARPU_CPU_WORKER;
			
 
				-		config->workers[worker_idx].perf_arch.type = STARPU_CPU_WORKER;
			
 
				-		config->workers[worker_idx].perf_arch.devid = 0;
			
 
				-		config->workers[worker_idx].perf_arch.ncore = 0;
			
 
				+		config->workers[worker_idx].perf_arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
			
 
				+		config->workers[worker_idx].perf_arch.ndevices = 1;
			
 
				+		config->workers[worker_idx].perf_arch.devices[0].type = STARPU_CPU_WORKER;
			
 
				+		config->workers[worker_idx].perf_arch.devices[0].devid = 0;
			
 
				+		config->workers[worker_idx].perf_arch.devices[0].ncores = 1;
			
 
				 		config->workers[worker_idx].subworkerid = 0;
			
 
				 		config->workers[worker_idx].devid = cpu;
			
 
				 		config->workers[worker_idx].worker_mask = STARPU_CPU;
			
@@ -1096,8 +1102,6 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-
			
 
				-
			
 
				 void
			
 
				 _starpu_bind_thread_on_cpu (
			
 
				 	struct _starpu_machine_config *config STARPU_ATTRIBUTE_UNUSED,
			
@@ -1168,7 +1172,6 @@ _starpu_bind_thread_on_cpu (
 
				 #endif
			
 
				 }
			
 
				 
			
 
				-
			
 
				 void
			
 
				 _starpu_bind_thread_on_cpus (
			
 
				 	struct _starpu_machine_config *config STARPU_ATTRIBUTE_UNUSED,
			
@@ -1211,7 +1214,6 @@ _starpu_bind_thread_on_cpus (
 
				 #endif
			
 
				 }
			
 
				 
			
 
				-
			
 
				 static void
			
 
				 _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_config STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
@@ -1467,7 +1469,6 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 	}
			
 
				 }
			
 
				 
			
 
				-
			
 
				 int
			
 
				 _starpu_build_topology (struct _starpu_machine_config *config, int no_mp_config)
			
 
				 {
			
@@ -1544,8 +1545,9 @@ _starpu_destroy_topology (
 
				 	unsigned worker;
			
 
				 	for (worker = 0; worker < config->topology.nworkers; worker++)
			
 
				 	{
			
 
				-#ifdef STARPU_HAVE_HWLOC
			
 
				 		struct _starpu_worker *workerarg = &config->workers[worker];
			
 
				+		free(workerarg->perf_arch.devices);
			
 
				+#ifdef STARPU_HAVE_HWLOC
			
 
				 		hwloc_bitmap_free(workerarg->hwloc_cpu_set);
			
 
				 		if (workerarg->bindid != -1)
			
 
				 		{
			
@@ -1561,6 +1563,13 @@ _starpu_destroy_topology (
 
				 #endif
			
 
				 	}
			
 
				 
			
 
				+	unsigned combined_worker_id;
			
 
				+	for(combined_worker_id=0 ; combined_worker_id < config->topology.ncombinedworkers ; combined_worker_id++)
			
 
				+	{
			
 
				+		struct _starpu_combined_worker *combined_worker = &config->combined_workers[combined_worker_id];
			
 
				+		free(combined_worker->perf_arch.devices);
			
 
				+	}
			
 
				+
			
 
				 #ifdef STARPU_HAVE_HWLOC
			
 
				 	hwloc_topology_destroy(config->topology.hwtopology);
			
 
				 #endif
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -287,7 +287,15 @@ static inline int _starpu_can_use_nth_implementation(enum starpu_worker_archtype
 
				 int starpu_worker_can_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl)
			
 
				 {
			
 
				 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
			
 
				-	if(sched_ctx->parallel_sect[workerid]) return 0;
			
 
				+
			
 
				+	/* if the task can't be parallel don't submit it to a ctx */
			
 
				+	unsigned child_sched_ctx = starpu_sched_ctx_worker_is_master_for_child_ctx(workerid, sched_ctx->id);
			
 
				+        if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
			
 
				+		if(!task->possibly_parallel) return 0;
			
 
				+
			
 
				+	/* if the worker is blocked in a parallel ctx don't submit tasks on it */
			
 
				+	if(sched_ctx->parallel_sect[workerid] ) return 0;
			
 
				+
			
 
				 	/* TODO: check that the task operand sizes will fit on that device */
			
 
				 	return (task->cl->where & config.workers[workerid].worker_mask) &&
			
 
				 		_starpu_can_use_nth_implementation(config.workers[workerid].arch, task->cl, nimpl) &&
			
@@ -1192,7 +1200,7 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 
				 	if (!is_a_sink)
			
 
				 	{
			
 
				 		struct starpu_sched_policy *selected_policy = _starpu_select_sched_policy(&config, config.conf->sched_policy_name);
			
 
				-		_starpu_create_sched_ctx(selected_policy, NULL, -1, 1, "init", 0, 0, 0, 0);
			
 
				+		_starpu_create_sched_ctx(selected_policy, NULL, -1, 1, "init", 0, 0, 0, 0, 1);
			
 
				 	}
			
 
				 
			
 
				 	_starpu_initialize_registered_performance_models();
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -26,6 +26,7 @@
 
				 #include <math.h>
			
 
				 #include <core/task.h>
			
 
				 #include <starpu_scheduler.h>
			
 
				+#include <core/workers.h>
			
 
				 
			
 
				 static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned src_node, unsigned dst_node, unsigned *handling_node);
			
 
				 int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
			
--- a/src/datawizard/footprint.c
+++ b/src/datawizard/footprint.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010-2011, 2013-2014  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -38,7 +38,7 @@ uint32_t starpu_task_data_footprint(struct starpu_task *task)
 
				 	return footprint;
			
 
				 }
			
 
				 
			
 
				-uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, unsigned nimpl, struct _starpu_job *j)
			
 
				+uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned nimpl, struct _starpu_job *j)
			
 
				 {
			
 
				 	if (j->footprint_is_computed)
			
 
				 		return j->footprint;
			
@@ -47,27 +47,30 @@ uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, struc
 
				 
			
 
				 	struct starpu_task *task = j->task;
			
 
				 
			
 
				-	if (model != NULL && model->footprint != NULL)
			
 
				+	if (model)
			
 
				 	{
			
 
				-		footprint = model->footprint(task);
			
 
				-	}
			
 
				-	else if (model != NULL && model->per_arch &&
			
 
				-			model->per_arch[arch->type] != NULL &&
			
 
				-			model->per_arch[arch->type][arch->devid] != NULL &&
			
 
				-			model->per_arch[arch->type][arch->devid][arch->ncore] != NULL &&
			
 
				-			model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].size_base)
			
 
				-	{
			
 
				-		size_t size = model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].size_base(task, arch, nimpl);
			
 
				-		footprint = starpu_hash_crc32c_be_n(&size, sizeof(size), footprint);
			
 
				-	}
			
 
				-	else if (model && model->size_base)
			
 
				-	{
			
 
				-		size_t size = model->size_base(task, nimpl);
			
 
				-		footprint = starpu_hash_crc32c_be_n(&size, sizeof(size), footprint);
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		footprint = starpu_task_data_footprint(task);
			
 
				+		if (model->footprint)
			
 
				+		{
			
 
				+			footprint = model->footprint(task);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			struct starpu_perfmodel_per_arch *per_arch = starpu_perfmodel_get_model_per_arch(model, arch, nimpl);
			
 
				+			if (per_arch != NULL && per_arch->size_base)
			
 
				+			{
			
 
				+				size_t size = per_arch->size_base(task, arch, nimpl);
			
 
				+				footprint = starpu_hash_crc32c_be_n(&size, sizeof(size), footprint);
			
 
				+			}
			
 
				+			else if (model->size_base)
			
 
				+			{
			
 
				+				size_t size = model->size_base(task, nimpl);
			
 
				+				footprint = starpu_hash_crc32c_be_n(&size, sizeof(size), footprint);
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				footprint = starpu_task_data_footprint(task);
			
 
				+			}
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	j->footprint = footprint;
			
--- a/src/debug/traces/starpu_fxt.c
+++ b/src/debug/traces/starpu_fxt.c
@@ -341,7 +341,7 @@ static void thread_pop_state(double time, const char *prefix, long unsigned int
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_ENABLE_PAJE_CODELET_DETAILS
			
 
				-static void worker_set_detailed_state(double time, const char *prefix, long unsigned int workerid, const char *name, unsigned long size, const char *parameters, unsigned long footprint, unsigned long long tag)
			
 
				+static void worker_set_detailed_state(double time, const char *prefix, long unsigned int workerid, const char *name, unsigned long size, const char *parameters, unsigned long footprint, unsigned long long tag, unsigned long job_id)
			
 
				 {
			
 
				 #ifdef STARPU_HAVE_POTI
			
 
				 	char container[STARPU_POTI_STR_LEN];
			
@@ -349,7 +349,7 @@ static void worker_set_detailed_state(double time, const char *prefix, long unsi
 
				 	/* TODO: set detailed state */
			
 
				 	poti_SetState(time, container, "WS", name);
			
 
				 #else
			
 
				-	fprintf(out_paje_file, "20	%.9f	%sw%lu	WS	%s	%lu	%s	%08lx	%016llx\n", time, prefix, workerid, name, size, parameters, footprint, tag);
			
 
				+	fprintf(out_paje_file, "20	%.9f	%sw%lu	WS	%s	%lu	%s	%08lx	%016llx	%lu\n", time, prefix, workerid, name, size, parameters, footprint, tag, job_id);
			
 
				 #endif
			
 
				 }
			
 
				 #endif
			
@@ -422,6 +422,8 @@ static void handle_worker_init_start(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 
				 
			
 
				 	char *kindstr = "";
			
 
				 	struct starpu_perfmodel_arch arch;
			
 
				+	arch.ndevices = 1;
			
 
				+	arch.devices = (struct starpu_perfmodel_device *)malloc(sizeof(struct starpu_perfmodel_device));
			
 
				 
			
 
				 	switch (ev->param[0])
			
 
				 	{
			
@@ -432,37 +434,37 @@ static void handle_worker_init_start(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 
				 		case _STARPU_FUT_CPU_KEY:
			
 
				 			set_next_cpu_worker_color(workerid);
			
 
				 			kindstr = "CPU";
			
 
				-			arch.type = STARPU_CPU_WORKER;
			
 
				-			arch.devid = 0;
			
 
				-			arch.ncore = 0;
			
 
				+			arch.devices[0].type = STARPU_CPU_WORKER;
			
 
				+			arch.devices[0].devid = 0;
			
 
				+			arch.devices[0].ncores = 1;
			
 
				 			break;
			
 
				 		case _STARPU_FUT_CUDA_KEY:
			
 
				 			set_next_cuda_worker_color(workerid);
			
 
				 			kindstr = "CUDA";
			
 
				-			arch.type = STARPU_CUDA_WORKER;
			
 
				-			arch.devid = devid;
			
 
				-			arch.ncore = 0;
			
 
				+			arch.devices[0].type = STARPU_CUDA_WORKER;
			
 
				+			arch.devices[0].devid = devid;
			
 
				+			arch.devices[0].ncores = 1;
			
 
				 			break;
			
 
				 		case _STARPU_FUT_OPENCL_KEY:
			
 
				 			set_next_opencl_worker_color(workerid);
			
 
				 			kindstr = "OPENCL";
			
 
				-			arch.type = STARPU_OPENCL_WORKER;
			
 
				-			arch.devid = devid;
			
 
				-			arch.ncore = 0;
			
 
				+			arch.devices[0].type = STARPU_OPENCL_WORKER;
			
 
				+			arch.devices[0].devid = devid;
			
 
				+			arch.devices[0].ncores = 1;
			
 
				 			break;
			
 
				 		case _STARPU_FUT_MIC_KEY:
			
 
				 			set_next_mic_worker_color(workerid);
			
 
				 			kindstr = "mic";
			
 
				-			arch.type = STARPU_MIC_WORKER;
			
 
				-			arch.devid = devid;
			
 
				-			arch.ncore = 0;
			
 
				+			arch.devices[0].type = STARPU_MIC_WORKER;
			
 
				+			arch.devices[0].devid = devid;
			
 
				+			arch.devices[0].ncores = 1;
			
 
				 			break;
			
 
				 		case _STARPU_FUT_SCC_KEY:
			
 
				 			set_next_scc_worker_color(workerid);
			
 
				 			kindstr = "scc";
			
 
				-			arch.type = STARPU_SCC_WORKER;
			
 
				-			arch.devid = devid;
			
 
				-			arch.ncore = 0;
			
 
				+			arch.devices[0].type = STARPU_SCC_WORKER;
			
 
				+			arch.devices[0].devid = devid;
			
 
				+			arch.devices[0].ncores = 1;
			
 
				 			break;
			
 
				 		default:
			
 
				 			STARPU_ABORT();
			
@@ -498,7 +500,7 @@ static void handle_worker_init_start(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 
				 		thread_set_state(get_event_time_stamp(ev, options), prefix, threadid, "I");
			
 
				 
			
 
				 	if (activity_file)
			
 
				-	fprintf(activity_file, "name\t%d\t%s %d\n", workerid, kindstr, devid);
			
 
				+		fprintf(activity_file, "name\t%d\t%s %d\n", workerid, kindstr, devid);
			
 
				 
			
 
				 	snprintf(options->worker_names[workerid], 256, "%s %d", kindstr, devid);
			
 
				 	options->worker_archtypes[workerid] = arch;
			
@@ -749,6 +751,7 @@ static void handle_codelet_details(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 
				 {
			
 
				 #ifdef STARPU_ENABLE_PAJE_CODELET_DETAILS
			
 
				 	int worker = ev->param[5];
			
 
				+	unsigned long job_id = ev->param[6];
			
 
				 
			
 
				 	unsigned sched_ctx = ev->param[1];
			
 
				 	if (worker < 0) return;
			
@@ -760,12 +763,15 @@ static void handle_codelet_details(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 
				 		int i;
			
 
				 		char parameters[256];
			
 
				 		size_t eaten = 0;
			
 
				+		if (!last_codelet_parameter[worker])
			
 
				+			eaten += snprintf(parameters + eaten, sizeof(parameters) - eaten, "nodata");
			
 
				+		else
			
 
				 		for (i = 0; i < last_codelet_parameter[worker] && i < MAX_PARAMETERS; i++)
			
 
				 		{
			
 
				 			eaten += snprintf(parameters + eaten, sizeof(parameters) - eaten, "%s%s", i?"_":"", last_codelet_parameter_description[worker][i]);
			
 
				 		}
			
 
				 
			
 
				-		worker_set_detailed_state(last_codelet_start[worker], prefix, ev->param[5], last_codelet_symbol[worker], ev->param[2], parameters, ev->param[3], ev->param[4]);
			
 
				+		worker_set_detailed_state(last_codelet_start[worker], prefix, worker, last_codelet_symbol[worker], ev->param[2], parameters, ev->param[3], ev->param[4], job_id);
			
 
				 		if (sched_ctx != 0)
			
 
				 		{
			
 
				 #ifdef STARPU_HAVE_POTI
			
@@ -775,7 +781,7 @@ static void handle_codelet_details(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 
				 			worker_container_alias(container, STARPU_POTI_STR_LEN, prefix, ev->param[5]);
			
 
				 			poti_SetState(last_codelet_start[worker], container, ctx, last_codelet_symbol[worker]);
			
 
				 #else
			
 
				-			fprintf(out_paje_file, "20	%.9f	%sw%"PRIu64"	Ctx%d	%s	%08lx	%lu	%016llx\n", last_codelet_start[worker], prefix, ev->param[2], sched_ctx, last_codelet_symbol[worker], (unsigned long) ev->param[2], (unsigned long) ev->param[3], (unsigned long long) ev->param[4]);
			
 
				+			fprintf(out_paje_file, "20	%.9f	%sw%"PRIu64"	Ctx%d	%s	%lu	%s	%08lx	%016llx	%lu\n", last_codelet_start[worker], prefix, ev->param[2], sched_ctx, last_codelet_symbol[worker], (unsigned long) ev->param[2], parameters, (unsigned long) ev->param[3], (unsigned long long) ev->param[4], job_id);
			
 
				 #endif
			
 
				 		}
			
 
				 	}
			
@@ -787,7 +793,7 @@ static struct starpu_fxt_codelet_event *dumped_codelets;
 
				 
			
 
				 static void handle_end_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
			
 
				 {
			
 
				-	int worker = ev->param[6];
			
 
				+	int worker = ev->param[3];
			
 
				 	if (worker < 0) return;
			
 
				 
			
 
				 	char *prefix = options->file_prefix;
			
@@ -798,15 +804,15 @@ static void handle_end_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_opti
 
				 	uint32_t codelet_hash = ev->param[2];
			
 
				 
			
 
				 	if (out_paje_file)
			
 
				-		worker_set_state(end_codelet_time, prefix, ev->param[6], "I");
			
 
				+		worker_set_state(end_codelet_time, prefix, worker, "I");
			
 
				 
			
 
				 	double codelet_length = (end_codelet_time - last_codelet_start[worker]);
			
 
				 
			
 
				 	update_accumulated_time(worker, 0.0, codelet_length, end_codelet_time, 0);
			
 
				 
			
 
				 	if (distrib_time)
			
 
				-	fprintf(distrib_time, "%s\t%s%d\t%ld\t%"PRIx32"\t%.9f\n", last_codelet_symbol[worker],
			
 
				-			prefix, worker, (unsigned long) codelet_size, codelet_hash, codelet_length);
			
 
				+	     fprintf(distrib_time, "%s\t%s%d\t%ld\t%"PRIx32"\t%.9f\n", last_codelet_symbol[worker],
			
 
				+		     prefix, worker, (unsigned long) codelet_size, codelet_hash, codelet_length);
			
 
				 
			
 
				 	if (options->dumped_codelets)
			
 
				 	{
			
@@ -815,10 +821,7 @@ static void handle_end_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_opti
 
				 
			
 
				 		snprintf(dumped_codelets[dumped_codelets_count - 1].symbol, 256, "%s", last_codelet_symbol[worker]);
			
 
				 		dumped_codelets[dumped_codelets_count - 1].workerid = worker;
			
 
				-		dumped_codelets[dumped_codelets_count - 1].arch.type = ev->param[3];
			
 
				-		dumped_codelets[dumped_codelets_count - 1].arch.devid = ev->param[4];
			
 
				-		dumped_codelets[dumped_codelets_count - 1].arch.ncore = ev->param[5];
			
 
				-
			
 
				+		snprintf(dumped_codelets[dumped_codelets_count - 1].perfmodel_archname, 256, "%s", (char *)&ev->param[4]);
			
 
				 		dumped_codelets[dumped_codelets_count - 1].size = codelet_size;
			
 
				 		dumped_codelets[dumped_codelets_count - 1].hash = codelet_hash;
			
 
				 		dumped_codelets[dumped_codelets_count - 1].time = codelet_length;
			
@@ -1055,10 +1058,10 @@ static void handle_work_stealing(struct fxt_ev_64 *ev, struct starpu_fxt_options
 
				 	unsigned src = ev->param[1];
			
 
				 	unsigned size = 0;
			
 
				 	unsigned comid = 0;
			
 
				-	
			
 
				+
			
 
				 	char *prefix = options->file_prefix;
			
 
				 
			
 
				-	
			
 
				+
			
 
				 	if (out_paje_file)
			
 
				 	{
			
 
				 		double time = get_event_time_stamp(ev, options);
			
@@ -1603,12 +1606,8 @@ void _starpu_fxt_display_bandwidth(struct starpu_fxt_options *options)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- *	Public functions
			
 
				- */
			
 
				-
			
 
				 static
			
 
				-void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *options)
			
 
				+void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *options)
			
 
				 {
			
 
				 	/* Open the trace file */
			
 
				 	int fd_in;
			
@@ -1795,6 +1794,9 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 
				 				     handle_data_copy();
			
 
				 				break;
			
 
				 
			
 
				+			case _STARPU_FUT_DATA_LOAD:
			
 
				+			     	break;
			
 
				+
			
 
				 			case _STARPU_FUT_START_DRIVER_COPY:
			
 
				 				if (!options->no_bus)
			
 
				 					handle_start_driver_copy(&ev, options);
			
@@ -2062,6 +2064,16 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 
				 				handle_hypervisor_end(&ev, options);
			
 
				 				break;
			
 
				 
			
 
				+			/* We can safely ignore FUT internal events */
			
 
				+			case FUT_SETUP_CODE:
			
 
				+			case FUT_CALIBRATE0_CODE:
			
 
				+			case FUT_CALIBRATE1_CODE:
			
 
				+			case FUT_CALIBRATE2_CODE:
			
 
				+			case FUT_KEYCHANGE_CODE:
			
 
				+			case FUT_NEW_LWP_CODE:
			
 
				+			case FUT_GCC_INSTRUMENT_ENTRY_CODE:
			
 
				+				break;
			
 
				+
			
 
				 			default:
			
 
				 #ifdef STARPU_VERBOSE
			
 
				 				fprintf(stderr, "unknown event.. %x at time %llx WITH OFFSET %llx\n",
			
@@ -2094,7 +2106,7 @@ void starpu_fxt_options_init(struct starpu_fxt_options *options)
 
				 }
			
 
				 
			
 
				 static
			
 
				-void starpu_fxt_distrib_file_init(struct starpu_fxt_options *options)
			
 
				+void _starpu_fxt_distrib_file_init(struct starpu_fxt_options *options)
			
 
				 {
			
 
				 	dumped_codelets_count = 0;
			
 
				 	dumped_codelets = NULL;
			
@@ -2110,7 +2122,7 @@ void starpu_fxt_distrib_file_init(struct starpu_fxt_options *options)
 
				 }
			
 
				 
			
 
				 static
			
 
				-void starpu_fxt_distrib_file_close(struct starpu_fxt_options *options)
			
 
				+void _starpu_fxt_distrib_file_close(struct starpu_fxt_options *options)
			
 
				 {
			
 
				 	if (distrib_time)
			
 
				 		fclose(distrib_time);
			
@@ -2123,7 +2135,7 @@ void starpu_fxt_distrib_file_close(struct starpu_fxt_options *options)
 
				 }
			
 
				 
			
 
				 static
			
 
				-void starpu_fxt_activity_file_init(struct starpu_fxt_options *options)
			
 
				+void _starpu_fxt_activity_file_init(struct starpu_fxt_options *options)
			
 
				 {
			
 
				 	if (options->activity_path)
			
 
				 		activity_file = fopen(options->activity_path, "w+");
			
@@ -2132,14 +2144,14 @@ void starpu_fxt_activity_file_init(struct starpu_fxt_options *options)
 
				 }
			
 
				 
			
 
				 static
			
 
				-void starpu_fxt_activity_file_close(void)
			
 
				+void _starpu_fxt_activity_file_close(void)
			
 
				 {
			
 
				 	if (activity_file)
			
 
				 		fclose(activity_file);
			
 
				 }
			
 
				 
			
 
				 static
			
 
				-void starpu_fxt_paje_file_init(struct starpu_fxt_options *options)
			
 
				+void _starpu_fxt_paje_file_init(struct starpu_fxt_options *options)
			
 
				 {
			
 
				 	/* create a new file */
			
 
				 	if (options->out_paje_path)
			
@@ -2164,13 +2176,14 @@ void starpu_fxt_paje_file_init(struct starpu_fxt_options *options)
 
				 }
			
 
				 
			
 
				 static
			
 
				-void starpu_fxt_paje_file_close(void)
			
 
				+void _starpu_fxt_paje_file_close(void)
			
 
				 {
			
 
				 	if (out_paje_file)
			
 
				 		fclose(out_paje_file);
			
 
				 }
			
 
				 
			
 
				-static uint64_t starpu_fxt_find_start_time(char *filename_in)
			
 
				+static
			
 
				+uint64_t _starpu_fxt_find_start_time(char *filename_in)
			
 
				 {
			
 
				 	/* Open the trace file */
			
 
				 	int fd_in;
			
@@ -2209,24 +2222,24 @@ static uint64_t starpu_fxt_find_start_time(char *filename_in)
 
				 void starpu_fxt_generate_trace(struct starpu_fxt_options *options)
			
 
				 {
			
 
				 	_starpu_fxt_dag_init(options->dag_path);
			
 
				-	starpu_fxt_distrib_file_init(options);
			
 
				-	starpu_fxt_activity_file_init(options);
			
 
				+	_starpu_fxt_distrib_file_init(options);
			
 
				+	_starpu_fxt_activity_file_init(options);
			
 
				 
			
 
				-	starpu_fxt_paje_file_init(options);
			
 
				+	_starpu_fxt_paje_file_init(options);
			
 
				 
			
 
				 	if (options->ninputfiles == 0)
			
 
				 	{
			
 
				-	     return;
			
 
				+		return;
			
 
				 	}
			
 
				 	else if (options->ninputfiles == 1)
			
 
				 	{
			
 
				 		/* we usually only have a single trace */
			
 
				-		uint64_t file_start_time = starpu_fxt_find_start_time(options->filenames[0]);
			
 
				+		uint64_t file_start_time = _starpu_fxt_find_start_time(options->filenames[0]);
			
 
				 		options->file_prefix = "";
			
 
				 		options->file_offset = file_start_time;
			
 
				 		options->file_rank = -1;
			
 
				 
			
 
				-		starpu_fxt_parse_new_file(options->filenames[0], options);
			
 
				+		_starpu_fxt_parse_new_file(options->filenames[0], options);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
@@ -2259,7 +2272,7 @@ void starpu_fxt_generate_trace(struct starpu_fxt_options *options)
 
				 		/* Compute all start_k */
			
 
				 		for (inputfile = 0; inputfile < options->ninputfiles; inputfile++)
			
 
				 		{
			
 
				-			uint64_t file_start = starpu_fxt_find_start_time(options->filenames[inputfile]);
			
 
				+			uint64_t file_start = _starpu_fxt_find_start_time(options->filenames[inputfile]);
			
 
				 			start_k[inputfile] = file_start;
			
 
				 		}
			
 
				 
			
@@ -2267,9 +2280,9 @@ void starpu_fxt_generate_trace(struct starpu_fxt_options *options)
 
				 		for (inputfile = 0; inputfile < options->ninputfiles; inputfile++)
			
 
				 		{
			
 
				 			int ret = _starpu_fxt_mpi_find_sync_point(options->filenames[inputfile],
			
 
				-						&sync_k[inputfile],
			
 
				-						&unique_keys[inputfile],
			
 
				-						&rank_k[inputfile]);
			
 
				+								  &sync_k[inputfile],
			
 
				+								  &unique_keys[inputfile],
			
 
				+								  &rank_k[inputfile]);
			
 
				 			if (ret == -1)
			
 
				 			{
			
 
				 				/* There was no sync point, we assume there is no offset */
			
@@ -2326,7 +2339,7 @@ void starpu_fxt_generate_trace(struct starpu_fxt_options *options)
 
				 			options->file_offset = offsets[inputfile];
			
 
				 			options->file_rank = filerank;
			
 
				 
			
 
				-			starpu_fxt_parse_new_file(options->filenames[inputfile], options);
			
 
				+			_starpu_fxt_parse_new_file(options->filenames[inputfile], options);
			
 
				 		}
			
 
				 
			
 
				 		/* display the MPI transfers if possible */
			
@@ -2337,9 +2350,9 @@ void starpu_fxt_generate_trace(struct starpu_fxt_options *options)
 
				 	_starpu_fxt_display_bandwidth(options);
			
 
				 
			
 
				 	/* close the different files */
			
 
				-	starpu_fxt_paje_file_close();
			
 
				-	starpu_fxt_activity_file_close();
			
 
				-	starpu_fxt_distrib_file_close(options);
			
 
				+	_starpu_fxt_paje_file_close();
			
 
				+	_starpu_fxt_activity_file_close();
			
 
				+	_starpu_fxt_distrib_file_close(options);
			
 
				 
			
 
				 	_starpu_fxt_dag_terminate();
			
 
				 
			
--- a/src/debug/traces/starpu_paje.c
+++ b/src/debug/traces/starpu_paje.c
@@ -140,6 +140,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 
				 	fprintf(file, "%%	Params	string\n");
			
 
				 	fprintf(file, "%%	Footprint	string\n");
			
 
				 	fprintf(file, "%%	Tag	string\n");
			
 
				+	fprintf(file, "%%	JobId	string\n");
			
 
				 	fprintf(file, "%%EndEventDef\n");
			
 
				 #endif
			
 
				 #endif
			
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -240,13 +240,23 @@ int _starpu_cpu_driver_run_once(struct _starpu_worker *cpu_worker)
 
				 		rank = j->active_task_alias_count++;
			
 
				 		STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
			
 
				 
			
 
				-		struct _starpu_combined_worker *combined_worker;
			
 
				-		combined_worker = _starpu_get_combined_worker_struct(j->combined_workerid);
			
 
				+		if(j->combined_workerid != -1)
			
 
				+		{
			
 
				+			struct _starpu_combined_worker *combined_worker;
			
 
				+			combined_worker = _starpu_get_combined_worker_struct(j->combined_workerid);
			
 
				+			
			
 
				+			cpu_worker->combined_workerid = j->combined_workerid;
			
 
				+			cpu_worker->worker_size = combined_worker->worker_size;
			
 
				+			cpu_worker->current_rank = rank;
			
 
				+			perf_arch = &combined_worker->perf_arch;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(cpu_worker, j);
			
 
				+			STARPU_ASSERT_MSG(sched_ctx != NULL, "there should be a worker %d in the ctx of this job \n", cpu_worker->workerid);
			
 
				 
			
 
				-		cpu_worker->combined_workerid = j->combined_workerid;
			
 
				-		cpu_worker->worker_size = combined_worker->worker_size;
			
 
				-		cpu_worker->current_rank = rank;
			
 
				-		perf_arch = &combined_worker->perf_arch;
			
 
				+			perf_arch = &sched_ctx->perf_arch;
			
 
				+		}
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -490,7 +490,14 @@ static void finish_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *wor
 
				 
			
 
				 	_starpu_driver_end_job(worker, j, &worker->perf_arch, &codelet_end, 0, profiling);
			
 
				 
			
 
				-	_starpu_driver_update_job_feedback(j, worker, &worker->perf_arch, &j->cl_start, &codelet_end, profiling);
			
 
				+	struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
			
 
				+	if(!sched_ctx)
			
 
				+		sched_ctx = _starpu_get_sched_ctx_struct(j->task->sched_ctx);
			
 
				+
			
 
				+	if(!sched_ctx->sched_policy)
			
 
				+		_starpu_driver_update_job_feedback(j, worker, &sched_ctx->perf_arch, &j->cl_start, &codelet_end, profiling);
			
 
				+	else
			
 
				+		_starpu_driver_update_job_feedback(j, worker, &worker->perf_arch, &j->cl_start, &codelet_end, profiling);
			
 
				 
			
 
				 	_starpu_push_task_output(j);
			
 
				 
			
--- a/src/drivers/driver_common/driver_common.c
+++ b/src/drivers/driver_common/driver_common.c
@@ -74,7 +74,34 @@ void _starpu_driver_start_job(struct _starpu_worker *worker, struct _starpu_job
 
				 	if (starpu_top)
			
 
				 		_starpu_top_task_started(task,workerid,codelet_start);
			
 
				 
			
 
				-	_STARPU_TRACE_START_CODELET_BODY(j, j->nimpl, perf_arch, workerid);
			
 
				+
			
 
				+	// Find out if the worker is the master of a parallel context
			
 
				+	struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
			
 
				+	if(!sched_ctx)
			
 
				+		sched_ctx = _starpu_get_sched_ctx_struct(j->task->sched_ctx);
			
 
				+	if(!sched_ctx->sched_policy)
			
 
				+	{
			
 
				+		if(!sched_ctx->awake_workers && sched_ctx->main_master == worker->workerid)
			
 
				+		{
			
 
				+			struct starpu_worker_collection *workers = sched_ctx->workers;
			
 
				+			struct starpu_sched_ctx_iterator it;
			
 
				+
			
 
				+			if (workers->init_iterator)
			
 
				+				workers->init_iterator(workers, &it);
			
 
				+			while (workers->has_next(workers, &it))
			
 
				+			{
			
 
				+				int _workerid = workers->get_next(workers, &it);
			
 
				+				if (_workerid != workerid)
			
 
				+				{
			
 
				+					struct _starpu_worker *_worker = _starpu_get_worker_struct(_workerid);
			
 
				+					_starpu_driver_start_job(_worker, j, &_worker->perf_arch, codelet_start, rank, profiling);
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+		_STARPU_TRACE_START_CODELET_BODY(j, j->nimpl, &sched_ctx->perf_arch, workerid);
			
 
				+	}
			
 
				+	else
			
 
				+		_STARPU_TRACE_START_CODELET_BODY(j, j->nimpl, perf_arch, workerid);
			
 
				 }
			
 
				 
			
 
				 void _starpu_driver_end_job(struct _starpu_worker *worker, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank, int profiling)
			
@@ -86,7 +113,22 @@ void _starpu_driver_end_job(struct _starpu_worker *worker, struct _starpu_job *j
 
				 	int workerid = worker->workerid;
			
 
				 	unsigned calibrate_model = 0;
			
 
				 
			
 
				-	_STARPU_TRACE_END_CODELET_BODY(j, j->nimpl, perf_arch, workerid);
			
 
				+	// Find out if the worker is the master of a parallel context
			
 
				+	struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
			
 
				+	unsigned worker_left_ctx = 0;
			
 
				+	if(!sched_ctx)
			
 
				+		sched_ctx = _starpu_get_sched_ctx_struct(j->task->sched_ctx);
			
 
				+
			
 
				+	if (!sched_ctx->sched_policy)
			
 
				+	{
			
 
				+		_starpu_perfmodel_create_comb_if_needed(&(sched_ctx->perf_arch));
			
 
				+		_STARPU_TRACE_END_CODELET_BODY(j, j->nimpl, &(sched_ctx->perf_arch), workerid);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		_starpu_perfmodel_create_comb_if_needed(perf_arch);
			
 
				+		_STARPU_TRACE_END_CODELET_BODY(j, j->nimpl, perf_arch, workerid);
			
 
				+	}
			
 
				 
			
 
				 	if (cl && cl->model && cl->model->benchmarking)
			
 
				 		calibrate_model = 1;
			
@@ -104,7 +146,27 @@ void _starpu_driver_end_job(struct _starpu_worker *worker, struct _starpu_job *j
 
				 		_starpu_top_task_ended(task,workerid,codelet_end);
			
 
				 
			
 
				 	_starpu_set_worker_status(worker, STATUS_UNKNOWN);
			
 
				+
			
 
				+	if(!sched_ctx->sched_policy && !sched_ctx->awake_workers &&
			
 
				+	   sched_ctx->main_master == worker->workerid)
			
 
				+	{
			
 
				+		struct starpu_worker_collection *workers = sched_ctx->workers;
			
 
				+		struct starpu_sched_ctx_iterator it;
			
 
				+
			
 
				+		if (workers->init_iterator)
			
 
				+			workers->init_iterator(workers, &it);
			
 
				+		while (workers->has_next(workers, &it))
			
 
				+		{
			
 
				+			int _workerid = workers->get_next(workers, &it);
			
 
				+			if (_workerid != workerid)
			
 
				+			{
			
 
				+				struct _starpu_worker *_worker = _starpu_get_worker_struct(_workerid);
			
 
				+				_starpu_driver_end_job(_worker, j, &_worker->perf_arch, codelet_end, rank, profiling);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				 }
			
 
				+
			
 
				 void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_worker *worker,
			
 
				 					struct starpu_perfmodel_arch* perf_arch,
			
 
				 					struct timespec *codelet_start, struct timespec *codelet_end, int profiling)
			
@@ -117,6 +179,8 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 
				 	int calibrate_model = 0;
			
 
				 	int updated = 0;
			
 
				 
			
 
				+	_starpu_perfmodel_create_comb_if_needed(perf_arch);
			
 
				+
			
 
				 #ifndef STARPU_SIMGRID
			
 
				 	if (cl->model && cl->model->benchmarking)
			
 
				 		calibrate_model = 1;
			
@@ -257,10 +321,10 @@ static void _starpu_worker_set_status_wakeup(int workerid)
 
				 static void _starpu_exponential_backoff(struct _starpu_worker *worker)
			
 
				 {
			
 
				 	int delay = worker->spinning_backoff;
			
 
				-	
			
 
				+
			
 
				 	if (worker->spinning_backoff < BACKOFF_MAX)
			
 
				-		worker->spinning_backoff<<=1; 
			
 
				-	
			
 
				+		worker->spinning_backoff<<=1;
			
 
				+
			
 
				 	while(delay--)
			
 
				 		STARPU_UYIELD();
			
 
				 }
			
@@ -285,6 +349,9 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *worker, int w
 
				 			if(sched_ctx && sched_ctx->id > 0 && sched_ctx->id < STARPU_NMAX_SCHED_CTXS)
			
 
				 			{
			
 
				 				STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->parallel_sect_mutex[workerid]);
			
 
				+				if(!sched_ctx->sched_policy && sched_ctx->awake_workers) 
			
 
				+					worker->slave = sched_ctx->main_master != workerid;
			
 
				+
			
 
				 				if(sched_ctx->parallel_sect[workerid])
			
 
				 				{
			
 
				 					/* don't let the worker sleep with the sched_mutex taken */
			
@@ -442,10 +509,13 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 
				 					STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
			
 
				 					workers[i].current_rank = j->active_task_alias_count++;
			
 
				 					STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
			
 
				-					
			
 
				-					combined_worker = _starpu_get_combined_worker_struct(j->combined_workerid);
			
 
				-					workers[i].combined_workerid = j->combined_workerid;
			
 
				-					workers[i].worker_size = combined_worker->worker_size;
			
 
				+
			
 
				+					if(j->combined_workerid != -1)
			
 
				+					{
			
 
				+						combined_worker = _starpu_get_combined_worker_struct(j->combined_workerid);
			
 
				+						workers[i].combined_workerid = j->combined_workerid;
			
 
				+						workers[i].worker_size = combined_worker->worker_size;
			
 
				+					}
			
 
				 				}
			
 
				 				else
			
 
				 				{
			
@@ -520,4 +590,3 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 
				 
			
 
				 	return count;
			
 
				 }
			
 
				-
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -926,7 +926,12 @@ static void _starpu_opencl_stop_job(struct _starpu_job *j, struct _starpu_worker
 
				 
			
 
				 	_starpu_driver_end_job(worker, j, &worker->perf_arch, &codelet_end, 0, profiling);
			
 
				 
			
 
				-	_starpu_driver_update_job_feedback(j, worker, &worker->perf_arch, &j->cl_start, &codelet_end, profiling);
			
 
				+	struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
			
 
				+	STARPU_ASSERT_MSG(sched_ctx != NULL, "there should be a worker %d in the ctx of this job \n", worker->workerid);
			
 
				+	if(!sched_ctx->sched_policy)
			
 
				+		_starpu_driver_update_job_feedback(j, worker, &sched_ctx->perf_arch, &j->cl_start, &codelet_end, profiling);
			
 
				+	else
			
 
				+		_starpu_driver_update_job_feedback(j, worker, &worker->perf_arch, &j->cl_start, &codelet_end, profiling);
			
 
				 
			
 
				 	_starpu_push_task_output(j);
			
 
				 
			
--- a/src/profiling/bound.c
+++ b/src/profiling/bound.c
@@ -426,7 +426,7 @@ static void _starpu_get_tasks_times(int nw, int nt, double *times)
 
				 				.footprint = tp->footprint,
			
 
				 				.footprint_is_computed = 1,
			
 
				 			};
			
 
				-			struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
			
 
				+			struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
			
 
				 			double length = _starpu_history_based_job_expected_perf(tp->cl->model, arch, &j, j.nimpl);
			
 
				 			if (isnan(length))
			
 
				 				times[w*nt+t] = NAN;
			
@@ -512,15 +512,15 @@ void starpu_bound_print_lp(FILE *output)
 
				 			};
			
 
				 			for (w = 0; w < nw; w++)
			
 
				 			{
			
 
				-				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
			
 
				-				if (_STARPU_IS_ZERO(t1->duration[arch->type][arch->devid][arch->ncore]))
			
 
				+				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
			
 
				+				if (_STARPU_IS_ZERO(t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores]))
			
 
				 				{
			
 
				 					double length = _starpu_history_based_job_expected_perf(t1->cl->model, arch, &j,j.nimpl);
			
 
				 					if (isnan(length))
			
 
				 						/* Avoid problems with binary coding of doubles */
			
 
				-						t1->duration[arch->type][arch->devid][arch->ncore] = NAN;
			
 
				+						t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores] = NAN;
			
 
				 					else
			
 
				-						t1->duration[arch->type][arch->devid][arch->ncore] = length / 1000.;
			
 
				+						t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores] = length / 1000.;
			
 
				 				}
			
 
				 			}
			
 
				 			nt++;
			
@@ -545,8 +545,8 @@ void starpu_bound_print_lp(FILE *output)
 
				 		{
			
 
				 			for (w = 0; w < nw; w++)
			
 
				 			{
			
 
				-				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
			
 
				-				if (!isnan(t1->duration[arch->type][arch->devid][arch->ncore]))
			
 
				+				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
			
 
				+				if (!isnan(t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores]))
			
 
				 					fprintf(output, " +t%luw%d", t1->id, w);
			
 
				 			}
			
 
				 			fprintf(output, " = 1;\n");
			
@@ -559,9 +559,9 @@ void starpu_bound_print_lp(FILE *output)
 
				 			fprintf(output, "/* %s %x */\tc%lu = s%lu", _starpu_codelet_get_model_name(t1->cl), (unsigned) t1->footprint, t1->id, t1->id);
			
 
				 			for (w = 0; w < nw; w++)
			
 
				 			{
			
 
				-				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
			
 
				-				if (!isnan(t1->duration[arch->type][arch->devid][arch->ncore]))
			
 
				-					fprintf(output, " + %f t%luw%d", t1->duration[arch->type][arch->devid][arch->ncore], t1->id, w);
			
 
				+				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
			
 
				+				if (!isnan(t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores]))
			
 
				+					fprintf(output, " + %f t%luw%d", t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores], t1->id, w);
			
 
				 			}
			
 
				 			fprintf(output, ";\n");
			
 
				 		}
			
@@ -642,8 +642,8 @@ void starpu_bound_print_lp(FILE *output)
 
				 				{
			
 
				 					for (w = 0; w < nw; w++)
			
 
				 					{
			
 
				-						struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
			
 
				-						if (!isnan(t1->duration[arch->type][arch->devid][arch->ncore]))
			
 
				+						struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
			
 
				+						if (!isnan(t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores]))
			
 
				 						{
			
 
				 							fprintf(output, "s%lu - c%lu >= -3e5 + 1e5 t%luw%d + 1e5 t%luw%d + 1e5 t%luafter%lu;\n",
			
 
				 									t1->id, t2->id, t1->id, w, t2->id, w, t1->id, t2->id);
			
--- a/src/sched_policies/component_best_implementation.c
+++ b/src/sched_policies/component_best_implementation.c
@@ -23,7 +23,7 @@
 
				 /* return true if workerid can execute task, and fill task->predicted and task->predicted_transfer
			
 
				  *  according to best implementation predictions
			
 
				  */
			
 
				-static int find_best_impl(struct starpu_task * task, int workerid)
			
 
				+static int find_best_impl(unsigned sched_ctx_id, struct starpu_task * task, int workerid)
			
 
				 {
			
 
				 	double len = DBL_MAX;
			
 
				 	int best_impl = -1;
			
@@ -32,7 +32,7 @@ static int find_best_impl(struct starpu_task * task, int workerid)
 
				 	{
			
 
				 		if(starpu_worker_can_execute_task(workerid, task, impl))
			
 
				 		{
			
 
				-			struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid);
			
 
				+			struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid, sched_ctx_id);
			
 
				 			double d = starpu_task_expected_length(task, archtype, impl);
			
 
				 			if(isnan(d))
			
 
				 			{
			
@@ -61,20 +61,20 @@ static int find_best_impl(struct starpu_task * task, int workerid)
 
				 /* set implementation, task->predicted and task->predicted_transfer with the first worker of workers that can execute that task
			
 
				  * or have to be calibrated
			
 
				  */
			
 
				-static void select_best_implementation_and_set_preds(struct starpu_bitmap * workers, struct starpu_task * task)
			
 
				+static void select_best_implementation_and_set_preds(unsigned sched_ctx_id, struct starpu_bitmap * workers, struct starpu_task * task)
			
 
				 {
			
 
				 	int workerid;
			
 
				 	for(workerid = starpu_bitmap_first(workers);
			
 
				 	    -1 != workerid;
			
 
				 	    workerid = starpu_bitmap_next(workers, workerid))
			
 
				-		if(find_best_impl(task, workerid))
			
 
				+		if(find_best_impl(sched_ctx_id, task, workerid))
			
 
				 			break;
			
 
				 }
			
 
				 
			
 
				 static int best_implementation_push_task(struct starpu_sched_component * component, struct starpu_task * task)
			
 
				 {
			
 
				 	STARPU_ASSERT(component->nchildren == 1);
			
 
				-	select_best_implementation_and_set_preds(component->workers_in_ctx, task);
			
 
				+	select_best_implementation_and_set_preds(component->tree->sched_ctx_id, component->workers_in_ctx, task);
			
 
				 	return component->children[0]->push_task(component->children[0],task);
			
 
				 }
			
 
				 
			
@@ -100,7 +100,7 @@ static struct starpu_task * best_implementation_pull_task(struct starpu_sched_co
 
				 	}
			
 
				 	if(task)
			
 
				 		/* this worker can execute this task as it was returned by a pop*/
			
 
				-		(void)find_best_impl(task, starpu_worker_get_id());
			
 
				+		(void)find_best_impl(component->tree->sched_ctx_id, task, starpu_worker_get_id());
			
 
				 	return task;
			
 
				 }
			
 
				 
			
--- a/src/sched_policies/component_eager_calibration.c
+++ b/src/sched_policies/component_eager_calibration.c
@@ -29,7 +29,7 @@ static int eager_calibration_push_task(struct starpu_sched_component * component
 
				 	    workerid != -1;
			
 
				 	    workerid = starpu_bitmap_next(component->workers_in_ctx, workerid))
			
 
				 	{
			
 
				-		struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid);
			
 
				+		struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid, component->tree->sched_ctx_id);
			
 
				 		int nimpl;
			
 
				 		for(nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				 		{
			
--- a/src/sched_policies/component_fifo.c
+++ b/src/sched_policies/component_fifo.c
@@ -66,7 +66,7 @@ static double fifo_estimated_load(struct starpu_sched_component * component)
 
				 	if(STARPU_SCHED_COMPONENT_IS_HOMOGENEOUS(component))
			
 
				 	{		
			
 
				 		int first_worker = starpu_bitmap_first(component->workers_in_ctx);
			
 
				-		relative_speedup = starpu_worker_get_relative_speedup(starpu_worker_get_perf_archtype(first_worker));
			
 
				+		relative_speedup = starpu_worker_get_relative_speedup(starpu_worker_get_perf_archtype(first_worker, component->tree->sched_ctx_id));
			
 
				 		STARPU_PTHREAD_MUTEX_LOCK(mutex);
			
 
				 		load = fifo->ntasks / relative_speedup;
			
 
				 		STARPU_PTHREAD_MUTEX_UNLOCK(mutex);
			
@@ -78,7 +78,7 @@ static double fifo_estimated_load(struct starpu_sched_component * component)
 
				 		for(i = starpu_bitmap_first(component->workers_in_ctx);
			
 
				 		    i != -1;
			
 
				 		    i = starpu_bitmap_next(component->workers_in_ctx, i))
			
 
				-			relative_speedup += starpu_worker_get_relative_speedup(starpu_worker_get_perf_archtype(i));
			
 
				+			relative_speedup += starpu_worker_get_relative_speedup(starpu_worker_get_perf_archtype(i, component->tree->sched_ctx_id));
			
 
				 		relative_speedup /= starpu_bitmap_cardinal(component->workers_in_ctx);
			
 
				 		STARPU_ASSERT(!_STARPU_IS_ZERO(relative_speedup));
			
 
				 		STARPU_PTHREAD_MUTEX_LOCK(mutex);
			
--- a/src/sched_policies/component_prio.c
+++ b/src/sched_policies/component_prio.c
@@ -79,7 +79,7 @@ static double prio_estimated_load(struct starpu_sched_component * component)
 
				 	if(STARPU_SCHED_COMPONENT_IS_HOMOGENEOUS(component))
			
 
				 	{		
			
 
				 		int first_worker = starpu_bitmap_first(component->workers_in_ctx);
			
 
				-		relative_speedup = starpu_worker_get_relative_speedup(starpu_worker_get_perf_archtype(first_worker));
			
 
				+		relative_speedup = starpu_worker_get_relative_speedup(starpu_worker_get_perf_archtype(first_worker, component->tree->sched_ctx_id));
			
 
				 		STARPU_PTHREAD_MUTEX_LOCK(mutex);
			
 
				 		load = prio->ntasks / relative_speedup;
			
 
				 		STARPU_PTHREAD_MUTEX_UNLOCK(mutex);
			
@@ -91,7 +91,7 @@ static double prio_estimated_load(struct starpu_sched_component * component)
 
				 		for(i = starpu_bitmap_first(component->workers_in_ctx);
			
 
				 		    i != -1;
			
 
				 		    i = starpu_bitmap_next(component->workers_in_ctx, i))
			
 
				-			relative_speedup += starpu_worker_get_relative_speedup(starpu_worker_get_perf_archtype(i));
			
 
				+			relative_speedup += starpu_worker_get_relative_speedup(starpu_worker_get_perf_archtype(i, component->tree->sched_ctx_id));
			
 
				 		relative_speedup /= starpu_bitmap_cardinal(component->workers_in_ctx);
			
 
				 		STARPU_ASSERT(!_STARPU_IS_ZERO(relative_speedup));
			
 
				 		STARPU_PTHREAD_MUTEX_LOCK(mutex);
			
--- a/src/sched_policies/component_random.c
+++ b/src/sched_policies/component_random.c
@@ -26,7 +26,7 @@ static double compute_relative_speedup(struct starpu_sched_component * component
 
				 	    id != -1;
			
 
				 	    id = starpu_bitmap_next(component->workers_in_ctx, id))
			
 
				 	{
			
 
				-		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(id);
			
 
				+		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(id, component->tree->sched_ctx_id);
			
 
				 		sum += starpu_worker_get_relative_speedup(perf_arch);
			
 
				 
			
 
				 	}
			
--- a/src/sched_policies/component_sched.c
+++ b/src/sched_policies/component_sched.c
@@ -49,7 +49,7 @@ int starpu_sched_component_execute_preds(struct starpu_sched_component * compone
 
				 	    workerid != -1;
			
 
				 	    workerid = starpu_bitmap_next(component->workers_in_ctx, workerid))
			
 
				 	{
			
 
				-		struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid);
			
 
				+		struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid, component->tree->sched_ctx_id);
			
 
				 		int nimpl;
			
 
				 		for(nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				 		{
			
@@ -137,7 +137,7 @@ double starpu_sched_component_transfer_length(struct starpu_sched_component * co
 
				 		else
			
 
				 		{
			
 
				 			sum += starpu_task_expected_data_transfer_time(memory_node, task);
			
 
				-			/* sum += starpu_task_expected_conversion_time(task, starpu_worker_get_perf_archtype(worker), impl ?)
			
 
				+			/* sum += starpu_task_expected_conversion_time(task, starpu_worker_get_perf_archtype(worker, component->tree->sched_ctx_id), impl ?)
			
 
				 			 * I dont know what to do as we dont know what implementation would be used here...
			
 
				 			 */
			
 
				 		}
			
--- a/src/sched_policies/component_work_stealing.c
+++ b/src/sched_policies/component_work_stealing.c
@@ -211,7 +211,7 @@ double _ws_estimated_load(struct starpu_sched_component * component)
 
				 	    -1 != workerid;
			
 
				 	    workerid = starpu_bitmap_next(component->workers_in_ctx, workerid))
			
 
				 	{
			
 
				-		speedup += starpu_worker_get_relative_speedup(starpu_worker_get_perf_archtype(workerid));
			
 
				+		speedup += starpu_worker_get_relative_speedup(starpu_worker_get_perf_archtype(workerid, component->tree->sched_ctx_id));
			
 
				 	}
			
 
				 	
			
 
				 	return ntasks / speedup;
			
--- a/src/sched_policies/component_worker.c
+++ b/src/sched_policies/component_worker.c
@@ -557,7 +557,7 @@ static double simple_worker_estimated_load(struct starpu_sched_component * compo
 
				 	int ntasks_in_fifo = l ? l->ntasks : 0;
			
 
				 	return (double) (nb_task + ntasks_in_fifo)
			
 
				 		/ starpu_worker_get_relative_speedup(
			
 
				-				starpu_worker_get_perf_archtype(starpu_bitmap_first(component->workers)));
			
 
				+				starpu_worker_get_perf_archtype(starpu_bitmap_first(component->workers), component->tree->sched_ctx_id));
			
 
				 }
			
 
				 
			
 
				 static void _worker_component_deinit_data(struct starpu_sched_component * component)
			
--- a/src/sched_policies/deque_modeling_policy_data_aware.c
+++ b/src/sched_policies/deque_modeling_policy_data_aware.c
@@ -307,7 +307,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
				 	STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				 
			
 
				         /* Sometimes workers didn't take the tasks as early as we expected */
			
 
				-	fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
			
 
				+	fifo->exp_start = isnan(fifo->exp_start) ? starpu_timing_now() : STARPU_MAX(fifo->exp_start, starpu_timing_now());
			
 
				 	fifo->exp_end = fifo->exp_start + fifo->exp_len;
			
 
				 
			
 
				 	if ((starpu_timing_now() + predicted_transfer) < fifo->exp_end)
			
@@ -417,10 +417,10 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 
				 		worker = workers->get_next_master(workers, &it);
			
 
				 		struct _starpu_fifo_taskq *fifo  = dt->queue_array[worker];
			
 
				 		unsigned memory_node = starpu_worker_get_memory_node(worker);
			
 
				-		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				+		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker, sched_ctx_id);
			
 
				 
			
 
				 		/* Sometimes workers didn't take the tasks as early as we expected */
			
 
				-		double exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
			
 
				+		double exp_start = isnan(fifo->exp_start) ? starpu_timing_now() : STARPU_MAX(fifo->exp_start, starpu_timing_now());
			
 
				 
			
 
				 		if (!starpu_worker_can_execute_task_impl(worker, task, &impl_mask))
			
 
				 			continue;
			
@@ -558,12 +558,11 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				 		worker = workers->get_next_master(workers, &it);
			
 
				 
			
 
				 		struct _starpu_fifo_taskq *fifo = dt->queue_array[worker];
			
 
				-		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				+		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker, sched_ctx_id);
			
 
				 		unsigned memory_node = starpu_worker_get_memory_node(worker);
			
 
				 
			
 
				 		/* Sometimes workers didn't take the tasks as early as we expected */
			
 
				-		double exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
			
 
				-
			
 
				+		double exp_start = isnan(fifo->exp_start) ? starpu_timing_now() : STARPU_MAX(fifo->exp_start, starpu_timing_now());
			
 
				 		if (!starpu_worker_can_execute_task_impl(worker, task, &impl_mask))
			
 
				 			continue;
			
 
				 
			
@@ -588,6 +587,7 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				 				local_task_length[worker_ctx][nimpl] = starpu_task_bundle_expected_length(bundle, perf_arch, nimpl);
			
 
				 				local_data_penalty[worker_ctx][nimpl] = starpu_task_bundle_expected_data_transfer_time(bundle, memory_node);
			
 
				 				local_power[worker_ctx][nimpl] = starpu_task_bundle_expected_power(bundle, perf_arch,nimpl);
			
 
				+
			
 
				 			}
			
 
				 			else
			
 
				 			{
			
@@ -758,12 +758,12 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
				 					selected_impl = nimpl;
			
 
				 
			
 
				 					//_STARPU_DEBUG("best fitness (worker %d) %e = alpha*(%e) + beta(%e) +gamma(%e)\n", worker, best_fitness, exp_end[worker][nimpl] - best_exp_end, local_data_penalty[worker][nimpl], local_power[worker][nimpl]);
			
 
				+
			
 
				 				}
			
 
				 			}
			
 
				 			worker_ctx++;
			
 
				 		}
			
 
				 	}
			
 
				-
			
 
				 	STARPU_ASSERT(forced_best != -1 || best != -1);
			
 
				 
			
 
				 	if (forced_best != -1)
			
@@ -778,7 +778,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
				 	}
			
 
				 	else if (task->bundle)
			
 
				 	{
			
 
				-		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(best_in_ctx);
			
 
				+		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(best_in_ctx, sched_ctx_id);
			
 
				 		unsigned memory_node = starpu_worker_get_memory_node(best);
			
 
				 		model_best = starpu_task_expected_length(task, perf_arch, selected_impl);
			
 
				 		transfer_model_best = starpu_task_expected_data_transfer_time(memory_node, task);
			
@@ -791,7 +791,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
				 
			
 
				 	//_STARPU_DEBUG("Scheduler dmda: kernel (%u)\n", best_impl);
			
 
				 	starpu_task_set_implementation(task, selected_impl);
			
 
				-
			
 
				+	
			
 
				 	/* we should now have the best worker in variable "best" */
			
 
				 	return push_task_on_best_worker(task, best, model_best, transfer_model_best, prio, sched_ctx_id);
			
 
				 }
			
@@ -951,7 +951,7 @@ static void dmda_push_task_notify(struct starpu_task *task, int workerid, int pe
 
				 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
			
 
				 	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
			
 
				 	/* Compute the expected penality */
			
 
				-	struct starpu_perfmodel_arch *perf_arch = starpu_worker_get_perf_archtype(perf_workerid);
			
 
				+	struct starpu_perfmodel_arch *perf_arch = starpu_worker_get_perf_archtype(perf_workerid, sched_ctx_id);
			
 
				 	unsigned memory_node = starpu_worker_get_memory_node(workerid);
			
 
				 
			
 
				 	double predicted = starpu_task_expected_length(task, perf_arch,
			
@@ -966,7 +966,7 @@ static void dmda_push_task_notify(struct starpu_task *task, int workerid, int pe
 
				 	/* Update the predictions */
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				 	/* Sometimes workers didn't take the tasks as early as we expected */
			
 
				-	fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
			
 
				+	fifo->exp_start = isnan(fifo->exp_start) ? starpu_timing_now() : STARPU_MAX(fifo->exp_start, starpu_timing_now());
			
 
				 	fifo->exp_end = fifo->exp_start + fifo->exp_len;
			
 
				 
			
 
				 	/* If there is no prediction available, we consider the task has a null length */
			
--- a/src/sched_policies/parallel_heft.c
+++ b/src/sched_policies/parallel_heft.c
@@ -232,9 +232,9 @@ static double compute_expected_end(int workerid, double length)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static double compute_ntasks_end(int workerid)
			
 
				+static double compute_ntasks_end(int workerid, unsigned sched_ctx_id)
			
 
				 {
			
 
				-	struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(workerid);
			
 
				+	struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(workerid, sched_ctx_id);
			
 
				 	starpu_pthread_mutex_t *sched_mutex;
			
 
				 	starpu_pthread_cond_t *sched_cond;
			
 
				 
			
@@ -350,14 +350,14 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 
				 			}
			
 
				 
			
 
				 
			
 
				-			struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				+			struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker, sched_ctx_id);
			
 
				 
			
 
				 			local_task_length[worker_ctx][nimpl] = starpu_task_expected_length(task, perf_arch,nimpl);
			
 
				 
			
 
				 			unsigned memory_node = starpu_worker_get_memory_node(worker);
			
 
				 			local_data_penalty[worker_ctx][nimpl] = starpu_task_expected_data_transfer_time(memory_node, task);
			
 
				 
			
 
				-			double ntasks_end = compute_ntasks_end(worker);
			
 
				+			double ntasks_end = compute_ntasks_end(worker, sched_ctx_id);
			
 
				 
			
 
				 			if (ntasks_best == -1
			
 
				 			    || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
			
--- a/src/sched_policies/random_policy.c
+++ b/src/sched_policies/random_policy.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2014  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -47,7 +47,7 @@ static int _random_push_task(struct starpu_task *task, unsigned prio)
 
				 		unsigned impl;
			
 
				 		if(starpu_worker_can_execute_task_first_impl(worker, task, &impl))
			
 
				 		{
			
 
				-			struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				+			struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker, sched_ctx_id);
			
 
				 			double speedup = starpu_worker_get_relative_speedup(perf_arch);
			
 
				 			alpha_sum += speedup;
			
 
				 			speedup_arr[size] = speedup;
			
--- a/src/util/starpu_task_insert_utils.c
+++ b/src/util/starpu_task_insert_utils.c
@@ -131,6 +131,10 @@ void _starpu_task_insert_get_args_size(va_list varg_list, unsigned *nbuffers, si
 
				 		{
			
 
				 			(void)va_arg(varg_list, int);
			
 
				 		}
			
 
				+		else if (arg_type==STARPU_POSSIBLY_PARALLEL)
			
 
				+		{
			
 
				+			(void)va_arg(varg_list, unsigned);
			
 
				+		}
			
 
				 		else if (arg_type==STARPU_FLOPS)
			
 
				 		{
			
 
				 			(void)va_arg(varg_list, double);
			
@@ -248,6 +252,10 @@ int _starpu_codelet_pack_args(void **arg_buffer, size_t arg_buffer_size, va_list
 
				 		{
			
 
				 			(void)va_arg(varg_list, int);
			
 
				 		}
			
 
				+		else if (arg_type==STARPU_POSSIBLY_PARALLEL)
			
 
				+		{
			
 
				+			(void)va_arg(varg_list, unsigned);
			
 
				+		}
			
 
				 		else if (arg_type==STARPU_FLOPS)
			
 
				 		{
			
 
				 			(void)va_arg(varg_list, double);
			
@@ -433,6 +441,11 @@ void _starpu_task_insert_create(void *arg_buffer, size_t arg_buffer_size, struct
 
				 			int hypervisor_tag = va_arg(varg_list, int);
			
 
				 			(*task)->hypervisor_tag = hypervisor_tag;
			
 
				 		}
			
 
				+		else if (arg_type==STARPU_POSSIBLY_PARALLEL)
			
 
				+		{
			
 
				+			unsigned possibly_parallel = va_arg(varg_list, unsigned);
			
 
				+			(*task)->possibly_parallel = possibly_parallel;
			
 
				+		}
			
 
				 		else if (arg_type==STARPU_FLOPS)
			
 
				 		{
			
 
				 			double flops = va_arg(varg_list, double);
			
--- a/src/worker_collection/worker_list.c
+++ b/src/worker_collection/worker_list.c
@@ -166,7 +166,7 @@ static int list_remove(struct starpu_worker_collection *workers, int worker)
 
				 	_rearange_workerids(masters, nmasters);
			
 
				 	if(found_master != -1)
			
 
				 		workers->nmasters--;
			
 
				-	printf("rem %d\n", found_worker);
			
 
				+
			
 
				 	return found_worker;
			
 
				 }
			
 
				 
			
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -268,6 +268,7 @@ noinst_PROGRAMS =				\
 
				 	perfmodels/user_base			\
			
 
				 	perfmodels/valid_model			\
			
 
				 	perfmodels/value_nan			\
			
 
				+	perfmodels/memory			\
			
 
				 	sched_policies/data_locality            \
			
 
				 	sched_policies/execute_all_tasks        \
			
 
				 	sched_policies/prio        		\
			
--- a/tests/perfmodels/feed.c
+++ b/tests/perfmodels/feed.c
@@ -73,15 +73,17 @@ int main(int argc, char **argv)
 
				 		measured_slow = 0.001+size*0.0000001;
			
 
				 
			
 
				 		struct starpu_perfmodel_arch arch;
			
 
				-		arch.type = STARPU_CUDA_WORKER;
			
 
				-		arch.ncore = 0;
			
 
				+		arch.ndevices = 1;
			
 
				+		arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
			
 
				+		arch.devices[0].type = STARPU_CUDA_WORKER;
			
 
				+		arch.devices[0].ncores = 0;
			
 
				 		/* Simulate Fast GPU */
			
 
				-		arch.devid = 0;
			
 
				+		arch.devices[0].devid = 0;
			
 
				 		starpu_perfmodel_update_history(&model, &task, &arch, 0, 0, measured_fast);
			
 
				 		starpu_perfmodel_update_history(&nl_model, &task, &arch, 0, 0, measured_fast);
			
 
				 
			
 
				 		/* Simulate Slow GPU */
			
 
				-		arch.devid = 1;
			
 
				+		arch.devices[0].devid = 1;
			
 
				 		starpu_perfmodel_update_history(&model, &task, &arch, 0, 0, measured_slow);
			
 
				 		starpu_perfmodel_update_history(&nl_model, &task, &arch, 0, 0, measured_slow);
			
 
				 		starpu_task_clean(&task);
			
--- a/tests/perfmodels/memory.c
+++ b/tests/perfmodels/memory.c
@@ -0,0 +1,64 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2014  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <config.h>
			
 
				+#include <starpu.h>
			
 
				+#include <core/perfmodel/perfmodel.h>
			
 
				+#include "../helper.h"
			
 
				+
			
 
				+void func(void *descr[], void *arg)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+static struct starpu_perfmodel my_model =
			
 
				+{
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = "my_model",
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet my_codelet =
			
 
				+{
			
 
				+	.cpu_funcs = {func, NULL},
			
 
				+	.model = &my_model
			
 
				+};
			
 
				+
			
 
				+double cuda_cost_function(struct starpu_task *t, struct starpu_perfmodel_arch *a, unsigned i)
			
 
				+{
			
 
				+	t;
			
 
				+	a;
			
 
				+	return (double)i;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	starpu_perfmodel_init(NULL, &my_model);
			
 
				+	starpu_perfmodel_set_per_devices_cost_function(&my_model, 0, cuda_cost_function, STARPU_CUDA_WORKER, 0, 1, -1);
			
 
				+
			
 
				+	ret = starpu_task_insert(&my_codelet, 0);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return EXIT_SUCCESS;
			
 
				+}
			
--- a/tests/perfmodels/regression_based.c
+++ b/tests/perfmodels/regression_based.c
@@ -128,7 +128,7 @@ static void show_task_perfs(int size, struct starpu_task *task)
 
				 		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				 		{
			
 
				 			FPRINTF(stdout, "Expected time for %d on %s (impl %d):\t%f\n",
			
 
				-				size, name, nimpl, starpu_task_expected_length(task, starpu_worker_get_perf_archtype(workerid), nimpl));
			
 
				+				size, name, nimpl, starpu_task_expected_length(task, starpu_worker_get_perf_archtype(workerid, task->sched_ctx), nimpl));
			
 
				 		}
			
 
				 	}
			
 
				 }
			
--- a/tests/perfmodels/valid_model.c
+++ b/tests/perfmodels/valid_model.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -16,6 +16,7 @@
 
				 
			
 
				 #include <config.h>
			
 
				 #include <starpu.h>
			
 
				+#include <core/perfmodel/perfmodel.h>
			
 
				 #include "../helper.h"
			
 
				 
			
 
				 void func(void *descr[], void *arg)
			
@@ -66,7 +67,6 @@ static int submit(struct starpu_codelet *codelet, struct starpu_perfmodel *model
 
				 	conf.sched_policy_name = "eager";
			
 
				 	conf.calibrate = 1;
			
 
				 
			
 
				-
			
 
				 	ret = starpu_init(&conf);
			
 
				 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
@@ -74,15 +74,19 @@ static int submit(struct starpu_codelet *codelet, struct starpu_perfmodel *model
 
				 	codelet->model = model;
			
 
				 
			
 
				 	old_nsamples = 0;
			
 
				-	lmodel.is_init=0;
			
 
				+	memset(&lmodel, 0, sizeof(struct starpu_perfmodel));
			
 
				 	lmodel.type = model->type;
			
 
				 	ret = starpu_perfmodel_load_symbol(codelet->model->symbol, &lmodel);
			
 
				 	if (ret != 1)
			
 
				-		for (archtype = 0; archtype < STARPU_NARCH; archtype++)
			
 
				-			if(lmodel.per_arch[archtype] != NULL)
			
 
				-				for(devid=0; lmodel.per_arch[archtype][devid] != NULL; devid++)
			
 
				-					for(ncore=0; lmodel.per_arch[archtype][devid][ncore] != NULL; ncore++)
			
 
				-						old_nsamples += lmodel.per_arch[archtype][devid][ncore][0].regression.nsample;
			
 
				+	{
			
 
				+		int i, impl;
			
 
				+		for(i = 0; i < lmodel.state->ncombs; i++)
			
 
				+		{
			
 
				+			int comb = lmodel.state->combs[i];
			
 
				+			for(impl = 0; impl < lmodel.state->nimpls[comb]; impl++)
			
 
				+				old_nsamples += lmodel.state->per_arch[comb][impl].regression.nsample;
			
 
				+		}
			
 
				+	}
			
 
				 
			
 
				         starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, 100, sizeof(int));
			
 
				 	for (loop = 0; loop < nloops; loop++)
			
@@ -105,13 +109,18 @@ static int submit(struct starpu_codelet *codelet, struct starpu_perfmodel *model
 
				 		starpu_shutdown();
			
 
				 		return 1;
			
 
				 	}
			
 
				-
			
 
				-	new_nsamples = 0;
			
 
				-	for (archtype = 0; archtype < STARPU_NARCH; archtype++)
			
 
				-		if(lmodel.per_arch[archtype] != NULL)
			
 
				-			for(devid=0; lmodel.per_arch[archtype][devid] != NULL; devid++)
			
 
				-				for(ncore=0; lmodel.per_arch[archtype][devid][ncore] != NULL; ncore++)
			
 
				-					new_nsamples += lmodel.per_arch[archtype][devid][ncore][0].regression.nsample;
			
 
				+	else
			
 
				+	{
			
 
				+		int i;
			
 
				+		new_nsamples = 0;
			
 
				+		for(i = 0; i < lmodel.state->ncombs; i++)
			
 
				+		{
			
 
				+			int comb = lmodel.state->combs[i];
			
 
				+			int impl;
			
 
				+			for(impl = 0; impl < lmodel.state->nimpls[comb]; impl++)
			
 
				+			     new_nsamples += lmodel.state->per_arch[comb][impl].regression.nsample;
			
 
				+		}
			
 
				+	}
			
 
				 
			
 
				 	ret = starpu_perfmodel_unload_model(&lmodel);
			
 
				 	starpu_shutdown();
			
--- a/tests/sched_policies/simple_cpu_gpu_sched.c
+++ b/tests/sched_policies/simple_cpu_gpu_sched.c
@@ -17,6 +17,7 @@
 
				 #include <starpu.h>
			
 
				 #include <starpu_scheduler.h>
			
 
				 #include "../helper.h"
			
 
				+#include <core/perfmodel/perfmodel.h>
			
 
				 
			
 
				 /*
			
 
				  * Schedulers that are aware of the expected task length provided by the
			
@@ -88,6 +89,7 @@ static struct starpu_perfmodel model_cpu_task =
 
				 	.type = STARPU_PER_ARCH,
			
 
				 	.symbol = "model_cpu_task"
			
 
				 };
			
 
				+
			
 
				 static struct starpu_perfmodel model_gpu_task =
			
 
				 {
			
 
				 	.type = STARPU_PER_ARCH,
			
@@ -95,48 +97,39 @@ static struct starpu_perfmodel model_gpu_task =
 
				 };
			
 
				 
			
 
				 static void
			
 
				-init_perfmodels(void)
			
 
				+init_perfmodels_gpu(int gpu_type)
			
 
				 {
			
 
				-	unsigned devid, ncore;
			
 
				-
			
 
				-	starpu_perfmodel_init(&model_cpu_task);
			
 
				-	starpu_perfmodel_init(&model_gpu_task);
			
 
				+	int nb_worker_gpu = starpu_worker_get_count_by_type(gpu_type);
			
 
				+	int *worker_gpu_ids = malloc(nb_worker_gpu * sizeof(int));
			
 
				+	int worker_gpu;
			
 
				 
			
 
				-	if(model_cpu_task.per_arch[STARPU_CPU_WORKER] != NULL)
			
 
				+	starpu_worker_get_ids_by_type(gpu_type, worker_gpu_ids, nb_worker_gpu);
			
 
				+	for(worker_gpu = 0 ; worker_gpu < nb_worker_gpu ; worker_gpu ++)
			
 
				 	{
			
 
				-		for(devid=0; model_cpu_task.per_arch[STARPU_CPU_WORKER][devid] != NULL; devid++)
			
 
				-		{
			
 
				-			for(ncore=0; model_cpu_task.per_arch[STARPU_CPU_WORKER][devid][ncore] != NULL; ncore++)
			
 
				-			{
			
 
				-				model_cpu_task.per_arch[STARPU_CPU_WORKER][devid][ncore][0].cost_function = cpu_task_cpu;
			
 
				-				model_gpu_task.per_arch[STARPU_CPU_WORKER][devid][ncore][0].cost_function = gpu_task_cpu;
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				+		starpu_perfmodel_set_per_devices_cost_function(&model_cpu_task, 0, cpu_task_gpu,
			
 
				+							       gpu_type, starpu_worker_get_devid(worker_gpu_ids[worker_gpu]), 1,
			
 
				+							       -1);
			
 
				 
			
 
				-	if(model_cpu_task.per_arch[STARPU_CUDA_WORKER] != NULL)
			
 
				-	{
			
 
				-		for(devid=0; model_cpu_task.per_arch[STARPU_CUDA_WORKER][devid] != NULL; devid++)
			
 
				-		{
			
 
				-			for(ncore=0; model_cpu_task.per_arch[STARPU_CUDA_WORKER][devid][ncore] != NULL; ncore++)
			
 
				-			{
			
 
				-				model_cpu_task.per_arch[STARPU_CUDA_WORKER][devid][ncore][0].cost_function = cpu_task_gpu;
			
 
				-				model_gpu_task.per_arch[STARPU_CUDA_WORKER][devid][ncore][0].cost_function = gpu_task_gpu;
			
 
				-			}
			
 
				-		}
			
 
				+		starpu_perfmodel_set_per_devices_cost_function(&model_gpu_task, 0, gpu_task_gpu,
			
 
				+							       gpu_type, starpu_worker_get_devid(worker_gpu_ids[worker_gpu]), 1,
			
 
				+							       -1);
			
 
				 	}
			
 
				+}
			
 
				 
			
 
				-	if(model_cpu_task.per_arch[STARPU_OPENCL_WORKER] != NULL)
			
 
				-	{
			
 
				-		for(devid=0; model_cpu_task.per_arch[STARPU_OPENCL_WORKER][devid] != NULL; devid++)
			
 
				-		{
			
 
				-			for(ncore=0; model_cpu_task.per_arch[STARPU_OPENCL_WORKER][devid][ncore] != NULL; ncore++)
			
 
				-			{
			
 
				-				model_cpu_task.per_arch[STARPU_OPENCL_WORKER][devid][ncore][0].cost_function = cpu_task_gpu;
			
 
				-				model_gpu_task.per_arch[STARPU_OPENCL_WORKER][devid][ncore][0].cost_function = gpu_task_gpu;
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				+static void
			
 
				+init_perfmodels(void)
			
 
				+{
			
 
				+	unsigned devid, ncore;
			
 
				+
			
 
				+	starpu_perfmodel_init(NULL, &model_cpu_task);
			
 
				+	starpu_perfmodel_init(NULL, &model_gpu_task);
			
 
				+
			
 
				+	starpu_perfmodel_set_per_devices_cost_function(&model_cpu_task, 0, cpu_task_cpu, STARPU_CPU_WORKER, 0, 1, -1);
			
 
				+	starpu_perfmodel_set_per_devices_cost_function(&model_gpu_task, 0, gpu_task_cpu, STARPU_CPU_WORKER, 0, 1, -1);
			
 
				+
			
 
				+	// We need to set the cost function for each combination with a CUDA or a OpenCL worker
			
 
				+	init_perfmodels_gpu(STARPU_CUDA_WORKER);
			
 
				+	init_perfmodels_gpu(STARPU_OPENCL_WORKER);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -166,17 +159,19 @@ run(struct starpu_sched_policy *policy)
 
				 	struct starpu_conf conf;
			
 
				 	starpu_conf_init(&conf);
			
 
				 	conf.sched_policy = policy;
			
 
				+
			
 
				 	int ret = starpu_init(&conf);
			
 
				 	if (ret == -ENODEV)
			
 
				 		exit(STARPU_TEST_SKIPPED);
			
 
				 
			
 
				 	/* At least 1 CPU and 1 GPU are needed. */
			
 
				-	if (starpu_cpu_worker_get_count() == 0) {
			
 
				+	if (starpu_cpu_worker_get_count() == 0)
			
 
				+	{
			
 
				 		starpu_shutdown();
			
 
				 		exit(STARPU_TEST_SKIPPED);
			
 
				 	}
			
 
				-	if (starpu_cuda_worker_get_count() == 0 &&
			
 
				-	    starpu_opencl_worker_get_count() == 0) {
			
 
				+	if (starpu_cuda_worker_get_count() == 0 && starpu_opencl_worker_get_count() == 0)
			
 
				+	{
			
 
				 		starpu_shutdown();
			
 
				 		exit(STARPU_TEST_SKIPPED);
			
 
				 	}
			
@@ -202,10 +197,9 @@ run(struct starpu_sched_policy *policy)
 
				 	enum starpu_worker_archtype cpu_task_worker, gpu_task_worker;
			
 
				 	cpu_task_worker = starpu_worker_get_type(cpu_task->profiling_info->workerid);
			
 
				 	gpu_task_worker = starpu_worker_get_type(gpu_task->profiling_info->workerid);
			
 
				-	if (cpu_task_worker != STARPU_CPU_WORKER ||
			
 
				-			(gpu_task_worker != STARPU_CUDA_WORKER &&
			
 
				-			 gpu_task_worker != STARPU_OPENCL_WORKER))
			
 
				+	if (cpu_task_worker != STARPU_CPU_WORKER || (gpu_task_worker != STARPU_CUDA_WORKER && gpu_task_worker != STARPU_OPENCL_WORKER))
			
 
				 	{
			
 
				+		FPRINTF(stderr, "Tasks did not execute on expected worker\n");
			
 
				 		if (cpu_task_worker != STARPU_CPU_WORKER)
			
 
				 		{
			
 
				 			FPRINTF(stderr, "The CPU task did not run on a CPU worker\n");
			
@@ -218,8 +212,10 @@ run(struct starpu_sched_policy *policy)
 
				 		ret = 1;
			
 
				 	}
			
 
				 	else
			
 
				+	{
			
 
				+		FPRINTF(stderr, "Tasks DID execute on expected worker\n");
			
 
				 		ret = 0;
			
 
				-
			
 
				+	}
			
 
				 
			
 
				 	starpu_task_destroy(cpu_task);
			
 
				 	starpu_task_destroy(gpu_task);
			
--- a/tools/gdbinit
+++ b/tools/gdbinit
@@ -2,7 +2,7 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				 # Copyright (C) 2010-2014  Université de Bordeaux
			
 
				-# Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+# Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
 
				 # it under the terms of the GNU Lesser General Public License as published by
			
@@ -475,6 +475,24 @@ define starpu-print-prequests
 
				   end
			
 
				 end
			
 
				 
			
 
				+define starpu-print-arch
			
 
				+  set $arch = (struct starpu_perfmodel_arch *)$arg0
			
 
				+  set $device = 0
			
 
				+  while $device < $arch->ndevices
			
 
				+    printf "  Device type %d - devid: %d - ncores: %d\n", $arch->devices[$device].type, $arch->devices[$device].devid, $arch->devices[$device].ncores
			
 
				+    set $device = $device + 1
			
 
				+  end
			
 
				+end
			
 
				+
			
 
				+define starpu-print-archs
			
 
				+  set $comb = 0
			
 
				+  while $comb < current_arch_comb
			
 
				+    printf "Combination %d with %d devices\n", $comb, arch_combs[$comb]->ndevices
			
 
				+    starpu-print-arch arch_combs[$comb]
			
 
				+    set $comb = $comb + 1
			
 
				+  end
			
 
				+end
			
 
				+
			
 
				 define starpu-print-frequests
			
 
				   set $node = 0
			
 
				   while $node < descr.nnodes
			
@@ -563,6 +581,25 @@ define starpu-memusage
 
				   end
			
 
				 end
			
 
				 
			
 
				+define starpu-print-model
			
 
				+    set $model = (struct starpu_perfmodel *)$arg0
			
 
				+    printf "Model %p type %d symbol ", $model, $model->type
			
 
				+    if $model->symbol
			
 
				+       printf "%s", $model->symbol
			
 
				+    else
			
 
				+       printf "NULL"
			
 
				+    end
			
 
				+    printf "\n"
			
 
				+end
			
 
				+
			
 
				+define starpu-print-registered-models
			
 
				+    set $node = registered_models
			
 
				+    while $node
			
 
				+    	  starpu-print-model $node->model
			
 
				+	  set $node = $node->next
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				 document starpu
			
 
				 List of StarPU-specific gdb functions:
			
 
				 starpu-workers          prints a list of the StarPU workers
			
@@ -580,4 +617,8 @@ starpu-print-frequests  prints all StarPU prefetch data requests
 
				 starpu-tasks            prints a list of the tasks flowing in StarPU
			
 
				 starpu-tags             prints a list of the tags known to StarPU
			
 
				 starpu-memusage         prints the memory node usage
			
 
				+starpu-print-archs      prints all known arch combinations
			
 
				+starpu-print-arch       prints a given arch combination
			
 
				+starpu-print-registered-models prints all registered performance models
			
 
				+starpu-print-model      prints a given performance model
			
 
				 end
			
--- a/tools/starpu_perfmodel_plot.c
+++ b/tools/starpu_perfmodel_plot.c
@@ -37,33 +37,33 @@
 
				 
			
 
				 #define PROGNAME "starpu_perfmodel_plot"
			
 
				 
			
 
				-/* display all available models */
			
 
				-static int list = 0;
			
 
				-/* what kernel ? */
			
 
				-static char *symbol = NULL;
			
 
				-/* which architecture ? (NULL = all)*/
			
 
				-static char *archname = NULL;
			
 
				-/* Unless a FxT file is specified, we just display the model */
			
 
				-static int no_fxt_file = 1;
			
 
				-static int gflops = 0;
			
 
				-
			
 
				-#ifdef STARPU_USE_FXT
			
 
				-static struct starpu_fxt_codelet_event *dumped_codelets;
			
 
				-static struct starpu_fxt_options options;
			
 
				-#endif
			
 
				+struct _perfmodel_plot_options
			
 
				+{
			
 
				+	/* display all available models */
			
 
				+	int list;
			
 
				+	/* what kernel ? */
			
 
				+	char *symbol;
			
 
				+	/* which combination */
			
 
				+	int comb_is_set;
			
 
				+	int comb;
			
 
				+	/* display all available combinations of a specific model */
			
 
				+	int list_combs;
			
 
				+	int gflops;
			
 
				+	/* Unless a FxT file is specified, we just display the model */
			
 
				+	int with_fxt_file;
			
 
				+
			
 
				+	char avg_file_name[256];
			
 
				 
			
 
				 #ifdef STARPU_USE_FXT
			
 
				-static int **archtype_is_found[STARPU_NARCH];
			
 
				-
			
 
				-static char data_file_name[256];
			
 
				+	struct starpu_fxt_codelet_event *dumped_codelets;
			
 
				+	struct starpu_fxt_options fxt_options;
			
 
				+	char data_file_name[256];
			
 
				 #endif
			
 
				-static char avg_file_name[256];
			
 
				-static char gnuplot_file_name[256];
			
 
				+};
			
 
				 
			
 
				 static void usage()
			
 
				 {
			
 
				-	fprintf(stderr, "Draw a graph corresponding to the execution time of a \
			
 
				-given perfmodel\n");
			
 
				+	fprintf(stderr, "Draw a graph corresponding to the execution time of a given perfmodel\n");
			
 
				 	fprintf(stderr, "Usage: %s [ options ]\n", PROGNAME);
			
 
				         fprintf(stderr, "\n");
			
 
				 	fprintf(stderr, "One must specify a symbol with the -s option or use -l\n");
			
@@ -72,25 +72,28 @@ given perfmodel\n");
 
				         fprintf(stderr, "   -s <symbol>         specify the symbol\n");
			
 
				 	fprintf(stderr, "   -f                  draw GFlops instead of time\n");
			
 
				 	fprintf(stderr, "   -i <Fxt files>      input FxT files generated by StarPU\n");
			
 
				-        fprintf(stderr, "   -a <arch>           specify the architecture (e.g. cpu, cpu:x, cuda, cuda_d, opencl, opencl_d)\n");
			
 
				+	fprintf(stderr, "   -lc                 display all combinations of a given model\n");
			
 
				+        fprintf(stderr, "   -c <combination>    specify the combination (use the option -lc to list all combinations of a given model)\n");
			
 
				 	fprintf(stderr, "   -h, --help          display this help and exit\n");
			
 
				 	fprintf(stderr, "   -v, --version       output version information and exit\n\n");
			
 
				         fprintf(stderr, "Report bugs to <%s>.", PACKAGE_BUGREPORT);
			
 
				         fprintf(stderr, "\n");
			
 
				 }
			
 
				 
			
 
				-static void parse_args(int argc, char **argv)
			
 
				+static void parse_args(int argc, char **argv, struct _perfmodel_plot_options *options)
			
 
				 {
			
 
				+	memset(options, 0, sizeof(struct _perfmodel_plot_options));
			
 
				+
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 	/* Default options */
			
 
				-	starpu_fxt_options_init(&options);
			
 
				+	starpu_fxt_options_init(&options->fxt_options);
			
 
				 
			
 
				-	options.out_paje_path = NULL;
			
 
				-	options.activity_path = NULL;
			
 
				-	options.distrib_time_path = NULL;
			
 
				-	options.dag_path = NULL;
			
 
				+	options->fxt_options.out_paje_path = NULL;
			
 
				+	options->fxt_options.activity_path = NULL;
			
 
				+	options->fxt_options.distrib_time_path = NULL;
			
 
				+	options->fxt_options.dag_path = NULL;
			
 
				 
			
 
				-	options.dumped_codelets = &dumped_codelets;
			
 
				+	options->fxt_options.dumped_codelets = &options->dumped_codelets;
			
 
				 #endif
			
 
				 
			
 
				 	/* We want to support arguments such as "-i trace_*" */
			
@@ -101,7 +104,7 @@ static void parse_args(int argc, char **argv)
 
				 	{
			
 
				 		if (strcmp(argv[i], "-s") == 0)
			
 
				 		{
			
 
				-			symbol = argv[++i];
			
 
				+			options->symbol = argv[++i];
			
 
				 			continue;
			
 
				 		}
			
 
				 
			
@@ -109,8 +112,8 @@ static void parse_args(int argc, char **argv)
 
				 		{
			
 
				 			reading_input_filenames = 1;
			
 
				 #ifdef STARPU_USE_FXT
			
 
				-			options.filenames[options.ninputfiles++] = argv[++i];
			
 
				-			no_fxt_file = 0;
			
 
				+			options->fxt_options.filenames[options->fxt_options.ninputfiles++] = argv[++i];
			
 
				+			options->with_fxt_file = 1;
			
 
				 #else
			
 
				 			fprintf(stderr, "Warning: FxT support was not enabled in StarPU: FxT traces will thus be ignored!\n");
			
 
				 #endif
			
@@ -119,19 +122,26 @@ static void parse_args(int argc, char **argv)
 
				 
			
 
				 		if (strcmp(argv[i], "-l") == 0)
			
 
				 		{
			
 
				-			list = 1;
			
 
				+			options->list = 1;
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-lc") == 0)
			
 
				+		{
			
 
				+			options->list_combs = 1;
			
 
				 			continue;
			
 
				 		}
			
 
				 
			
 
				 		if (strcmp(argv[i], "-f") == 0)
			
 
				 		{
			
 
				-			gflops = 1;
			
 
				+			options->gflops = 1;
			
 
				 			continue;
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-a") == 0)
			
 
				+		if (strcmp(argv[i], "-c") == 0)
			
 
				 		{
			
 
				-			archname = argv[++i];
			
 
				+			options->comb_is_set = 1;
			
 
				+			options->comb = atoi(argv[++i]);
			
 
				 			continue;
			
 
				 		}
			
 
				 
			
@@ -155,19 +165,18 @@ static void parse_args(int argc, char **argv)
 
				 		if (reading_input_filenames)
			
 
				 		{
			
 
				 #ifdef STARPU_USE_FXT
			
 
				-			options.filenames[options.ninputfiles++] = argv[i];
			
 
				+			options->fxt_options.filenames[options->fxt_options.ninputfiles++] = argv[i];
			
 
				 #endif
			
 
				 			continue;
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	if (!symbol && !list)
			
 
				+	if ((!options->symbol && !options->list) || (options->list_combs && !options->symbol))
			
 
				 	{
			
 
				 		fprintf(stderr, "Incorrect usage, aborting\n");
			
 
				                 usage();
			
 
				 		exit(-1);
			
 
				 	}
			
 
				-
			
 
				 }
			
 
				 
			
 
				 static char *replace_char(char *str, char old, char new)
			
@@ -194,27 +203,22 @@ static void print_comma(FILE *gnuplot_file, int *first)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static void display_perf_model(FILE *gnuplot_file, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, int *first, unsigned nimpl)
			
 
				+static void display_perf_model(FILE *gnuplot_file, struct starpu_perfmodel_arch* arch, struct starpu_perfmodel_per_arch *arch_model, int impl, int *first, struct _perfmodel_plot_options *options)
			
 
				 {
			
 
				 	char arch_name[256];
			
 
				-	starpu_perfmodel_get_arch_name(arch, arch_name, 256, nimpl);
			
 
				-
			
 
				-	struct starpu_perfmodel_per_arch *arch_model =
			
 
				-		&model->per_arch[arch->type][arch->devid][arch->ncore][nimpl];
			
 
				 
			
 
				-	if (arch_model->regression.valid || arch_model->regression.nl_valid)
			
 
				-		fprintf(stderr,"Arch: %s\n", arch_name);
			
 
				+	starpu_perfmodel_get_arch_name(arch, arch_name, 256, impl);
			
 
				 
			
 
				 #ifdef STARPU_USE_FXT
			
 
				-	if (!gflops && !no_fxt_file && archtype_is_found[arch->type][arch->devid][arch->ncore] && nimpl == 0)
			
 
				+	if (!options->gflops && options->with_fxt_file && impl == 0)
			
 
				 	{
			
 
				 		print_comma(gnuplot_file, first);
			
 
				-		fprintf(gnuplot_file, "\"< grep -w \\^%d_%d_%d %s\" using 2:3 title \"Profiling %s\"", arch->type, arch->devid, arch->ncore, data_file_name, arch_name);
			
 
				+		fprintf(gnuplot_file, "\"< grep -w \\^%s %s\" using 2:3 title \"Profiling %s\"", arch_name, options->data_file_name, replace_char(arch_name, '_', '-'));
			
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				 	/* Only display the regression model if we could actually build a model */
			
 
				-	if (!gflops && arch_model->regression.valid && !arch_model->regression.nl_valid)
			
 
				+	if (!options->gflops && arch_model->regression.valid && !arch_model->regression.nl_valid)
			
 
				 	{
			
 
				 		print_comma(gnuplot_file, first);
			
 
				 
			
@@ -226,7 +230,7 @@ static void display_perf_model(FILE *gnuplot_file, struct starpu_perfmodel *mode
 
				 			arch_model->regression.alpha, arch_model->regression.beta, arch_name);
			
 
				 	}
			
 
				 
			
 
				-	if (!gflops && arch_model->regression.nl_valid)
			
 
				+	if (!options->gflops && arch_model->regression.nl_valid)
			
 
				 	{
			
 
				 		print_comma(gnuplot_file, first);
			
 
				 
			
@@ -240,90 +244,37 @@ static void display_perf_model(FILE *gnuplot_file, struct starpu_perfmodel *mode
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, enum starpu_worker_archtype* type, int* devid, int* ncore, int *first)
			
 
				+static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, int *first, struct _perfmodel_plot_options *options)
			
 
				 {
			
 
				-	char *command;
			
 
				 	FILE *datafile;
			
 
				 	struct starpu_perfmodel_history_list *ptr;
			
 
				 	char arch_name[32];
			
 
				 	int col;
			
 
				-	size_t len;
			
 
				 	unsigned long last, minimum = 0;
			
 
				 
			
 
				-	len = 10 + strlen(avg_file_name) + 1;
			
 
				-	command = (char *) malloc(len);
			
 
				-	datafile = fopen(avg_file_name, "w");
			
 
				-	free(command);
			
 
				-
			
 
				+	datafile = fopen(options->avg_file_name, "w");
			
 
				 	col = 2;
			
 
				-	unsigned implid;
			
 
				 
			
 
				-	unsigned archmin, archmax, devmin, devmax, coremin, coremax;
			
 
				-	if(type != NULL)
			
 
				+	int i;
			
 
				+	for(i = 0; i < model->state->ncombs; i++)
			
 
				 	{
			
 
				-		archmin = *type;
			
 
				-		archmax = *type +1;
			
 
				-		if(devid != NULL)
			
 
				-		{
			
 
				-			devmin = *devid;
			
 
				-			devmax = *devid +1;
			
 
				-			if(ncore != NULL)
			
 
				-			{
			
 
				-				coremin = *ncore;
			
 
				-				coremax = *ncore +1;
			
 
				-			}
			
 
				-			else
			
 
				-			{
			
 
				-				coremin = 0;
			
 
				-				coremax = 0;
			
 
				-			}
			
 
				-		}
			
 
				-		else
			
 
				+		int comb = model->state->combs[i];
			
 
				+		if (options->comb_is_set == 0 || options->comb == comb)
			
 
				 		{
			
 
				-			devmin = 0;
			
 
				-			devmax = 0;
			
 
				-			coremin = 0;
			
 
				-			coremax = 0;
			
 
				-		}
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		archmin = 0;
			
 
				-		archmax = STARPU_NARCH;
			
 
				-		devmin = 0;
			
 
				-		devmax = 0;
			
 
				-		coremin = 0;
			
 
				-		coremax = 0;
			
 
				+			struct starpu_perfmodel_arch *arch;
			
 
				+			int impl;
			
 
				 
			
 
				-	}
			
 
				-	struct starpu_perfmodel_arch arch;
			
 
				-	unsigned archtype, dev, core;
			
 
				-	for (archtype = archmin; archtype < archmax; archtype++)
			
 
				-	{
			
 
				-		arch.type = archtype;
			
 
				-		if(model->per_arch[archtype]!=NULL)
			
 
				-		{
			
 
				-			for(dev = devmin; model->per_arch[archtype][dev] != NULL && (devmax == 0 || dev < devmax);dev++)
			
 
				+			arch = _starpu_arch_comb_get(comb);
			
 
				+			for(impl = 0; impl < model->state->nimpls[comb]; impl++)
			
 
				 			{
			
 
				-				arch.devid = dev;
			
 
				+				struct starpu_perfmodel_per_arch *arch_model = &model->state->per_arch[comb][impl];
			
 
				+				starpu_perfmodel_get_arch_name(arch, arch_name, 32, impl);
			
 
				 
			
 
				-				for(core = coremin; model->per_arch[archtype][dev][core] != NULL && (coremax == 0 || core < coremax); core++)
			
 
				+				if (arch_model->list)
			
 
				 				{
			
 
				-					arch.ncore = core;
			
 
				-					for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				-					{
			
 
				-						struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[archtype][dev][core][implid];
			
 
				-						starpu_perfmodel_get_arch_name(&arch, arch_name, 32, implid);
			
 
				-
			
 
				-						//ptrs[arch-arch1][implid] = ptr[arch-arch1][implid] = arch_model->list;
			
 
				-
			
 
				-						if (arch_model->list)
			
 
				-						{
			
 
				-							print_comma(gnuplot_file, first);
			
 
				-							fprintf(gnuplot_file, "\"%s\" using 1:%d:%d with errorlines title \"Average %s\"", avg_file_name, col, col+1, replace_char(arch_name, '_', '-'));
			
 
				-							col += 2;
			
 
				-						}
			
 
				-					}
			
 
				+					print_comma(gnuplot_file, first);
			
 
				+					fprintf(gnuplot_file, "\"%s\" using 1:%d:%d with errorlines title \"Average %s\"", options->avg_file_name, col, col+1, replace_char(arch_name, '_', '-'));
			
 
				+					col += 2;
			
 
				 				}
			
 
				 			}
			
 
				 		}
			
@@ -336,25 +287,20 @@ static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_
 
				 
			
 
				 		minimum = ULONG_MAX;
			
 
				 		/* Get the next minimum */
			
 
				-		for (archtype = archmin; archtype < archmax; archtype++)
			
 
				+		for(i = 0; i < model->state->ncombs; i++)
			
 
				 		{
			
 
				-			if(model->per_arch[archtype]!=NULL)
			
 
				+			int comb = model->state->combs[i];
			
 
				+			if (options->comb_is_set == 0 || options->comb == comb)
			
 
				 			{
			
 
				-				for(dev = devmin; model->per_arch[archtype][dev] != NULL && (devmax == 0 || dev < devmax);dev++)
			
 
				+				int impl;
			
 
				+				for(impl = 0; impl < model->state->nimpls[comb]; impl++)
			
 
				 				{
			
 
				-					for(core = coremin; model->per_arch[archtype][dev][core] != NULL && (coremax == 0 || core < coremax); core++)
			
 
				-				
			
 
				+					struct starpu_perfmodel_per_arch *arch_model = &model->state->per_arch[comb][impl];
			
 
				+					for (ptr = arch_model->list; ptr; ptr = ptr->next)
			
 
				 					{
			
 
				-						for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				-						{
			
 
				-							struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[archtype][dev][core][implid];
			
 
				-							for (ptr = arch_model->list; ptr; ptr = ptr->next)
			
 
				-							{
			
 
				-								unsigned long size = ptr->entry->size;
			
 
				-								if (size > last && size < minimum)
			
 
				-									minimum = size;
			
 
				-							}
			
 
				-						}
			
 
				+						unsigned long size = ptr->entry->size;
			
 
				+						if (size > last && size < minimum)
			
 
				+							minimum = size;
			
 
				 					}
			
 
				 				}
			
 
				 			}
			
@@ -364,160 +310,98 @@ static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_
 
				 
			
 
				 		fprintf(stderr, "%lu ", minimum);
			
 
				 		fprintf(datafile, "%-15lu ", minimum);
			
 
				-		for (archtype = archmin; archtype < archmax; archtype++)
			
 
				-			if(model->per_arch[archtype]!=NULL)
			
 
				-				for(dev = devmin; model->per_arch[archtype][dev] != NULL && (devmax == 0 || dev < devmax);dev++)
			
 
				-					for(core = coremin; model->per_arch[archtype][dev][core] != NULL && (coremax == 0 || core < coremax); core++)
			
 
				-						for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				+		for(i = 0; i < model->state->ncombs; i++)
			
 
				+		{
			
 
				+			int comb = model->state->combs[i];
			
 
				+			if (options->comb_is_set == 0 || options->comb == comb)
			
 
				+			{
			
 
				+				int impl;
			
 
				+
			
 
				+				for(impl = 0; impl < model->state->nimpls[comb]; impl++)
			
 
				+				{
			
 
				+					struct starpu_perfmodel_per_arch *arch_model = &model->state->per_arch[comb][impl];
			
 
				+					for (ptr = arch_model->list; ptr; ptr = ptr->next)
			
 
				+					{
			
 
				+						struct starpu_perfmodel_history_entry *entry = ptr->entry;
			
 
				+						if (entry->size == minimum)
			
 
				 						{
			
 
				-							struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[archtype][dev][core][implid];
			
 
				-							for (ptr = arch_model->list; ptr; ptr = ptr->next)
			
 
				-							{
			
 
				-								struct starpu_perfmodel_history_entry *entry = ptr->entry;
			
 
				-								if (entry->size == minimum)
			
 
				-								{
			
 
				-									if (gflops)
			
 
				-										fprintf(datafile, "\t%-15le\t%-15le", entry->flops / (entry->mean * 1000),
			
 
				-												entry->flops / ((entry->mean + entry->deviation) * 1000) -
			
 
				-												entry->flops / (entry->mean * 1000)
			
 
				-										       );
			
 
				-									else
			
 
				-										fprintf(datafile, "\t%-15le\t%-15le", 0.001*entry->mean, 0.001*entry->deviation);
			
 
				-									break;
			
 
				-								}
			
 
				-							}
			
 
				-							if (!ptr && arch_model->list)
			
 
				-								/* No value for this arch. */
			
 
				-								fprintf(datafile, "\t\"\"\t\"\"");
			
 
				+							if (options->gflops)
			
 
				+								fprintf(datafile, "\t%-15le\t%-15le", entry->flops / (entry->mean * 1000),
			
 
				+									entry->flops / ((entry->mean + entry->deviation) * 1000) -
			
 
				+									entry->flops / (entry->mean * 1000)
			
 
				+									);
			
 
				+							else
			
 
				+								fprintf(datafile, "\t%-15le\t%-15le", 0.001*entry->mean, 0.001*entry->deviation);
			
 
				+							break;
			
 
				 						}
			
 
				+					}
			
 
				+					if (!ptr && arch_model->list)
			
 
				+						/* No value for this arch. */
			
 
				+						fprintf(datafile, "\t\"\"\t\"\"");
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				 		fprintf(datafile, "\n");
			
 
				 	}
			
 
				 	fprintf(stderr, "\n");
			
 
				-	fclose(datafile);
			
 
				-}
			
 
				-
			
 
				-
			
 
				-static void display_selected_arch_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, int *first)
			
 
				-{
			
 
				-	unsigned implid;
			
 
				-	for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				-		display_perf_model(gnuplot_file, model, arch, first, implid);
			
 
				-}
			
 
				-
			
 
				-static void display_selected_device_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, enum starpu_worker_archtype archtype, int devid, int *first)
			
 
				-{
			
 
				-	unsigned ncore;
			
 
				-	struct starpu_perfmodel_arch arch;
			
 
				-	arch.type = archtype;
			
 
				-	arch.devid = devid;
			
 
				-	for(ncore=0; model->per_arch[archtype][devid][ncore] != NULL; ncore++)
			
 
				-	{
			
 
				-		arch.ncore = ncore;
			
 
				-		display_selected_arch_perf_models(gnuplot_file,model,&arch,first);
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-static void display_selected_archtype_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, enum starpu_worker_archtype archtype, int *first)
			
 
				-{
			
 
				-	unsigned devid;
			
 
				-	for(devid=0; model->per_arch[archtype][devid] != NULL; devid++)
			
 
				-		display_selected_device_perf_models(gnuplot_file,model,archtype,devid,first);
			
 
				-}
			
 
				-
			
 
				-static void display_all_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, int *first)
			
 
				-{
			
 
				-	unsigned archtype;
			
 
				-	for(archtype = 0; archtype < STARPU_NARCH; archtype++)
			
 
				-		display_selected_archtype_perf_models(gnuplot_file,model,archtype,first);
			
 
				-}
			
 
				-
			
 
				-#ifdef STARPU_USE_FXT
			
 
				-static int ** init_archtype_is_found_per_arch(int maxdevid, unsigned* maxncore_table)
			
 
				-{
			
 
				-	int devid, ncore;
			
 
				-	int ** archtype_is_found_per_arch = malloc(sizeof(*archtype_is_found_per_arch)*(maxdevid+1));
			
 
				-	archtype_is_found_per_arch[maxdevid] = NULL;
			
 
				-	for(devid=0; devid<maxdevid; devid++)
			
 
				-	{
			
 
				-		int maxncore;
			
 
				-		if(maxncore_table != NULL)
			
 
				-			maxncore = maxncore_table[devid];
			
 
				-		else
			
 
				-			maxncore = 1;
			
 
				-		
			
 
				-		archtype_is_found_per_arch[devid] = malloc(sizeof(*archtype_is_found_per_arch[devid])*(maxncore+1));
			
 
				-		archtype_is_found_per_arch[devid][maxncore] = 0;
			
 
				-		for(ncore=0; ncore<maxncore; ncore++)
			
 
				-			archtype_is_found_per_arch[devid][ncore] = 0;
			
 
				-	}
			
 
				-	return archtype_is_found_per_arch;
			
 
				 
			
 
				+	fclose(datafile);
			
 
				 }
			
 
				 
			
 
				-
			
 
				-static void init_archtype_is_found(struct starpu_perfmodel *model)
			
 
				+static void display_all_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, int *first, struct _perfmodel_plot_options *options)
			
 
				 {
			
 
				-	unsigned archtype, devid, ndevice, ncore, *maxncore;
			
 
				-
			
 
				-	for(archtype = 0; archtype < STARPU_NARCH; archtype++)
			
 
				+	int i;
			
 
				+	for(i = 0; i < model->state->ncombs; i++)
			
 
				 	{
			
 
				-	
			
 
				-		for(devid=0; model->per_arch[archtype][devid] != NULL; devid++)
			
 
				-			;
			
 
				-		ndevice = devid;
			
 
				-		if(ndevice != 0)
			
 
				+		int comb = model->state->combs[i];
			
 
				+		if (options->comb_is_set == 0 || options->comb == comb)
			
 
				 		{
			
 
				-			maxncore = malloc(sizeof(*maxncore)*ndevice);
			
 
				-			for(devid=0; devid < ndevice; devid++)
			
 
				+			struct starpu_perfmodel_arch *arch;
			
 
				+			int impl;
			
 
				+
			
 
				+			arch = _starpu_arch_comb_get(comb);
			
 
				+			for(impl = 0; impl < model->state->nimpls[comb]; impl++)
			
 
				 			{
			
 
				-				for(ncore=0; model->per_arch[archtype][devid][ncore] != NULL; ncore++)
			
 
				-					;
			
 
				-				maxncore[devid] = ncore;
			
 
				+				struct starpu_perfmodel_per_arch *archmodel = &model->state->per_arch[comb][impl];
			
 
				+				display_perf_model(gnuplot_file, arch, archmodel, impl, first, options);
			
 
				 			}
			
 
				 		}
			
 
				-		else
			
 
				-		{
			
 
				-			maxncore = NULL;
			
 
				-		}
			
 
				-
			
 
				-		archtype_is_found[archtype] = init_archtype_is_found_per_arch(ndevice,maxncore);
			
 
				-		if(maxncore != NULL)
			
 
				-			free(maxncore);
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				-
			
 
				-static void dump_data_file(FILE *data_file, struct starpu_perfmodel *model)
			
 
				+#ifdef STARPU_USE_FXT
			
 
				+static void dump_data_file(FILE *data_file, struct _perfmodel_plot_options *options)
			
 
				 {
			
 
				-	init_archtype_is_found(model);
			
 
				-
			
 
				 	int i;
			
 
				-	for (i = 0; i < options.dumped_codelets_count; i++)
			
 
				+	for (i = 0; i < options->fxt_options.dumped_codelets_count; i++)
			
 
				 	{
			
 
				-		/* Dump only if the symbol matches user's request */
			
 
				-		if (strncmp(dumped_codelets[i].symbol, symbol, (FXT_MAX_PARAMS - 4)*sizeof(unsigned long)-1) == 0)
			
 
				+		/* Dump only if the codelet symbol matches user's request (with or without the machine name) */
			
 
				+		char *tmp = strdup(options->symbol);
			
 
				+		char *dot = strchr(tmp, '.');
			
 
				+		if (dot) tmp[strlen(tmp)-strlen(dot)] = '\0';
			
 
				+		if ((strncmp(options->dumped_codelets[i].symbol, options->symbol, (FXT_MAX_PARAMS - 4)*sizeof(unsigned long)-1) == 0)
			
 
				+		    || (strncmp(options->dumped_codelets[i].symbol, tmp, (FXT_MAX_PARAMS - 4)*sizeof(unsigned long)-1) == 0))
			
 
				 		{
			
 
				-			struct starpu_perfmodel_arch* arch = &dumped_codelets[i].arch;
			
 
				-			archtype_is_found[arch->type][arch->devid][arch->ncore] = 1;
			
 
				-
			
 
				-			size_t size = dumped_codelets[i].size;
			
 
				-			float time = dumped_codelets[i].time;
			
 
				+			char *archname = options->dumped_codelets[i].perfmodel_archname;
			
 
				+			size_t size = options->dumped_codelets[i].size;
			
 
				+			float time = options->dumped_codelets[i].time;
			
 
				 
			
 
				-			fprintf(data_file, "%d_%d_%d	%f	%f\n", arch->type, arch->devid, arch->ncore, (float)size, time);
			
 
				+			fprintf(data_file, "%s	%f	%f\n", archname, (float)size, time);
			
 
				 		}
			
 
				+		free(tmp);
			
 
				 	}
			
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-static void display_selected_models(FILE *gnuplot_file, struct starpu_perfmodel *model)
			
 
				+static void display_selected_models(FILE *gnuplot_file, struct starpu_perfmodel *model, struct _perfmodel_plot_options *options)
			
 
				 {
			
 
				 	fprintf(gnuplot_file, "#!/usr/bin/gnuplot -persist\n");
			
 
				 	fprintf(gnuplot_file, "\n");
			
 
				 	fprintf(gnuplot_file, "set term postscript eps enhanced color\n");
			
 
				-	fprintf(gnuplot_file, "set output \"starpu_%s.eps\"\n", symbol);
			
 
				-	fprintf(gnuplot_file, "set title \"Model for codelet %s\"\n", replace_char(symbol, '_', '-'));
			
 
				+	fprintf(gnuplot_file, "set output \"starpu_%s.eps\"\n", options->symbol);
			
 
				+	fprintf(gnuplot_file, "set title \"Model for codelet %s\"\n", replace_char(options->symbol, '_', '-'));
			
 
				 	fprintf(gnuplot_file, "set xlabel \"Total data size\"\n");
			
 
				-	if (gflops)
			
 
				+	if (options->gflops)
			
 
				 		fprintf(gnuplot_file, "set ylabel \"GFlops\"\n");
			
 
				 	else
			
 
				 		fprintf(gnuplot_file, "set ylabel \"Time (ms)\"\n");
			
@@ -529,134 +413,32 @@ static void display_selected_models(FILE *gnuplot_file, struct starpu_perfmodel
 
				 
			
 
				 	/* If no input data is given to gnuplot, we at least need to specify an
			
 
				 	 * arbitrary range. */
			
 
				-	if (no_fxt_file)
			
 
				+	if (options->with_fxt_file == 0)
			
 
				 		fprintf(gnuplot_file, "set xrange [1:10**9]\n\n");
			
 
				 
			
 
				 	int first = 1;
			
 
				 	fprintf(gnuplot_file, "plot\t");
			
 
				 
			
 
				-	struct starpu_perfmodel_arch arch;
			
 
				-	struct _starpu_machine_config *conf = _starpu_get_machine_config();
			
 
				-
			
 
				-
			
 
				-
			
 
				-	if (archname == NULL)
			
 
				-	{
			
 
				-		/* display all architectures */
			
 
				-		display_all_perf_models(gnuplot_file, model, &first);
			
 
				-		display_history_based_perf_models(gnuplot_file, model, NULL, NULL, NULL, &first);
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		if (strcmp(archname, "cpu") == 0)
			
 
				-		{
			
 
				-			
			
 
				-			arch.type = STARPU_CPU_WORKER;
			
 
				-			arch.devid = 1;
			
 
				-			arch.ncore = 0;
			
 
				-
			
 
				-			display_selected_arch_perf_models(gnuplot_file, model, &arch, &first); 
			
 
				-			display_history_based_perf_models(gnuplot_file, model,  &arch.type, &arch.devid, &arch.ncore, &first);
			
 
				-			return;
			
 
				-		}
			
 
				-
			
 
				-		unsigned k;
			
 
				-		if (sscanf(archname, "cpu:%u", &k) == 1)
			
 
				-		{
			
 
				-			/* For combined CPU workers */
			
 
				-			if ((k < 1) || (k > conf->topology.ncpus))
			
 
				-			{
			
 
				-				fprintf(stderr, "Invalid CPU size\n");
			
 
				-				exit(-1);
			
 
				-			}
			
 
				-
			
 
				-			arch.type = STARPU_CPU_WORKER;
			
 
				-			arch.devid = 1;
			
 
				-			arch.ncore = k - 1;
			
 
				-
			
 
				-			display_selected_arch_perf_models(gnuplot_file, model, &arch, &first); 
			
 
				-			display_history_based_perf_models(gnuplot_file, model,  &arch.type, &arch.devid, &arch.ncore, &first);
			
 
				-			return;
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(archname, "cuda") == 0)
			
 
				-		{
			
 
				-			unsigned archtype = STARPU_CUDA_WORKER;
			
 
				-			display_selected_archtype_perf_models(gnuplot_file, model, archtype, &first);
			
 
				-			display_history_based_perf_models(gnuplot_file, model,  &archtype, NULL, NULL, &first);
			
 
				-			return;
			
 
				-		}
			
 
				-
			
 
				-		/* There must be a cleaner way ! */
			
 
				-		unsigned gpuid;
			
 
				-		int nmatched;
			
 
				-		nmatched = sscanf(archname, "cuda_%u", &gpuid);
			
 
				-		if (nmatched == 1)
			
 
				-		{
			
 
				-			if (gpuid < conf->topology.ncudagpus)
			
 
				-			{
			
 
				-				arch.type = STARPU_CUDA_WORKER;
			
 
				-				arch.devid = gpuid;
			
 
				-				arch.ncore = 0;
			
 
				-
			
 
				-				display_selected_arch_perf_models(gnuplot_file, model, &arch, &first);
			
 
				-				display_history_based_perf_models(gnuplot_file, model,  &arch.type, &arch.devid, &arch.ncore, &first);
			
 
				-				return;
			
 
				-			}
			
 
				-			else
			
 
				-			{
			
 
				-				fprintf(stderr, "Invalid CUDA device %d (last valid one is %d)\n", gpuid, STARPU_MAXCUDADEVS-1);
			
 
				-				exit(-1);
			
 
				-			}
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(archname, "opencl") == 0)
			
 
				-		{
			
 
				-			unsigned archtype = STARPU_OPENCL_WORKER;
			
 
				-			display_selected_archtype_perf_models(gnuplot_file, model, archtype, &first);
			
 
				-			display_history_based_perf_models(gnuplot_file, model,  &archtype, NULL, NULL, &first);
			
 
				-			return;
			
 
				-		}
			
 
				-
			
 
				-		/* There must be a cleaner way ! */
			
 
				-		nmatched = sscanf(archname, "opencl_%u", &gpuid);
			
 
				-		if (nmatched == 1)
			
 
				-		{
			
 
				-			if (gpuid < conf->topology.nopenclgpus)
			
 
				-			{
			
 
				-				arch.type = STARPU_OPENCL_WORKER;
			
 
				-				arch.devid = gpuid;
			
 
				-				arch.ncore = 0;
			
 
				-		
			
 
				-				display_selected_arch_perf_models(gnuplot_file, model, &arch, &first);
			
 
				-				display_history_based_perf_models(gnuplot_file, model,  &arch.type, &arch.devid, &arch.ncore, &first);
			
 
				-				return;
			
 
				-			}
			
 
				-			else
			
 
				-			{
			
 
				-				fprintf(stderr, "Invalid OpenCL device %d (last valid one is %d)\n", gpuid, STARPU_MAXOPENCLDEVS-1);
			
 
				-				exit(-1);
			
 
				-			}
			
 
				-		}
			
 
				-
			
 
				-		fprintf(stderr, "Unknown architecture requested, aborting.\n");
			
 
				-		exit(-1);
			
 
				-	}
			
 
				+	/* display all or selected combinations */
			
 
				+	display_all_perf_models(gnuplot_file, model, &first, options);
			
 
				+	display_history_based_perf_models(gnuplot_file, model, &first, options);
			
 
				 }
			
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				-	int ret;
			
 
				+	int ret = 0;
			
 
				 	struct starpu_perfmodel model = {};
			
 
				+	char gnuplot_file_name[256];
			
 
				+	struct _perfmodel_plot_options options;
			
 
				 
			
 
				 #if defined(_WIN32) && !defined(__CYGWIN__)
			
 
				 	WSADATA wsadata;
			
 
				 	WSAStartup(MAKEWORD(1,0), &wsadata);
			
 
				 #endif
			
 
				 
			
 
				-	parse_args(argc, argv);
			
 
				+	parse_args(argc, argv, &options);
			
 
				 
			
 
				-        if (list)
			
 
				+        if (options.list)
			
 
				 	{
			
 
				                 ret = starpu_perfmodel_list(stdout);
			
 
				                 if (ret)
			
@@ -668,35 +450,47 @@ int main(int argc, char **argv)
 
				         }
			
 
				 
			
 
				 	/* Load the performance model associated to the symbol */
			
 
				-	ret = starpu_perfmodel_load_symbol(symbol, &model);
			
 
				+	ret = starpu_perfmodel_load_symbol(options.symbol, &model);
			
 
				 	if (ret == 1)
			
 
				 	{
			
 
				-		fprintf(stderr, "The performance model for the symbol <%s> could not be loaded\n", symbol);
			
 
				+		fprintf(stderr, "The performance model for the symbol <%s> could not be loaded\n", options.symbol);
			
 
				 		return 1;
			
 
				 	}
			
 
				 
			
 
				+        if (options.list_combs)
			
 
				+	{
			
 
				+		ret = starpu_perfmodel_list_combs(stdout, &model);
			
 
				+                if (ret)
			
 
				+		{
			
 
				+                        fprintf(stderr, "Error when listing combinations for model <%s>\n", options.symbol);
			
 
				+                        return 1;
			
 
				+                }
			
 
				+		return 0;
			
 
				+
			
 
				+	}
			
 
				+
			
 
				 	/* If some FxT input was specified, we put the points on the graph */
			
 
				 #ifdef STARPU_USE_FXT
			
 
				-	if (!no_fxt_file)
			
 
				+	if (options.with_fxt_file)
			
 
				 	{
			
 
				-		starpu_fxt_generate_trace(&options);
			
 
				+		starpu_fxt_generate_trace(&options.fxt_options);
			
 
				 
			
 
				-		snprintf(data_file_name, 256, "starpu_%s.data", symbol);
			
 
				+		snprintf(options.data_file_name, 256, "starpu_%s.data", options.symbol);
			
 
				 
			
 
				-		FILE *data_file = fopen(data_file_name, "w+");
			
 
				+		FILE *data_file = fopen(options.data_file_name, "w+");
			
 
				 		STARPU_ASSERT(data_file);
			
 
				-		dump_data_file(data_file, &model);
			
 
				+		dump_data_file(data_file, &options);
			
 
				 		fclose(data_file);
			
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				-	snprintf(gnuplot_file_name, 256, "starpu_%s.gp", symbol);
			
 
				-
			
 
				-	snprintf(avg_file_name, 256, "starpu_%s_avg.data", symbol);
			
 
				+	snprintf(gnuplot_file_name, 256, "starpu_%s.gp", options.symbol);
			
 
				+	snprintf(options.avg_file_name, 256, "starpu_%s_avg.data", options.symbol);
			
 
				 
			
 
				 	FILE *gnuplot_file = fopen(gnuplot_file_name, "w+");
			
 
				 	STARPU_ASSERT(gnuplot_file);
			
 
				-	display_selected_models(gnuplot_file, &model);
			
 
				+	display_selected_models(gnuplot_file, &model, &options);
			
 
				+	fprintf(gnuplot_file,"\n");
			
 
				 	fclose(gnuplot_file);
			
 
				 
			
 
				 	/* Retrieve the current mode of the gnuplot executable */