Просмотр исходного кода

merge branches/perf_model into trunk

Nathalie Furmento лет назад: 11
Родитель
Сommit
a4d31c80da
60 измененных файлов с 2052 добавлено и 1290 удалено
  1. 3 0
      ChangeLog
  2. 2 1
      examples/Makefile.am
  3. 18 6
      examples/cholesky/cholesky_models.c
  4. 12 8
      examples/heat/lu_kernels_model.c
  5. 2 0
      examples/sched_ctx/nested_sched_ctxs.c
  6. 2 1
      examples/sched_ctx/sched_ctx.c
  7. 13 2
      examples/sched_ctx/sched_ctx_without_sched_policy.c
  8. 171 0
      examples/sched_ctx/sched_ctx_without_sched_policy_awake.c
  9. 1 1
      include/starpu_fxt.h
  10. 34 11
      include/starpu_perfmodel.h
  11. 1 0
      include/starpu_sched_ctx.h
  12. 3 1
      include/starpu_task.h
  13. 19 18
      include/starpu_task_util.h
  14. 20 6
      sc_hypervisor/examples/cholesky/cholesky_models.c
  15. 1 1
      sc_hypervisor/src/policies_utils/policy_tools.c
  16. 6 3
      sc_hypervisor/src/sc_hypervisor.c
  17. 13 11
      src/common/fxt.h
  18. 6 5
      src/core/combined_workers.c
  19. 2 2
      src/core/detect_combined_workers.c
  20. 3 2
      src/core/jobs.c
  21. 46 36
      src/core/perfmodel/perfmodel.c
  22. 21 5
      src/core/perfmodel/perfmodel.h
  23. 587 446
      src/core/perfmodel/perfmodel_history.c
  24. 61 38
      src/core/perfmodel/perfmodel_print.c
  25. 290 60
      src/core/sched_ctx.c
  26. 12 1
      src/core/sched_ctx.h
  27. 31 2
      src/core/sched_policy.c
  28. 18 15
      src/core/task.c
  29. 31 22
      src/core/topology.c
  30. 10 2
      src/core/workers.c
  31. 1 0
      src/datawizard/coherency.c
  32. 25 22
      src/datawizard/footprint.c
  33. 69 56
      src/debug/traces/starpu_fxt.c
  34. 1 0
      src/debug/traces/starpu_paje.c
  35. 16 6
      src/drivers/cpu/driver_cpu.c
  36. 8 1
      src/drivers/cuda/driver_cuda.c
  37. 79 10
      src/drivers/driver_common/driver_common.c
  38. 6 1
      src/drivers/opencl/driver_opencl.c
  39. 12 12
      src/profiling/bound.c
  40. 6 6
      src/sched_policies/component_best_implementation.c
  41. 1 1
      src/sched_policies/component_eager_calibration.c
  42. 2 2
      src/sched_policies/component_fifo.c
  43. 2 2
      src/sched_policies/component_prio.c
  44. 1 1
      src/sched_policies/component_random.c
  45. 2 2
      src/sched_policies/component_sched.c
  46. 1 1
      src/sched_policies/component_work_stealing.c
  47. 1 1
      src/sched_policies/component_worker.c
  48. 11 11
      src/sched_policies/deque_modeling_policy_data_aware.c
  49. 4 4
      src/sched_policies/parallel_heft.c
  50. 2 2
      src/sched_policies/random_policy.c
  51. 13 0
      src/util/starpu_task_insert_utils.c
  52. 1 1
      src/worker_collection/worker_list.c
  53. 1 0
      tests/Makefile.am
  54. 6 4
      tests/perfmodels/feed.c
  55. 64 0
      tests/perfmodels/memory.c
  56. 1 1
      tests/perfmodels/regression_based.c
  57. 24 15
      tests/perfmodels/valid_model.c
  58. 39 43
      tests/sched_policies/simple_cpu_gpu_sched.c
  59. 42 1
      tools/gdbinit
  60. 172 378
      tools/starpu_perfmodel_plot.c

+ 3 - 0
ChangeLog

@@ -63,6 +63,9 @@ New features:
     modes field to the task structure, which permit to define codelets taking a
     variable number of data.
   * Add support for implementing OpenMP runtimes on top of StarPU
+  * New performance model format to better represent parallel tasks.
+    Used to provide estimations for the execution times of the
+    parallel tasks on scheduling contexts or combined workers.
 
 Small features:
   * Tasks can now have a name (via the field const char *name of

+ 2 - 1
examples/Makefile.am

@@ -255,7 +255,8 @@ STARPU_EXAMPLES +=				\
 	openmp/vector_scal_omp			\
 	sched_ctx/sched_ctx_without_sched_policy\
 	sched_ctx/nested_sched_ctxs		\
-	sched_ctx/sched_ctx_without_sched_policy
+	sched_ctx/sched_ctx_without_sched_policy\
+	sched_ctx/sched_ctx_without_sched_policy_awake 
 
 if STARPU_LONG_CHECK
 STARPU_EXAMPLES +=				\

+ 18 - 6
examples/cholesky/cholesky_models.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010-2011  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -128,13 +128,25 @@ double cuda_chol_task_22_cost(struct starpu_task *task, struct starpu_perfmodel_
 }
 
 void initialize_chol_model(struct starpu_perfmodel* model, char * symbol,
-		double (*cpu_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned),
-		double (*cuda_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned))
+			   double (*cpu_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned),
+			   double (*cuda_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned))
 {
+	struct starpu_perfmodel_per_arch *per_arch;
+
 	model->symbol = symbol;
 	model->type = STARPU_HISTORY_BASED;
-	starpu_perfmodel_init(model);
-	model->per_arch[STARPU_CPU_WORKER][0][0][0].cost_function = cpu_cost_function;
+
+	starpu_perfmodel_init(NULL, model);
+
+	per_arch = starpu_perfmodel_get_model_per_devices(model, 0, STARPU_CPU_WORKER, 0, 1, -1);
+        per_arch->cost_function = cpu_cost_function;
+	// We could also call directly:
+	// starpu_perfmodel_set_per_devices_cost_function(model, 0, cpu_cost_function, STARPU_CPU_WORKER, 0, 1, -1);
+
 	if(starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) != 0)
-		model->per_arch[STARPU_CUDA_WORKER][0][0][0].cost_function = cuda_cost_function;
+	{
+	     	per_arch = starpu_perfmodel_get_model_per_devices(model, 0, STARPU_CUDA_WORKER, 0, 1, -1);
+		per_arch->cost_function = cuda_cost_function;
+
+	}
 }

+ 12 - 8
examples/heat/lu_kernels_model.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010-2011  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -215,15 +215,19 @@ double task_22_cost_cpu(struct starpu_task *task, struct starpu_perfmodel_arch*
 }
 
 void initialize_lu_kernels_model(struct starpu_perfmodel* model, char * symbol,
-		double (*cost_function)(struct starpu_task *, unsigned),
-		double (*cpu_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned),
-		double (*cuda_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned))
+				 double (*cost_function)(struct starpu_task *, unsigned),
+				 double (*cpu_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned),
+				 double (*cuda_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned))
 {
 	model->symbol = symbol;
 	model->type = STARPU_HISTORY_BASED;
-	starpu_perfmodel_init(model);
-	model->cost_function = cost_function;
-	model->per_arch[STARPU_CPU_WORKER][0][0][0].cost_function = cpu_cost_function;
+
+	starpu_perfmodel_init(NULL, model);
+
+	starpu_perfmodel_set_per_devices_cost_function(model, 0, cpu_cost_function, STARPU_CPU_WORKER, 0, 1, -1);
+
 	if(starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) != 0)
-		model->per_arch[STARPU_CUDA_WORKER][0][0][0].cost_function = cuda_cost_function;
+	{
+		starpu_perfmodel_set_per_devices_cost_function(model, 0, cuda_cost_function, STARPU_CUDA_WORKER, 0, 1, -1);
+	}
 }

+ 2 - 0
examples/sched_ctx/nested_sched_ctxs.c

@@ -161,6 +161,7 @@ int main(int argc, char **argv)
 
 		task->cl = &sched_ctx_codelet;
 		task->cl_arg = sched_ctx1;
+		task->possibly_parallel = 1;
 
 		/*submit tasks to context*/
 		ret = starpu_task_submit_to_ctx(task,sched_ctx1);
@@ -174,6 +175,7 @@ int main(int argc, char **argv)
 
 		task->cl = &sched_ctx_codelet;
 		task->cl_arg = sched_ctx2;
+		task->possibly_parallel = 1;
 
 		/*submit tasks to context*/
 		ret = starpu_task_submit_to_ctx(task,sched_ctx2);

+ 2 - 1
examples/sched_ctx/sched_ctx.c

@@ -93,7 +93,7 @@ int main(int argc, char **argv)
 
 	/*create contexts however you want*/
 	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", STARPU_SCHED_CTX_POLICY_NAME, "eager", 0);
-	unsigned sched_ctx2 = starpu_sched_ctx_create(procs2, nprocs2, "ctx2", STARPU_SCHED_CTX_POLICY_NAME, "eager",  0);
+	unsigned sched_ctx2 = starpu_sched_ctx_create(procs2, nprocs2, "ctx2", STARPU_SCHED_CTX_POLICY_NAME, "eager", 0);
 
 	/*indicate what to do with the resources when context 2 finishes (it depends on your application)*/
 	starpu_sched_ctx_set_inheritor(sched_ctx2, sched_ctx1);
@@ -152,6 +152,7 @@ int main(int argc, char **argv)
 	/* wait for all tasks at the end*/
 	starpu_task_wait_for_all();
 
+	starpu_sched_ctx_add_workers(procs1, nprocs1, sched_ctx2);
 	starpu_sched_ctx_delete(sched_ctx1);
 	starpu_sched_ctx_delete(sched_ctx2);
 	printf("tasks executed %d out of %d\n", tasks_executed, ntasks/2);

+ 13 - 2
examples/sched_ctx/sched_ctx_without_sched_policy.c

@@ -59,7 +59,8 @@ static void sched_ctx_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
 static struct starpu_codelet sched_ctx_codelet =
 {
 	.cpu_funcs = {sched_ctx_func, NULL},
-	.cuda_funcs = { NULL},
+#warning FIXME: cuda_funcs should not need to be defined
+	.cuda_funcs = {sched_ctx_func, NULL},
 	.opencl_funcs = {NULL},
 	.model = NULL,
 	.nbuffers = 0,
@@ -83,8 +84,14 @@ int main(int argc, char **argv)
 	starpu_pthread_mutex_init(&mut, NULL);
 	int nprocs1 = 1;
 	int nprocs2 = 1;
-	int *procs1, *procs2;
+	int ncuda = 0;
+	int *procs1, *procs2, *procscuda;
 
+#ifdef STARPU_USE_CUDA
+	ncuda = starpu_cuda_worker_get_count();
+	procscuda = (int*)malloc(ncuda*sizeof(int));
+	starpu_worker_get_ids_by_type(STARPU_CUDA_WORKER, procscuda, ncuda);
+#endif
 #ifdef STARPU_USE_CPU
 	ncpus = starpu_cpu_worker_get_count();
 	procs1 = (int*)malloc(ncpus*sizeof(int));
@@ -108,6 +115,10 @@ int main(int argc, char **argv)
 #endif
 
 	if (ncpus == 0) goto enodev;
+	if (ncuda > 0 && nprocs1 > 1)
+	{
+		procs1[nprocs1-1] = procscuda[0];
+	}
 
 	/*create contexts however you want*/
 	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", 0);

+ 171 - 0
examples/sched_ctx/sched_ctx_without_sched_policy_awake.c

@@ -0,0 +1,171 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <omp.h>
+
+#ifdef STARPU_QUICK_CHECK
+#define NTASKS 64
+#else
+#define NTASKS 100
+#endif
+
+
+starpu_pthread_mutex_t mut;
+
+int tasks_executed[2][STARPU_NMAXWORKERS];
+int parallel_code(int sched_ctx)
+{
+	int i;
+	int t = 0;
+	int workerid = starpu_worker_get_id();
+	for(i = 0; i < NTASKS; i++)
+		t++;
+	tasks_executed[sched_ctx-1][workerid] = t;
+//	printf("executed %d tasks on worker %d of sched_ctx %d \n", t, workerid, sched_ctx);
+
+	return t;
+}
+
+static void sched_ctx_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
+{
+	unsigned sched_ctx = (unsigned)arg;
+	parallel_code(sched_ctx);
+}
+
+
+static struct starpu_codelet sched_ctx_codelet =
+{
+	.cpu_funcs = {sched_ctx_func, NULL},
+	.cuda_funcs = { NULL},
+	.opencl_funcs = {NULL},
+	.model = NULL,
+	.nbuffers = 0,
+	.name = "sched_ctx"
+};
+
+
+int main(int argc, char **argv)
+{
+	int i;
+	for(i = 0; i < STARPU_NMAXWORKERS; i++)
+	{
+		tasks_executed[0][i] = 0;
+		tasks_executed[1][i] = 0;
+	}
+	int ntasks = NTASKS;
+	int ret, j, k;
+	unsigned ncpus = 0;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_pthread_mutex_init(&mut, NULL);
+	int nprocs1 = 1;
+	int nprocs2 = 1;
+	int *procs1, *procs2;
+
+#ifdef STARPU_USE_CPU
+	ncpus = starpu_cpu_worker_get_count();
+	procs1 = (int*)malloc(ncpus*sizeof(int));
+	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, ncpus);
+
+	if(ncpus > 1)
+	{
+		nprocs1 = ncpus/2;
+		nprocs2 =  ncpus-nprocs1;
+		k = 0;
+		procs2 = (int*)malloc(nprocs2*sizeof(int));
+		for(j = nprocs1; j < nprocs1+nprocs2; j++)
+			procs2[k++] = procs1[j];
+	}
+	else
+	{
+		procs2 = (int*)malloc(nprocs2*sizeof(int));
+		procs2[0] = procs1[0];
+
+	}
+#endif
+
+	if (ncpus == 0) goto enodev;
+
+	/*create contexts however you want*/
+	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", STARPU_SCHED_CTX_AWAKE_WORKERS, 0);
+	unsigned sched_ctx2 = starpu_sched_ctx_create(procs2, nprocs2, "ctx2", STARPU_SCHED_CTX_AWAKE_WORKERS, 0);
+
+
+	for (i = 0; i < ntasks; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+
+		task->cl = &sched_ctx_codelet;
+		task->cl_arg = sched_ctx1;
+
+		/*submit tasks to context*/
+		ret = starpu_task_submit_to_ctx(task,sched_ctx1);
+
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	for (i = 0; i < ntasks; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+
+		task->cl = &sched_ctx_codelet;
+		task->cl_arg = sched_ctx2;
+
+		/*submit tasks to context*/
+		ret = starpu_task_submit_to_ctx(task,sched_ctx2);
+
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+
+	/* tell starpu when you finished submitting tasks to this context
+	   in order to allow moving resources from this context to the inheritor one
+	   when its corresponding tasks finished executing */
+
+
+
+	/* wait for all tasks at the end*/
+	starpu_task_wait_for_all();
+
+	starpu_sched_ctx_delete(sched_ctx1);
+	starpu_sched_ctx_delete(sched_ctx2);
+
+	int tasks_per_ctx[2];
+	tasks_per_ctx[0] = 0;
+	tasks_per_ctx[1] = 0;
+	for(i = 0; i < STARPU_NMAXWORKERS; i++)
+	{
+		tasks_per_ctx[0] += tasks_executed[0][i];
+		tasks_per_ctx[1] += tasks_executed[1][i];
+	}
+
+	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx1, tasks_per_ctx[0]/nprocs1, NTASKS);
+	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx2, tasks_per_ctx[1]/nprocs2, NTASKS);
+
+enodev:
+#ifdef STARPU_USE_CPU
+	free(procs1);
+	free(procs2);
+#endif
+	starpu_shutdown();
+	return ncpus == 0 ? 77 : 0;
+}

+ 1 - 1
include/starpu_fxt.h

@@ -31,7 +31,7 @@ struct starpu_fxt_codelet_event
 {
 	char symbol[256];
 	int workerid;
-	struct starpu_perfmodel_arch arch;
+	char perfmodel_archname[256];
 	uint32_t hash;
 	size_t size;
 	float time;

+ 34 - 11
include/starpu_perfmodel.h

@@ -35,13 +35,20 @@ struct starpu_data_descr;
 
 #define STARPU_NARCH STARPU_ANY_WORKER
 
-struct starpu_perfmodel_arch
+struct starpu_perfmodel_device
 {
 	enum starpu_worker_archtype type;
 	int devid;	/* identifier of the precise device */
-	int ncore;	/* number of execution in parallel, minus 1 */
+	int ncores;	/* number of execution in parallel, minus 1 */	
+};
+
+struct starpu_perfmodel_arch
+{
+	int ndevices;
+	struct starpu_perfmodel_device *devices;
 };
 
+
 struct starpu_perfmodel_history_entry
 {
 	double mean;
@@ -91,10 +98,13 @@ struct starpu_perfmodel_history_table;
 
 #define starpu_per_arch_perfmodel starpu_perfmodel_per_arch STARPU_DEPRECATED
 
+typedef double (*starpu_perfmodel_per_arch_cost_function)(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+typedef size_t (*starpu_perfmodel_per_arch_size_base)(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+
 struct starpu_perfmodel_per_arch
 {
-	double (*cost_function)(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
-	size_t (*size_base)(struct starpu_task *, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+	starpu_perfmodel_per_arch_cost_function cost_function;
+	starpu_perfmodel_per_arch_size_base size_base;
 
 	struct starpu_perfmodel_history_table *history;
 	struct starpu_perfmodel_history_list *list;
@@ -114,6 +124,9 @@ enum starpu_perfmodel_type
 	STARPU_NL_REGRESSION_BASED
 };
 
+struct _starpu_perfmodel_state;
+typedef struct _starpu_perfmodel_state* starpu_perfmodel_state_t;
+
 struct starpu_perfmodel
 {
 	enum starpu_perfmodel_type type;
@@ -123,23 +136,31 @@ struct starpu_perfmodel
 	size_t (*size_base)(struct starpu_task *, unsigned nimpl);
 	uint32_t (*footprint)(struct starpu_task *);
 
-	struct starpu_perfmodel_per_arch**** per_arch; /*STARPU_MAXIMPLEMENTATIONS*/
-
 	const char *symbol;
 
-	unsigned is_init;
 	unsigned is_loaded;
 	unsigned benchmarking;
-	starpu_pthread_rwlock_t model_rwlock;
+	unsigned is_init;
+
+	starpu_perfmodel_state_t state;
 };
 
-void starpu_perfmodel_init(struct starpu_perfmodel *model);
-void starpu_perfmodel_init_with_file(FILE*f, struct starpu_perfmodel *model);
+void starpu_perfmodel_init(FILE *f, struct starpu_perfmodel *model);
+//void starpu_perfmodel_init_with_file(FILE*f, struct starpu_perfmodel *model);
 
-struct starpu_perfmodel_arch *starpu_worker_get_perf_archtype(int workerid);
+struct starpu_perfmodel_arch *starpu_worker_get_perf_archtype(int workerid, unsigned sched_ctx_id);
 
 int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *model);
 int starpu_perfmodel_unload_model(struct starpu_perfmodel *model);
+int starpu_get_narch_combs();
+int starpu_perfmodel_arch_comb_add(int ndevices, struct starpu_perfmodel_device* devices);
+int starpu_perfmodel_arch_comb_get(int ndevices, struct starpu_perfmodel_device *devices);
+
+struct starpu_perfmodel_per_arch *starpu_perfmodel_get_model_per_arch(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, unsigned impl);
+struct starpu_perfmodel_per_arch *starpu_perfmodel_get_model_per_devices(struct starpu_perfmodel *model, int impl, ...);
+
+int starpu_perfmodel_set_per_devices_cost_function(struct starpu_perfmodel *model, int impl, starpu_perfmodel_per_arch_cost_function func, ...);
+int starpu_perfmodel_set_per_devices_size_base(struct starpu_perfmodel *model, int impl, starpu_perfmodel_per_arch_size_base func, ...);
 
 void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, char *path, size_t maxlen, unsigned nimpl);
 char* starpu_perfmodel_get_archtype_name(enum starpu_worker_archtype archtype);
@@ -150,6 +171,8 @@ int starpu_perfmodel_list(FILE *output);
 void starpu_perfmodel_print(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output);
 int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char *parameter, uint32_t *footprint, FILE *output);
 
+int starpu_perfmodel_list_combs(FILE *output, struct starpu_perfmodel *model);
+
 void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned cpuid, unsigned nimpl, double measured);
 void starpu_perfmodel_directory(FILE *output);
 

+ 1 - 0
include/starpu_sched_ctx.h

@@ -30,6 +30,7 @@ extern "C"
 #define STARPU_SCHED_CTX_POLICY_MAX_PRIO	 (4<<16)
 #define STARPU_SCHED_CTX_HIERARCHY_LEVEL         (5<<16)
 #define STARPU_SCHED_CTX_NESTED                  (6<<16)
+#define STARPU_SCHED_CTX_AWAKE_WORKERS           (7<<16)
 
 unsigned starpu_sched_ctx_create(int *workerids_ctx, int nworkers_ctx, const char *sched_ctx_name, ...);
 

+ 3 - 1
include/starpu_task.h

@@ -184,6 +184,7 @@ struct starpu_task
 
 	unsigned sched_ctx;
 	int hypervisor_tag;
+	unsigned possibly_parallel;
 
 	starpu_task_bundle_t bundle;
 
@@ -232,7 +233,8 @@ struct starpu_task
 	.dyn_handles = NULL,				\
 	.dyn_interfaces = NULL,				\
 	.dyn_modes = NULL,				\
-	.name = NULL                        		\
+	.name = NULL,                        		\
+	.possibly_parallel = 0                        	\
 }
 
 #define STARPU_TASK_GET_NBUFFERS(task) ((unsigned)((task)->cl->nbuffers == STARPU_VARIABLE_NBUFFERS ? ((task)->nbuffers) : ((task)->cl->nbuffers)))

+ 19 - 18
include/starpu_task_util.h

@@ -32,24 +32,25 @@ extern "C"
 
 void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t *deps, void (*callback)(void *), void *callback_arg);
 
-#define STARPU_VALUE		 (1<<18)
-#define STARPU_CALLBACK		 (2<<18)
-#define STARPU_CALLBACK_WITH_ARG (3<<18)
-#define STARPU_CALLBACK_ARG	 (4<<18)
-#define STARPU_PRIORITY		 (5<<18)
-#define STARPU_EXECUTE_ON_NODE	 (6<<18)
-#define STARPU_EXECUTE_ON_DATA	 (7<<18)
-#define STARPU_DATA_ARRAY        (8<<18)
-#define STARPU_TAG               (9<<18)
-#define STARPU_HYPERVISOR_TAG	 (10<<18)
-#define STARPU_FLOPS	         (11<<18)
-#define STARPU_SCHED_CTX	 (12<<18)
-#define STARPU_PROLOGUE_CALLBACK   (13<<18)
-#define STARPU_PROLOGUE_CALLBACK_ARG (14<<18)
-#define STARPU_PROLOGUE_CALLBACK_POP   (15<<18)
-#define STARPU_PROLOGUE_CALLBACK_POP_ARG (16<<18)
-#define STARPU_EXECUTE_ON_WORKER (17<<18)
-#define STARPU_TAG_ONLY          (18<<18)
+#define STARPU_VALUE		 (1<<20)
+#define STARPU_CALLBACK		 (2<<20)
+#define STARPU_CALLBACK_WITH_ARG (3<<20)
+#define STARPU_CALLBACK_ARG	 (4<<20)
+#define STARPU_PRIORITY		 (5<<20)
+#define STARPU_EXECUTE_ON_NODE	 (6<<20)
+#define STARPU_EXECUTE_ON_DATA	 (7<<20)
+#define STARPU_DATA_ARRAY        (8<<20)
+#define STARPU_TAG               (9<<20)
+#define STARPU_HYPERVISOR_TAG	 (10<<20)
+#define STARPU_FLOPS	         (11<<20)
+#define STARPU_SCHED_CTX	 (12<<20)
+#define STARPU_PROLOGUE_CALLBACK   (13<<20)
+#define STARPU_PROLOGUE_CALLBACK_ARG (14<<20)
+#define STARPU_PROLOGUE_CALLBACK_POP   (15<<20)
+#define STARPU_PROLOGUE_CALLBACK_POP_ARG (16<<20)
+#define STARPU_EXECUTE_ON_WORKER (17<<20)
+#define STARPU_TAG_ONLY          (18<<20)
+#define STARPU_POSSIBLY_PARALLEL    (19<<20)
 #define STARPU_WORKER_ORDER      (19<<18)
 
 struct starpu_task *starpu_task_build(struct starpu_codelet *cl, ...);

+ 20 - 6
sc_hypervisor/examples/cholesky/cholesky_models.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010-2011  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -26,6 +26,7 @@
  */
 
 #include <starpu.h>
+#include <starpu_perfmodel.h>
 #include "cholesky.h"
 
 /* #define USE_PERTURBATION	1 */
@@ -127,12 +128,25 @@ double cuda_chol_task_22_cost(struct starpu_task *task, struct starpu_perfmodel_
 }
 
 void initialize_chol_model(struct starpu_perfmodel* model, char * symbol,
-		double (*cpu_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned),
-		double (*cuda_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned))
+			   double (*cpu_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned),
+			   double (*cuda_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned))
 {
+	struct starpu_perfmodel_per_arch *per_arch;
+
 	model->symbol = symbol;
 	model->type = STARPU_HISTORY_BASED;
-	starpu_perfmodel_init(model);
-	model->per_arch[STARPU_CPU_WORKER][0][0][0].cost_function = cpu_cost_function;
-	model->per_arch[STARPU_CUDA_WORKER][0][0][0].cost_function = cuda_cost_function;
+
+	starpu_perfmodel_init(NULL, model);
+
+	per_arch = starpu_perfmodel_get_model_per_devices(model, 0, STARPU_CPU_WORKER, 0, 1, -1);
+        per_arch->cost_function = cpu_cost_function;
+	// We could also call directly:
+	// starpu_perfmodel_set_per_devices_cost_function(model, 0, cpu_cost_function, STARPU_CPU_WORKER, 0, 1, -1);
+
+	if(starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) != 0)
+	{
+	     	per_arch = starpu_perfmodel_get_model_per_devices(model, 0, STARPU_CUDA_WORKER, 0, 1, -1);
+		per_arch->cost_function = cuda_cost_function;
+
+	}
 }

+ 1 - 1
sc_hypervisor/src/policies_utils/policy_tools.c

@@ -414,7 +414,7 @@ void sc_hypervisor_get_tasks_times(int nw, int nt, double times[nw][nt], int *wo
                 for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
                 {
 			int worker = workers == NULL ? w : workers[w];
-                        struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(worker);
+                        struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(worker, STARPU_NMAX_SCHED_CTXS);
                         double length = starpu_permodel_history_based_expected_perf(tp->cl->model, arch, tp->footprint);
 
                         if (isnan(length))

+ 6 - 3
sc_hypervisor/src/sc_hypervisor.c

@@ -382,9 +382,12 @@ void sc_hypervisor_unregister_ctx(unsigned sched_ctx)
 	int *pus;
 	unsigned npus = starpu_sched_ctx_get_workers_list(sched_ctx, &pus);
 
-	starpu_sched_ctx_set_priority(pus, npus, father, 1);
-	starpu_sched_ctx_set_priority_on_level(pus, npus, father, 1);
-	free(pus);
+	if(npus)
+	{
+		starpu_sched_ctx_set_priority(pus, npus, father, 1);
+		starpu_sched_ctx_set_priority_on_level(pus, npus, father, 1);
+		free(pus);
+	}
 
 	unsigned i;
 	for(i = 0; i < hypervisor.nsched_ctxs; i++)

+ 13 - 11
src/common/fxt.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2014  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -414,7 +414,7 @@ do {									\
 #define _STARPU_TRACE_WORKER_INIT_END(__workerid)				\
 	FUT_DO_PROBE2(_STARPU_FUT_WORKER_INIT_END, _starpu_gettid(), (__workerid));
 
-#define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, archtype, workerid)				\
+#define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, perf_arch, workerid)				\
 do {									\
         const char *model_name = _starpu_job_get_model_name((job));         \
 	if (model_name)                                                 \
@@ -442,17 +442,19 @@ do {									\
 				}					\
 			}						\
 		}							\
-		const size_t __job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));	\
-		const uint32_t __job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));\
-		FUT_DO_PROBE6(_STARPU_FUT_CODELET_DETAILS, (job), ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->tag_id, workerid);	\
+		const size_t __job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));	\
+		const uint32_t __job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));\
+		FUT_DO_PROBE7(_STARPU_FUT_CODELET_DETAILS, (job), ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->tag_id, workerid, ((job)->job_id)); \
 	}								\
 } while(0);
 
-#define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, archtype, workerid)			\
+#define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, perf_arch, workerid)			\
 do {									\
-	const size_t job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));	\
-	const uint32_t job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));\
-	FUT_DO_PROBE7(_STARPU_FUT_END_CODELET_BODY, (job), (job_size), (job_hash), (archtype)->type, (archtype)->devid, (archtype)->ncore, workerid);	\
+	const size_t job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));	\
+	const uint32_t job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));\
+	char _archname[32]=""; \
+	starpu_perfmodel_get_arch_name(perf_arch, _archname, 32, 0);	\
+	_STARPU_FUT_DO_PROBE4STR(_STARPU_FUT_END_CODELET_BODY, (job), (job_size), (job_hash), workerid, _archname); \
 } while(0);
 
 #define _STARPU_TRACE_START_EXECUTING()				\
@@ -818,8 +820,8 @@ do {										\
 #define _STARPU_TRACE_NEW_MEM_NODE(nodeid)	do {} while(0)
 #define _STARPU_TRACE_WORKER_INIT_START(a,b,c)	do {} while(0)
 #define _STARPU_TRACE_WORKER_INIT_END(workerid)	do {} while(0)
-#define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, archtype, workerid)	do {} while(0)
-#define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, a, workerid)	do {} while(0)
+#define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, perf_arch, workerid)	do {} while(0)
+#define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, perf_arch, workerid)	do {} while(0)
 #define _STARPU_TRACE_START_EXECUTING()	do {} while(0)
 #define _STARPU_TRACE_END_EXECUTING()	do {} while(0)
 #define _STARPU_TRACE_START_CALLBACK(job)	do {} while(0)

+ 6 - 5
src/core/combined_workers.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2014  Université de Bordeaux
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -102,10 +102,11 @@ int starpu_combined_worker_assign_workerid(int nworkers, int workerid_array[])
 		&config->combined_workers[combined_worker_id];
 
 	combined_worker->worker_size = nworkers;
-
-	combined_worker->perf_arch.type = config->workers[workerid_array[0]].perf_arch.type;
-	combined_worker->perf_arch.devid = config->workers[workerid_array[0]].perf_arch.devid; 
-	combined_worker->perf_arch.ncore = nworkers - 1;
+	combined_worker->perf_arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
+	combined_worker->perf_arch.ndevices = 1;
+	combined_worker->perf_arch.devices[0].type = config->workers[workerid_array[0]].perf_arch.devices[0].type;
+	combined_worker->perf_arch.devices[0].devid = config->workers[workerid_array[0]].perf_arch.devices[0].devid; 
+	combined_worker->perf_arch.devices[0].ncores = nworkers - 1;
 	combined_worker->worker_mask = config->workers[workerid_array[0]].worker_mask;
 	
 #ifdef STARPU_USE_MP

+ 2 - 2
src/core/detect_combined_workers.c

@@ -44,7 +44,7 @@ static void find_workers(hwloc_obj_t obj, int cpu_workers[STARPU_NMAXWORKERS], u
 	for(worker = _starpu_worker_list_begin(workers); worker != _starpu_worker_list_end(workers); worker = _starpu_worker_list_next(worker))
 	{
 		/* is it a CPU worker? */
-		if (worker->perf_arch.type == STARPU_CPU_WORKER && worker->perf_arch.ncore == 0)
+		if (worker->perf_arch.devices[0].type == STARPU_CPU_WORKER && worker->perf_arch.devices[0].ncores == 0)
 		{
 			_STARPU_DEBUG("worker %d is part of it\n", worker->workerid);
 			/* Add it to the combined worker */
@@ -178,7 +178,7 @@ static void find_and_assign_combinations_with_hwloc(int *workerids, int nworkers
 	for (i = 0; i < nworkers; i++)
 	{
 		struct _starpu_worker *worker = _starpu_get_worker_struct(workerids[i]);
-		if (worker->perf_arch.type == STARPU_CPU_WORKER && worker->perf_arch.ncore == 0)
+		if (worker->perf_arch.devices[0].type == STARPU_CPU_WORKER && worker->perf_arch.devices[0].ncores == 0)
 		{
 			hwloc_obj_t obj = hwloc_get_obj_by_depth(topology->hwtopology, config->pu_depth, worker->bindid);
 			obj = obj->parent;

+ 3 - 2
src/core/jobs.c

@@ -193,6 +193,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 {
 	struct starpu_task *task = j->task;
 	unsigned sched_ctx = task->sched_ctx;
+	int workerid = starpu_worker_get_id();
 	double flops = task->flops;
 	const unsigned continuation =
 #ifdef STARPU_OPENMP
@@ -219,12 +220,11 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 		 * the callback is not done yet. */
 		j->terminated = 1;
 	}
-
 	STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
 
+
 #ifdef STARPU_USE_SC_HYPERVISOR
 	size_t data_size = 0;
-	int workerid = starpu_worker_get_id();
 #endif //STARPU_USE_SC_HYPERVISOR
 
 	/* We release handle reference count */
@@ -259,6 +259,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 		 * implicit dependencies any more.  */
 		_starpu_release_task_enforce_sequential_consistency(j);
 	}
+
 	/* Task does not have a cl, but has explicit data dependencies, we need
 	 * to tell them that we will not exist any more before notifying the
 	 * tasks waiting for us

+ 46 - 36
src/core/perfmodel/perfmodel.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2014  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -39,7 +39,6 @@
  *	2: models must be calibrated, existing models are overwritten.
  */
 static unsigned calibrate_flag = 0;
-
 void _starpu_set_calibrate_flag(unsigned val)
 {
 	calibrate_flag = val;
@@ -50,8 +49,15 @@ unsigned _starpu_get_calibrate_flag(void)
 	return calibrate_flag;
 }
 
-struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid)
+struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid, unsigned sched_ctx_id)
 {
+	if(sched_ctx_id != STARPU_NMAX_SCHED_CTXS)
+	{
+		unsigned child_sched_ctx = starpu_sched_ctx_worker_is_master_for_child_ctx(workerid, sched_ctx_id);
+		if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
+			return _starpu_sched_ctx_get_perf_archtype(child_sched_ctx);
+	}
+
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
 
 	/* This workerid may either be a basic worker or a combined worker */
@@ -60,6 +66,7 @@ struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid)
 	if (workerid < (int)config->topology.nworkers)
 		return &config->workers[workerid].perf_arch;
 
+
 	/* We have a combined worker */
 	unsigned ncombinedworkers = config->topology.ncombinedworkers;
 	STARPU_ASSERT(workerid < (int)(ncombinedworkers + nworkers));
@@ -72,10 +79,17 @@ struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid)
 
 static double per_arch_task_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, struct starpu_task *task, unsigned nimpl)
 {
+	int comb;
 	double (*per_arch_cost_function)(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
 
-	per_arch_cost_function = model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].cost_function;
+	comb = starpu_perfmodel_arch_comb_get(arch->ndevices, arch->devices);
+	if (comb == -1)
+		return NAN;
+	if (model->state->per_arch[comb] == NULL)
+		// The model has not been executed on this combination
+		return NAN;
 
+	per_arch_cost_function = model->state->per_arch[comb][nimpl].cost_function;
 	STARPU_ASSERT_MSG(per_arch_cost_function, "STARPU_PER_ARCH needs per-arch cost_function to be defined");
 
 	return per_arch_cost_function(task, arch, nimpl);
@@ -87,26 +101,23 @@ static double per_arch_task_expected_perf(struct starpu_perfmodel *model, struct
 
 double starpu_worker_get_relative_speedup(struct starpu_perfmodel_arch* perf_arch)
 {
-	if (perf_arch->type == STARPU_CPU_WORKER)
-	{
-		return _STARPU_CPU_ALPHA * (perf_arch->ncore + 1);
-	}
-	else if (perf_arch->type == STARPU_CUDA_WORKER)
-	{
-		return _STARPU_CUDA_ALPHA;
-	}
-	else if (perf_arch->type == STARPU_OPENCL_WORKER)
+	double speedup = 0;
+	int dev;
+	for(dev = 0; dev < perf_arch->ndevices; dev++)
 	{
-		return _STARPU_OPENCL_ALPHA;
+		double coef = 0.0;
+		if (perf_arch->devices[dev].type == STARPU_CPU_WORKER)
+			coef = _STARPU_CPU_ALPHA;
+		else if (perf_arch->devices[dev].type == STARPU_CUDA_WORKER)
+			coef = _STARPU_CUDA_ALPHA;
+		else if (perf_arch->devices[dev].type == STARPU_OPENCL_WORKER)
+			coef = _STARPU_OPENCL_ALPHA;
+		else if (perf_arch->devices[dev].type == STARPU_MIC_WORKER)
+			coef =  _STARPU_MIC_ALPHA;
+
+		speedup += coef * (perf_arch->devices[dev].ncores + 1);
 	}
-	else if (perf_arch->type == STARPU_MIC_WORKER)
-	{
-		return _STARPU_MIC_ALPHA * (perf_arch->ncore + 1);
-	}
-	STARPU_ABORT();
-
-	/* Never reached ! */
-	return NAN;
+	return speedup == 0 ? NAN : speedup;
 }
 
 static double common_task_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, struct starpu_task *task, unsigned nimpl)
@@ -124,13 +135,18 @@ static double common_task_expected_perf(struct starpu_perfmodel *model, struct s
 	return (exp/alpha);
 }
 
-void _starpu_load_perfmodel(struct starpu_perfmodel *model)
+void _starpu_init_and_load_perfmodel(struct starpu_perfmodel *model)
 {
 	if (!model || model->is_loaded)
 		return;
 
-	int load_model = _starpu_register_model(model);
-	if (!load_model)
+	starpu_perfmodel_init(NULL, model);
+
+	// Check if a symbol is defined before trying to load the model from a file
+	if (!model->symbol)
+		return;
+
+	if (model->is_loaded)
 		return;
 
 	switch (model->type)
@@ -160,30 +176,22 @@ static double starpu_model_expected_perf(struct starpu_task *task, struct starpu
 {
 	if (model)
 	{
-		if (model->symbol)
-			_starpu_load_perfmodel(model);
+		_starpu_init_and_load_perfmodel(model);
 
 		struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
 
 		switch (model->type)
 		{
 			case STARPU_PER_ARCH:
-
 				return per_arch_task_expected_perf(model, arch, task, nimpl);
 			case STARPU_COMMON:
 				return common_task_expected_perf(model, arch, task, nimpl);
-
 			case STARPU_HISTORY_BASED:
-
 				return _starpu_history_based_job_expected_perf(model, arch, j, nimpl);
 			case STARPU_REGRESSION_BASED:
-
 				return _starpu_regression_based_job_expected_perf(model, arch, j, nimpl);
-
 			case STARPU_NL_REGRESSION_BASED:
-
 				return _starpu_non_linear_regression_based_job_expected_perf(model, arch, j,nimpl);
-
 			default:
 				STARPU_ABORT();
 		}
@@ -207,6 +215,8 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 					    struct starpu_perfmodel_arch* arch,
 					    unsigned nimpl)
 {
+	if(arch->ndevices > 1)
+		return -1.0;
 	unsigned i;
 	double sum = 0.0;
 	enum starpu_node_kind node_kind;
@@ -220,8 +230,8 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 		handle = STARPU_TASK_GET_HANDLE(task, i);
 		if (!_starpu_data_is_multiformat_handle(handle))
 			continue;
-		
-		switch(arch->type)
+
+		switch(arch->devices[0].type)
 		{
 			case STARPU_CPU_WORKER:
 				node_kind = STARPU_CPU_RAM;

+ 21 - 5
src/core/perfmodel/perfmodel.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2013  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -38,7 +38,20 @@ extern "C"
  * differents versions of StarPU having different performance model
  * formats.
  */
-#define _STARPU_PERFMODEL_VERSION 43
+#define _STARPU_PERFMODEL_VERSION 44
+
+struct _starpu_perfmodel_state
+{
+	struct starpu_perfmodel_per_arch** per_arch; /*STARPU_MAXIMPLEMENTATIONS*/
+	int** per_arch_is_set; /*STARPU_MAXIMPLEMENTATIONS*/
+
+	starpu_pthread_rwlock_t model_rwlock;
+	int *nimpls;
+	int *nimpls_set;
+	int ncombs;  /* The number of combinations currently used by the model */
+	int ncombs_set; /* The number of combinations allocated in the array nimpls and ncombs */
+	int *combs;
+};
 
 struct _starpu_perfmodel_list
 {
@@ -56,11 +69,10 @@ void _starpu_get_perf_model_dir_bus(char *path, size_t maxlen);
 void _starpu_get_perf_model_dir_debug(char *path, size_t maxlen);
 
 double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, struct _starpu_job *j, unsigned nimpl);
-int _starpu_register_model(struct starpu_perfmodel *model);
 void _starpu_load_per_arch_based_model(struct starpu_perfmodel *model);
 void _starpu_load_common_based_model(struct starpu_perfmodel *model);
 void _starpu_load_history_based_model(struct starpu_perfmodel *model, unsigned scan_history);
-void _starpu_load_perfmodel(struct starpu_perfmodel *model);
+void _starpu_init_and_load_perfmodel(struct starpu_perfmodel *model);
 void _starpu_initialize_registered_performance_models(void);
 void _starpu_deinitialize_registered_performance_models(void);
 void _starpu_deinitialize_performance_model(struct starpu_perfmodel *model);
@@ -71,6 +83,7 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 					struct starpu_perfmodel_arch* arch, struct _starpu_job *j, unsigned nimpl);
 void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch,
 				unsigned cpuid, double measured, unsigned nimpl);
+int _starpu_perfmodel_create_comb_if_needed(struct starpu_perfmodel_arch* arch);
 
 void _starpu_create_sampling_directory_if_needed(void);
 
@@ -86,13 +99,16 @@ int *_starpu_get_cuda_affinity_vector(unsigned gpuid);
 int *_starpu_get_opencl_affinity_vector(unsigned gpuid);
 #endif
 
-
 void _starpu_save_bandwidth_and_latency_disk(double bandwidth_write, double bandwidth_read, 
 					    double latency_write, double latency_read, unsigned node);
 
 int _starpu_read_double(FILE *f, char *format, double *val);
 void _starpu_simgrid_get_platform_path(char *path, size_t maxlen);
 
+struct starpu_perfmodel_arch * _starpu_arch_comb_get(int comb);
+
+void _starpu_perfmodel_realloc(struct starpu_perfmodel *model, int nb);
+
 #ifdef __cplusplus
 }
 #endif

Разница между файлами не показана из-за своего большого размера
+ 587 - 446
src/core/perfmodel/perfmodel_history.c


+ 61 - 38
src/core/perfmodel/perfmodel_print.c

@@ -19,6 +19,7 @@
 #include <starpu.h>
 #include <starpu_perfmodel.h>
 #include <common/config.h>
+#include "perfmodel.h"
 
 static
 void _starpu_perfmodel_print_history_based(struct starpu_perfmodel_per_arch *per_arch_model, char *parameter, uint32_t *footprint, FILE *output)
@@ -63,13 +64,16 @@ void _starpu_perfmodel_print_history_based(struct starpu_perfmodel_per_arch *per
 
 void starpu_perfmodel_print(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output)
 {
-	struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[arch->type][arch->devid][arch->ncore][nimpl];
+	int comb = starpu_perfmodel_arch_comb_get(arch->ndevices, arch->devices);
+	STARPU_ASSERT(comb != -1);
+
+	struct starpu_perfmodel_per_arch *arch_model = &model->state->per_arch[comb][nimpl];
 	char archname[32];
 
 	if (arch_model->regression.nsample || arch_model->regression.valid || arch_model->regression.nl_valid || arch_model->list)
 	{
 		starpu_perfmodel_get_arch_name(arch, archname, 32, nimpl);
-		fprintf(output, "performance model for %s\n", archname);
+		fprintf(output, "# performance model for %s\n", archname);
 	}
 
 	if (parameter == NULL)
@@ -170,24 +174,13 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 {
 	if (arch == NULL)
 	{
-		/* display all architectures */
-		unsigned archtype, devid, ncore, implid;
-		struct starpu_perfmodel_arch perf_arch;
-		for (archtype = 0; archtype < STARPU_NARCH; archtype++)
+		int comb, impl;
+		for(comb = 0; comb < starpu_get_narch_combs(); comb++)
 		{
-			perf_arch.type = archtype;
-			for(devid = 0; model->per_arch[archtype][devid] != NULL; devid++)
-			{
-				perf_arch.devid = devid;
-				for(ncore = 0; model->per_arch[archtype][devid][ncore] != NULL; ncore++)
-				{
-					perf_arch.ncore = ncore;
-					for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
-					{ /* Display all codelets on each arch */
-						starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
-					}
-				}
-			}
+			struct starpu_perfmodel_arch *arch_comb = _starpu_arch_comb_get(comb);
+			int nimpls = model->state ? model->state->nimpls[comb] : 0;
+			for(impl = 0; impl < nimpls; impl++)
+				starpu_perfmodel_print(model, arch_comb, impl, parameter, footprint, output);
 		}
 	}
 	else
@@ -196,11 +189,17 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 		{
 			int implid;
 			struct starpu_perfmodel_arch perf_arch;
-			perf_arch.type = STARPU_CPU_WORKER;
-			perf_arch.devid = 0;
-			perf_arch.ncore = 0;
-			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+			perf_arch.ndevices = 1;
+			perf_arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
+			perf_arch.devices[0].type = STARPU_CPU_WORKER;
+			perf_arch.devices[0].devid = 0;
+			perf_arch.devices[0].ncores = 1;
+			int comb = starpu_perfmodel_arch_comb_get(perf_arch.ndevices, perf_arch.devices);
+			STARPU_ASSERT(comb != -1);
+			int nimpls = model->state->nimpls[comb];
+			for (implid = 0; implid < nimpls; implid++)
 				starpu_perfmodel_print(model, &perf_arch,implid, parameter, footprint, output); /* Display all codelets on cpu */
+			free(perf_arch.devices);
 			return 0;
 		}
 
@@ -216,28 +215,44 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 
 			int implid;
 			struct starpu_perfmodel_arch perf_arch;
-			perf_arch.type = STARPU_CPU_WORKER;
-			perf_arch.devid = 0;
-			perf_arch.ncore = k-1;
-			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+			perf_arch.ndevices = 1;
+			perf_arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
+			perf_arch.devices[0].type = STARPU_CPU_WORKER;
+			perf_arch.devices[0].devid = 0;
+			perf_arch.devices[0].ncores = k-1;
+			int comb = starpu_perfmodel_arch_comb_get(perf_arch.ndevices, perf_arch.devices);
+			STARPU_ASSERT(comb != -1);
+			int nimpls = model->state->nimpls[comb];
+
+			for (implid = 0; implid < nimpls; implid++)
 				starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
+			free(perf_arch.devices);
 			return 0;
 		}
 
 		if (strcmp(arch, "cuda") == 0)
 		{
-			unsigned devid;
 			int implid;
 			struct starpu_perfmodel_arch perf_arch;
-			perf_arch.type = STARPU_CUDA_WORKER;
-			perf_arch.ncore = 0;
 
-			for (devid = 0; model->per_arch[STARPU_CUDA_WORKER] != NULL; devid++)
+			perf_arch.ndevices = 1;
+			perf_arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
+			perf_arch.devices[0].type = STARPU_CUDA_WORKER;
+			perf_arch.devices[0].ncores = 1;
+			int comb;
+			for(comb = 0; comb < starpu_get_narch_combs(); comb++)
 			{
-				perf_arch.devid = devid;
-				for (implid = 0; implid <STARPU_MAXIMPLEMENTATIONS; implid ++)
-					starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
+				struct starpu_perfmodel_arch *arch_comb = _starpu_arch_comb_get(comb);
+				if(arch_comb->ndevices == 1 && arch_comb->devices[0].type == STARPU_CUDA_WORKER)
+				{
+					perf_arch.devices[0].devid = arch_comb->devices[0].devid;
+					int nimpls = model->state->nimpls[comb];
+
+					for (implid = 0; implid < nimpls; implid++)
+						starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
+				}
 			}
+			free(perf_arch.devices);
 			return 0;
 		}
 
@@ -248,11 +263,19 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 		if (nmatched == 1)
 		{
 			struct starpu_perfmodel_arch perf_arch;
-			perf_arch.type = STARPU_CUDA_WORKER;
-			perf_arch.devid = gpuid;
-			perf_arch.ncore = 0;
+			perf_arch.ndevices = 1;
+			perf_arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
+
+			perf_arch.devices[0].type = STARPU_CUDA_WORKER;
+			perf_arch.devices[0].devid = gpuid;
+			perf_arch.devices[0].ncores = 1;
+
+			int comb = starpu_perfmodel_arch_comb_get(perf_arch.ndevices, perf_arch.devices);
+			STARPU_ASSERT(comb != -1);
+			int nimpls = model->state->nimpls[comb];
+
 			int implid;
-			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+			for (implid = 0; implid < nimpls; implid++)
 				starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
 			return 0;
 		}

+ 290 - 60
src/core/sched_ctx.c

@@ -34,11 +34,13 @@ static size_t data_size[STARPU_NMAX_SCHED_CTXS][STARPU_NMAXWORKERS];
 static unsigned _starpu_get_first_free_sched_ctx(struct _starpu_machine_config *config);
 static void _starpu_sched_ctx_add_workers_to_master(unsigned sched_ctx_id, int *workerids, int nworkers, int new_master);
 static void _starpu_sched_ctx_wake_these_workers_up(unsigned sched_ctx_id, int *workerids, int nworkers);
+static int _starpu_sched_ctx_find_master(unsigned sched_ctx_id, int *workerids, int nworkers);
+static void _starpu_sched_ctx_set_master(struct _starpu_sched_ctx *sched_ctx, int *workerids, int nworkers, int master);
 
 static void _starpu_worker_gets_into_ctx(unsigned sched_ctx_id, struct _starpu_worker *worker)
 {
 	unsigned ret_sched_ctx = _starpu_sched_ctx_list_get_sched_ctx(worker->sched_ctx_list, sched_ctx_id);
-	/* the worker was planning to go away in another ctx but finally he changed his mind & 
+	/* the worker was planning to go away in another ctx but finally he changed his mind &
 	   he's staying */
 	if (ret_sched_ctx == STARPU_NMAX_SCHED_CTXS)
 	{
@@ -58,13 +60,16 @@ void _starpu_worker_gets_out_of_ctx(unsigned sched_ctx_id, struct _starpu_worker
 	/* remove context from worker */
 	if(ret_sched_ctx != STARPU_NMAX_SCHED_CTXS)
 	{
-		struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
-		if(sched_ctx && sched_ctx->sched_policy && sched_ctx->sched_policy->remove_workers)
-		{
-			_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
-			sched_ctx->sched_policy->remove_workers(sched_ctx_id, &worker->workerid, 1);
-			_STARPU_TRACE_WORKER_SCHEDULING_POP;
-		}
+		/* don't remove scheduling data here, there might be tasks running and when post_exec
+		   executes scheduling data is not there any more, do it when deleting context, then
+		   we really won't need it anymore */
+		/* struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id); */
+		/* if(sched_ctx && sched_ctx->sched_policy && sched_ctx->sched_policy->remove_workers) */
+		/* { */
+		/* 	_STARPU_TRACE_WORKER_SCHEDULING_PUSH; */
+		/* 	sched_ctx->sched_policy->remove_workers(sched_ctx_id, &worker->workerid, 1); */
+		/* 	_STARPU_TRACE_WORKER_SCHEDULING_POP; */
+		/* } */
 		_starpu_sched_ctx_list_remove(&worker->sched_ctx_list, sched_ctx_id);
 		worker->nsched_ctxs--;
 	}
@@ -143,7 +148,10 @@ static void _starpu_add_workers_to_sched_ctx(struct _starpu_sched_ctx *sched_ctx
 	int nworkers_to_add = nworkers == -1 ? (int)config->topology.nworkers : nworkers;
 	int workers_to_add[nworkers_to_add];
 
-
+	struct starpu_perfmodel_device devices[nworkers_to_add];
+	int ndevices = 0;
+	struct _starpu_worker *str_worker = NULL;
+	int worker;
 	int i = 0;
 	for(i = 0; i < nworkers_to_add; i++)
 	{
@@ -151,7 +159,7 @@ static void _starpu_add_workers_to_sched_ctx(struct _starpu_sched_ctx *sched_ctx
 		/* if the function is called at the creation of the context it's no need to do this verif */
 		if(added_workers)
 		{
-			int worker = workers->add(workers, (workerids == NULL ? i : workerids[i]));
+			worker = workers->add(workers, (workerids == NULL ? i : workerids[i]));
 			if(worker >= 0)
 				added_workers[(*n_added_workers)++] = worker;
 			else
@@ -169,22 +177,133 @@ static void _starpu_add_workers_to_sched_ctx(struct _starpu_sched_ctx *sched_ctx
 		}
 		else
 		{
-			int worker = (workerids == NULL ? i : workerids[i]);
+			worker = (workerids == NULL ? i : workerids[i]);
 			workers->add(workers, worker);
 			workers_to_add[i] = worker;
-			struct _starpu_worker *str_worker = _starpu_get_worker_struct(worker);
+			str_worker = _starpu_get_worker_struct(worker);
 			str_worker->tmp_sched_ctx = (int)sched_ctx->id;
+		}
+	}
+
+	int *wa;
+	int na;
+	if(added_workers)
+	{
+		na = *n_added_workers;
+		wa = added_workers;
+	}
+	else
+	{
+		na = nworkers_to_add;
+		wa = workers_to_add;
+	}
+
+	for(i = 0; i < na; i++)
+	{
+		worker = wa[i];
+		str_worker = _starpu_get_worker_struct(worker);
+		int dev1, dev2;
+		unsigned found = 0;
+		for(dev1 = 0; dev1 < str_worker->perf_arch.ndevices; dev1++)
+		{
+			for(dev2 = 0; dev2 < ndevices; dev2++)
+			{
+				if(devices[dev2].type == str_worker->perf_arch.devices[dev1].type &&
+				   devices[dev2].devid == str_worker->perf_arch.devices[dev1].devid)
+				{
+					devices[dev2].ncores += str_worker->perf_arch.devices[dev1].ncores;
+					found = 1;
+					break;
+				}
+			}
+			if(!found)
+			{
+				devices[ndevices].type = str_worker->perf_arch.devices[dev1].type;
+				devices[ndevices].devid = str_worker->perf_arch.devices[dev1].devid;
+				devices[ndevices].ncores = str_worker->perf_arch.devices[dev1].ncores;
+				ndevices++;
+			}
+			else
+				found = 0;
+		}
+	}
+
+	if(ndevices > 0)
+	{
+
+		if(sched_ctx->perf_arch.devices == NULL)
+			sched_ctx->perf_arch.devices = (struct starpu_perfmodel_device*)malloc(ndevices*sizeof(struct starpu_perfmodel_device));
+		else
+		{
+			int nfinal_devices = 0;
+			int dev1, dev2;
+			unsigned found = 0;
+			for(dev1 = 0; dev1 < ndevices; dev1++)
+			{
+				for(dev2 = 0; dev2 < sched_ctx->perf_arch.ndevices; dev2++)
+				{
+					if(sched_ctx->perf_arch.devices[dev2].type == devices[dev1].type && sched_ctx->perf_arch.devices[dev2].devid == devices[dev1].devid)
+						found = 1;
+				}
+				
+				if(!found)
+				{
+					nfinal_devices++;
+				}
+				else
+					found = 0;
+				
+			}
+
+
+			int nsize =  (sched_ctx->perf_arch.ndevices+nfinal_devices);
+			sched_ctx->perf_arch.devices  = (struct starpu_perfmodel_device*)realloc(sched_ctx->perf_arch.devices, nsize*sizeof(struct starpu_perfmodel_device));
+			
+		}
+
+		int dev1, dev2;
+		unsigned found = 0;
+		for(dev1 = 0; dev1 < ndevices; dev1++)
+		{
+			for(dev2 = 0; dev2 < sched_ctx->perf_arch.ndevices; dev2++)
+			{
+				if(sched_ctx->perf_arch.devices[dev2].type == devices[dev1].type && sched_ctx->perf_arch.devices[dev2].devid == devices[dev1].devid)
+				{
+					if(sched_ctx->perf_arch.devices[dev2].type == STARPU_CPU_WORKER)
+						sched_ctx->perf_arch.devices[dev2].ncores += devices[dev1].ncores;
+				     
+					found = 1;
+				}
+			}
+
+			if(!found)
+			{
+				sched_ctx->perf_arch.devices[sched_ctx->perf_arch.ndevices].type = devices[dev1].type;
+				sched_ctx->perf_arch.devices[sched_ctx->perf_arch.ndevices].devid = devices[dev1].devid;
+				sched_ctx->perf_arch.devices[sched_ctx->perf_arch.ndevices].ncores = devices[dev1].ncores;
+				sched_ctx->perf_arch.ndevices++;
+			}
+			else
+				found = 0;
 
 		}
 	}
 
 	if(!sched_ctx->sched_policy)
 	{
-		if(sched_ctx->main_master == -1)
-			sched_ctx->main_master = starpu_sched_ctx_book_workers_for_task(sched_ctx->id, workerids, nworkers);
+		if(!sched_ctx->awake_workers)
+		{
+			if(sched_ctx->main_master == -1)
+				sched_ctx->main_master = starpu_sched_ctx_book_workers_for_task(sched_ctx->id, wa, na);
+			else
+			{
+				_starpu_sched_ctx_add_workers_to_master(sched_ctx->id, wa, na, sched_ctx->main_master);
+			}
+		}
 		else
 		{
-			_starpu_sched_ctx_add_workers_to_master(sched_ctx->id, workerids, nworkers, sched_ctx->main_master);
+			sched_ctx->main_master = _starpu_sched_ctx_find_master(sched_ctx->id, wa, na);
+			_starpu_sched_ctx_set_master(sched_ctx, wa, na, sched_ctx->main_master);
 		}
 	}
 	else if(sched_ctx->sched_policy->add_workers)
@@ -196,7 +315,9 @@ static void _starpu_add_workers_to_sched_ctx(struct _starpu_sched_ctx *sched_ctx
 				sched_ctx->sched_policy->add_workers(sched_ctx->id, added_workers, *n_added_workers);
 		}
 		else
+		{
 			sched_ctx->sched_policy->add_workers(sched_ctx->id, workers_to_add, nworkers_to_add);
+		}
 		_STARPU_TRACE_WORKER_SCHEDULING_POP;
 	}
 	return;
@@ -207,8 +328,10 @@ static void _starpu_remove_workers_from_sched_ctx(struct _starpu_sched_ctx *sche
 {
 	struct starpu_worker_collection *workers = sched_ctx->workers;
 
-	int i = 0;
+	struct starpu_perfmodel_device devices[workers->nworkers];
+	int ndevices = 0;
 
+	int i = 0;
 	for(i = 0; i < nworkers; i++)
 	{
 		if(workers->nworkers > 0)
@@ -222,26 +345,80 @@ static void _starpu_remove_workers_from_sched_ctx(struct _starpu_sched_ctx *sche
 		}
 	}
 
+	int worker;
+	unsigned found = 0;
+	int dev;
+	struct starpu_sched_ctx_iterator it;
+	if(workers->init_iterator)
+		workers->init_iterator(workers, &it);
+
+	while(workers->has_next(workers, &it))
+	{
+		worker = workers->get_next(workers, &it);
+		struct _starpu_worker *str_worker = _starpu_get_worker_struct(worker);
+		for(dev = 0; dev < str_worker->perf_arch.ndevices; dev++)
+		{
+			int dev2;
+			for(dev2 = 0; dev2 < ndevices; dev2++)
+			{
+				if(devices[dev2].type == str_worker->perf_arch.devices[dev].type &&
+				   devices[dev2].devid == str_worker->perf_arch.devices[dev].devid)
+				{
+					if(devices[dev2].type == STARPU_CPU_WORKER)
+						devices[dev2].ncores += str_worker->perf_arch.devices[dev].ncores;
+				}
+
+					found = 1;
+			}
+			if(!found)
+			{
+				devices[ndevices].type = str_worker->perf_arch.devices[dev].type;
+				devices[ndevices].devid = str_worker->perf_arch.devices[dev].devid;
+				devices[ndevices].ncores = str_worker->perf_arch.devices[dev].ncores;
+				ndevices++;
+			}
+			else 
+				found = 0;
+		}
+		found = 0;
+		
+	}
+	sched_ctx->perf_arch.ndevices = ndevices;
+	for(dev = 0; dev < ndevices; dev++)
+	{
+		sched_ctx->perf_arch.devices[dev].type = devices[dev].type;
+		sched_ctx->perf_arch.devices[dev].devid = devices[dev].devid;
+		sched_ctx->perf_arch.devices[dev].ncores = devices[dev].ncores;
+	}
+
 	if(!sched_ctx->sched_policy)
-		_starpu_sched_ctx_wake_these_workers_up(sched_ctx->id, removed_workers, *n_removed_workers);
+	{
+		if(!sched_ctx->awake_workers)
+		{
+			_starpu_sched_ctx_wake_these_workers_up(sched_ctx->id, removed_workers, *n_removed_workers);
+		}
+	}
 
 	return;
 }
 
 static void _starpu_sched_ctx_free_scheduling_data(struct _starpu_sched_ctx *sched_ctx)
 {
-	int *workerids = NULL;
-
-	unsigned nworkers_ctx = starpu_sched_ctx_get_workers_list(sched_ctx->id, &workerids);
-
-	if(nworkers_ctx > 0 && sched_ctx->sched_policy->remove_workers)
+	if(sched_ctx->sched_policy && sched_ctx->sched_policy->remove_workers)
 	{
-		_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
-		sched_ctx->sched_policy->remove_workers(sched_ctx->id, workerids, nworkers_ctx);
-		_STARPU_TRACE_WORKER_SCHEDULING_POP;
+		int *workerids = NULL;
+		
+		unsigned nworkers_ctx = starpu_sched_ctx_get_workers_list(sched_ctx->id, &workerids);
+		
+		if(nworkers_ctx > 0)
+		{
+			_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
+			sched_ctx->sched_policy->remove_workers(sched_ctx->id, workerids, nworkers_ctx);
+			_STARPU_TRACE_WORKER_SCHEDULING_POP;
+		}
+		
+		free(workerids);
 	}
-
-	free(workerids);
 	return;
 
 }
@@ -275,7 +452,7 @@ struct _starpu_sched_ctx* _starpu_create_sched_ctx(struct starpu_sched_policy *p
 						   int nworkers_ctx, unsigned is_initial_sched,
 						   const char *sched_ctx_name,
 						   int min_prio_set, int min_prio,
-						   int max_prio_set, int max_prio)
+						   int max_prio_set, int max_prio, unsigned awake_workers)
 {
 	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
 
@@ -314,7 +491,9 @@ struct _starpu_sched_ctx* _starpu_create_sched_ctx(struct starpu_sched_policy *p
 
 	sched_ctx->ready_flops = 0.0;
 	sched_ctx->main_master = -1;
-	
+	sched_ctx->perf_arch.devices = NULL;
+	sched_ctx->perf_arch.ndevices = 0;
+
 	int w;
 	for(w = 0; w < nworkers; w++)
 	{
@@ -323,22 +502,24 @@ struct _starpu_sched_ctx* _starpu_create_sched_ctx(struct starpu_sched_policy *p
 
 		STARPU_PTHREAD_COND_INIT(&sched_ctx->parallel_sect_cond[w], NULL);
 		STARPU_PTHREAD_MUTEX_INIT(&sched_ctx->parallel_sect_mutex[w], NULL);
-		
+
 		sched_ctx->master[w] = -1;
 		sched_ctx->parallel_sect[w] = 0;
 		sched_ctx->sleeping[w] = 0;
 	}
 
-	
+
         /*init the strategy structs and the worker_collection of the ressources of the context */
 	if(policy)
+	{
 		_starpu_init_sched_policy(config, sched_ctx, policy);
+		sched_ctx->awake_workers = 1;
+	}
 	else
+	{
+		sched_ctx->awake_workers = awake_workers;
 		starpu_sched_ctx_create_worker_collection(sched_ctx->id, STARPU_WORKER_LIST);
-	
-        /* construct the collection of workers(list/tree/etc.) */
-	sched_ctx->workers->init(sched_ctx->workers);
-
+	}
 
 	/* after having an worker_collection on the ressources add them */
 	_starpu_add_workers_to_sched_ctx(sched_ctx, workerids, nworkers_ctx, NULL, NULL);
@@ -365,7 +546,7 @@ struct _starpu_sched_ctx* _starpu_create_sched_ctx(struct starpu_sched_policy *p
 			worker->nsched_ctxs++;
 		}
 	}
-	
+
 	return sched_ctx;
 }
 
@@ -504,7 +685,7 @@ unsigned starpu_sched_ctx_create_inside_interval(const char *policy_name, const
 	for(i = 0; i < nw; i++)
 		printf("%d ", workers[i]);
 	printf("\n");
-	sched_ctx = _starpu_create_sched_ctx(selected_policy, workers, nw, 0, sched_ctx_name, 0, 0, 0, 0);
+	sched_ctx = _starpu_create_sched_ctx(selected_policy, workers, nw, 0, sched_ctx_name, 0, 0, 0, 0, 1);
 	sched_ctx->min_ncpus = min_ncpus;
 	sched_ctx->max_ncpus = max_ncpus;
 	sched_ctx->min_ngpus = min_ngpus;
@@ -533,6 +714,7 @@ unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched
 	struct starpu_sched_policy *sched_policy = NULL;
 	unsigned hierarchy_level = 0;
 	unsigned nesting_sched_ctx = STARPU_NMAX_SCHED_CTXS;
+	unsigned awake_workers = 0;
 
 	va_start(varg_list, sched_ctx_name);
 	while ((arg_type = va_arg(varg_list, int)) != 0)
@@ -565,6 +747,10 @@ unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched
 		{
 			nesting_sched_ctx = va_arg(varg_list, unsigned);
 		}
+		else if (arg_type == STARPU_SCHED_CTX_AWAKE_WORKERS)
+		{
+			awake_workers = 1;
+		}
 		else
 		{
 			STARPU_ABORT_MSG("Unrecognized argument %d\n", arg_type);
@@ -574,7 +760,7 @@ unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched
 	va_end(varg_list);
 
 	struct _starpu_sched_ctx *sched_ctx = NULL;
-	sched_ctx = _starpu_create_sched_ctx(sched_policy, workerids, nworkers, 0, sched_ctx_name, min_prio_set, min_prio, max_prio_set, max_prio);
+	sched_ctx = _starpu_create_sched_ctx(sched_policy, workerids, nworkers, 0, sched_ctx_name, min_prio_set, min_prio, max_prio_set, max_prio, awake_workers);
 	sched_ctx->hierarchy_level = hierarchy_level;
 	sched_ctx->nesting_sched_ctx = nesting_sched_ctx;
 
@@ -617,7 +803,16 @@ static void _starpu_delete_sched_ctx(struct _starpu_sched_ctx *sched_ctx)
 		free(sched_ctx->sched_policy);
 		sched_ctx->sched_policy = NULL;
 	}
-	
+	else
+	{
+		starpu_sched_ctx_delete_worker_collection(sched_ctx->id);
+	}
+
+	if (sched_ctx->perf_arch.devices)
+	{
+		free(sched_ctx->perf_arch.devices);
+		sched_ctx->perf_arch.devices = NULL;
+	}
 
 	STARPU_PTHREAD_MUTEX_DESTROY(&sched_ctx->empty_ctx_mutex);
 	sched_ctx->id = STARPU_NMAX_SCHED_CTXS;
@@ -653,13 +848,13 @@ void starpu_sched_ctx_delete(unsigned sched_ctx_id)
 
 	int *workerids;
 	unsigned nworkers_ctx = starpu_sched_ctx_get_workers_list(sched_ctx->id, &workerids);
-	
+
 	/*if both of them have all the ressources is pointless*/
 	/*trying to transfer ressources from one ctx to the other*/
 	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
 	unsigned nworkers = config->topology.nworkers;
 
-	if(nworkers_ctx > 0 && inheritor_sched_ctx && inheritor_sched_ctx->id != STARPU_NMAX_SCHED_CTXS && 
+	if(nworkers_ctx > 0 && inheritor_sched_ctx && inheritor_sched_ctx->id != STARPU_NMAX_SCHED_CTXS &&
 	   !(nworkers_ctx == nworkers && nworkers_ctx == inheritor_sched_ctx->workers->nworkers))
 	{
 		starpu_sched_ctx_add_workers(workerids, nworkers_ctx, inheritor_sched_ctx_id);
@@ -674,10 +869,10 @@ void starpu_sched_ctx_delete(unsigned sched_ctx_id)
 		/*if btw the mutex release & the mutex lock the context has changed take care to free all
 		  scheduling data before deleting the context */
 		_starpu_update_workers_without_ctx(workerids, nworkers_ctx, sched_ctx_id, 1);
-//		_starpu_sched_ctx_free_scheduling_data(sched_ctx);
+		_starpu_sched_ctx_free_scheduling_data(sched_ctx);
 		_starpu_delete_sched_ctx(sched_ctx);
-
 	}
+
 	STARPU_PTHREAD_RWLOCK_UNLOCK(&changing_ctx_mutex[sched_ctx_id]);
 	/* workerids is malloc-ed in starpu_sched_ctx_get_workers_list, don't forget to free it when
 	   you don't use it anymore */
@@ -734,7 +929,7 @@ void _starpu_fetch_tasks_from_empty_ctx_list(struct _starpu_sched_ctx *sched_ctx
                 /* you're not suppose to get here if you deleted the context
 		   so no point in having the mutex locked */
 		STARPU_PTHREAD_RWLOCK_UNLOCK(&changing_ctx_mutex[sched_ctx->id]);
-	
+
 	while(!starpu_task_list_empty(&sched_ctx->empty_ctx_tasks))
 	{
 		if(unlocked)
@@ -814,7 +1009,7 @@ void starpu_sched_ctx_add_workers(int *workers_to_add, int nworkers_to_add, unsi
 	if(sched_ctx->id != STARPU_NMAX_SCHED_CTXS)
 	{
 		_starpu_add_workers_to_sched_ctx(sched_ctx, workers_to_add, nworkers_to_add, added_workers, &n_added_workers);
-		
+
 		if(n_added_workers > 0)
 		{
 			_starpu_update_workers_with_ctx(added_workers, n_added_workers, sched_ctx->id);
@@ -874,13 +1069,13 @@ int _starpu_nworkers_able_to_execute_task(struct starpu_task *task, struct _star
 
 	STARPU_PTHREAD_RWLOCK_WRLOCK(&changing_ctx_mutex[sched_ctx->id]);
 	struct starpu_worker_collection *workers = sched_ctx->workers;
-	
+
 	struct starpu_sched_ctx_iterator it;
 
 	workers->init_iterator(workers, &it);
-	while(workers->has_next(workers, &it))
+	while(workers->has_next_master(workers, &it))
 	{
-		worker = workers->get_next(workers, &it);
+		worker = workers->get_next_master(workers, &it);
 		STARPU_ASSERT_MSG(worker < STARPU_NMAXWORKERS, "worker id %d", worker);
 		if (starpu_worker_can_execute_task_first_impl(worker, task, NULL))
 			nworkers++;
@@ -951,7 +1146,7 @@ void _starpu_decrement_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
 
 				int *workerids = NULL;
 				unsigned nworkers = starpu_sched_ctx_get_workers_list(sched_ctx->id, &workerids);
-				
+
 				if(nworkers > 0)
 				{
 					starpu_sched_ctx_add_workers(workerids, nworkers, sched_ctx->inheritor);
@@ -1172,6 +1367,9 @@ struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection(unsig
 
 	}
 
+        /* construct the collection of workers(list/tree/etc.) */
+	sched_ctx->workers->init(sched_ctx->workers);
+
 	return sched_ctx->workers;
 }
 
@@ -1196,6 +1394,7 @@ unsigned starpu_sched_ctx_get_workers_list(unsigned sched_ctx_id, int **workerid
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	struct starpu_worker_collection *workers = sched_ctx->workers;
+	if(!workers) return 0;
 	*workerids = (int*)malloc(workers->nworkers*sizeof(int));
 	int worker;
 	unsigned nworkers = 0;
@@ -1216,6 +1415,7 @@ void starpu_sched_ctx_delete_worker_collection(unsigned sched_ctx_id)
 	sched_ctx->workers->deinit(sched_ctx->workers);
 
 	free(sched_ctx->workers);
+	sched_ctx->workers = NULL;
 }
 
 struct starpu_worker_collection* starpu_sched_ctx_get_worker_collection(unsigned sched_ctx_id)
@@ -1351,7 +1551,7 @@ unsigned starpu_sched_ctx_worker_get_id(unsigned sched_ctx_id)
 			return workerid;
 	return -1;
 }
-		 
+
 unsigned starpu_sched_ctx_overlapping_ctxs_on_worker(int workerid)
 {
 	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
@@ -1397,7 +1597,7 @@ void starpu_sched_ctx_finished_submit(unsigned sched_ctx_id)
 void _starpu_sched_ctx_post_exec_task_cb(int workerid, struct starpu_task *task, size_t data_size2, uint32_t footprint)
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
-	if(sched_ctx != NULL && task->sched_ctx != _starpu_get_initial_sched_ctx()->id && 
+	if(sched_ctx != NULL && task->sched_ctx != _starpu_get_initial_sched_ctx()->id &&
 	   task->sched_ctx != STARPU_NMAX_SCHED_CTXS  && sched_ctx->perf_counters != NULL)
 	{
 		flops[task->sched_ctx][workerid] += task->flops;
@@ -1532,13 +1732,13 @@ unsigned _starpu_sched_ctx_last_worker_awake(struct _starpu_worker *worker)
         for (l = worker->sched_ctx_list; l; l = l->next)
         {
 		struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
-		
+
 		unsigned last_worker_awake = 1;
 		struct starpu_worker_collection *workers = sched_ctx->workers;
 		struct starpu_sched_ctx_iterator it;
 
 		int workerid;
-		
+
 		workers->init_iterator(workers, &it);
 		while(workers->has_next(workers, &it))
 		{
@@ -1575,7 +1775,7 @@ void starpu_sched_ctx_bind_current_thread_to_cpuid(unsigned cpuid STARPU_ATTRIBU
 							  config->pu_depth, cpuid);
 		hwloc_bitmap_t set = obj->cpuset;
 		int ret;
-		
+
 		hwloc_bitmap_singlify(set);
 		ret = hwloc_set_cpubind (config->topology.hwtopology, set,
 					 HWLOC_CPUBIND_THREAD);
@@ -1621,7 +1821,7 @@ unsigned starpu_sched_ctx_worker_is_master_for_child_ctx(int workerid, unsigned
 	struct _starpu_sched_ctx_list *l = NULL;
 	struct _starpu_sched_ctx *sched_ctx = NULL;
 	for (l = worker->sched_ctx_list; l; l = l->next)
-	{ 
+	{
 		 sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
 		if(sched_ctx-> main_master == workerid && sched_ctx->nesting_sched_ctx == sched_ctx_id)
 			return sched_ctx->id;
@@ -1630,6 +1830,18 @@ unsigned starpu_sched_ctx_worker_is_master_for_child_ctx(int workerid, unsigned
 
 }
 
+struct _starpu_sched_ctx *_starpu_sched_ctx_get_sched_ctx_for_worker_and_job(struct _starpu_worker *worker, struct _starpu_job *j)
+{
+	struct _starpu_sched_ctx_list *l = NULL;
+	for (l = worker->sched_ctx_list; l; l = l->next)
+	{
+		struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
+		if (j->task->sched_ctx == sched_ctx->id)
+			return sched_ctx;
+	}
+	return NULL;
+}
+
 void starpu_sched_ctx_revert_task_counters(unsigned sched_ctx_id, double ready_flops)
 {
         _starpu_decrement_nsubmitted_tasks_of_sched_ctx(sched_ctx_id);
@@ -1863,7 +2075,7 @@ static int _starpu_sched_ctx_find_master(unsigned sched_ctx_id, int *workerids,
                 if (master > -1)
 		{
                         int already_seen = 0;
-                        //Could create a function for this. Basically searching an element in an array.                                                                                                             
+                        //Could create a function for this. Basically searching an element in an array.
                         for (i = 0 ; i < npotential_masters; i++)
                         {
                                 if (potential_masters[i] == master)
@@ -1881,7 +2093,7 @@ static int _starpu_sched_ctx_find_master(unsigned sched_ctx_id, int *workerids,
 
         for (i = 0 ; i < npotential_masters ; i++) {
 		int master_is_in_section = 0;
-		//Could create a function for this. Basically searching an element in an array.                                                                                                                     
+		//Could create a function for this. Basically searching an element in an array.
 		for (w = 0 ; w < nworkers ; w++)
 		{
 			if (workerids[w] == potential_masters[i])
@@ -1917,7 +2129,7 @@ static void _starpu_sched_ctx_add_workers_to_master(unsigned sched_ctx_id, int *
 	int nwake_up = 0;
 	int put_to_sleep[nworkers];
 	int wake_up[nworkers];
-	
+
 	for(w = 0 ; w < nworkers ; w++)
 	{
 		int master = sched_ctx->master[workerids[w]];
@@ -1934,9 +2146,19 @@ static void _starpu_sched_ctx_add_workers_to_master(unsigned sched_ctx_id, int *
 
 }
 
+static void _starpu_sched_ctx_set_master(struct _starpu_sched_ctx *sched_ctx, int *workerids, int nworkers, int master)
+{
+	int i;
+	for(i = 0; i < nworkers; i++)
+	{
+		if(workerids[i] != master)
+			sched_ctx->master[workerids[i]] = master;
+	}
+}
+
 int starpu_sched_ctx_book_workers_for_task(unsigned sched_ctx_id, int *workerids, int nworkers)
-{ 
-	int new_master = _starpu_sched_ctx_find_master(sched_ctx_id, workerids, nworkers);	
+{
+	int new_master = _starpu_sched_ctx_find_master(sched_ctx_id, workerids, nworkers);
 	_starpu_sched_ctx_add_workers_to_master(sched_ctx_id, workerids, nworkers, new_master);
 	return new_master;
 }
@@ -1947,12 +2169,20 @@ void starpu_sched_ctx_unbook_workers_for_task(unsigned sched_ctx_id, int master)
 	_starpu_sched_ctx_wake_up_workers(sched_ctx_id, master);
 }
 
+struct starpu_perfmodel_arch * _starpu_sched_ctx_get_perf_archtype(unsigned sched_ctx_id)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	return &sched_ctx->perf_arch;
+}
+
 int starpu_sched_ctx_get_worker_rank(unsigned sched_ctx_id)
 {
 	int idx = 0;
 	int curr_workerid = starpu_worker_get_id();
 	int worker;
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	if(sched_ctx->sched_policy || !sched_ctx->awake_workers)
+		return -1;
 	struct starpu_worker_collection *workers = sched_ctx->workers;
 
 	struct starpu_sched_ctx_iterator it;

+ 12 - 1
src/core/sched_ctx.h

@@ -150,6 +150,13 @@ struct _starpu_sched_ctx
 	/* ctx nesting the current ctx */
 	unsigned nesting_sched_ctx;
 
+	/* perf model for the device comb of the ctx */
+	struct starpu_perfmodel_arch perf_arch;
+
+	/* for ctxs without policy: flag to indicate that we want to get
+	   the threads to sleep in order to replace them with other threads or leave
+	   them awake & use them in the parallel code*/
+	unsigned awake_workers;
 };
 
 struct _starpu_machine_config;
@@ -160,7 +167,7 @@ void _starpu_init_all_sched_ctxs(struct _starpu_machine_config *config);
 /* allocate all structures belonging to a context */
 struct _starpu_sched_ctx*  _starpu_create_sched_ctx(struct starpu_sched_policy *policy, int *workerid, int nworkerids, unsigned is_init_sched, const char *sched_name,
 						    int min_prio_set, int min_prio,
-						    int max_prio_set, int max_prio);
+						    int max_prio_set, int max_prio, unsigned awake_workers);
 
 /* delete all sched_ctx */
 void _starpu_delete_all_sched_ctxs();
@@ -224,10 +231,14 @@ void _starpu_fetch_tasks_from_empty_ctx_list(struct _starpu_sched_ctx *sched_ctx
 
 unsigned _starpu_sched_ctx_allow_hypervisor(unsigned sched_ctx_id);
 
+struct starpu_perfmodel_arch * _starpu_sched_ctx_get_perf_archtype(unsigned sched_ctx);
 #ifdef STARPU_USE_SC_HYPERVISOR
 /* Notifies the hypervisor that a tasks was poped from the workers' list */
 void _starpu_sched_ctx_post_exec_task_cb(int workerid, struct starpu_task *task, size_t data_size, uint32_t footprint);
 
 #endif //STARPU_USE_SC_HYPERVISOR
 
+/* if the worker is the master of a parallel context, and the job is meant to be executed on this parallel context, return a pointer to the context */
+struct _starpu_sched_ctx *_starpu_sched_ctx_get_sched_ctx_for_worker_and_job(struct _starpu_worker *worker, struct _starpu_job *j);
+
 #endif // __SCHED_CONTEXT_H__

+ 31 - 2
src/core/sched_policy.c

@@ -438,7 +438,7 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 
 	_starpu_profiling_set_task_push_start_time(task);
 
-	int ret;
+	int ret = 0;
 	if (STARPU_UNLIKELY(task->execute_on_a_specific_worker))
 	{
 		unsigned node = starpu_worker_get_memory_node(task->workerid);
@@ -469,7 +469,36 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 
 		if(!sched_ctx->sched_policy)
 		{
-			ret = _starpu_push_task_on_specific_worker(task, sched_ctx->main_master);
+			if(!sched_ctx->awake_workers)
+				ret = _starpu_push_task_on_specific_worker(task, sched_ctx->main_master);
+			else
+			{
+				struct starpu_worker_collection *workers = sched_ctx->workers;
+				
+				struct _starpu_job *job = _starpu_get_job_associated_to_task(task);
+				job->task_size = workers->nworkers;
+				job->combined_workerid = -1; // workerid; its a ctx not combined worker
+				job->active_task_alias_count = 0;
+
+				STARPU_PTHREAD_BARRIER_INIT(&job->before_work_barrier, NULL, workers->nworkers);
+				STARPU_PTHREAD_BARRIER_INIT(&job->after_work_barrier, NULL, workers->nworkers);
+				
+				/* Note: we have to call that early, or else the task may have
+				 * disappeared already */
+				starpu_push_task_end(task);
+
+				unsigned workerid;
+				struct starpu_sched_ctx_iterator it;
+				if(workers->init_iterator)
+					workers->init_iterator(workers, &it);
+
+				while(workers->has_next(workers, &it))
+				{
+					workerid = workers->get_next(workers, &it);
+					struct starpu_task *alias = starpu_task_dup(task);
+					ret |= _starpu_push_task_on_specific_worker(alias, workerid);
+				}
+			}
 		}
 		else
 		{

+ 18 - 15
src/core/task.c

@@ -288,10 +288,13 @@ int _starpu_submit_job(struct _starpu_job *j)
 	   && sched_ctx->perf_counters != NULL)
 	{
 		struct starpu_perfmodel_arch arch;
-		arch.type = STARPU_CPU_WORKER;
-		arch.devid = 0;
-		arch.ncore = 0;
+		arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
+		arch.ndevices = 1;
+		arch.devices[0].type = STARPU_CPU_WORKER;
+		arch.devices[0].devid = 0;
+		arch.devices[0].ncores = 1;
 		_starpu_compute_buffers_footprint(j->task->cl->model, &arch, 0, j);
+		free(arch.devices);
 		int i;
 		size_t data_size = 0;
 		if (j->task->cl)
@@ -543,11 +546,11 @@ int starpu_task_submit(struct starpu_task *task)
 			_starpu_detect_implicit_data_deps(task);
 		}
 
-		if (task->cl->model && task->cl->model->symbol)
-			_starpu_load_perfmodel(task->cl->model);
+		if (task->cl->model)
+			_starpu_init_and_load_perfmodel(task->cl->model);
 
-		if (task->cl->power_model && task->cl->power_model->symbol)
-			_starpu_load_perfmodel(task->cl->power_model);
+		if (task->cl->power_model)
+			_starpu_init_and_load_perfmodel(task->cl->power_model);
 	}
 
 	if (bundle)
@@ -562,11 +565,11 @@ int starpu_task_submit(struct starpu_task *task)
 
 		while (entry)
 		{
-			if (entry->task->cl->model && entry->task->cl->model->symbol)
-				_starpu_load_perfmodel(entry->task->cl->model);
+			if (entry->task->cl->model)
+				_starpu_init_and_load_perfmodel(entry->task->cl->model);
 
-			if (entry->task->cl->power_model && entry->task->cl->power_model->symbol)
-				_starpu_load_perfmodel(entry->task->cl->power_model);
+			if (entry->task->cl->power_model)
+				_starpu_init_and_load_perfmodel(entry->task->cl->power_model);
 
 			entry = entry->next;
 		}
@@ -626,10 +629,10 @@ int _starpu_task_submit_nodeps(struct starpu_task *task)
 	if (task->cl)
 	{
 		if (task->cl->model)
-			_starpu_load_perfmodel(task->cl->model);
+			_starpu_init_and_load_perfmodel(task->cl->model);
 
 		if (task->cl->power_model)
-			_starpu_load_perfmodel(task->cl->power_model);
+			_starpu_init_and_load_perfmodel(task->cl->power_model);
 	}
 
 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
@@ -687,10 +690,10 @@ int _starpu_task_submit_conversion_task(struct starpu_task *task,
 
 	/* We should factorize that */
 	if (task->cl->model)
-		_starpu_load_perfmodel(task->cl->model);
+		_starpu_init_and_load_perfmodel(task->cl->model);
 
 	if (task->cl->power_model)
-		_starpu_load_perfmodel(task->cl->power_model);
+		_starpu_init_and_load_perfmodel(task->cl->power_model);
 
 	/* We retain handle reference count */
 	unsigned i;

+ 31 - 22
src/core/topology.c

@@ -334,7 +334,6 @@ _starpu_init_mic_topology (struct _starpu_machine_config *config, long mic_idx)
 	topology->nhwmiccores[mic_idx] = nbcores;
 }
 
-
 static int
 _starpu_init_mic_node (struct _starpu_machine_config *config, int mic_idx,
 		       COIENGINE *coi_handle, COIPROCESS *coi_process)
@@ -403,8 +402,6 @@ _starpu_init_mic_node (struct _starpu_machine_config *config, int mic_idx,
 }
 #endif
 
-
-
 static void
 _starpu_init_topology (struct _starpu_machine_config *config)
 {
@@ -732,7 +729,6 @@ _starpu_init_mic_config (struct _starpu_machine_config *config,
 	topology->nworkers += topology->nmiccores[mic_idx];
     }
 
-
 #ifdef STARPU_USE_MIC
 static COIENGINE handles[2];
 static COIPROCESS process[2];
@@ -870,12 +866,15 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 		for (i = 0; i < nworker_per_cuda; i++)
 		{
 			int worker_idx = topology->nworkers + cudagpu * nworker_per_cuda + i;
+
 			config->workers[worker_idx].arch = STARPU_CUDA_WORKER;
-			config->workers[worker_idx].perf_arch.type = STARPU_CUDA_WORKER;
-			config->workers[worker_idx].perf_arch.devid = devid;
+			config->workers[worker_idx].perf_arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
+			config->workers[worker_idx].perf_arch.ndevices = 1;
+			config->workers[worker_idx].perf_arch.devices[0].type = STARPU_CUDA_WORKER;
+			config->workers[worker_idx].perf_arch.devices[0].devid = devid;
 			// TODO: fix perfmodels etc.
 			//config->workers[worker_idx].perf_arch.ncore = nworker_per_cuda - 1;
-			config->workers[worker_idx].perf_arch.ncore = 0;
+			config->workers[worker_idx].perf_arch.devices[0].ncores = 1;
 			config->workers[worker_idx].devid = devid;
 			config->workers[worker_idx].subworkerid = i;
 			config->workers[worker_idx].worker_mask = STARPU_CUDA;
@@ -948,9 +947,11 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 			break;
 		}
 		config->workers[worker_idx].arch = STARPU_OPENCL_WORKER;
-		config->workers[worker_idx].perf_arch.type = STARPU_OPENCL_WORKER;
-		config->workers[worker_idx].perf_arch.devid = devid;
-		config->workers[worker_idx].perf_arch.ncore = 0;
+		config->workers[worker_idx].perf_arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
+		config->workers[worker_idx].perf_arch.ndevices = 1;
+		config->workers[worker_idx].perf_arch.devices[0].type = STARPU_OPENCL_WORKER;
+		config->workers[worker_idx].perf_arch.devices[0].devid = devid;
+		config->workers[worker_idx].perf_arch.devices[0].ncores = 1;
 		config->workers[worker_idx].subworkerid = 0;
 		config->workers[worker_idx].devid = devid;
 		config->workers[worker_idx].worker_mask = STARPU_OPENCL;
@@ -1010,9 +1011,12 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 	{
 		config->workers[topology->nworkers + sccdev].arch = STARPU_SCC_WORKER;
 		int devid = _starpu_get_next_scc_deviceid(config);
-		config->workers[topology->nworkers + sccdev].perf_arch.type = STARPU_SCC_WORKER;
-		config->workers[topology->nworkers + sccdev].perf_arch.devid = sccdev;
-		config->workers[topology->nworkers + sccdev].perf_arch.ncore = 0;
+		config->workers[topology->nworkers + sccdev].perf_arch.devices = (struct starpu_perfmodel_device)malloc(sizeof(struct starpu_perfmodel_device));
+		config->workers[topology->nworkers + sccdev].perf_arch.ndevices = 1;
+
+		config->workers[topology->nworkers + sccdev].perf_arch.devices[0].type = STARPU_SCC_WORKER;
+		config->workers[topology->nworkers + sccdev].perf_arch.devices[0].devid = sccdev;
+		config->workers[topology->nworkers + sccdev].perf_arch.devices[0].ncore = 1;
 		config->workers[topology->nworkers + sccdev].subworkerid = 0;
 		config->workers[topology->nworkers + sccdev].devid = devid;
 		config->workers[topology->nworkers + sccdev].worker_mask = STARPU_SCC;
@@ -1076,9 +1080,11 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 	{
 		int worker_idx = topology->nworkers + cpu;
 		config->workers[worker_idx].arch = STARPU_CPU_WORKER;
-		config->workers[worker_idx].perf_arch.type = STARPU_CPU_WORKER;
-		config->workers[worker_idx].perf_arch.devid = 0;
-		config->workers[worker_idx].perf_arch.ncore = 0;
+		config->workers[worker_idx].perf_arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
+		config->workers[worker_idx].perf_arch.ndevices = 1;
+		config->workers[worker_idx].perf_arch.devices[0].type = STARPU_CPU_WORKER;
+		config->workers[worker_idx].perf_arch.devices[0].devid = 0;
+		config->workers[worker_idx].perf_arch.devices[0].ncores = 1;
 		config->workers[worker_idx].subworkerid = 0;
 		config->workers[worker_idx].devid = cpu;
 		config->workers[worker_idx].worker_mask = STARPU_CPU;
@@ -1096,8 +1102,6 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 	return 0;
 }
 
-
-
 void
 _starpu_bind_thread_on_cpu (
 	struct _starpu_machine_config *config STARPU_ATTRIBUTE_UNUSED,
@@ -1168,7 +1172,6 @@ _starpu_bind_thread_on_cpu (
 #endif
 }
 
-
 void
 _starpu_bind_thread_on_cpus (
 	struct _starpu_machine_config *config STARPU_ATTRIBUTE_UNUSED,
@@ -1211,7 +1214,6 @@ _starpu_bind_thread_on_cpus (
 #endif
 }
 
-
 static void
 _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_config STARPU_ATTRIBUTE_UNUSED)
 {
@@ -1467,7 +1469,6 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 	}
 }
 
-
 int
 _starpu_build_topology (struct _starpu_machine_config *config, int no_mp_config)
 {
@@ -1544,8 +1545,9 @@ _starpu_destroy_topology (
 	unsigned worker;
 	for (worker = 0; worker < config->topology.nworkers; worker++)
 	{
-#ifdef STARPU_HAVE_HWLOC
 		struct _starpu_worker *workerarg = &config->workers[worker];
+		free(workerarg->perf_arch.devices);
+#ifdef STARPU_HAVE_HWLOC
 		hwloc_bitmap_free(workerarg->hwloc_cpu_set);
 		if (workerarg->bindid != -1)
 		{
@@ -1561,6 +1563,13 @@ _starpu_destroy_topology (
 #endif
 	}
 
+	unsigned combined_worker_id;
+	for(combined_worker_id=0 ; combined_worker_id < config->topology.ncombinedworkers ; combined_worker_id++)
+	{
+		struct _starpu_combined_worker *combined_worker = &config->combined_workers[combined_worker_id];
+		free(combined_worker->perf_arch.devices);
+	}
+
 #ifdef STARPU_HAVE_HWLOC
 	hwloc_topology_destroy(config->topology.hwtopology);
 #endif

+ 10 - 2
src/core/workers.c

@@ -287,7 +287,15 @@ static inline int _starpu_can_use_nth_implementation(enum starpu_worker_archtype
 int starpu_worker_can_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl)
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
-	if(sched_ctx->parallel_sect[workerid]) return 0;
+
+	/* if the task can't be parallel don't submit it to a ctx */
+	unsigned child_sched_ctx = starpu_sched_ctx_worker_is_master_for_child_ctx(workerid, sched_ctx->id);
+        if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
+		if(!task->possibly_parallel) return 0;
+
+	/* if the worker is blocked in a parallel ctx don't submit tasks on it */
+	if(sched_ctx->parallel_sect[workerid] ) return 0;
+
 	/* TODO: check that the task operand sizes will fit on that device */
 	return (task->cl->where & config.workers[workerid].worker_mask) &&
 		_starpu_can_use_nth_implementation(config.workers[workerid].arch, task->cl, nimpl) &&
@@ -1192,7 +1200,7 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 	if (!is_a_sink)
 	{
 		struct starpu_sched_policy *selected_policy = _starpu_select_sched_policy(&config, config.conf->sched_policy_name);
-		_starpu_create_sched_ctx(selected_policy, NULL, -1, 1, "init", 0, 0, 0, 0);
+		_starpu_create_sched_ctx(selected_policy, NULL, -1, 1, "init", 0, 0, 0, 0, 1);
 	}
 
 	_starpu_initialize_registered_performance_models();

+ 1 - 0
src/datawizard/coherency.c

@@ -26,6 +26,7 @@
 #include <math.h>
 #include <core/task.h>
 #include <starpu_scheduler.h>
+#include <core/workers.h>
 
 static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned src_node, unsigned dst_node, unsigned *handling_node);
 int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)

+ 25 - 22
src/datawizard/footprint.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010-2011, 2013-2014  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -38,7 +38,7 @@ uint32_t starpu_task_data_footprint(struct starpu_task *task)
 	return footprint;
 }
 
-uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, unsigned nimpl, struct _starpu_job *j)
+uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned nimpl, struct _starpu_job *j)
 {
 	if (j->footprint_is_computed)
 		return j->footprint;
@@ -47,27 +47,30 @@ uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, struc
 
 	struct starpu_task *task = j->task;
 
-	if (model != NULL && model->footprint != NULL)
+	if (model)
 	{
-		footprint = model->footprint(task);
-	}
-	else if (model != NULL && model->per_arch &&
-			model->per_arch[arch->type] != NULL &&
-			model->per_arch[arch->type][arch->devid] != NULL &&
-			model->per_arch[arch->type][arch->devid][arch->ncore] != NULL &&
-			model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].size_base)
-	{
-		size_t size = model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].size_base(task, arch, nimpl);
-		footprint = starpu_hash_crc32c_be_n(&size, sizeof(size), footprint);
-	}
-	else if (model && model->size_base)
-	{
-		size_t size = model->size_base(task, nimpl);
-		footprint = starpu_hash_crc32c_be_n(&size, sizeof(size), footprint);
-	}
-	else
-	{
-		footprint = starpu_task_data_footprint(task);
+		if (model->footprint)
+		{
+			footprint = model->footprint(task);
+		}
+		else
+		{
+			struct starpu_perfmodel_per_arch *per_arch = starpu_perfmodel_get_model_per_arch(model, arch, nimpl);
+			if (per_arch != NULL && per_arch->size_base)
+			{
+				size_t size = per_arch->size_base(task, arch, nimpl);
+				footprint = starpu_hash_crc32c_be_n(&size, sizeof(size), footprint);
+			}
+			else if (model->size_base)
+			{
+				size_t size = model->size_base(task, nimpl);
+				footprint = starpu_hash_crc32c_be_n(&size, sizeof(size), footprint);
+			}
+			else
+			{
+				footprint = starpu_task_data_footprint(task);
+			}
+		}
 	}
 
 	j->footprint = footprint;

+ 69 - 56
src/debug/traces/starpu_fxt.c

@@ -341,7 +341,7 @@ static void thread_pop_state(double time, const char *prefix, long unsigned int
 }
 
 #ifdef STARPU_ENABLE_PAJE_CODELET_DETAILS
-static void worker_set_detailed_state(double time, const char *prefix, long unsigned int workerid, const char *name, unsigned long size, const char *parameters, unsigned long footprint, unsigned long long tag)
+static void worker_set_detailed_state(double time, const char *prefix, long unsigned int workerid, const char *name, unsigned long size, const char *parameters, unsigned long footprint, unsigned long long tag, unsigned long job_id)
 {
 #ifdef STARPU_HAVE_POTI
 	char container[STARPU_POTI_STR_LEN];
@@ -349,7 +349,7 @@ static void worker_set_detailed_state(double time, const char *prefix, long unsi
 	/* TODO: set detailed state */
 	poti_SetState(time, container, "WS", name);
 #else
-	fprintf(out_paje_file, "20	%.9f	%sw%lu	WS	%s	%lu	%s	%08lx	%016llx\n", time, prefix, workerid, name, size, parameters, footprint, tag);
+	fprintf(out_paje_file, "20	%.9f	%sw%lu	WS	%s	%lu	%s	%08lx	%016llx	%lu\n", time, prefix, workerid, name, size, parameters, footprint, tag, job_id);
 #endif
 }
 #endif
@@ -422,6 +422,8 @@ static void handle_worker_init_start(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 
 	char *kindstr = "";
 	struct starpu_perfmodel_arch arch;
+	arch.ndevices = 1;
+	arch.devices = (struct starpu_perfmodel_device *)malloc(sizeof(struct starpu_perfmodel_device));
 
 	switch (ev->param[0])
 	{
@@ -432,37 +434,37 @@ static void handle_worker_init_start(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 		case _STARPU_FUT_CPU_KEY:
 			set_next_cpu_worker_color(workerid);
 			kindstr = "CPU";
-			arch.type = STARPU_CPU_WORKER;
-			arch.devid = 0;
-			arch.ncore = 0;
+			arch.devices[0].type = STARPU_CPU_WORKER;
+			arch.devices[0].devid = 0;
+			arch.devices[0].ncores = 1;
 			break;
 		case _STARPU_FUT_CUDA_KEY:
 			set_next_cuda_worker_color(workerid);
 			kindstr = "CUDA";
-			arch.type = STARPU_CUDA_WORKER;
-			arch.devid = devid;
-			arch.ncore = 0;
+			arch.devices[0].type = STARPU_CUDA_WORKER;
+			arch.devices[0].devid = devid;
+			arch.devices[0].ncores = 1;
 			break;
 		case _STARPU_FUT_OPENCL_KEY:
 			set_next_opencl_worker_color(workerid);
 			kindstr = "OPENCL";
-			arch.type = STARPU_OPENCL_WORKER;
-			arch.devid = devid;
-			arch.ncore = 0;
+			arch.devices[0].type = STARPU_OPENCL_WORKER;
+			arch.devices[0].devid = devid;
+			arch.devices[0].ncores = 1;
 			break;
 		case _STARPU_FUT_MIC_KEY:
 			set_next_mic_worker_color(workerid);
 			kindstr = "mic";
-			arch.type = STARPU_MIC_WORKER;
-			arch.devid = devid;
-			arch.ncore = 0;
+			arch.devices[0].type = STARPU_MIC_WORKER;
+			arch.devices[0].devid = devid;
+			arch.devices[0].ncores = 1;
 			break;
 		case _STARPU_FUT_SCC_KEY:
 			set_next_scc_worker_color(workerid);
 			kindstr = "scc";
-			arch.type = STARPU_SCC_WORKER;
-			arch.devid = devid;
-			arch.ncore = 0;
+			arch.devices[0].type = STARPU_SCC_WORKER;
+			arch.devices[0].devid = devid;
+			arch.devices[0].ncores = 1;
 			break;
 		default:
 			STARPU_ABORT();
@@ -498,7 +500,7 @@ static void handle_worker_init_start(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 		thread_set_state(get_event_time_stamp(ev, options), prefix, threadid, "I");
 
 	if (activity_file)
-	fprintf(activity_file, "name\t%d\t%s %d\n", workerid, kindstr, devid);
+		fprintf(activity_file, "name\t%d\t%s %d\n", workerid, kindstr, devid);
 
 	snprintf(options->worker_names[workerid], 256, "%s %d", kindstr, devid);
 	options->worker_archtypes[workerid] = arch;
@@ -749,6 +751,7 @@ static void handle_codelet_details(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 {
 #ifdef STARPU_ENABLE_PAJE_CODELET_DETAILS
 	int worker = ev->param[5];
+	unsigned long job_id = ev->param[6];
 
 	unsigned sched_ctx = ev->param[1];
 	if (worker < 0) return;
@@ -760,12 +763,15 @@ static void handle_codelet_details(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 		int i;
 		char parameters[256];
 		size_t eaten = 0;
+		if (!last_codelet_parameter[worker])
+			eaten += snprintf(parameters + eaten, sizeof(parameters) - eaten, "nodata");
+		else
 		for (i = 0; i < last_codelet_parameter[worker] && i < MAX_PARAMETERS; i++)
 		{
 			eaten += snprintf(parameters + eaten, sizeof(parameters) - eaten, "%s%s", i?"_":"", last_codelet_parameter_description[worker][i]);
 		}
 
-		worker_set_detailed_state(last_codelet_start[worker], prefix, ev->param[5], last_codelet_symbol[worker], ev->param[2], parameters, ev->param[3], ev->param[4]);
+		worker_set_detailed_state(last_codelet_start[worker], prefix, worker, last_codelet_symbol[worker], ev->param[2], parameters, ev->param[3], ev->param[4], job_id);
 		if (sched_ctx != 0)
 		{
 #ifdef STARPU_HAVE_POTI
@@ -775,7 +781,7 @@ static void handle_codelet_details(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 			worker_container_alias(container, STARPU_POTI_STR_LEN, prefix, ev->param[5]);
 			poti_SetState(last_codelet_start[worker], container, ctx, last_codelet_symbol[worker]);
 #else
-			fprintf(out_paje_file, "20	%.9f	%sw%"PRIu64"	Ctx%d	%s	%08lx	%lu	%016llx\n", last_codelet_start[worker], prefix, ev->param[2], sched_ctx, last_codelet_symbol[worker], (unsigned long) ev->param[2], (unsigned long) ev->param[3], (unsigned long long) ev->param[4]);
+			fprintf(out_paje_file, "20	%.9f	%sw%"PRIu64"	Ctx%d	%s	%lu	%s	%08lx	%016llx	%lu\n", last_codelet_start[worker], prefix, ev->param[2], sched_ctx, last_codelet_symbol[worker], (unsigned long) ev->param[2], parameters, (unsigned long) ev->param[3], (unsigned long long) ev->param[4], job_id);
 #endif
 		}
 	}
@@ -787,7 +793,7 @@ static struct starpu_fxt_codelet_event *dumped_codelets;
 
 static void handle_end_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 {
-	int worker = ev->param[6];
+	int worker = ev->param[3];
 	if (worker < 0) return;
 
 	char *prefix = options->file_prefix;
@@ -798,15 +804,15 @@ static void handle_end_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_opti
 	uint32_t codelet_hash = ev->param[2];
 
 	if (out_paje_file)
-		worker_set_state(end_codelet_time, prefix, ev->param[6], "I");
+		worker_set_state(end_codelet_time, prefix, worker, "I");
 
 	double codelet_length = (end_codelet_time - last_codelet_start[worker]);
 
 	update_accumulated_time(worker, 0.0, codelet_length, end_codelet_time, 0);
 
 	if (distrib_time)
-	fprintf(distrib_time, "%s\t%s%d\t%ld\t%"PRIx32"\t%.9f\n", last_codelet_symbol[worker],
-			prefix, worker, (unsigned long) codelet_size, codelet_hash, codelet_length);
+	     fprintf(distrib_time, "%s\t%s%d\t%ld\t%"PRIx32"\t%.9f\n", last_codelet_symbol[worker],
+		     prefix, worker, (unsigned long) codelet_size, codelet_hash, codelet_length);
 
 	if (options->dumped_codelets)
 	{
@@ -815,10 +821,7 @@ static void handle_end_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_opti
 
 		snprintf(dumped_codelets[dumped_codelets_count - 1].symbol, 256, "%s", last_codelet_symbol[worker]);
 		dumped_codelets[dumped_codelets_count - 1].workerid = worker;
-		dumped_codelets[dumped_codelets_count - 1].arch.type = ev->param[3];
-		dumped_codelets[dumped_codelets_count - 1].arch.devid = ev->param[4];
-		dumped_codelets[dumped_codelets_count - 1].arch.ncore = ev->param[5];
-
+		snprintf(dumped_codelets[dumped_codelets_count - 1].perfmodel_archname, 256, "%s", (char *)&ev->param[4]);
 		dumped_codelets[dumped_codelets_count - 1].size = codelet_size;
 		dumped_codelets[dumped_codelets_count - 1].hash = codelet_hash;
 		dumped_codelets[dumped_codelets_count - 1].time = codelet_length;
@@ -1055,10 +1058,10 @@ static void handle_work_stealing(struct fxt_ev_64 *ev, struct starpu_fxt_options
 	unsigned src = ev->param[1];
 	unsigned size = 0;
 	unsigned comid = 0;
-	
+
 	char *prefix = options->file_prefix;
 
-	
+
 	if (out_paje_file)
 	{
 		double time = get_event_time_stamp(ev, options);
@@ -1603,12 +1606,8 @@ void _starpu_fxt_display_bandwidth(struct starpu_fxt_options *options)
 	}
 }
 
-/*
- *	Public functions
- */
-
 static
-void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *options)
+void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *options)
 {
 	/* Open the trace file */
 	int fd_in;
@@ -1795,6 +1794,9 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 				     handle_data_copy();
 				break;
 
+			case _STARPU_FUT_DATA_LOAD:
+			     	break;
+
 			case _STARPU_FUT_START_DRIVER_COPY:
 				if (!options->no_bus)
 					handle_start_driver_copy(&ev, options);
@@ -2062,6 +2064,16 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 				handle_hypervisor_end(&ev, options);
 				break;
 
+			/* We can safely ignore FUT internal events */
+			case FUT_SETUP_CODE:
+			case FUT_CALIBRATE0_CODE:
+			case FUT_CALIBRATE1_CODE:
+			case FUT_CALIBRATE2_CODE:
+			case FUT_KEYCHANGE_CODE:
+			case FUT_NEW_LWP_CODE:
+			case FUT_GCC_INSTRUMENT_ENTRY_CODE:
+				break;
+
 			default:
 #ifdef STARPU_VERBOSE
 				fprintf(stderr, "unknown event.. %x at time %llx WITH OFFSET %llx\n",
@@ -2094,7 +2106,7 @@ void starpu_fxt_options_init(struct starpu_fxt_options *options)
 }
 
 static
-void starpu_fxt_distrib_file_init(struct starpu_fxt_options *options)
+void _starpu_fxt_distrib_file_init(struct starpu_fxt_options *options)
 {
 	dumped_codelets_count = 0;
 	dumped_codelets = NULL;
@@ -2110,7 +2122,7 @@ void starpu_fxt_distrib_file_init(struct starpu_fxt_options *options)
 }
 
 static
-void starpu_fxt_distrib_file_close(struct starpu_fxt_options *options)
+void _starpu_fxt_distrib_file_close(struct starpu_fxt_options *options)
 {
 	if (distrib_time)
 		fclose(distrib_time);
@@ -2123,7 +2135,7 @@ void starpu_fxt_distrib_file_close(struct starpu_fxt_options *options)
 }
 
 static
-void starpu_fxt_activity_file_init(struct starpu_fxt_options *options)
+void _starpu_fxt_activity_file_init(struct starpu_fxt_options *options)
 {
 	if (options->activity_path)
 		activity_file = fopen(options->activity_path, "w+");
@@ -2132,14 +2144,14 @@ void starpu_fxt_activity_file_init(struct starpu_fxt_options *options)
 }
 
 static
-void starpu_fxt_activity_file_close(void)
+void _starpu_fxt_activity_file_close(void)
 {
 	if (activity_file)
 		fclose(activity_file);
 }
 
 static
-void starpu_fxt_paje_file_init(struct starpu_fxt_options *options)
+void _starpu_fxt_paje_file_init(struct starpu_fxt_options *options)
 {
 	/* create a new file */
 	if (options->out_paje_path)
@@ -2164,13 +2176,14 @@ void starpu_fxt_paje_file_init(struct starpu_fxt_options *options)
 }
 
 static
-void starpu_fxt_paje_file_close(void)
+void _starpu_fxt_paje_file_close(void)
 {
 	if (out_paje_file)
 		fclose(out_paje_file);
 }
 
-static uint64_t starpu_fxt_find_start_time(char *filename_in)
+static
+uint64_t _starpu_fxt_find_start_time(char *filename_in)
 {
 	/* Open the trace file */
 	int fd_in;
@@ -2209,24 +2222,24 @@ static uint64_t starpu_fxt_find_start_time(char *filename_in)
 void starpu_fxt_generate_trace(struct starpu_fxt_options *options)
 {
 	_starpu_fxt_dag_init(options->dag_path);
-	starpu_fxt_distrib_file_init(options);
-	starpu_fxt_activity_file_init(options);
+	_starpu_fxt_distrib_file_init(options);
+	_starpu_fxt_activity_file_init(options);
 
-	starpu_fxt_paje_file_init(options);
+	_starpu_fxt_paje_file_init(options);
 
 	if (options->ninputfiles == 0)
 	{
-	     return;
+		return;
 	}
 	else if (options->ninputfiles == 1)
 	{
 		/* we usually only have a single trace */
-		uint64_t file_start_time = starpu_fxt_find_start_time(options->filenames[0]);
+		uint64_t file_start_time = _starpu_fxt_find_start_time(options->filenames[0]);
 		options->file_prefix = "";
 		options->file_offset = file_start_time;
 		options->file_rank = -1;
 
-		starpu_fxt_parse_new_file(options->filenames[0], options);
+		_starpu_fxt_parse_new_file(options->filenames[0], options);
 	}
 	else
 	{
@@ -2259,7 +2272,7 @@ void starpu_fxt_generate_trace(struct starpu_fxt_options *options)
 		/* Compute all start_k */
 		for (inputfile = 0; inputfile < options->ninputfiles; inputfile++)
 		{
-			uint64_t file_start = starpu_fxt_find_start_time(options->filenames[inputfile]);
+			uint64_t file_start = _starpu_fxt_find_start_time(options->filenames[inputfile]);
 			start_k[inputfile] = file_start;
 		}
 
@@ -2267,9 +2280,9 @@ void starpu_fxt_generate_trace(struct starpu_fxt_options *options)
 		for (inputfile = 0; inputfile < options->ninputfiles; inputfile++)
 		{
 			int ret = _starpu_fxt_mpi_find_sync_point(options->filenames[inputfile],
-						&sync_k[inputfile],
-						&unique_keys[inputfile],
-						&rank_k[inputfile]);
+								  &sync_k[inputfile],
+								  &unique_keys[inputfile],
+								  &rank_k[inputfile]);
 			if (ret == -1)
 			{
 				/* There was no sync point, we assume there is no offset */
@@ -2326,7 +2339,7 @@ void starpu_fxt_generate_trace(struct starpu_fxt_options *options)
 			options->file_offset = offsets[inputfile];
 			options->file_rank = filerank;
 
-			starpu_fxt_parse_new_file(options->filenames[inputfile], options);
+			_starpu_fxt_parse_new_file(options->filenames[inputfile], options);
 		}
 
 		/* display the MPI transfers if possible */
@@ -2337,9 +2350,9 @@ void starpu_fxt_generate_trace(struct starpu_fxt_options *options)
 	_starpu_fxt_display_bandwidth(options);
 
 	/* close the different files */
-	starpu_fxt_paje_file_close();
-	starpu_fxt_activity_file_close();
-	starpu_fxt_distrib_file_close(options);
+	_starpu_fxt_paje_file_close();
+	_starpu_fxt_activity_file_close();
+	_starpu_fxt_distrib_file_close(options);
 
 	_starpu_fxt_dag_terminate();
 

+ 1 - 0
src/debug/traces/starpu_paje.c

@@ -140,6 +140,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 	fprintf(file, "%%	Params	string\n");
 	fprintf(file, "%%	Footprint	string\n");
 	fprintf(file, "%%	Tag	string\n");
+	fprintf(file, "%%	JobId	string\n");
 	fprintf(file, "%%EndEventDef\n");
 #endif
 #endif

+ 16 - 6
src/drivers/cpu/driver_cpu.c

@@ -240,13 +240,23 @@ int _starpu_cpu_driver_run_once(struct _starpu_worker *cpu_worker)
 		rank = j->active_task_alias_count++;
 		STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
 
-		struct _starpu_combined_worker *combined_worker;
-		combined_worker = _starpu_get_combined_worker_struct(j->combined_workerid);
+		if(j->combined_workerid != -1)
+		{
+			struct _starpu_combined_worker *combined_worker;
+			combined_worker = _starpu_get_combined_worker_struct(j->combined_workerid);
+			
+			cpu_worker->combined_workerid = j->combined_workerid;
+			cpu_worker->worker_size = combined_worker->worker_size;
+			cpu_worker->current_rank = rank;
+			perf_arch = &combined_worker->perf_arch;
+		}
+		else
+		{
+			struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(cpu_worker, j);
+			STARPU_ASSERT_MSG(sched_ctx != NULL, "there should be a worker %d in the ctx of this job \n", cpu_worker->workerid);
 
-		cpu_worker->combined_workerid = j->combined_workerid;
-		cpu_worker->worker_size = combined_worker->worker_size;
-		cpu_worker->current_rank = rank;
-		perf_arch = &combined_worker->perf_arch;
+			perf_arch = &sched_ctx->perf_arch;
+		}
 	}
 	else
 	{

+ 8 - 1
src/drivers/cuda/driver_cuda.c

@@ -490,7 +490,14 @@ static void finish_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *wor
 
 	_starpu_driver_end_job(worker, j, &worker->perf_arch, &codelet_end, 0, profiling);
 
-	_starpu_driver_update_job_feedback(j, worker, &worker->perf_arch, &j->cl_start, &codelet_end, profiling);
+	struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
+	if(!sched_ctx)
+		sched_ctx = _starpu_get_sched_ctx_struct(j->task->sched_ctx);
+
+	if(!sched_ctx->sched_policy)
+		_starpu_driver_update_job_feedback(j, worker, &sched_ctx->perf_arch, &j->cl_start, &codelet_end, profiling);
+	else
+		_starpu_driver_update_job_feedback(j, worker, &worker->perf_arch, &j->cl_start, &codelet_end, profiling);
 
 	_starpu_push_task_output(j);
 

+ 79 - 10
src/drivers/driver_common/driver_common.c

@@ -74,7 +74,34 @@ void _starpu_driver_start_job(struct _starpu_worker *worker, struct _starpu_job
 	if (starpu_top)
 		_starpu_top_task_started(task,workerid,codelet_start);
 
-	_STARPU_TRACE_START_CODELET_BODY(j, j->nimpl, perf_arch, workerid);
+
+	// Find out if the worker is the master of a parallel context
+	struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
+	if(!sched_ctx)
+		sched_ctx = _starpu_get_sched_ctx_struct(j->task->sched_ctx);
+	if(!sched_ctx->sched_policy)
+	{
+		if(!sched_ctx->awake_workers && sched_ctx->main_master == worker->workerid)
+		{
+			struct starpu_worker_collection *workers = sched_ctx->workers;
+			struct starpu_sched_ctx_iterator it;
+
+			if (workers->init_iterator)
+				workers->init_iterator(workers, &it);
+			while (workers->has_next(workers, &it))
+			{
+				int _workerid = workers->get_next(workers, &it);
+				if (_workerid != workerid)
+				{
+					struct _starpu_worker *_worker = _starpu_get_worker_struct(_workerid);
+					_starpu_driver_start_job(_worker, j, &_worker->perf_arch, codelet_start, rank, profiling);
+				}
+			}
+		}
+		_STARPU_TRACE_START_CODELET_BODY(j, j->nimpl, &sched_ctx->perf_arch, workerid);
+	}
+	else
+		_STARPU_TRACE_START_CODELET_BODY(j, j->nimpl, perf_arch, workerid);
 }
 
 void _starpu_driver_end_job(struct _starpu_worker *worker, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank, int profiling)
@@ -86,7 +113,22 @@ void _starpu_driver_end_job(struct _starpu_worker *worker, struct _starpu_job *j
 	int workerid = worker->workerid;
 	unsigned calibrate_model = 0;
 
-	_STARPU_TRACE_END_CODELET_BODY(j, j->nimpl, perf_arch, workerid);
+	// Find out if the worker is the master of a parallel context
+	struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
+	unsigned worker_left_ctx = 0;
+	if(!sched_ctx)
+		sched_ctx = _starpu_get_sched_ctx_struct(j->task->sched_ctx);
+
+	if (!sched_ctx->sched_policy)
+	{
+		_starpu_perfmodel_create_comb_if_needed(&(sched_ctx->perf_arch));
+		_STARPU_TRACE_END_CODELET_BODY(j, j->nimpl, &(sched_ctx->perf_arch), workerid);
+	}
+	else
+	{
+		_starpu_perfmodel_create_comb_if_needed(perf_arch);
+		_STARPU_TRACE_END_CODELET_BODY(j, j->nimpl, perf_arch, workerid);
+	}
 
 	if (cl && cl->model && cl->model->benchmarking)
 		calibrate_model = 1;
@@ -104,7 +146,27 @@ void _starpu_driver_end_job(struct _starpu_worker *worker, struct _starpu_job *j
 		_starpu_top_task_ended(task,workerid,codelet_end);
 
 	_starpu_set_worker_status(worker, STATUS_UNKNOWN);
+
+	if(!sched_ctx->sched_policy && !sched_ctx->awake_workers &&
+	   sched_ctx->main_master == worker->workerid)
+	{
+		struct starpu_worker_collection *workers = sched_ctx->workers;
+		struct starpu_sched_ctx_iterator it;
+
+		if (workers->init_iterator)
+			workers->init_iterator(workers, &it);
+		while (workers->has_next(workers, &it))
+		{
+			int _workerid = workers->get_next(workers, &it);
+			if (_workerid != workerid)
+			{
+				struct _starpu_worker *_worker = _starpu_get_worker_struct(_workerid);
+				_starpu_driver_end_job(_worker, j, &_worker->perf_arch, codelet_end, rank, profiling);
+			}
+		}
+	}
 }
+
 void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_worker *worker,
 					struct starpu_perfmodel_arch* perf_arch,
 					struct timespec *codelet_start, struct timespec *codelet_end, int profiling)
@@ -117,6 +179,8 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 	int calibrate_model = 0;
 	int updated = 0;
 
+	_starpu_perfmodel_create_comb_if_needed(perf_arch);
+
 #ifndef STARPU_SIMGRID
 	if (cl->model && cl->model->benchmarking)
 		calibrate_model = 1;
@@ -257,10 +321,10 @@ static void _starpu_worker_set_status_wakeup(int workerid)
 static void _starpu_exponential_backoff(struct _starpu_worker *worker)
 {
 	int delay = worker->spinning_backoff;
-	
+
 	if (worker->spinning_backoff < BACKOFF_MAX)
-		worker->spinning_backoff<<=1; 
-	
+		worker->spinning_backoff<<=1;
+
 	while(delay--)
 		STARPU_UYIELD();
 }
@@ -285,6 +349,9 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *worker, int w
 			if(sched_ctx && sched_ctx->id > 0 && sched_ctx->id < STARPU_NMAX_SCHED_CTXS)
 			{
 				STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->parallel_sect_mutex[workerid]);
+				if(!sched_ctx->sched_policy && sched_ctx->awake_workers) 
+					worker->slave = sched_ctx->main_master != workerid;
+
 				if(sched_ctx->parallel_sect[workerid])
 				{
 					/* don't let the worker sleep with the sched_mutex taken */
@@ -442,10 +509,13 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 					STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
 					workers[i].current_rank = j->active_task_alias_count++;
 					STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
-					
-					combined_worker = _starpu_get_combined_worker_struct(j->combined_workerid);
-					workers[i].combined_workerid = j->combined_workerid;
-					workers[i].worker_size = combined_worker->worker_size;
+
+					if(j->combined_workerid != -1)
+					{
+						combined_worker = _starpu_get_combined_worker_struct(j->combined_workerid);
+						workers[i].combined_workerid = j->combined_workerid;
+						workers[i].worker_size = combined_worker->worker_size;
+					}
 				}
 				else
 				{
@@ -520,4 +590,3 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 
 	return count;
 }
-

+ 6 - 1
src/drivers/opencl/driver_opencl.c

@@ -926,7 +926,12 @@ static void _starpu_opencl_stop_job(struct _starpu_job *j, struct _starpu_worker
 
 	_starpu_driver_end_job(worker, j, &worker->perf_arch, &codelet_end, 0, profiling);
 
-	_starpu_driver_update_job_feedback(j, worker, &worker->perf_arch, &j->cl_start, &codelet_end, profiling);
+	struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
+	STARPU_ASSERT_MSG(sched_ctx != NULL, "there should be a worker %d in the ctx of this job \n", worker->workerid);
+	if(!sched_ctx->sched_policy)
+		_starpu_driver_update_job_feedback(j, worker, &sched_ctx->perf_arch, &j->cl_start, &codelet_end, profiling);
+	else
+		_starpu_driver_update_job_feedback(j, worker, &worker->perf_arch, &j->cl_start, &codelet_end, profiling);
 
 	_starpu_push_task_output(j);
 

+ 12 - 12
src/profiling/bound.c

@@ -426,7 +426,7 @@ static void _starpu_get_tasks_times(int nw, int nt, double *times)
 				.footprint = tp->footprint,
 				.footprint_is_computed = 1,
 			};
-			struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
+			struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
 			double length = _starpu_history_based_job_expected_perf(tp->cl->model, arch, &j, j.nimpl);
 			if (isnan(length))
 				times[w*nt+t] = NAN;
@@ -512,15 +512,15 @@ void starpu_bound_print_lp(FILE *output)
 			};
 			for (w = 0; w < nw; w++)
 			{
-				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
-				if (_STARPU_IS_ZERO(t1->duration[arch->type][arch->devid][arch->ncore]))
+				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
+				if (_STARPU_IS_ZERO(t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores]))
 				{
 					double length = _starpu_history_based_job_expected_perf(t1->cl->model, arch, &j,j.nimpl);
 					if (isnan(length))
 						/* Avoid problems with binary coding of doubles */
-						t1->duration[arch->type][arch->devid][arch->ncore] = NAN;
+						t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores] = NAN;
 					else
-						t1->duration[arch->type][arch->devid][arch->ncore] = length / 1000.;
+						t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores] = length / 1000.;
 				}
 			}
 			nt++;
@@ -545,8 +545,8 @@ void starpu_bound_print_lp(FILE *output)
 		{
 			for (w = 0; w < nw; w++)
 			{
-				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
-				if (!isnan(t1->duration[arch->type][arch->devid][arch->ncore]))
+				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
+				if (!isnan(t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores]))
 					fprintf(output, " +t%luw%d", t1->id, w);
 			}
 			fprintf(output, " = 1;\n");
@@ -559,9 +559,9 @@ void starpu_bound_print_lp(FILE *output)
 			fprintf(output, "/* %s %x */\tc%lu = s%lu", _starpu_codelet_get_model_name(t1->cl), (unsigned) t1->footprint, t1->id, t1->id);
 			for (w = 0; w < nw; w++)
 			{
-				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
-				if (!isnan(t1->duration[arch->type][arch->devid][arch->ncore]))
-					fprintf(output, " + %f t%luw%d", t1->duration[arch->type][arch->devid][arch->ncore], t1->id, w);
+				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
+				if (!isnan(t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores]))
+					fprintf(output, " + %f t%luw%d", t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores], t1->id, w);
 			}
 			fprintf(output, ";\n");
 		}
@@ -642,8 +642,8 @@ void starpu_bound_print_lp(FILE *output)
 				{
 					for (w = 0; w < nw; w++)
 					{
-						struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
-						if (!isnan(t1->duration[arch->type][arch->devid][arch->ncore]))
+						struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
+						if (!isnan(t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores]))
 						{
 							fprintf(output, "s%lu - c%lu >= -3e5 + 1e5 t%luw%d + 1e5 t%luw%d + 1e5 t%luafter%lu;\n",
 									t1->id, t2->id, t1->id, w, t2->id, w, t1->id, t2->id);

+ 6 - 6
src/sched_policies/component_best_implementation.c

@@ -23,7 +23,7 @@
 /* return true if workerid can execute task, and fill task->predicted and task->predicted_transfer
  *  according to best implementation predictions
  */
-static int find_best_impl(struct starpu_task * task, int workerid)
+static int find_best_impl(unsigned sched_ctx_id, struct starpu_task * task, int workerid)
 {
 	double len = DBL_MAX;
 	int best_impl = -1;
@@ -32,7 +32,7 @@ static int find_best_impl(struct starpu_task * task, int workerid)
 	{
 		if(starpu_worker_can_execute_task(workerid, task, impl))
 		{
-			struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid);
+			struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid, sched_ctx_id);
 			double d = starpu_task_expected_length(task, archtype, impl);
 			if(isnan(d))
 			{
@@ -61,20 +61,20 @@ static int find_best_impl(struct starpu_task * task, int workerid)
 /* set implementation, task->predicted and task->predicted_transfer with the first worker of workers that can execute that task
  * or have to be calibrated
  */
-static void select_best_implementation_and_set_preds(struct starpu_bitmap * workers, struct starpu_task * task)
+static void select_best_implementation_and_set_preds(unsigned sched_ctx_id, struct starpu_bitmap * workers, struct starpu_task * task)
 {
 	int workerid;
 	for(workerid = starpu_bitmap_first(workers);
 	    -1 != workerid;
 	    workerid = starpu_bitmap_next(workers, workerid))
-		if(find_best_impl(task, workerid))
+		if(find_best_impl(sched_ctx_id, task, workerid))
 			break;
 }
 
 static int best_implementation_push_task(struct starpu_sched_component * component, struct starpu_task * task)
 {
 	STARPU_ASSERT(component->nchildren == 1);
-	select_best_implementation_and_set_preds(component->workers_in_ctx, task);
+	select_best_implementation_and_set_preds(component->tree->sched_ctx_id, component->workers_in_ctx, task);
 	return component->children[0]->push_task(component->children[0],task);
 }
 
@@ -100,7 +100,7 @@ static struct starpu_task * best_implementation_pull_task(struct starpu_sched_co
 	}
 	if(task)
 		/* this worker can execute this task as it was returned by a pop*/
-		(void)find_best_impl(task, starpu_worker_get_id());
+		(void)find_best_impl(component->tree->sched_ctx_id, task, starpu_worker_get_id());
 	return task;
 }
 

+ 1 - 1
src/sched_policies/component_eager_calibration.c

@@ -29,7 +29,7 @@ static int eager_calibration_push_task(struct starpu_sched_component * component
 	    workerid != -1;
 	    workerid = starpu_bitmap_next(component->workers_in_ctx, workerid))
 	{
-		struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid);
+		struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid, component->tree->sched_ctx_id);
 		int nimpl;
 		for(nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 		{

+ 2 - 2
src/sched_policies/component_fifo.c

@@ -66,7 +66,7 @@ static double fifo_estimated_load(struct starpu_sched_component * component)
 	if(STARPU_SCHED_COMPONENT_IS_HOMOGENEOUS(component))
 	{		
 		int first_worker = starpu_bitmap_first(component->workers_in_ctx);
-		relative_speedup = starpu_worker_get_relative_speedup(starpu_worker_get_perf_archtype(first_worker));
+		relative_speedup = starpu_worker_get_relative_speedup(starpu_worker_get_perf_archtype(first_worker, component->tree->sched_ctx_id));
 		STARPU_PTHREAD_MUTEX_LOCK(mutex);
 		load = fifo->ntasks / relative_speedup;
 		STARPU_PTHREAD_MUTEX_UNLOCK(mutex);
@@ -78,7 +78,7 @@ static double fifo_estimated_load(struct starpu_sched_component * component)
 		for(i = starpu_bitmap_first(component->workers_in_ctx);
 		    i != -1;
 		    i = starpu_bitmap_next(component->workers_in_ctx, i))
-			relative_speedup += starpu_worker_get_relative_speedup(starpu_worker_get_perf_archtype(i));
+			relative_speedup += starpu_worker_get_relative_speedup(starpu_worker_get_perf_archtype(i, component->tree->sched_ctx_id));
 		relative_speedup /= starpu_bitmap_cardinal(component->workers_in_ctx);
 		STARPU_ASSERT(!_STARPU_IS_ZERO(relative_speedup));
 		STARPU_PTHREAD_MUTEX_LOCK(mutex);

+ 2 - 2
src/sched_policies/component_prio.c

@@ -79,7 +79,7 @@ static double prio_estimated_load(struct starpu_sched_component * component)
 	if(STARPU_SCHED_COMPONENT_IS_HOMOGENEOUS(component))
 	{		
 		int first_worker = starpu_bitmap_first(component->workers_in_ctx);
-		relative_speedup = starpu_worker_get_relative_speedup(starpu_worker_get_perf_archtype(first_worker));
+		relative_speedup = starpu_worker_get_relative_speedup(starpu_worker_get_perf_archtype(first_worker, component->tree->sched_ctx_id));
 		STARPU_PTHREAD_MUTEX_LOCK(mutex);
 		load = prio->ntasks / relative_speedup;
 		STARPU_PTHREAD_MUTEX_UNLOCK(mutex);
@@ -91,7 +91,7 @@ static double prio_estimated_load(struct starpu_sched_component * component)
 		for(i = starpu_bitmap_first(component->workers_in_ctx);
 		    i != -1;
 		    i = starpu_bitmap_next(component->workers_in_ctx, i))
-			relative_speedup += starpu_worker_get_relative_speedup(starpu_worker_get_perf_archtype(i));
+			relative_speedup += starpu_worker_get_relative_speedup(starpu_worker_get_perf_archtype(i, component->tree->sched_ctx_id));
 		relative_speedup /= starpu_bitmap_cardinal(component->workers_in_ctx);
 		STARPU_ASSERT(!_STARPU_IS_ZERO(relative_speedup));
 		STARPU_PTHREAD_MUTEX_LOCK(mutex);

+ 1 - 1
src/sched_policies/component_random.c

@@ -26,7 +26,7 @@ static double compute_relative_speedup(struct starpu_sched_component * component
 	    id != -1;
 	    id = starpu_bitmap_next(component->workers_in_ctx, id))
 	{
-		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(id);
+		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(id, component->tree->sched_ctx_id);
 		sum += starpu_worker_get_relative_speedup(perf_arch);
 
 	}

+ 2 - 2
src/sched_policies/component_sched.c

@@ -49,7 +49,7 @@ int starpu_sched_component_execute_preds(struct starpu_sched_component * compone
 	    workerid != -1;
 	    workerid = starpu_bitmap_next(component->workers_in_ctx, workerid))
 	{
-		struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid);
+		struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid, component->tree->sched_ctx_id);
 		int nimpl;
 		for(nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 		{
@@ -137,7 +137,7 @@ double starpu_sched_component_transfer_length(struct starpu_sched_component * co
 		else
 		{
 			sum += starpu_task_expected_data_transfer_time(memory_node, task);
-			/* sum += starpu_task_expected_conversion_time(task, starpu_worker_get_perf_archtype(worker), impl ?)
+			/* sum += starpu_task_expected_conversion_time(task, starpu_worker_get_perf_archtype(worker, component->tree->sched_ctx_id), impl ?)
 			 * I dont know what to do as we dont know what implementation would be used here...
 			 */
 		}

+ 1 - 1
src/sched_policies/component_work_stealing.c

@@ -211,7 +211,7 @@ double _ws_estimated_load(struct starpu_sched_component * component)
 	    -1 != workerid;
 	    workerid = starpu_bitmap_next(component->workers_in_ctx, workerid))
 	{
-		speedup += starpu_worker_get_relative_speedup(starpu_worker_get_perf_archtype(workerid));
+		speedup += starpu_worker_get_relative_speedup(starpu_worker_get_perf_archtype(workerid, component->tree->sched_ctx_id));
 	}
 	
 	return ntasks / speedup;

+ 1 - 1
src/sched_policies/component_worker.c

@@ -557,7 +557,7 @@ static double simple_worker_estimated_load(struct starpu_sched_component * compo
 	int ntasks_in_fifo = l ? l->ntasks : 0;
 	return (double) (nb_task + ntasks_in_fifo)
 		/ starpu_worker_get_relative_speedup(
-				starpu_worker_get_perf_archtype(starpu_bitmap_first(component->workers)));
+				starpu_worker_get_perf_archtype(starpu_bitmap_first(component->workers), component->tree->sched_ctx_id));
 }
 
 static void _worker_component_deinit_data(struct starpu_sched_component * component)

+ 11 - 11
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -307,7 +307,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 	STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
 
         /* Sometimes workers didn't take the tasks as early as we expected */
-	fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
+	fifo->exp_start = isnan(fifo->exp_start) ? starpu_timing_now() : STARPU_MAX(fifo->exp_start, starpu_timing_now());
 	fifo->exp_end = fifo->exp_start + fifo->exp_len;
 
 	if ((starpu_timing_now() + predicted_transfer) < fifo->exp_end)
@@ -417,10 +417,10 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 		worker = workers->get_next_master(workers, &it);
 		struct _starpu_fifo_taskq *fifo  = dt->queue_array[worker];
 		unsigned memory_node = starpu_worker_get_memory_node(worker);
-		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
+		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker, sched_ctx_id);
 
 		/* Sometimes workers didn't take the tasks as early as we expected */
-		double exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
+		double exp_start = isnan(fifo->exp_start) ? starpu_timing_now() : STARPU_MAX(fifo->exp_start, starpu_timing_now());
 
 		if (!starpu_worker_can_execute_task_impl(worker, task, &impl_mask))
 			continue;
@@ -558,12 +558,11 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 		worker = workers->get_next_master(workers, &it);
 
 		struct _starpu_fifo_taskq *fifo = dt->queue_array[worker];
-		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
+		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker, sched_ctx_id);
 		unsigned memory_node = starpu_worker_get_memory_node(worker);
 
 		/* Sometimes workers didn't take the tasks as early as we expected */
-		double exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
-
+		double exp_start = isnan(fifo->exp_start) ? starpu_timing_now() : STARPU_MAX(fifo->exp_start, starpu_timing_now());
 		if (!starpu_worker_can_execute_task_impl(worker, task, &impl_mask))
 			continue;
 
@@ -588,6 +587,7 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 				local_task_length[worker_ctx][nimpl] = starpu_task_bundle_expected_length(bundle, perf_arch, nimpl);
 				local_data_penalty[worker_ctx][nimpl] = starpu_task_bundle_expected_data_transfer_time(bundle, memory_node);
 				local_power[worker_ctx][nimpl] = starpu_task_bundle_expected_power(bundle, perf_arch,nimpl);
+
 			}
 			else
 			{
@@ -758,12 +758,12 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 					selected_impl = nimpl;
 
 					//_STARPU_DEBUG("best fitness (worker %d) %e = alpha*(%e) + beta(%e) +gamma(%e)\n", worker, best_fitness, exp_end[worker][nimpl] - best_exp_end, local_data_penalty[worker][nimpl], local_power[worker][nimpl]);
+
 				}
 			}
 			worker_ctx++;
 		}
 	}
-
 	STARPU_ASSERT(forced_best != -1 || best != -1);
 
 	if (forced_best != -1)
@@ -778,7 +778,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 	}
 	else if (task->bundle)
 	{
-		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(best_in_ctx);
+		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(best_in_ctx, sched_ctx_id);
 		unsigned memory_node = starpu_worker_get_memory_node(best);
 		model_best = starpu_task_expected_length(task, perf_arch, selected_impl);
 		transfer_model_best = starpu_task_expected_data_transfer_time(memory_node, task);
@@ -791,7 +791,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
 	//_STARPU_DEBUG("Scheduler dmda: kernel (%u)\n", best_impl);
 	starpu_task_set_implementation(task, selected_impl);
-
+	
 	/* we should now have the best worker in variable "best" */
 	return push_task_on_best_worker(task, best, model_best, transfer_model_best, prio, sched_ctx_id);
 }
@@ -951,7 +951,7 @@ static void dmda_push_task_notify(struct starpu_task *task, int workerid, int pe
 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
 	/* Compute the expected penality */
-	struct starpu_perfmodel_arch *perf_arch = starpu_worker_get_perf_archtype(perf_workerid);
+	struct starpu_perfmodel_arch *perf_arch = starpu_worker_get_perf_archtype(perf_workerid, sched_ctx_id);
 	unsigned memory_node = starpu_worker_get_memory_node(workerid);
 
 	double predicted = starpu_task_expected_length(task, perf_arch,
@@ -966,7 +966,7 @@ static void dmda_push_task_notify(struct starpu_task *task, int workerid, int pe
 	/* Update the predictions */
 	STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
 	/* Sometimes workers didn't take the tasks as early as we expected */
-	fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
+	fifo->exp_start = isnan(fifo->exp_start) ? starpu_timing_now() : STARPU_MAX(fifo->exp_start, starpu_timing_now());
 	fifo->exp_end = fifo->exp_start + fifo->exp_len;
 
 	/* If there is no prediction available, we consider the task has a null length */

+ 4 - 4
src/sched_policies/parallel_heft.c

@@ -232,9 +232,9 @@ static double compute_expected_end(int workerid, double length)
 	}
 }
 
-static double compute_ntasks_end(int workerid)
+static double compute_ntasks_end(int workerid, unsigned sched_ctx_id)
 {
-	struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(workerid);
+	struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(workerid, sched_ctx_id);
 	starpu_pthread_mutex_t *sched_mutex;
 	starpu_pthread_cond_t *sched_cond;
 
@@ -350,14 +350,14 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 			}
 
 
-			struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
+			struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker, sched_ctx_id);
 
 			local_task_length[worker_ctx][nimpl] = starpu_task_expected_length(task, perf_arch,nimpl);
 
 			unsigned memory_node = starpu_worker_get_memory_node(worker);
 			local_data_penalty[worker_ctx][nimpl] = starpu_task_expected_data_transfer_time(memory_node, task);
 
-			double ntasks_end = compute_ntasks_end(worker);
+			double ntasks_end = compute_ntasks_end(worker, sched_ctx_id);
 
 			if (ntasks_best == -1
 			    || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */

+ 2 - 2
src/sched_policies/random_policy.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2014  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -47,7 +47,7 @@ static int _random_push_task(struct starpu_task *task, unsigned prio)
 		unsigned impl;
 		if(starpu_worker_can_execute_task_first_impl(worker, task, &impl))
 		{
-			struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
+			struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker, sched_ctx_id);
 			double speedup = starpu_worker_get_relative_speedup(perf_arch);
 			alpha_sum += speedup;
 			speedup_arr[size] = speedup;

+ 13 - 0
src/util/starpu_task_insert_utils.c

@@ -131,6 +131,10 @@ void _starpu_task_insert_get_args_size(va_list varg_list, unsigned *nbuffers, si
 		{
 			(void)va_arg(varg_list, int);
 		}
+		else if (arg_type==STARPU_POSSIBLY_PARALLEL)
+		{
+			(void)va_arg(varg_list, unsigned);
+		}
 		else if (arg_type==STARPU_FLOPS)
 		{
 			(void)va_arg(varg_list, double);
@@ -248,6 +252,10 @@ int _starpu_codelet_pack_args(void **arg_buffer, size_t arg_buffer_size, va_list
 		{
 			(void)va_arg(varg_list, int);
 		}
+		else if (arg_type==STARPU_POSSIBLY_PARALLEL)
+		{
+			(void)va_arg(varg_list, unsigned);
+		}
 		else if (arg_type==STARPU_FLOPS)
 		{
 			(void)va_arg(varg_list, double);
@@ -433,6 +441,11 @@ void _starpu_task_insert_create(void *arg_buffer, size_t arg_buffer_size, struct
 			int hypervisor_tag = va_arg(varg_list, int);
 			(*task)->hypervisor_tag = hypervisor_tag;
 		}
+		else if (arg_type==STARPU_POSSIBLY_PARALLEL)
+		{
+			unsigned possibly_parallel = va_arg(varg_list, unsigned);
+			(*task)->possibly_parallel = possibly_parallel;
+		}
 		else if (arg_type==STARPU_FLOPS)
 		{
 			double flops = va_arg(varg_list, double);

+ 1 - 1
src/worker_collection/worker_list.c

@@ -166,7 +166,7 @@ static int list_remove(struct starpu_worker_collection *workers, int worker)
 	_rearange_workerids(masters, nmasters);
 	if(found_master != -1)
 		workers->nmasters--;
-	printf("rem %d\n", found_worker);
+
 	return found_worker;
 }
 

+ 1 - 0
tests/Makefile.am

@@ -268,6 +268,7 @@ noinst_PROGRAMS =				\
 	perfmodels/user_base			\
 	perfmodels/valid_model			\
 	perfmodels/value_nan			\
+	perfmodels/memory			\
 	sched_policies/data_locality            \
 	sched_policies/execute_all_tasks        \
 	sched_policies/prio        		\

+ 6 - 4
tests/perfmodels/feed.c

@@ -73,15 +73,17 @@ int main(int argc, char **argv)
 		measured_slow = 0.001+size*0.0000001;
 
 		struct starpu_perfmodel_arch arch;
-		arch.type = STARPU_CUDA_WORKER;
-		arch.ncore = 0;
+		arch.ndevices = 1;
+		arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
+		arch.devices[0].type = STARPU_CUDA_WORKER;
+		arch.devices[0].ncores = 0;
 		/* Simulate Fast GPU */
-		arch.devid = 0;
+		arch.devices[0].devid = 0;
 		starpu_perfmodel_update_history(&model, &task, &arch, 0, 0, measured_fast);
 		starpu_perfmodel_update_history(&nl_model, &task, &arch, 0, 0, measured_fast);
 
 		/* Simulate Slow GPU */
-		arch.devid = 1;
+		arch.devices[0].devid = 1;
 		starpu_perfmodel_update_history(&model, &task, &arch, 0, 0, measured_slow);
 		starpu_perfmodel_update_history(&nl_model, &task, &arch, 0, 0, measured_slow);
 		starpu_task_clean(&task);

+ 64 - 0
tests/perfmodels/memory.c

@@ -0,0 +1,64 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <config.h>
+#include <starpu.h>
+#include <core/perfmodel/perfmodel.h>
+#include "../helper.h"
+
+void func(void *descr[], void *arg)
+{
+}
+
+static struct starpu_perfmodel my_model =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "my_model",
+};
+
+static struct starpu_codelet my_codelet =
+{
+	.cpu_funcs = {func, NULL},
+	.model = &my_model
+};
+
+double cuda_cost_function(struct starpu_task *t, struct starpu_perfmodel_arch *a, unsigned i)
+{
+	t;
+	a;
+	return (double)i;
+}
+
+int main(int argc, char **argv)
+{
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_perfmodel_init(NULL, &my_model);
+	starpu_perfmodel_set_per_devices_cost_function(&my_model, 0, cuda_cost_function, STARPU_CUDA_WORKER, 0, 1, -1);
+
+	ret = starpu_task_insert(&my_codelet, 0);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	starpu_task_wait_for_all();
+	starpu_shutdown();
+
+	return EXIT_SUCCESS;
+}

+ 1 - 1
tests/perfmodels/regression_based.c

@@ -128,7 +128,7 @@ static void show_task_perfs(int size, struct starpu_task *task)
 		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 		{
 			FPRINTF(stdout, "Expected time for %d on %s (impl %d):\t%f\n",
-				size, name, nimpl, starpu_task_expected_length(task, starpu_worker_get_perf_archtype(workerid), nimpl));
+				size, name, nimpl, starpu_task_expected_length(task, starpu_worker_get_perf_archtype(workerid, task->sched_ctx), nimpl));
 		}
 	}
 }

+ 24 - 15
tests/perfmodels/valid_model.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,6 +16,7 @@
 
 #include <config.h>
 #include <starpu.h>
+#include <core/perfmodel/perfmodel.h>
 #include "../helper.h"
 
 void func(void *descr[], void *arg)
@@ -66,7 +67,6 @@ static int submit(struct starpu_codelet *codelet, struct starpu_perfmodel *model
 	conf.sched_policy_name = "eager";
 	conf.calibrate = 1;
 
-
 	ret = starpu_init(&conf);
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
@@ -74,15 +74,19 @@ static int submit(struct starpu_codelet *codelet, struct starpu_perfmodel *model
 	codelet->model = model;
 
 	old_nsamples = 0;
-	lmodel.is_init=0;
+	memset(&lmodel, 0, sizeof(struct starpu_perfmodel));
 	lmodel.type = model->type;
 	ret = starpu_perfmodel_load_symbol(codelet->model->symbol, &lmodel);
 	if (ret != 1)
-		for (archtype = 0; archtype < STARPU_NARCH; archtype++)
-			if(lmodel.per_arch[archtype] != NULL)
-				for(devid=0; lmodel.per_arch[archtype][devid] != NULL; devid++)
-					for(ncore=0; lmodel.per_arch[archtype][devid][ncore] != NULL; ncore++)
-						old_nsamples += lmodel.per_arch[archtype][devid][ncore][0].regression.nsample;
+	{
+		int i, impl;
+		for(i = 0; i < lmodel.state->ncombs; i++)
+		{
+			int comb = lmodel.state->combs[i];
+			for(impl = 0; impl < lmodel.state->nimpls[comb]; impl++)
+				old_nsamples += lmodel.state->per_arch[comb][impl].regression.nsample;
+		}
+	}
 
         starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, 100, sizeof(int));
 	for (loop = 0; loop < nloops; loop++)
@@ -105,13 +109,18 @@ static int submit(struct starpu_codelet *codelet, struct starpu_perfmodel *model
 		starpu_shutdown();
 		return 1;
 	}
-
-	new_nsamples = 0;
-	for (archtype = 0; archtype < STARPU_NARCH; archtype++)
-		if(lmodel.per_arch[archtype] != NULL)
-			for(devid=0; lmodel.per_arch[archtype][devid] != NULL; devid++)
-				for(ncore=0; lmodel.per_arch[archtype][devid][ncore] != NULL; ncore++)
-					new_nsamples += lmodel.per_arch[archtype][devid][ncore][0].regression.nsample;
+	else
+	{
+		int i;
+		new_nsamples = 0;
+		for(i = 0; i < lmodel.state->ncombs; i++)
+		{
+			int comb = lmodel.state->combs[i];
+			int impl;
+			for(impl = 0; impl < lmodel.state->nimpls[comb]; impl++)
+			     new_nsamples += lmodel.state->per_arch[comb][impl].regression.nsample;
+		}
+	}
 
 	ret = starpu_perfmodel_unload_model(&lmodel);
 	starpu_shutdown();

+ 39 - 43
tests/sched_policies/simple_cpu_gpu_sched.c

@@ -17,6 +17,7 @@
 #include <starpu.h>
 #include <starpu_scheduler.h>
 #include "../helper.h"
+#include <core/perfmodel/perfmodel.h>
 
 /*
  * Schedulers that are aware of the expected task length provided by the
@@ -88,6 +89,7 @@ static struct starpu_perfmodel model_cpu_task =
 	.type = STARPU_PER_ARCH,
 	.symbol = "model_cpu_task"
 };
+
 static struct starpu_perfmodel model_gpu_task =
 {
 	.type = STARPU_PER_ARCH,
@@ -95,48 +97,39 @@ static struct starpu_perfmodel model_gpu_task =
 };
 
 static void
-init_perfmodels(void)
+init_perfmodels_gpu(int gpu_type)
 {
-	unsigned devid, ncore;
-
-	starpu_perfmodel_init(&model_cpu_task);
-	starpu_perfmodel_init(&model_gpu_task);
+	int nb_worker_gpu = starpu_worker_get_count_by_type(gpu_type);
+	int *worker_gpu_ids = malloc(nb_worker_gpu * sizeof(int));
+	int worker_gpu;
 
-	if(model_cpu_task.per_arch[STARPU_CPU_WORKER] != NULL)
+	starpu_worker_get_ids_by_type(gpu_type, worker_gpu_ids, nb_worker_gpu);
+	for(worker_gpu = 0 ; worker_gpu < nb_worker_gpu ; worker_gpu ++)
 	{
-		for(devid=0; model_cpu_task.per_arch[STARPU_CPU_WORKER][devid] != NULL; devid++)
-		{
-			for(ncore=0; model_cpu_task.per_arch[STARPU_CPU_WORKER][devid][ncore] != NULL; ncore++)
-			{
-				model_cpu_task.per_arch[STARPU_CPU_WORKER][devid][ncore][0].cost_function = cpu_task_cpu;
-				model_gpu_task.per_arch[STARPU_CPU_WORKER][devid][ncore][0].cost_function = gpu_task_cpu;
-			}
-		}
-	}
+		starpu_perfmodel_set_per_devices_cost_function(&model_cpu_task, 0, cpu_task_gpu,
+							       gpu_type, starpu_worker_get_devid(worker_gpu_ids[worker_gpu]), 1,
+							       -1);
 
-	if(model_cpu_task.per_arch[STARPU_CUDA_WORKER] != NULL)
-	{
-		for(devid=0; model_cpu_task.per_arch[STARPU_CUDA_WORKER][devid] != NULL; devid++)
-		{
-			for(ncore=0; model_cpu_task.per_arch[STARPU_CUDA_WORKER][devid][ncore] != NULL; ncore++)
-			{
-				model_cpu_task.per_arch[STARPU_CUDA_WORKER][devid][ncore][0].cost_function = cpu_task_gpu;
-				model_gpu_task.per_arch[STARPU_CUDA_WORKER][devid][ncore][0].cost_function = gpu_task_gpu;
-			}
-		}
+		starpu_perfmodel_set_per_devices_cost_function(&model_gpu_task, 0, gpu_task_gpu,
+							       gpu_type, starpu_worker_get_devid(worker_gpu_ids[worker_gpu]), 1,
+							       -1);
 	}
+}
 
-	if(model_cpu_task.per_arch[STARPU_OPENCL_WORKER] != NULL)
-	{
-		for(devid=0; model_cpu_task.per_arch[STARPU_OPENCL_WORKER][devid] != NULL; devid++)
-		{
-			for(ncore=0; model_cpu_task.per_arch[STARPU_OPENCL_WORKER][devid][ncore] != NULL; ncore++)
-			{
-				model_cpu_task.per_arch[STARPU_OPENCL_WORKER][devid][ncore][0].cost_function = cpu_task_gpu;
-				model_gpu_task.per_arch[STARPU_OPENCL_WORKER][devid][ncore][0].cost_function = gpu_task_gpu;
-			}
-		}
-	}
+static void
+init_perfmodels(void)
+{
+	unsigned devid, ncore;
+
+	starpu_perfmodel_init(NULL, &model_cpu_task);
+	starpu_perfmodel_init(NULL, &model_gpu_task);
+
+	starpu_perfmodel_set_per_devices_cost_function(&model_cpu_task, 0, cpu_task_cpu, STARPU_CPU_WORKER, 0, 1, -1);
+	starpu_perfmodel_set_per_devices_cost_function(&model_gpu_task, 0, gpu_task_cpu, STARPU_CPU_WORKER, 0, 1, -1);
+
+	// We need to set the cost function for each combination with a CUDA or a OpenCL worker
+	init_perfmodels_gpu(STARPU_CUDA_WORKER);
+	init_perfmodels_gpu(STARPU_OPENCL_WORKER);
 }
 
 /*
@@ -166,17 +159,19 @@ run(struct starpu_sched_policy *policy)
 	struct starpu_conf conf;
 	starpu_conf_init(&conf);
 	conf.sched_policy = policy;
+
 	int ret = starpu_init(&conf);
 	if (ret == -ENODEV)
 		exit(STARPU_TEST_SKIPPED);
 
 	/* At least 1 CPU and 1 GPU are needed. */
-	if (starpu_cpu_worker_get_count() == 0) {
+	if (starpu_cpu_worker_get_count() == 0)
+	{
 		starpu_shutdown();
 		exit(STARPU_TEST_SKIPPED);
 	}
-	if (starpu_cuda_worker_get_count() == 0 &&
-	    starpu_opencl_worker_get_count() == 0) {
+	if (starpu_cuda_worker_get_count() == 0 && starpu_opencl_worker_get_count() == 0)
+	{
 		starpu_shutdown();
 		exit(STARPU_TEST_SKIPPED);
 	}
@@ -202,10 +197,9 @@ run(struct starpu_sched_policy *policy)
 	enum starpu_worker_archtype cpu_task_worker, gpu_task_worker;
 	cpu_task_worker = starpu_worker_get_type(cpu_task->profiling_info->workerid);
 	gpu_task_worker = starpu_worker_get_type(gpu_task->profiling_info->workerid);
-	if (cpu_task_worker != STARPU_CPU_WORKER ||
-			(gpu_task_worker != STARPU_CUDA_WORKER &&
-			 gpu_task_worker != STARPU_OPENCL_WORKER))
+	if (cpu_task_worker != STARPU_CPU_WORKER || (gpu_task_worker != STARPU_CUDA_WORKER && gpu_task_worker != STARPU_OPENCL_WORKER))
 	{
+		FPRINTF(stderr, "Tasks did not execute on expected worker\n");
 		if (cpu_task_worker != STARPU_CPU_WORKER)
 		{
 			FPRINTF(stderr, "The CPU task did not run on a CPU worker\n");
@@ -218,8 +212,10 @@ run(struct starpu_sched_policy *policy)
 		ret = 1;
 	}
 	else
+	{
+		FPRINTF(stderr, "Tasks DID execute on expected worker\n");
 		ret = 0;
-
+	}
 
 	starpu_task_destroy(cpu_task);
 	starpu_task_destroy(gpu_task);

+ 42 - 1
tools/gdbinit

@@ -2,7 +2,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2010-2014  Université de Bordeaux
-# Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+# Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -475,6 +475,24 @@ define starpu-print-prequests
   end
 end
 
+define starpu-print-arch
+  set $arch = (struct starpu_perfmodel_arch *)$arg0
+  set $device = 0
+  while $device < $arch->ndevices
+    printf "  Device type %d - devid: %d - ncores: %d\n", $arch->devices[$device].type, $arch->devices[$device].devid, $arch->devices[$device].ncores
+    set $device = $device + 1
+  end
+end
+
+define starpu-print-archs
+  set $comb = 0
+  while $comb < current_arch_comb
+    printf "Combination %d with %d devices\n", $comb, arch_combs[$comb]->ndevices
+    starpu-print-arch arch_combs[$comb]
+    set $comb = $comb + 1
+  end
+end
+
 define starpu-print-frequests
   set $node = 0
   while $node < descr.nnodes
@@ -563,6 +581,25 @@ define starpu-memusage
   end
 end
 
+define starpu-print-model
+    set $model = (struct starpu_perfmodel *)$arg0
+    printf "Model %p type %d symbol ", $model, $model->type
+    if $model->symbol
+       printf "%s", $model->symbol
+    else
+       printf "NULL"
+    end
+    printf "\n"
+end
+
+define starpu-print-registered-models
+    set $node = registered_models
+    while $node
+    	  starpu-print-model $node->model
+	  set $node = $node->next
+    end
+end
+
 document starpu
 List of StarPU-specific gdb functions:
 starpu-workers          prints a list of the StarPU workers
@@ -580,4 +617,8 @@ starpu-print-frequests  prints all StarPU prefetch data requests
 starpu-tasks            prints a list of the tasks flowing in StarPU
 starpu-tags             prints a list of the tags known to StarPU
 starpu-memusage         prints the memory node usage
+starpu-print-archs      prints all known arch combinations
+starpu-print-arch       prints a given arch combination
+starpu-print-registered-models prints all registered performance models
+starpu-print-model      prints a given performance model
 end

+ 172 - 378
tools/starpu_perfmodel_plot.c

@@ -37,33 +37,33 @@
 
 #define PROGNAME "starpu_perfmodel_plot"
 
-/* display all available models */
-static int list = 0;
-/* what kernel ? */
-static char *symbol = NULL;
-/* which architecture ? (NULL = all)*/
-static char *archname = NULL;
-/* Unless a FxT file is specified, we just display the model */
-static int no_fxt_file = 1;
-static int gflops = 0;
-
-#ifdef STARPU_USE_FXT
-static struct starpu_fxt_codelet_event *dumped_codelets;
-static struct starpu_fxt_options options;
-#endif
+struct _perfmodel_plot_options
+{
+	/* display all available models */
+	int list;
+	/* what kernel ? */
+	char *symbol;
+	/* which combination */
+	int comb_is_set;
+	int comb;
+	/* display all available combinations of a specific model */
+	int list_combs;
+	int gflops;
+	/* Unless a FxT file is specified, we just display the model */
+	int with_fxt_file;
+
+	char avg_file_name[256];
 
 #ifdef STARPU_USE_FXT
-static int **archtype_is_found[STARPU_NARCH];
-
-static char data_file_name[256];
+	struct starpu_fxt_codelet_event *dumped_codelets;
+	struct starpu_fxt_options fxt_options;
+	char data_file_name[256];
 #endif
-static char avg_file_name[256];
-static char gnuplot_file_name[256];
+};
 
 static void usage()
 {
-	fprintf(stderr, "Draw a graph corresponding to the execution time of a \
-given perfmodel\n");
+	fprintf(stderr, "Draw a graph corresponding to the execution time of a given perfmodel\n");
 	fprintf(stderr, "Usage: %s [ options ]\n", PROGNAME);
         fprintf(stderr, "\n");
 	fprintf(stderr, "One must specify a symbol with the -s option or use -l\n");
@@ -72,25 +72,28 @@ given perfmodel\n");
         fprintf(stderr, "   -s <symbol>         specify the symbol\n");
 	fprintf(stderr, "   -f                  draw GFlops instead of time\n");
 	fprintf(stderr, "   -i <Fxt files>      input FxT files generated by StarPU\n");
-        fprintf(stderr, "   -a <arch>           specify the architecture (e.g. cpu, cpu:x, cuda, cuda_d, opencl, opencl_d)\n");
+	fprintf(stderr, "   -lc                 display all combinations of a given model\n");
+        fprintf(stderr, "   -c <combination>    specify the combination (use the option -lc to list all combinations of a given model)\n");
 	fprintf(stderr, "   -h, --help          display this help and exit\n");
 	fprintf(stderr, "   -v, --version       output version information and exit\n\n");
         fprintf(stderr, "Report bugs to <%s>.", PACKAGE_BUGREPORT);
         fprintf(stderr, "\n");
 }
 
-static void parse_args(int argc, char **argv)
+static void parse_args(int argc, char **argv, struct _perfmodel_plot_options *options)
 {
+	memset(options, 0, sizeof(struct _perfmodel_plot_options));
+
 #ifdef STARPU_USE_FXT
 	/* Default options */
-	starpu_fxt_options_init(&options);
+	starpu_fxt_options_init(&options->fxt_options);
 
-	options.out_paje_path = NULL;
-	options.activity_path = NULL;
-	options.distrib_time_path = NULL;
-	options.dag_path = NULL;
+	options->fxt_options.out_paje_path = NULL;
+	options->fxt_options.activity_path = NULL;
+	options->fxt_options.distrib_time_path = NULL;
+	options->fxt_options.dag_path = NULL;
 
-	options.dumped_codelets = &dumped_codelets;
+	options->fxt_options.dumped_codelets = &options->dumped_codelets;
 #endif
 
 	/* We want to support arguments such as "-i trace_*" */
@@ -101,7 +104,7 @@ static void parse_args(int argc, char **argv)
 	{
 		if (strcmp(argv[i], "-s") == 0)
 		{
-			symbol = argv[++i];
+			options->symbol = argv[++i];
 			continue;
 		}
 
@@ -109,8 +112,8 @@ static void parse_args(int argc, char **argv)
 		{
 			reading_input_filenames = 1;
 #ifdef STARPU_USE_FXT
-			options.filenames[options.ninputfiles++] = argv[++i];
-			no_fxt_file = 0;
+			options->fxt_options.filenames[options->fxt_options.ninputfiles++] = argv[++i];
+			options->with_fxt_file = 1;
 #else
 			fprintf(stderr, "Warning: FxT support was not enabled in StarPU: FxT traces will thus be ignored!\n");
 #endif
@@ -119,19 +122,26 @@ static void parse_args(int argc, char **argv)
 
 		if (strcmp(argv[i], "-l") == 0)
 		{
-			list = 1;
+			options->list = 1;
+			continue;
+		}
+
+		if (strcmp(argv[i], "-lc") == 0)
+		{
+			options->list_combs = 1;
 			continue;
 		}
 
 		if (strcmp(argv[i], "-f") == 0)
 		{
-			gflops = 1;
+			options->gflops = 1;
 			continue;
 		}
 
-		if (strcmp(argv[i], "-a") == 0)
+		if (strcmp(argv[i], "-c") == 0)
 		{
-			archname = argv[++i];
+			options->comb_is_set = 1;
+			options->comb = atoi(argv[++i]);
 			continue;
 		}
 
@@ -155,19 +165,18 @@ static void parse_args(int argc, char **argv)
 		if (reading_input_filenames)
 		{
 #ifdef STARPU_USE_FXT
-			options.filenames[options.ninputfiles++] = argv[i];
+			options->fxt_options.filenames[options->fxt_options.ninputfiles++] = argv[i];
 #endif
 			continue;
 		}
 	}
 
-	if (!symbol && !list)
+	if ((!options->symbol && !options->list) || (options->list_combs && !options->symbol))
 	{
 		fprintf(stderr, "Incorrect usage, aborting\n");
                 usage();
 		exit(-1);
 	}
-
 }
 
 static char *replace_char(char *str, char old, char new)
@@ -194,27 +203,22 @@ static void print_comma(FILE *gnuplot_file, int *first)
 	}
 }
 
-static void display_perf_model(FILE *gnuplot_file, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, int *first, unsigned nimpl)
+static void display_perf_model(FILE *gnuplot_file, struct starpu_perfmodel_arch* arch, struct starpu_perfmodel_per_arch *arch_model, int impl, int *first, struct _perfmodel_plot_options *options)
 {
 	char arch_name[256];
-	starpu_perfmodel_get_arch_name(arch, arch_name, 256, nimpl);
-
-	struct starpu_perfmodel_per_arch *arch_model =
-		&model->per_arch[arch->type][arch->devid][arch->ncore][nimpl];
 
-	if (arch_model->regression.valid || arch_model->regression.nl_valid)
-		fprintf(stderr,"Arch: %s\n", arch_name);
+	starpu_perfmodel_get_arch_name(arch, arch_name, 256, impl);
 
 #ifdef STARPU_USE_FXT
-	if (!gflops && !no_fxt_file && archtype_is_found[arch->type][arch->devid][arch->ncore] && nimpl == 0)
+	if (!options->gflops && options->with_fxt_file && impl == 0)
 	{
 		print_comma(gnuplot_file, first);
-		fprintf(gnuplot_file, "\"< grep -w \\^%d_%d_%d %s\" using 2:3 title \"Profiling %s\"", arch->type, arch->devid, arch->ncore, data_file_name, arch_name);
+		fprintf(gnuplot_file, "\"< grep -w \\^%s %s\" using 2:3 title \"Profiling %s\"", arch_name, options->data_file_name, replace_char(arch_name, '_', '-'));
 	}
 #endif
 
 	/* Only display the regression model if we could actually build a model */
-	if (!gflops && arch_model->regression.valid && !arch_model->regression.nl_valid)
+	if (!options->gflops && arch_model->regression.valid && !arch_model->regression.nl_valid)
 	{
 		print_comma(gnuplot_file, first);
 
@@ -226,7 +230,7 @@ static void display_perf_model(FILE *gnuplot_file, struct starpu_perfmodel *mode
 			arch_model->regression.alpha, arch_model->regression.beta, arch_name);
 	}
 
-	if (!gflops && arch_model->regression.nl_valid)
+	if (!options->gflops && arch_model->regression.nl_valid)
 	{
 		print_comma(gnuplot_file, first);
 
@@ -240,90 +244,37 @@ static void display_perf_model(FILE *gnuplot_file, struct starpu_perfmodel *mode
 	}
 }
 
-static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, enum starpu_worker_archtype* type, int* devid, int* ncore, int *first)
+static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, int *first, struct _perfmodel_plot_options *options)
 {
-	char *command;
 	FILE *datafile;
 	struct starpu_perfmodel_history_list *ptr;
 	char arch_name[32];
 	int col;
-	size_t len;
 	unsigned long last, minimum = 0;
 
-	len = 10 + strlen(avg_file_name) + 1;
-	command = (char *) malloc(len);
-	datafile = fopen(avg_file_name, "w");
-	free(command);
-
+	datafile = fopen(options->avg_file_name, "w");
 	col = 2;
-	unsigned implid;
 
-	unsigned archmin, archmax, devmin, devmax, coremin, coremax;
-	if(type != NULL)
+	int i;
+	for(i = 0; i < model->state->ncombs; i++)
 	{
-		archmin = *type;
-		archmax = *type +1;
-		if(devid != NULL)
-		{
-			devmin = *devid;
-			devmax = *devid +1;
-			if(ncore != NULL)
-			{
-				coremin = *ncore;
-				coremax = *ncore +1;
-			}
-			else
-			{
-				coremin = 0;
-				coremax = 0;
-			}
-		}
-		else
+		int comb = model->state->combs[i];
+		if (options->comb_is_set == 0 || options->comb == comb)
 		{
-			devmin = 0;
-			devmax = 0;
-			coremin = 0;
-			coremax = 0;
-		}
-	}
-	else
-	{
-		archmin = 0;
-		archmax = STARPU_NARCH;
-		devmin = 0;
-		devmax = 0;
-		coremin = 0;
-		coremax = 0;
+			struct starpu_perfmodel_arch *arch;
+			int impl;
 
-	}
-	struct starpu_perfmodel_arch arch;
-	unsigned archtype, dev, core;
-	for (archtype = archmin; archtype < archmax; archtype++)
-	{
-		arch.type = archtype;
-		if(model->per_arch[archtype]!=NULL)
-		{
-			for(dev = devmin; model->per_arch[archtype][dev] != NULL && (devmax == 0 || dev < devmax);dev++)
+			arch = _starpu_arch_comb_get(comb);
+			for(impl = 0; impl < model->state->nimpls[comb]; impl++)
 			{
-				arch.devid = dev;
+				struct starpu_perfmodel_per_arch *arch_model = &model->state->per_arch[comb][impl];
+				starpu_perfmodel_get_arch_name(arch, arch_name, 32, impl);
 
-				for(core = coremin; model->per_arch[archtype][dev][core] != NULL && (coremax == 0 || core < coremax); core++)
+				if (arch_model->list)
 				{
-					arch.ncore = core;
-					for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
-					{
-						struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[archtype][dev][core][implid];
-						starpu_perfmodel_get_arch_name(&arch, arch_name, 32, implid);
-
-						//ptrs[arch-arch1][implid] = ptr[arch-arch1][implid] = arch_model->list;
-
-						if (arch_model->list)
-						{
-							print_comma(gnuplot_file, first);
-							fprintf(gnuplot_file, "\"%s\" using 1:%d:%d with errorlines title \"Average %s\"", avg_file_name, col, col+1, replace_char(arch_name, '_', '-'));
-							col += 2;
-						}
-					}
+					print_comma(gnuplot_file, first);
+					fprintf(gnuplot_file, "\"%s\" using 1:%d:%d with errorlines title \"Average %s\"", options->avg_file_name, col, col+1, replace_char(arch_name, '_', '-'));
+					col += 2;
 				}
 			}
 		}
@@ -336,25 +287,20 @@ static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_
 
 		minimum = ULONG_MAX;
 		/* Get the next minimum */
-		for (archtype = archmin; archtype < archmax; archtype++)
+		for(i = 0; i < model->state->ncombs; i++)
 		{
-			if(model->per_arch[archtype]!=NULL)
+			int comb = model->state->combs[i];
+			if (options->comb_is_set == 0 || options->comb == comb)
 			{
-				for(dev = devmin; model->per_arch[archtype][dev] != NULL && (devmax == 0 || dev < devmax);dev++)
+				int impl;
+				for(impl = 0; impl < model->state->nimpls[comb]; impl++)
 				{
-					for(core = coremin; model->per_arch[archtype][dev][core] != NULL && (coremax == 0 || core < coremax); core++)
-				
+					struct starpu_perfmodel_per_arch *arch_model = &model->state->per_arch[comb][impl];
+					for (ptr = arch_model->list; ptr; ptr = ptr->next)
 					{
-						for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
-						{
-							struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[archtype][dev][core][implid];
-							for (ptr = arch_model->list; ptr; ptr = ptr->next)
-							{
-								unsigned long size = ptr->entry->size;
-								if (size > last && size < minimum)
-									minimum = size;
-							}
-						}
+						unsigned long size = ptr->entry->size;
+						if (size > last && size < minimum)
+							minimum = size;
 					}
 				}
 			}
@@ -364,160 +310,98 @@ static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_
 
 		fprintf(stderr, "%lu ", minimum);
 		fprintf(datafile, "%-15lu ", minimum);
-		for (archtype = archmin; archtype < archmax; archtype++)
-			if(model->per_arch[archtype]!=NULL)
-				for(dev = devmin; model->per_arch[archtype][dev] != NULL && (devmax == 0 || dev < devmax);dev++)
-					for(core = coremin; model->per_arch[archtype][dev][core] != NULL && (coremax == 0 || core < coremax); core++)
-						for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+		for(i = 0; i < model->state->ncombs; i++)
+		{
+			int comb = model->state->combs[i];
+			if (options->comb_is_set == 0 || options->comb == comb)
+			{
+				int impl;
+
+				for(impl = 0; impl < model->state->nimpls[comb]; impl++)
+				{
+					struct starpu_perfmodel_per_arch *arch_model = &model->state->per_arch[comb][impl];
+					for (ptr = arch_model->list; ptr; ptr = ptr->next)
+					{
+						struct starpu_perfmodel_history_entry *entry = ptr->entry;
+						if (entry->size == minimum)
 						{
-							struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[archtype][dev][core][implid];
-							for (ptr = arch_model->list; ptr; ptr = ptr->next)
-							{
-								struct starpu_perfmodel_history_entry *entry = ptr->entry;
-								if (entry->size == minimum)
-								{
-									if (gflops)
-										fprintf(datafile, "\t%-15le\t%-15le", entry->flops / (entry->mean * 1000),
-												entry->flops / ((entry->mean + entry->deviation) * 1000) -
-												entry->flops / (entry->mean * 1000)
-										       );
-									else
-										fprintf(datafile, "\t%-15le\t%-15le", 0.001*entry->mean, 0.001*entry->deviation);
-									break;
-								}
-							}
-							if (!ptr && arch_model->list)
-								/* No value for this arch. */
-								fprintf(datafile, "\t\"\"\t\"\"");
+							if (options->gflops)
+								fprintf(datafile, "\t%-15le\t%-15le", entry->flops / (entry->mean * 1000),
+									entry->flops / ((entry->mean + entry->deviation) * 1000) -
+									entry->flops / (entry->mean * 1000)
+									);
+							else
+								fprintf(datafile, "\t%-15le\t%-15le", 0.001*entry->mean, 0.001*entry->deviation);
+							break;
 						}
+					}
+					if (!ptr && arch_model->list)
+						/* No value for this arch. */
+						fprintf(datafile, "\t\"\"\t\"\"");
+				}
+			}
+		}
 		fprintf(datafile, "\n");
 	}
 	fprintf(stderr, "\n");
-	fclose(datafile);
-}
-
-
-static void display_selected_arch_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, int *first)
-{
-	unsigned implid;
-	for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
-		display_perf_model(gnuplot_file, model, arch, first, implid);
-}
-
-static void display_selected_device_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, enum starpu_worker_archtype archtype, int devid, int *first)
-{
-	unsigned ncore;
-	struct starpu_perfmodel_arch arch;
-	arch.type = archtype;
-	arch.devid = devid;
-	for(ncore=0; model->per_arch[archtype][devid][ncore] != NULL; ncore++)
-	{
-		arch.ncore = ncore;
-		display_selected_arch_perf_models(gnuplot_file,model,&arch,first);
-	}
-}
-
-static void display_selected_archtype_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, enum starpu_worker_archtype archtype, int *first)
-{
-	unsigned devid;
-	for(devid=0; model->per_arch[archtype][devid] != NULL; devid++)
-		display_selected_device_perf_models(gnuplot_file,model,archtype,devid,first);
-}
-
-static void display_all_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, int *first)
-{
-	unsigned archtype;
-	for(archtype = 0; archtype < STARPU_NARCH; archtype++)
-		display_selected_archtype_perf_models(gnuplot_file,model,archtype,first);
-}
-
-#ifdef STARPU_USE_FXT
-static int ** init_archtype_is_found_per_arch(int maxdevid, unsigned* maxncore_table)
-{
-	int devid, ncore;
-	int ** archtype_is_found_per_arch = malloc(sizeof(*archtype_is_found_per_arch)*(maxdevid+1));
-	archtype_is_found_per_arch[maxdevid] = NULL;
-	for(devid=0; devid<maxdevid; devid++)
-	{
-		int maxncore;
-		if(maxncore_table != NULL)
-			maxncore = maxncore_table[devid];
-		else
-			maxncore = 1;
-		
-		archtype_is_found_per_arch[devid] = malloc(sizeof(*archtype_is_found_per_arch[devid])*(maxncore+1));
-		archtype_is_found_per_arch[devid][maxncore] = 0;
-		for(ncore=0; ncore<maxncore; ncore++)
-			archtype_is_found_per_arch[devid][ncore] = 0;
-	}
-	return archtype_is_found_per_arch;
 
+	fclose(datafile);
 }
 
-
-static void init_archtype_is_found(struct starpu_perfmodel *model)
+static void display_all_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, int *first, struct _perfmodel_plot_options *options)
 {
-	unsigned archtype, devid, ndevice, ncore, *maxncore;
-
-	for(archtype = 0; archtype < STARPU_NARCH; archtype++)
+	int i;
+	for(i = 0; i < model->state->ncombs; i++)
 	{
-	
-		for(devid=0; model->per_arch[archtype][devid] != NULL; devid++)
-			;
-		ndevice = devid;
-		if(ndevice != 0)
+		int comb = model->state->combs[i];
+		if (options->comb_is_set == 0 || options->comb == comb)
 		{
-			maxncore = malloc(sizeof(*maxncore)*ndevice);
-			for(devid=0; devid < ndevice; devid++)
+			struct starpu_perfmodel_arch *arch;
+			int impl;
+
+			arch = _starpu_arch_comb_get(comb);
+			for(impl = 0; impl < model->state->nimpls[comb]; impl++)
 			{
-				for(ncore=0; model->per_arch[archtype][devid][ncore] != NULL; ncore++)
-					;
-				maxncore[devid] = ncore;
+				struct starpu_perfmodel_per_arch *archmodel = &model->state->per_arch[comb][impl];
+				display_perf_model(gnuplot_file, arch, archmodel, impl, first, options);
 			}
 		}
-		else
-		{
-			maxncore = NULL;
-		}
-
-		archtype_is_found[archtype] = init_archtype_is_found_per_arch(ndevice,maxncore);
-		if(maxncore != NULL)
-			free(maxncore);
 	}
 }
 
-
-static void dump_data_file(FILE *data_file, struct starpu_perfmodel *model)
+#ifdef STARPU_USE_FXT
+static void dump_data_file(FILE *data_file, struct _perfmodel_plot_options *options)
 {
-	init_archtype_is_found(model);
-
 	int i;
-	for (i = 0; i < options.dumped_codelets_count; i++)
+	for (i = 0; i < options->fxt_options.dumped_codelets_count; i++)
 	{
-		/* Dump only if the symbol matches user's request */
-		if (strncmp(dumped_codelets[i].symbol, symbol, (FXT_MAX_PARAMS - 4)*sizeof(unsigned long)-1) == 0)
+		/* Dump only if the codelet symbol matches user's request (with or without the machine name) */
+		char *tmp = strdup(options->symbol);
+		char *dot = strchr(tmp, '.');
+		if (dot) tmp[strlen(tmp)-strlen(dot)] = '\0';
+		if ((strncmp(options->dumped_codelets[i].symbol, options->symbol, (FXT_MAX_PARAMS - 4)*sizeof(unsigned long)-1) == 0)
+		    || (strncmp(options->dumped_codelets[i].symbol, tmp, (FXT_MAX_PARAMS - 4)*sizeof(unsigned long)-1) == 0))
 		{
-			struct starpu_perfmodel_arch* arch = &dumped_codelets[i].arch;
-			archtype_is_found[arch->type][arch->devid][arch->ncore] = 1;
-
-			size_t size = dumped_codelets[i].size;
-			float time = dumped_codelets[i].time;
+			char *archname = options->dumped_codelets[i].perfmodel_archname;
+			size_t size = options->dumped_codelets[i].size;
+			float time = options->dumped_codelets[i].time;
 
-			fprintf(data_file, "%d_%d_%d	%f	%f\n", arch->type, arch->devid, arch->ncore, (float)size, time);
+			fprintf(data_file, "%s	%f	%f\n", archname, (float)size, time);
 		}
+		free(tmp);
 	}
 }
 #endif
 
-static void display_selected_models(FILE *gnuplot_file, struct starpu_perfmodel *model)
+static void display_selected_models(FILE *gnuplot_file, struct starpu_perfmodel *model, struct _perfmodel_plot_options *options)
 {
 	fprintf(gnuplot_file, "#!/usr/bin/gnuplot -persist\n");
 	fprintf(gnuplot_file, "\n");
 	fprintf(gnuplot_file, "set term postscript eps enhanced color\n");
-	fprintf(gnuplot_file, "set output \"starpu_%s.eps\"\n", symbol);
-	fprintf(gnuplot_file, "set title \"Model for codelet %s\"\n", replace_char(symbol, '_', '-'));
+	fprintf(gnuplot_file, "set output \"starpu_%s.eps\"\n", options->symbol);
+	fprintf(gnuplot_file, "set title \"Model for codelet %s\"\n", replace_char(options->symbol, '_', '-'));
 	fprintf(gnuplot_file, "set xlabel \"Total data size\"\n");
-	if (gflops)
+	if (options->gflops)
 		fprintf(gnuplot_file, "set ylabel \"GFlops\"\n");
 	else
 		fprintf(gnuplot_file, "set ylabel \"Time (ms)\"\n");
@@ -529,134 +413,32 @@ static void display_selected_models(FILE *gnuplot_file, struct starpu_perfmodel
 
 	/* If no input data is given to gnuplot, we at least need to specify an
 	 * arbitrary range. */
-	if (no_fxt_file)
+	if (options->with_fxt_file == 0)
 		fprintf(gnuplot_file, "set xrange [1:10**9]\n\n");
 
 	int first = 1;
 	fprintf(gnuplot_file, "plot\t");
 
-	struct starpu_perfmodel_arch arch;
-	struct _starpu_machine_config *conf = _starpu_get_machine_config();
-
-
-
-	if (archname == NULL)
-	{
-		/* display all architectures */
-		display_all_perf_models(gnuplot_file, model, &first);
-		display_history_based_perf_models(gnuplot_file, model, NULL, NULL, NULL, &first);
-	}
-	else
-	{
-		if (strcmp(archname, "cpu") == 0)
-		{
-			
-			arch.type = STARPU_CPU_WORKER;
-			arch.devid = 1;
-			arch.ncore = 0;
-
-			display_selected_arch_perf_models(gnuplot_file, model, &arch, &first); 
-			display_history_based_perf_models(gnuplot_file, model,  &arch.type, &arch.devid, &arch.ncore, &first);
-			return;
-		}
-
-		unsigned k;
-		if (sscanf(archname, "cpu:%u", &k) == 1)
-		{
-			/* For combined CPU workers */
-			if ((k < 1) || (k > conf->topology.ncpus))
-			{
-				fprintf(stderr, "Invalid CPU size\n");
-				exit(-1);
-			}
-
-			arch.type = STARPU_CPU_WORKER;
-			arch.devid = 1;
-			arch.ncore = k - 1;
-
-			display_selected_arch_perf_models(gnuplot_file, model, &arch, &first); 
-			display_history_based_perf_models(gnuplot_file, model,  &arch.type, &arch.devid, &arch.ncore, &first);
-			return;
-		}
-
-		if (strcmp(archname, "cuda") == 0)
-		{
-			unsigned archtype = STARPU_CUDA_WORKER;
-			display_selected_archtype_perf_models(gnuplot_file, model, archtype, &first);
-			display_history_based_perf_models(gnuplot_file, model,  &archtype, NULL, NULL, &first);
-			return;
-		}
-
-		/* There must be a cleaner way ! */
-		unsigned gpuid;
-		int nmatched;
-		nmatched = sscanf(archname, "cuda_%u", &gpuid);
-		if (nmatched == 1)
-		{
-			if (gpuid < conf->topology.ncudagpus)
-			{
-				arch.type = STARPU_CUDA_WORKER;
-				arch.devid = gpuid;
-				arch.ncore = 0;
-
-				display_selected_arch_perf_models(gnuplot_file, model, &arch, &first);
-				display_history_based_perf_models(gnuplot_file, model,  &arch.type, &arch.devid, &arch.ncore, &first);
-				return;
-			}
-			else
-			{
-				fprintf(stderr, "Invalid CUDA device %d (last valid one is %d)\n", gpuid, STARPU_MAXCUDADEVS-1);
-				exit(-1);
-			}
-		}
-
-		if (strcmp(archname, "opencl") == 0)
-		{
-			unsigned archtype = STARPU_OPENCL_WORKER;
-			display_selected_archtype_perf_models(gnuplot_file, model, archtype, &first);
-			display_history_based_perf_models(gnuplot_file, model,  &archtype, NULL, NULL, &first);
-			return;
-		}
-
-		/* There must be a cleaner way ! */
-		nmatched = sscanf(archname, "opencl_%u", &gpuid);
-		if (nmatched == 1)
-		{
-			if (gpuid < conf->topology.nopenclgpus)
-			{
-				arch.type = STARPU_OPENCL_WORKER;
-				arch.devid = gpuid;
-				arch.ncore = 0;
-		
-				display_selected_arch_perf_models(gnuplot_file, model, &arch, &first);
-				display_history_based_perf_models(gnuplot_file, model,  &arch.type, &arch.devid, &arch.ncore, &first);
-				return;
-			}
-			else
-			{
-				fprintf(stderr, "Invalid OpenCL device %d (last valid one is %d)\n", gpuid, STARPU_MAXOPENCLDEVS-1);
-				exit(-1);
-			}
-		}
-
-		fprintf(stderr, "Unknown architecture requested, aborting.\n");
-		exit(-1);
-	}
+	/* display all or selected combinations */
+	display_all_perf_models(gnuplot_file, model, &first, options);
+	display_history_based_perf_models(gnuplot_file, model, &first, options);
 }
 
 int main(int argc, char **argv)
 {
-	int ret;
+	int ret = 0;
 	struct starpu_perfmodel model = {};
+	char gnuplot_file_name[256];
+	struct _perfmodel_plot_options options;
 
 #if defined(_WIN32) && !defined(__CYGWIN__)
 	WSADATA wsadata;
 	WSAStartup(MAKEWORD(1,0), &wsadata);
 #endif
 
-	parse_args(argc, argv);
+	parse_args(argc, argv, &options);
 
-        if (list)
+        if (options.list)
 	{
                 ret = starpu_perfmodel_list(stdout);
                 if (ret)
@@ -668,35 +450,47 @@ int main(int argc, char **argv)
         }
 
 	/* Load the performance model associated to the symbol */
-	ret = starpu_perfmodel_load_symbol(symbol, &model);
+	ret = starpu_perfmodel_load_symbol(options.symbol, &model);
 	if (ret == 1)
 	{
-		fprintf(stderr, "The performance model for the symbol <%s> could not be loaded\n", symbol);
+		fprintf(stderr, "The performance model for the symbol <%s> could not be loaded\n", options.symbol);
 		return 1;
 	}
 
+        if (options.list_combs)
+	{
+		ret = starpu_perfmodel_list_combs(stdout, &model);
+                if (ret)
+		{
+                        fprintf(stderr, "Error when listing combinations for model <%s>\n", options.symbol);
+                        return 1;
+                }
+		return 0;
+
+	}
+
 	/* If some FxT input was specified, we put the points on the graph */
 #ifdef STARPU_USE_FXT
-	if (!no_fxt_file)
+	if (options.with_fxt_file)
 	{
-		starpu_fxt_generate_trace(&options);
+		starpu_fxt_generate_trace(&options.fxt_options);
 
-		snprintf(data_file_name, 256, "starpu_%s.data", symbol);
+		snprintf(options.data_file_name, 256, "starpu_%s.data", options.symbol);
 
-		FILE *data_file = fopen(data_file_name, "w+");
+		FILE *data_file = fopen(options.data_file_name, "w+");
 		STARPU_ASSERT(data_file);
-		dump_data_file(data_file, &model);
+		dump_data_file(data_file, &options);
 		fclose(data_file);
 	}
 #endif
 
-	snprintf(gnuplot_file_name, 256, "starpu_%s.gp", symbol);
-
-	snprintf(avg_file_name, 256, "starpu_%s_avg.data", symbol);
+	snprintf(gnuplot_file_name, 256, "starpu_%s.gp", options.symbol);
+	snprintf(options.avg_file_name, 256, "starpu_%s_avg.data", options.symbol);
 
 	FILE *gnuplot_file = fopen(gnuplot_file_name, "w+");
 	STARPU_ASSERT(gnuplot_file);
-	display_selected_models(gnuplot_file, &model);
+	display_selected_models(gnuplot_file, &model, &options);
+	fprintf(gnuplot_file,"\n");
 	fclose(gnuplot_file);
 
 	/* Retrieve the current mode of the gnuplot executable */