Преглед изворни кода

merge branches/mic: update of perfmodel storage + update _STARPU_PERFMODEL_VERSION

Nathalie Furmento пре 11 година
родитељ
комит
497ce06798
55 измењених фајлова са 1292 додато и 841 уклоњено
  1. 11 3
      examples/cholesky/cholesky.h
  2. 14 0
      examples/cholesky/cholesky_grain_tag.c
  3. 13 0
      examples/cholesky/cholesky_implicit.c
  4. 18 37
      examples/cholesky/cholesky_models.c
  5. 15 0
      examples/cholesky/cholesky_tag.c
  6. 14 0
      examples/cholesky/cholesky_tile_tag.c
  7. 26 0
      examples/heat/dw_factolu.c
  8. 0 5
      examples/heat/dw_factolu.h
  9. 5 0
      examples/heat/dw_factolu_grain.c
  10. 5 0
      examples/heat/dw_factolu_tag.c
  11. 21 78
      examples/heat/lu_kernels_model.c
  12. 20 0
      examples/heat/lu_kernels_model.h
  13. 2 2
      include/starpu_fxt.h
  14. 21 37
      include/starpu_perfmodel.h
  15. 7 7
      include/starpu_scheduler.h
  16. 2 2
      include/starpu_worker.h
  17. 11 3
      sc_hypervisor/examples/cholesky/cholesky.h
  18. 14 0
      sc_hypervisor/examples/cholesky/cholesky_grain_tag.c
  19. 15 0
      sc_hypervisor/examples/cholesky/cholesky_implicit.c
  20. 16 37
      sc_hypervisor/examples/cholesky/cholesky_models.c
  21. 14 0
      sc_hypervisor/examples/cholesky/cholesky_tag.c
  22. 14 0
      sc_hypervisor/examples/cholesky/cholesky_tile_tag.c
  23. 1 1
      sc_hypervisor/src/policies_utils/policy_tools.c
  24. 5 12
      src/core/combined_workers.c
  25. 12 12
      src/core/detect_combined_workers.c
  26. 1 1
      src/core/jobs.h
  27. 45 29
      src/core/perfmodel/perfmodel.c
  28. 6 6
      src/core/perfmodel/perfmodel.h
  29. 465 366
      src/core/perfmodel/perfmodel_history.c
  30. 40 20
      src/core/perfmodel/perfmodel_print.c
  31. 1 1
      src/core/simgrid.c
  32. 1 1
      src/core/simgrid.h
  33. 5 1
      src/core/task.c
  34. 16 11
      src/core/topology.c
  35. 2 2
      src/core/workers.h
  36. 8 4
      src/datawizard/footprint.c
  37. 1 1
      src/datawizard/footprint.h
  38. 19 9
      src/debug/traces/starpu_fxt.c
  39. 4 4
      src/drivers/cpu/driver_cpu.c
  40. 3 3
      src/drivers/cuda/driver_cuda.c
  41. 3 3
      src/drivers/driver_common/driver_common.c
  42. 2 2
      src/drivers/driver_common/driver_common.h
  43. 0 5
      src/drivers/mp_common/sink_common.c
  44. 2 2
      src/drivers/mp_common/source_common.c
  45. 4 3
      src/drivers/opencl/driver_opencl.c
  46. 41 15
      src/profiling/bound.c
  47. 4 4
      src/sched_policies/deque_modeling_policy_data_aware.c
  48. 2 2
      src/sched_policies/parallel_heft.c
  49. 1 1
      src/sched_policies/random_policy.c
  50. 1 1
      src/starpu_parameters.h
  51. 13 4
      tests/perfmodels/feed.c
  52. 13 5
      tests/perfmodels/valid_model.c
  53. 42 13
      tests/sched_policies/simple_cpu_gpu_sched.c
  54. 1 1
      tools/Makefile.am
  55. 250 85
      tools/starpu_perfmodel_plot.c

+ 11 - 3
examples/cholesky/cholesky.h

@@ -132,15 +132,23 @@ void chol_cpu_codelet_update_u11(void **, void *);
 void chol_cpu_codelet_update_u21(void **, void *);
 void chol_cpu_codelet_update_u22(void **, void *);
 
+double cpu_chol_task_11_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double cpu_chol_task_21_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double cpu_chol_task_22_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+
 #ifdef STARPU_USE_CUDA
 void chol_cublas_codelet_update_u11(void *descr[], void *_args);
 void chol_cublas_codelet_update_u21(void *descr[], void *_args);
 void chol_cublas_codelet_update_u22(void *descr[], void *_args);
+
+double cuda_chol_task_11_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double cuda_chol_task_21_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double cuda_chol_task_22_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
 #endif
 
-extern struct starpu_perfmodel chol_model_11;
-extern struct starpu_perfmodel chol_model_21;
-extern struct starpu_perfmodel chol_model_22;
+void initialize_chol_model(struct starpu_perfmodel* model, char* symbol, 
+		double (*cpu_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned), 
+		double (*cuda_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned));
 
 static void STARPU_ATTRIBUTE_UNUSED parse_args(int argc, char **argv)
 {

+ 14 - 0
examples/cholesky/cholesky_grain_tag.c

@@ -18,6 +18,10 @@
 
 #include "cholesky.h"
 
+struct starpu_perfmodel chol_model_11;
+struct starpu_perfmodel chol_model_21;
+struct starpu_perfmodel chol_model_22;
+
 /*
  *	Some useful functions
  */
@@ -292,6 +296,16 @@ static void initialize_system(float **A, unsigned dim, unsigned pinned)
 		exit(77);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+#ifdef STARPU_USE_CUDA
+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,cuda_chol_task_11_cost);
+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,cuda_chol_task_21_cost);
+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,cuda_chol_task_22_cost);
+#else
+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,NULL);
+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,NULL);
+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,NULL);
+#endif
+
 	starpu_cublas_init();
 
 #ifndef STARPU_SIMGRID

+ 13 - 0
examples/cholesky/cholesky_implicit.c

@@ -21,6 +21,9 @@
 /*
  *	Create the codelets
  */
+struct starpu_perfmodel chol_model_11;
+struct starpu_perfmodel chol_model_21;
+struct starpu_perfmodel chol_model_22;
 
 static struct starpu_codelet cl11 =
 {
@@ -346,6 +349,16 @@ int main(int argc, char **argv)
                 return 77;
         STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+#ifdef STARPU_USE_CUDA
+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,cuda_chol_task_11_cost);
+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,cuda_chol_task_21_cost);
+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,cuda_chol_task_22_cost);
+#else
+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,NULL);
+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,NULL);
+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,NULL);
+#endif
+
 	starpu_cublas_init();
 
 	if(with_ctxs)

+ 18 - 37
examples/cholesky/cholesky_models.c

@@ -26,6 +26,7 @@
  */
 
 #include <starpu.h>
+#include <starpu_perfmodel.h>
 #include "cholesky.h"
 
 /* #define USE_PERTURBATION	1 */
@@ -36,7 +37,7 @@
 #define PERTURBATE(a)	(a)
 #endif
 
-static double cpu_chol_task_11_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double cpu_chol_task_11_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -51,7 +52,7 @@ static double cpu_chol_task_11_cost(struct starpu_task *task, enum starpu_perfmo
 	return PERTURBATE(cost);
 }
 
-static double cuda_chol_task_11_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double cuda_chol_task_11_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -66,7 +67,7 @@ static double cuda_chol_task_11_cost(struct starpu_task *task, enum starpu_perfm
 	return PERTURBATE(cost);
 }
 
-static double cpu_chol_task_21_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double cpu_chol_task_21_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -81,7 +82,7 @@ static double cpu_chol_task_21_cost(struct starpu_task *task, enum starpu_perfmo
 	return PERTURBATE(cost);
 }
 
-static double cuda_chol_task_21_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double cuda_chol_task_21_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -96,7 +97,7 @@ static double cuda_chol_task_21_cost(struct starpu_task *task, enum starpu_perfm
 	return PERTURBATE(cost);
 }
 
-static double cpu_chol_task_22_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double cpu_chol_task_22_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -111,7 +112,7 @@ static double cpu_chol_task_22_cost(struct starpu_task *task, enum starpu_perfmo
 	return PERTURBATE(cost);
 }
 
-static double cuda_chol_task_22_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double cuda_chol_task_22_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -126,35 +127,15 @@ static double cuda_chol_task_22_cost(struct starpu_task *task, enum starpu_perfm
 	return PERTURBATE(cost);
 }
 
-struct starpu_perfmodel chol_model_11 =
+void initialize_chol_model(struct starpu_perfmodel* model, char * symbol, 
+		double (*cpu_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned), 
+		double (*cuda_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned))
 {
-	.per_arch =
-	{
-		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_11_cost },
-		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_11_cost }
-	},
-	.type = STARPU_HISTORY_BASED,
-	.symbol = "chol_model_11"
-};
-
-struct starpu_perfmodel chol_model_21 =
-{
-	.per_arch =
-	{
-		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_21_cost },
-		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_21_cost }
-	},
-	.type = STARPU_HISTORY_BASED,
-	.symbol = "chol_model_21"
-};
-
-struct starpu_perfmodel chol_model_22 =
-{
-	.per_arch =
-	{
-		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_22_cost },
-		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_22_cost }
-	},
-	.type = STARPU_HISTORY_BASED,
-	.symbol = "chol_model_22"
-};
+	model->symbol = symbol;
+	model->type = STARPU_HISTORY_BASED;
+	starpu_initialize_model(model);
+	model->per_arch[STARPU_CPU_WORKER][0][0][0].cost_function = cpu_cost_function;
+	if(starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) != 0)
+		model->per_arch[STARPU_CUDA_WORKER][0][0][0].cost_function = cuda_cost_function;
+}
+

+ 15 - 0
examples/cholesky/cholesky_tag.c

@@ -17,6 +17,11 @@
  */
 
 #include "cholesky.h"
+#include <starpu_perfmodel.h>
+
+struct starpu_perfmodel chol_model_11;
+struct starpu_perfmodel chol_model_21;
+struct starpu_perfmodel chol_model_22;
 
 /*
  *	Some useful functions
@@ -258,6 +263,16 @@ static int initialize_system(float **A, unsigned dim, unsigned pinned)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+#ifdef STARPU_USE_CUDA
+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,cuda_chol_task_11_cost);
+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,cuda_chol_task_21_cost);
+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,cuda_chol_task_22_cost);
+#else
+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,NULL);
+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,NULL);
+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,NULL);
+#endif
+
 	starpu_cublas_init();
 
 #ifndef STARPU_SIMGRID

+ 14 - 0
examples/cholesky/cholesky_tile_tag.c

@@ -17,6 +17,10 @@
 
 #include "cholesky.h"
 
+struct starpu_perfmodel chol_model_11;
+struct starpu_perfmodel chol_model_21;
+struct starpu_perfmodel chol_model_22;
+
 /* A [ y ] [ x ] */
 float *A[NMAXBLOCKS][NMAXBLOCKS];
 starpu_data_handle_t A_state[NMAXBLOCKS][NMAXBLOCKS];
@@ -254,6 +258,16 @@ int main(int argc, char **argv)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+#ifdef STARPU_USE_CUDA
+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,cuda_chol_task_11_cost);
+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,cuda_chol_task_21_cost);
+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,cuda_chol_task_22_cost);
+#else
+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,NULL);
+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,NULL);
+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,NULL);
+#endif
+
 	/* Disable sequential consistency */
 	starpu_data_set_default_sequential_consistency_flag(0);
 

+ 26 - 0
examples/heat/dw_factolu.c

@@ -25,6 +25,11 @@
 #define debug(fmt, ...)
 #endif
 
+struct starpu_perfmodel model_11;
+struct starpu_perfmodel model_12;
+struct starpu_perfmodel model_21;
+struct starpu_perfmodel model_22;
+
 unsigned *advance_11; /* size nblocks, whether the 11 task is done */
 unsigned *advance_12_21; /* size nblocks*nblocks */
 unsigned *advance_22; /* array of nblocks *nblocks*nblocks */
@@ -701,6 +706,27 @@ void initialize_system(float **A, float **B, unsigned dim, unsigned pinned)
 		exit(77);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+#ifdef STARPU_ATLAS
+	char * symbol_11 = "lu_model_11_atlas";
+	char * symbol_12 = "lu_model_12_atlas";
+	char * symbol_21 = "lu_model_21_atlas";
+	char * symbol_22 = "lu_model_22_atlas";
+#elif defined(STARPU_GOTO)
+	char * symbol_11 = "lu_model_11_goto";
+	char * symbol_12 = "lu_model_12_goto";
+	char * symbol_21 = "lu_model_21_goto";
+	char * symbol_22 = "lu_model_22_goto";
+#else
+	char * symbol_11 = "lu_model_11";
+	char * symbol_12 = "lu_model_12";
+	char * symbol_21 = "lu_model_21";
+	char * symbol_22 = "lu_model_22";
+#endif
+	initialize_lu_kernels_model(&model_11,symbol_11,task_11_cost,task_11_cost_cpu,task_11_cost_cuda);
+	initialize_lu_kernels_model(&model_12,symbol_12,task_12_cost,task_12_cost_cpu,task_12_cost_cuda);
+	initialize_lu_kernels_model(&model_21,symbol_21,task_21_cost,task_21_cost_cpu,task_21_cost_cuda);
+	initialize_lu_kernels_model(&model_22,symbol_22,task_22_cost,task_22_cost_cpu,task_22_cost_cuda);
+
 	starpu_cublas_init();
 
 	if (pinned)

+ 0 - 5
examples/heat/dw_factolu.h

@@ -216,9 +216,4 @@ void dw_callback_v2_codelet_update_u12(void *);
 void dw_callback_v2_codelet_update_u21(void *);
 void dw_callback_v2_codelet_update_u22(void *);
 
-extern struct starpu_perfmodel model_11;
-extern struct starpu_perfmodel model_12;
-extern struct starpu_perfmodel model_21;
-extern struct starpu_perfmodel model_22;
-
 #endif /* __DW_FACTO_LU_H__ */

+ 5 - 0
examples/heat/dw_factolu_grain.c

@@ -27,6 +27,11 @@
 					| ((unsigned long long)(i)<<16)	\
 					| (unsigned long long)(j))))
 
+struct starpu_perfmodel model_11;
+struct starpu_perfmodel model_12;
+struct starpu_perfmodel model_21;
+struct starpu_perfmodel model_22;
+
 /*
  *	Construct the DAG
  */

+ 5 - 0
examples/heat/dw_factolu_tag.c

@@ -27,6 +27,11 @@
 					| ((unsigned long long)(i)<<16)	\
 					| (unsigned long long)(j))))
 
+struct starpu_perfmodel model_11;
+struct starpu_perfmodel model_12;
+struct starpu_perfmodel model_21;
+struct starpu_perfmodel model_22;
+
 static unsigned no_prio = 0;
 
 /*

+ 21 - 78
examples/heat/lu_kernels_model.c

@@ -102,7 +102,7 @@ double task_22_cost(struct starpu_task *task, unsigned nimpl)
  */
 
 
-double task_11_cost_cuda(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double task_11_cost_cuda(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -114,7 +114,7 @@ double task_11_cost_cuda(struct starpu_task *task, enum starpu_perfmodel_archtyp
 	return PERTURBATE(cost);
 }
 
-double task_12_cost_cuda(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double task_12_cost_cuda(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -127,7 +127,7 @@ double task_12_cost_cuda(struct starpu_task *task, enum starpu_perfmodel_archtyp
 }
 
 
-double task_21_cost_cuda(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double task_21_cost_cuda(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -141,7 +141,7 @@ double task_21_cost_cuda(struct starpu_task *task, enum starpu_perfmodel_archtyp
 
 
 
-double task_22_cost_cuda(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double task_22_cost_cuda(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t nx, ny, nz;
 
@@ -161,7 +161,7 @@ double task_22_cost_cuda(struct starpu_task *task, enum starpu_perfmodel_archtyp
  *
  */
 
-double task_11_cost_cpu(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double task_11_cost_cpu(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -173,7 +173,7 @@ double task_11_cost_cpu(struct starpu_task *task, enum starpu_perfmodel_archtype
 	return PERTURBATE(cost);
 }
 
-double task_12_cost_cpu(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double task_12_cost_cpu(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -186,7 +186,7 @@ double task_12_cost_cpu(struct starpu_task *task, enum starpu_perfmodel_archtype
 }
 
 
-double task_21_cost_cpu(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double task_21_cost_cpu(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -200,7 +200,7 @@ double task_21_cost_cpu(struct starpu_task *task, enum starpu_perfmodel_archtype
 
 
 
-double task_22_cost_cpu(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double task_22_cost_cpu(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t nx, ny, nz;
 
@@ -214,74 +214,17 @@ double task_22_cost_cpu(struct starpu_task *task, enum starpu_perfmodel_archtype
 	return PERTURBATE(cost);
 }
 
-struct starpu_perfmodel model_11 =
-{
-	.cost_function = task_11_cost,
-	.per_arch =
-	{
-		[STARPU_CPU_DEFAULT][0] = { .cost_function = task_11_cost_cpu },
-		[STARPU_CUDA_DEFAULT][0] = { .cost_function = task_11_cost_cuda }
-	},
-	.type = STARPU_HISTORY_BASED,
-#ifdef STARPU_ATLAS
-	.symbol = "lu_model_11_atlas"
-#elif defined(STARPU_GOTO)
-	.symbol = "lu_model_11_goto"
-#else
-	.symbol = "lu_model_11"
-#endif
-};
-
-struct starpu_perfmodel model_12 =
-{
-	.cost_function = task_12_cost,
-	.per_arch =
-	{
-		[STARPU_CPU_DEFAULT][0] = { .cost_function = task_12_cost_cpu },
-		[STARPU_CUDA_DEFAULT][0] = { .cost_function = task_12_cost_cuda }
-	},
-	.type = STARPU_HISTORY_BASED,
-#ifdef STARPU_ATLAS
-	.symbol = "lu_model_12_atlas"
-#elif defined(STARPU_GOTO)
-	.symbol = "lu_model_12_goto"
-#else
-	.symbol = "lu_model_12"
-#endif
-};
-
-struct starpu_perfmodel model_21 =
-{
-	.cost_function = task_21_cost,
-	.per_arch =
-	{
-		[STARPU_CPU_DEFAULT][0] = { .cost_function = task_21_cost_cpu },
-		[STARPU_CUDA_DEFAULT][0] = { .cost_function = task_21_cost_cuda }
-	},
-	.type = STARPU_HISTORY_BASED,
-#ifdef STARPU_ATLAS
-	.symbol = "lu_model_21_atlas"
-#elif defined(STARPU_GOTO)
-	.symbol = "lu_model_21_goto"
-#else
-	.symbol = "lu_model_21"
-#endif
-};
+void initialize_lu_kernels_model(struct starpu_perfmodel* model, char * symbol, 
+		double (*cost_function)(struct starpu_task *, unsigned), 
+		double (*cpu_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned), 
+		double (*cuda_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned))
+{
+	model->type = STARPU_HISTORY_BASED;
+	model->symbol = symbol;
+	starpu_initialize_model(model);
+	model->cost_function = cost_function;
+	model->per_arch[STARPU_CPU_WORKER][0][0][0].cost_function = cpu_cost_function;
+	if(starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) != 0)
+		model->per_arch[STARPU_CUDA_WORKER][0][0][0].cost_function = cuda_cost_function;
+}
 
-struct starpu_perfmodel model_22 =
-{
-	.cost_function = task_22_cost,
-	.per_arch =
-	{
-		[STARPU_CPU_DEFAULT][0] = { .cost_function = task_22_cost_cpu },
-		[STARPU_CUDA_DEFAULT][0] = { .cost_function = task_22_cost_cuda }
-	},
-	.type = STARPU_HISTORY_BASED,
-#ifdef STARPU_ATLAS
-	.symbol = "lu_model_22_atlas"
-#elif defined(STARPU_GOTO)
-	.symbol = "lu_model_22_goto"
-#else
-	.symbol = "lu_model_22"
-#endif
-};

+ 20 - 0
examples/heat/lu_kernels_model.h

@@ -20,4 +20,24 @@
 
 #include <starpu.h>
 
+double task_11_cost(struct starpu_task *task, unsigned nimpl);
+double task_12_cost(struct starpu_task *task, unsigned nimpl);
+double task_21_cost(struct starpu_task *task, unsigned nimpl);
+double task_22_cost(struct starpu_task *task, unsigned nimpl);
+
+double task_11_cost_cuda(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double task_12_cost_cuda(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double task_21_cost_cuda(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double task_22_cost_cuda(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+
+double task_11_cost_cpu(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double task_12_cost_cpu(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double task_21_cost_cpu(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double task_22_cost_cpu(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+
+void initialize_lu_kernels_model(struct starpu_perfmodel* model, char * symbol, 
+		double (*cost_function)(struct starpu_task *, unsigned), 
+		double (*cpu_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned), 
+		double (*cuda_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned));
+
 #endif /* __LU_KERNELS_MODEL_H__ */

+ 2 - 2
include/starpu_fxt.h

@@ -31,7 +31,7 @@ struct starpu_fxt_codelet_event
 {
 	char symbol[256];
 	int workerid;
-	enum starpu_perfmodel_archtype archtype;
+	struct starpu_perfmodel_arch arch;
 	uint32_t hash;
 	size_t size;
 	float time;
@@ -54,7 +54,7 @@ struct starpu_fxt_options
 	int file_rank;
 
 	char worker_names[STARPU_NMAXWORKERS][256];
-	enum starpu_perfmodel_archtype worker_archtypes[STARPU_NMAXWORKERS];
+	struct starpu_perfmodel_arch worker_archtypes[STARPU_NMAXWORKERS];
 	int nworkers;
 
 	struct starpu_fxt_codelet_event **dumped_codelets;

+ 21 - 37
include/starpu_perfmodel.h

@@ -23,6 +23,7 @@
 #include <stdio.h>
 
 #include <starpu_util.h>
+#include <starpu_worker.h>
 
 #ifdef __cplusplus
 extern "C"
@@ -32,36 +33,15 @@ extern "C"
 struct starpu_task;
 struct starpu_data_descr;
 
-enum starpu_perfmodel_archtype
+#define STARPU_NARCH STARPU_ANY_WORKER
+
+struct starpu_perfmodel_arch
 {
-	STARPU_CPU_DEFAULT = 0,
-	STARPU_CUDA_DEFAULT = STARPU_MAXCPUS,
-	STARPU_OPENCL_DEFAULT = STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS,
-	STARPU_MIC_DEFAULT = STARPU_OPENCL_DEFAULT + STARPU_MAXOPENCLDEVS,
-	STARPU_SCC_DEFAULT = STARPU_MIC_DEFAULT + STARPU_MAXMICDEVS //* STARPU_MAXMICCPUS
+	enum starpu_worker_archtype type;
+	int devid;
+	int ncore;
 };
 
-#ifdef __STDC_VERSION__
-#  if __STDC_VERSION__ > 199901L || STARPU_GNUC_PREREQ(4, 6)
-
-/* Make sure the following assertions hold, since StarPU relies on it.  */
-
-_Static_assert(STARPU_CPU_DEFAULT == 0,
-	       "invalid STARPU_CPU_DEFAULT value");
-_Static_assert(STARPU_CPU_DEFAULT < STARPU_CUDA_DEFAULT,
-	       "invalid STARPU_{CPU,CUDA}_DEFAULT values");
-_Static_assert(STARPU_CUDA_DEFAULT < STARPU_OPENCL_DEFAULT,
-	       "invalid STARPU_{CUDA,OPENCL}_DEFAULT values");
-_Static_assert(STARPU_OPENCL_DEFAULT < STARPU_MIC_DEFAULT,
-	       "invalid STARPU_{OPENCL,MIC}_DEFAULT values");
-_Static_assert(STARPU_MIC_DEFAULT < STARPU_SCC_DEFAULT,
-	       "invalid STARPU_{MIC,SCC}_DEFAULT values");
-
-#  endif
-#endif
-
-#define STARPU_NARCH_VARIATIONS	(STARPU_MIC_DEFAULT + STARPU_MAXMICDEVS)
-
 struct starpu_perfmodel_history_entry
 {
 	double mean;
@@ -113,8 +93,8 @@ struct starpu_perfmodel_history_table;
 struct starpu_perfmodel_per_arch
 {
 	double (*cost_model)(struct starpu_data_descr *t) STARPU_DEPRECATED;
-	double (*cost_function)(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl);
-	size_t (*size_base)(struct starpu_task *, enum starpu_perfmodel_archtype arch, unsigned nimpl);
+	double (*cost_function)(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+	size_t (*size_base)(struct starpu_task *, struct starpu_perfmodel_arch* arch, unsigned nimpl);
 
 	struct starpu_perfmodel_history_table *history;
 	struct starpu_perfmodel_history_list *list;
@@ -142,32 +122,36 @@ struct starpu_perfmodel
 
 	size_t (*size_base)(struct starpu_task *, unsigned nimpl);
 
-	struct starpu_perfmodel_per_arch per_arch[STARPU_NARCH_VARIATIONS][STARPU_MAXIMPLEMENTATIONS];
+	struct starpu_perfmodel_per_arch**** per_arch; /*STARPU_MAXIMPLEMENTATIONS*/
 
 	const char *symbol;
 
+	unsigned is_init;
 	unsigned is_loaded;
 	unsigned benchmarking;
 	starpu_pthread_rwlock_t model_rwlock;
 };
 
-enum starpu_perfmodel_archtype starpu_worker_get_perf_archtype(int workerid);
+void starpu_initialize_model(struct starpu_perfmodel *model);
+void starpu_initialize_model_with_file(FILE*f, struct starpu_perfmodel *model);
+
+struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid);
 
 int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *model);
 int starpu_perfmodel_unload_model(struct starpu_perfmodel *model);
 
-void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, char *path, size_t maxlen, unsigned nimpl);
-void starpu_perfmodel_get_arch_name(enum starpu_perfmodel_archtype arch, char *archname, size_t maxlen, unsigned nimpl);
+void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, char *path, size_t maxlen, unsigned nimpl);
+char* starpu_perfmodel_get_archtype_name(enum starpu_worker_archtype archtype);
+void starpu_perfmodel_get_arch_name(struct starpu_perfmodel_arch* arch, char *archname, size_t maxlen, unsigned nimpl);
 
-double starpu_permodel_history_based_expected_perf(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, uint32_t footprint);
+double starpu_permodel_history_based_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, uint32_t footprint);
 int starpu_perfmodel_list(FILE *output);
-void starpu_perfmodel_print(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output);
+void starpu_perfmodel_print(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output);
 int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char *parameter, uint32_t *footprint, FILE *output);
 
+void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch * arch, unsigned cpuid, unsigned nimpl, double measured);
 void starpu_perfmodel_directory(FILE *output);
 
-void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned cpuid, unsigned nimpl, double measured);
-
 void starpu_bus_print_bandwidth(FILE *f);
 void starpu_bus_print_affinity(FILE *f);
 

+ 7 - 7
include/starpu_scheduler.h

@@ -68,17 +68,17 @@ int starpu_combined_worker_can_execute_task(unsigned workerid, struct starpu_tas
 int starpu_get_prefetch_flag(void);
 int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node);
 
-uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl);
-double starpu_task_expected_length(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl);
-double starpu_worker_get_relative_speedup(enum starpu_perfmodel_archtype perf_archtype);
+uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double starpu_worker_get_relative_speedup(struct starpu_perfmodel_arch* perf_arch);
 double starpu_task_expected_data_transfer_time(unsigned memory_node, struct starpu_task *task);
 double starpu_data_expected_transfer_time(starpu_data_handle_t handle, unsigned memory_node, enum starpu_data_access_mode mode);
-double starpu_task_expected_power(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl);
-double starpu_task_expected_conversion_time(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl);
+double starpu_task_expected_power(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double starpu_task_expected_conversion_time(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
 
-double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl);
+double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, struct starpu_perfmodel_arch* arch, unsigned nimpl);
 double starpu_task_bundle_expected_data_transfer_time(starpu_task_bundle_t bundle, unsigned memory_node);
-double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl);
+double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, struct starpu_perfmodel_arch* arch, unsigned nimpl);
 
 void starpu_sched_ctx_worker_shares_tasks_lists(int workerid, int sched_ctx_id);
 #ifdef __cplusplus

+ 2 - 2
include/starpu_worker.h

@@ -28,12 +28,12 @@ extern "C"
 
 enum starpu_worker_archtype
 {
-	STARPU_ANY_WORKER,
 	STARPU_CPU_WORKER,
 	STARPU_CUDA_WORKER,
 	STARPU_OPENCL_WORKER,
 	STARPU_MIC_WORKER,
-	STARPU_SCC_WORKER
+	STARPU_SCC_WORKER,
+	STARPU_ANY_WORKER
 };
 
 struct starpu_sched_ctx_iterator

+ 11 - 3
sc_hypervisor/examples/cholesky/cholesky.h

@@ -73,15 +73,23 @@ void chol_cpu_codelet_update_u11(void **, void *);
 void chol_cpu_codelet_update_u21(void **, void *);
 void chol_cpu_codelet_update_u22(void **, void *);
 
+double cpu_chol_task_11_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double cpu_chol_task_21_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double cpu_chol_task_22_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+
 #ifdef STARPU_USE_CUDA
 void chol_cublas_codelet_update_u11(void *descr[], void *_args);
 void chol_cublas_codelet_update_u21(void *descr[], void *_args);
 void chol_cublas_codelet_update_u22(void *descr[], void *_args);
+
+double cuda_chol_task_11_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double cuda_chol_task_21_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double cuda_chol_task_22_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
 #endif
 
-extern struct starpu_perfmodel chol_model_11;
-extern struct starpu_perfmodel chol_model_21;
-extern struct starpu_perfmodel chol_model_22;
+void initialize_chol_model(struct starpu_perfmodel* model, char* symbol, 
+		double (*cpu_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned), 
+		double (*cuda_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned));
 
 static void STARPU_ATTRIBUTE_UNUSED parse_args(int argc, char **argv)
 {

+ 14 - 0
sc_hypervisor/examples/cholesky/cholesky_grain_tag.c

@@ -18,6 +18,10 @@
 
 #include "cholesky.h"
 
+struct starpu_perfmodel chol_model_11;
+struct starpu_perfmodel chol_model_21;
+struct starpu_perfmodel chol_model_22;
+
 /*
  *	Some useful functions
  */
@@ -276,6 +280,16 @@ static void initialize_system(float **A, unsigned dim, unsigned pinned)
 		exit(77);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+#ifdef STARPU_USE_CUDA
+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,cuda_chol_task_11_cost);
+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,cuda_chol_task_21_cost);
+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,cuda_chol_task_22_cost);
+#else
+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,NULL);
+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,NULL);
+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,NULL);
+#endif
+
 	starpu_cublas_init();
 
 	if (pinned)

+ 15 - 0
sc_hypervisor/examples/cholesky/cholesky_implicit.c

@@ -19,6 +19,11 @@
 
 #include "cholesky.h"
 #include "../sched_ctx_utils/sched_ctx_utils.h"
+
+struct starpu_perfmodel chol_model_11;
+struct starpu_perfmodel chol_model_21;
+struct starpu_perfmodel chol_model_22;
+
 /*
  *	Create the codelets
  */
@@ -342,6 +347,16 @@ int main(int argc, char **argv)
 
 	starpu_init(NULL);
 
+#ifdef STARPU_USE_CUDA
+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,cuda_chol_task_11_cost);
+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,cuda_chol_task_21_cost);
+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,cuda_chol_task_22_cost);
+#else
+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,NULL);
+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,NULL);
+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,NULL);
+#endif
+
 	starpu_cublas_init();
 
 	if(with_ctxs)

+ 16 - 37
sc_hypervisor/examples/cholesky/cholesky_models.c

@@ -36,7 +36,7 @@
 #define PERTURBATE(a)	(a)
 #endif
 
-static double cpu_chol_task_11_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double cpu_chol_task_11_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -51,7 +51,7 @@ static double cpu_chol_task_11_cost(struct starpu_task *task, enum starpu_perfmo
 	return PERTURBATE(cost);
 }
 
-static double cuda_chol_task_11_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double cuda_chol_task_11_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -66,7 +66,7 @@ static double cuda_chol_task_11_cost(struct starpu_task *task, enum starpu_perfm
 	return PERTURBATE(cost);
 }
 
-static double cpu_chol_task_21_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double cpu_chol_task_21_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -81,7 +81,7 @@ static double cpu_chol_task_21_cost(struct starpu_task *task, enum starpu_perfmo
 	return PERTURBATE(cost);
 }
 
-static double cuda_chol_task_21_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double cuda_chol_task_21_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -96,7 +96,7 @@ static double cuda_chol_task_21_cost(struct starpu_task *task, enum starpu_perfm
 	return PERTURBATE(cost);
 }
 
-static double cpu_chol_task_22_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double cpu_chol_task_22_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -111,7 +111,7 @@ static double cpu_chol_task_22_cost(struct starpu_task *task, enum starpu_perfmo
 	return PERTURBATE(cost);
 }
 
-static double cuda_chol_task_22_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double cuda_chol_task_22_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -126,35 +126,14 @@ static double cuda_chol_task_22_cost(struct starpu_task *task, enum starpu_perfm
 	return PERTURBATE(cost);
 }
 
-struct starpu_perfmodel chol_model_11 =
+void initialize_chol_model(struct starpu_perfmodel* model, char * symbol, 
+		double (*cpu_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned), 
+		double (*cuda_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned))
 {
-	.per_arch =
-	{
-		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_11_cost },
-		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_11_cost }
-	},
-	.type = STARPU_HISTORY_BASED,
-	.symbol = "chol_model_11"
-};
-
-struct starpu_perfmodel chol_model_21 =
-{
-	.per_arch =
-	{
-		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_21_cost },
-		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_21_cost }
-	},
-	.type = STARPU_HISTORY_BASED,
-	.symbol = "chol_model_21"
-};
-
-struct starpu_perfmodel chol_model_22 =
-{
-	.per_arch =
-	{
-		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_22_cost },
-		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_22_cost }
-	},
-	.type = STARPU_HISTORY_BASED,
-	.symbol = "chol_model_22"
-};
+	starpu_initialize_model(model);
+	model->type = STARPU_HISTORY_BASED;
+	model->symbol = symbol;
+	model->per_arch[STARPU_CPU_WORKER][0][0][0].cost_function = cpu_cost_function;
+	model->per_arch[STARPU_CUDA_WORKER][0][0][0].cost_function = cuda_cost_function;
+}
+

+ 14 - 0
sc_hypervisor/examples/cholesky/cholesky_tag.c

@@ -18,6 +18,10 @@
 
 #include "cholesky.h"
 
+struct starpu_perfmodel chol_model_11;
+struct starpu_perfmodel chol_model_21;
+struct starpu_perfmodel chol_model_22;
+
 /*
  *	Some useful functions
  */
@@ -249,6 +253,16 @@ static int initialize_system(float **A, unsigned dim, unsigned pinned)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+#ifdef STARPU_USE_CUDA
+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,cuda_chol_task_11_cost);
+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,cuda_chol_task_21_cost);
+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,cuda_chol_task_22_cost);
+#else
+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,NULL);
+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,NULL);
+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,NULL);
+#endif
+
 	starpu_cublas_init();
 
 	if (pinned)

+ 14 - 0
sc_hypervisor/examples/cholesky/cholesky_tile_tag.c

@@ -17,6 +17,10 @@
 
 #include "cholesky.h"
 
+struct starpu_perfmodel chol_model_11;
+struct starpu_perfmodel chol_model_21;
+struct starpu_perfmodel chol_model_22;
+
 /* A [ y ] [ x ] */
 float *A[NMAXBLOCKS][NMAXBLOCKS];
 starpu_data_handle_t A_state[NMAXBLOCKS][NMAXBLOCKS];
@@ -239,6 +243,16 @@ int main(int argc, char **argv)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+#ifdef STARPU_USE_CUDA
+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,cuda_chol_task_11_cost);
+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,cuda_chol_task_21_cost);
+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,cuda_chol_task_22_cost);
+#else
+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,NULL);
+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,NULL);
+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,NULL);
+#endif
+
 	/* Disable sequential consistency */
 	starpu_data_set_default_sequential_consistency_flag(0);
 

+ 1 - 1
sc_hypervisor/src/policies_utils/policy_tools.c

@@ -416,7 +416,7 @@ void sc_hypervisor_get_tasks_times(int nw, int nt, double times[nw][nt], int *wo
                 for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
                 {
 			int worker = workers == NULL ? w : workers[w];
-                        enum starpu_perfmodel_archtype arch = starpu_worker_get_perf_archtype(worker);
+                        struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(worker);
                         double length = starpu_permodel_history_based_expected_perf(tp->cl->model, arch, tp->footprint);
 
                         if (isnan(length))

+ 5 - 12
src/core/combined_workers.c

@@ -101,18 +101,11 @@ int starpu_combined_worker_assign_workerid(int nworkers, int workerid_array[])
 
 	combined_worker->worker_size = nworkers;
 
-#ifdef STARPU_USE_MIC
-	if(config->workers[workerid_array[0]].worker_mask == STARPU_MIC)
-	{
-		combined_worker->perf_arch = (enum starpu_perfmodel_archtype) (STARPU_MIC_DEFAULT + config->workers[workerid_array[0]].mp_nodeid /* *STARPU_MAXMICCPUS + nworkers - 1*/);
-		combined_worker->worker_mask = STARPU_MIC;
-	}
-#endif
-	if(config->workers[workerid_array[0]].worker_mask == STARPU_CPU)
-	{
-		combined_worker->perf_arch = (enum starpu_perfmodel_archtype) (STARPU_CPU_DEFAULT + nworkers - 1);
-		combined_worker->worker_mask = STARPU_CPU;
-	}
+	combined_worker->perf_arch.type = config->workers[workerid_array[0]].perf_arch.type;
+	combined_worker->perf_arch.devid = config->workers[workerid_array[0]].perf_arch.devid; 
+	combined_worker->perf_arch.ncore = nworkers - 1;
+	combined_worker->worker_mask = config->workers[workerid_array[0]].worker_mask;
+	
 #ifdef STARPU_USE_MP
 	combined_worker->count = nworkers -1;
 	pthread_mutex_init(&combined_worker->count_mutex,NULL);

+ 12 - 12
src/core/detect_combined_workers.c

@@ -41,7 +41,7 @@ static void find_workers(hwloc_obj_t obj, int cpu_workers[STARPU_NMAXWORKERS], u
 	/* Got to a PU leaf */
 	struct _starpu_worker *worker = obj->userdata;
 	/* is it a CPU worker? */
-	if (worker->perf_arch == STARPU_CPU_DEFAULT)
+	if (worker->perf_arch.type == STARPU_CPU_WORKER && worker->perf_arch.ncore == 0)
 	{
 		_STARPU_DEBUG("worker %d is part of it\n", worker->workerid);
 		/* Add it to the combined worker */
@@ -174,7 +174,7 @@ static void find_and_assign_combinations_with_hwloc(int *workerids, int nworkers
 	for (i = 0; i < nworkers; i++)
 	{
 		struct _starpu_worker *worker = _starpu_get_worker_struct(workerids[i]);
-		if (worker->perf_arch == STARPU_CPU_DEFAULT)
+		if (worker->perf_arch.type == STARPU_CPU_WORKER && worker->perf_arch.ncore == 0)
 		{
 			hwloc_obj_t obj = hwloc_get_obj_by_depth(topology->hwtopology, config->cpu_depth, worker->bindid);
 			STARPU_ASSERT(obj->userdata == worker);
@@ -248,11 +248,11 @@ static void find_and_assign_combinations_without_hwloc(int *workerids, int nwork
 	mic_id = malloc(sizeof(int)*nb_mics);
 	nmics_table = malloc(sizeof(unsigned)*nb_mics);
 	mic_workers = malloc(sizeof(int*)*nb_mics);
-	for(i=0; i<nb_mics; i++)
+	for(j=0; j<nb_mics; j++)
 	{
-		mic_id[i] = -1;
-		nmics_table[i] = 0;
-		mic_workers[i] = malloc(sizeof(int)*STARPU_NMAXWORKERS);
+		mic_id[j] = -1;
+		nmics_table[j] = 0;
+		mic_workers[j] = malloc(sizeof(int)*STARPU_NMAXWORKERS);
 	}
 #endif /* STARPU_USE_MIC */
 
@@ -292,13 +292,13 @@ static void find_and_assign_combinations_without_hwloc(int *workerids, int nwork
 	mic_min = starpu_get_env_number("STARPU_MIN_WORKERSIZE");
 	if (mic_min < 2)
 		mic_min = 2;
-	for(i=0; i<nb_mics; i++)
+	for(j=0; j<nb_mics; j++)
 	{
 		mic_max = starpu_get_env_number("STARPU_MAX_WORKERSIZE");
-		if (mic_max == -1 || mic_max > (int) nmics_table[i])
-			mic_max = nmics_table[i];
-		assign_combinations_without_hwloc(workers,mic_workers[i],nmics_table[i],mic_min,mic_max);
-		free(mic_workers[i]);
+		if (mic_max == -1 || mic_max > (int) nmics_table[j])
+			mic_max = nmics_table[j];
+		assign_combinations_without_hwloc(workers,mic_workers[j],nmics_table[j],mic_min,mic_max);
+		free(mic_workers[j]);
 	}
 	free(mic_id);
 	free(nmics_table);
@@ -325,7 +325,7 @@ static void combine_all_cpu_workers(int *workerids, int nworkers)
 	{
 		worker = _starpu_get_worker_struct(workerids[i]);
 
-		if (worker->perf_arch == STARPU_CPU_DEFAULT)
+		if (worker->arch == STARPU_CPU_WORKER)
 			cpu_workers[ncpus++] = workerids[i];
 	}
 

+ 1 - 1
src/core/jobs.h

@@ -166,7 +166,7 @@ unsigned _starpu_enforce_deps_starting_from_task(struct _starpu_job *j);
 void _starpu_handle_job_termination(struct _starpu_job *j);
 
 /* Get the sum of the size of the data accessed by the job. */
-size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, unsigned nimpl, struct _starpu_job *j);
+size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned nimpl, struct _starpu_job *j);
 
 /* Get a task from the local pool of tasks that were explicitly attributed to
  * that worker. */

+ 45 - 29
src/core/perfmodel/perfmodel.c

@@ -48,7 +48,7 @@ unsigned _starpu_get_calibrate_flag(void)
 	return calibrate_flag;
 }
 
-enum starpu_perfmodel_archtype starpu_worker_get_perf_archtype(int workerid)
+struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid)
 {
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
 
@@ -56,26 +56,26 @@ enum starpu_perfmodel_archtype starpu_worker_get_perf_archtype(int workerid)
 	unsigned nworkers = config->topology.nworkers;
 
 	if (workerid < (int)config->topology.nworkers)
-		return config->workers[workerid].perf_arch;
+		return &config->workers[workerid].perf_arch;
 
 	/* We have a combined worker */
 	unsigned ncombinedworkers = config->topology.ncombinedworkers;
 	STARPU_ASSERT(workerid < (int)(ncombinedworkers + nworkers));
-	return config->combined_workers[workerid - nworkers].perf_arch;
+	return &config->combined_workers[workerid - nworkers].perf_arch;
 }
 
 /*
  * PER ARCH model
  */
 
-static double per_arch_task_expected_perf(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, struct starpu_task *task, unsigned nimpl)
+static double per_arch_task_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, struct starpu_task *task, unsigned nimpl)
 {
 	double exp = NAN;
-	double (*per_arch_cost_function)(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl);
+	double (*per_arch_cost_function)(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
 	double (*per_arch_cost_model)(struct starpu_data_descr *);
 
-	per_arch_cost_function = model->per_arch[arch][nimpl].cost_function;
-	per_arch_cost_model = model->per_arch[arch][nimpl].cost_model;
+	per_arch_cost_function = model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].cost_function;
+	per_arch_cost_model = model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].cost_model;
 
 	if (per_arch_cost_function)
 		exp = per_arch_cost_function(task, arch, nimpl);
@@ -89,28 +89,31 @@ static double per_arch_task_expected_perf(struct starpu_perfmodel *model, enum s
  * Common model
  */
 
-double starpu_worker_get_relative_speedup(enum starpu_perfmodel_archtype perf_archtype)
+double starpu_worker_get_relative_speedup(struct starpu_perfmodel_arch* perf_arch)
 {
-	if (perf_archtype < STARPU_CUDA_DEFAULT)
+	if (perf_arch->type == STARPU_CPU_WORKER)
 	{
-		return _STARPU_CPU_ALPHA * (perf_archtype + 1);
+		return _STARPU_CPU_ALPHA * (perf_arch->ncore + 1);
 	}
-	else if (perf_archtype < STARPU_OPENCL_DEFAULT)
+	else if (perf_arch->type == STARPU_CUDA_WORKER)
 	{
 		return _STARPU_CUDA_ALPHA;
 	}
-	else if (perf_archtype < STARPU_NARCH_VARIATIONS)
+	else if (perf_arch->type == STARPU_OPENCL_WORKER)
 	{
 		return _STARPU_OPENCL_ALPHA;
 	}
-
+	else if (perf_arch->type == STARPU_MIC_WORKER)
+	{
+		return _STARPU_MIC_ALPHA * (perf_arch->ncore + 1);
+	}
 	STARPU_ABORT();
 
 	/* Never reached ! */
 	return NAN;
 }
 
-static double common_task_expected_perf(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, struct starpu_task *task, unsigned nimpl)
+static double common_task_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, struct starpu_task *task, unsigned nimpl)
 {
 	double exp;
 	double alpha;
@@ -143,7 +146,6 @@ void _starpu_load_perfmodel(struct starpu_perfmodel *model)
 		return;
 
 	int load_model = _starpu_register_model(model);
-
 	if (!load_model)
 		return;
 
@@ -170,7 +172,7 @@ void _starpu_load_perfmodel(struct starpu_perfmodel *model)
 	model->is_loaded = 1;
 }
 
-static double starpu_model_expected_perf(struct starpu_task *task, struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch,  unsigned nimpl)
+static double starpu_model_expected_perf(struct starpu_task *task, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch,  unsigned nimpl)
 {
 	if (model)
 	{
@@ -203,19 +205,19 @@ static double starpu_model_expected_perf(struct starpu_task *task, struct starpu
 	return 0.0;
 }
 
-double starpu_task_expected_length(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 
 	return starpu_model_expected_perf(task, task->cl->model, arch, nimpl);
 }
 
-double starpu_task_expected_power(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double starpu_task_expected_power(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	return starpu_model_expected_perf(task, task->cl->power_model, arch, nimpl);
 }
 
 double starpu_task_expected_conversion_time(struct starpu_task *task,
-					    enum starpu_perfmodel_archtype arch,
+					    struct starpu_perfmodel_arch* arch,
 					    unsigned nimpl)
 {
 	unsigned i;
@@ -230,14 +232,28 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 		handle = STARPU_TASK_GET_HANDLE(task, i);
 		if (!_starpu_data_is_multiformat_handle(handle))
 			continue;
-
-		if (arch < STARPU_CUDA_DEFAULT)
-			node_kind = STARPU_CPU_RAM;
-		else if (arch < STARPU_OPENCL_DEFAULT)
-			node_kind = STARPU_CUDA_RAM;
-		else
-			node_kind = STARPU_OPENCL_RAM;
-
+		
+		switch(arch->type)
+		{
+			case STARPU_CPU_WORKER:
+				node_kind = STARPU_CPU_RAM;
+				break;
+			case STARPU_CUDA_WORKER:
+				node_kind = STARPU_CUDA_RAM;
+				break;
+			case STARPU_OPENCL_WORKER:
+				node_kind = STARPU_OPENCL_RAM;
+				break;
+			case STARPU_MIC_WORKER:
+				node_kind = STARPU_MIC_RAM;
+				break;
+			case STARPU_SCC_WORKER:
+				node_kind = STARPU_SCC_RAM;
+				break;
+			default:
+				STARPU_ABORT();
+				break;
+		}
 		if (!_starpu_handle_needs_conversion_task_for_arch(handle, node_kind))
 			continue;
 
@@ -298,7 +314,7 @@ double starpu_task_expected_data_transfer_time(unsigned memory_node, struct star
 }
 
 /* Return the expected duration of the entire task bundle in µs */
-double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	double expected_length = 0.0;
 
@@ -329,7 +345,7 @@ double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, enum star
 }
 
 /* Return the expected power consumption of the entire task bundle in J */
-double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	double expected_power = 0.0;
 

+ 6 - 6
src/core/perfmodel/perfmodel.h

@@ -38,7 +38,7 @@ extern "C"
  * differents versions of StarPU having different performance model
  * formats.
  */
-#define _STARPU_PERFMODEL_VERSION 42
+#define _STARPU_PERFMODEL_VERSION 43
 
 struct _starpu_perfmodel_list
 {
@@ -48,14 +48,14 @@ struct _starpu_perfmodel_list
 
 struct starpu_data_descr;
 struct _starpu_job;
-enum starpu_perfmodel_archtype;
+struct starpu_perfmodel_arch;
 
 void _starpu_get_perf_model_dir(char *path, size_t maxlen);
 void _starpu_get_perf_model_dir_codelets(char *path, size_t maxlen);
 void _starpu_get_perf_model_dir_bus(char *path, size_t maxlen);
 void _starpu_get_perf_model_dir_debug(char *path, size_t maxlen);
 
-double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, struct _starpu_job *j, unsigned nimpl);
+double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, struct _starpu_job *j, unsigned nimpl);
 int _starpu_register_model(struct starpu_perfmodel *model);
 void _starpu_load_per_arch_based_model(struct starpu_perfmodel *model);
 void _starpu_load_common_based_model(struct starpu_perfmodel *model);
@@ -65,10 +65,10 @@ void _starpu_initialize_registered_performance_models(void);
 void _starpu_deinitialize_registered_performance_models(void);
 
 double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model,
-					enum starpu_perfmodel_archtype arch, struct _starpu_job *j, unsigned nimpl);
+					struct starpu_perfmodel_arch* arch, struct _starpu_job *j, unsigned nimpl);
 double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfmodel *model,
-					enum starpu_perfmodel_archtype arch, struct _starpu_job *j, unsigned nimpl);
-void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch,
+					struct starpu_perfmodel_arch* arch, struct _starpu_job *j, unsigned nimpl);
+void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch,
 				unsigned cpuid, double measured, unsigned nimpl);
 
 void _starpu_create_sampling_directory_if_needed(void);

Разлика између датотеке није приказан због своје велике величине
+ 465 - 366
src/core/perfmodel/perfmodel_history.c


+ 40 - 20
src/core/perfmodel/perfmodel_print.c

@@ -61,9 +61,9 @@ void _starpu_perfmodel_print_history_based(struct starpu_perfmodel_per_arch *per
 	}
 }
 
-void starpu_perfmodel_print(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output)
+void starpu_perfmodel_print(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output)
 {
-	struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[arch][nimpl];
+	struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[arch->type][arch->devid][arch->ncore][nimpl];
 	char archname[32];
 
 	if (arch_model->regression.nsample || arch_model->regression.valid || arch_model->regression.nl_valid || arch_model->list)
@@ -171,13 +171,22 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 	if (arch == NULL)
 	{
 		/* display all architectures */
-		unsigned archid;
-		unsigned implid;
-		for (archid = 0; archid < STARPU_NARCH_VARIATIONS; archid++)
+		unsigned archtype, devid, ncore, implid;
+		struct starpu_perfmodel_arch perf_arch;
+		for (archtype = 0; archtype < STARPU_NARCH; archtype++)
 		{
-			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
-			{ /* Display all codelets on each arch */
-				starpu_perfmodel_print(model, (enum starpu_perfmodel_archtype) archid, implid, parameter, footprint, output);
+			perf_arch.type = archtype;
+			for(devid = 0; model->per_arch[archtype][devid] != NULL; devid++)
+			{
+				perf_arch.devid = devid;
+				for(ncore = 0; model->per_arch[archtype][devid][ncore] != NULL; ncore++)
+				{
+					perf_arch.ncore = ncore;
+					for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+					{ /* Display all codelets on each arch */
+						starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
+					}
+				}
 			}
 		}
 	}
@@ -186,8 +195,12 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 		if (strcmp(arch, "cpu") == 0)
 		{
 			unsigned implid;
+			struct starpu_perfmodel_arch perf_arch;
+			perf_arch.type = STARPU_CPU_WORKER;
+			perf_arch.devid = 0;
+			perf_arch.ncore = 0;
 			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
-				starpu_perfmodel_print(model, STARPU_CPU_DEFAULT,implid, parameter, footprint, output); /* Display all codelets on cpu */
+				starpu_perfmodel_print(model, &perf_arch,implid, parameter, footprint, output); /* Display all codelets on cpu */
 			return 0;
 		}
 
@@ -202,24 +215,28 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 			}
 
 			unsigned implid;
+			struct starpu_perfmodel_arch perf_arch;
+			perf_arch.type = STARPU_CPU_WORKER;
+			perf_arch.devid = 0;
+			perf_arch.ncore = k-1;
 			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
-				starpu_perfmodel_print(model, (enum starpu_perfmodel_archtype) (STARPU_CPU_DEFAULT + k - 1), implid, parameter, footprint, output);
+				starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
 			return 0;
 		}
 
 		if (strcmp(arch, "cuda") == 0)
 		{
-			unsigned archid;
+			unsigned devid;
 			unsigned implid;
-			for (archid = STARPU_CUDA_DEFAULT; archid < STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS; archid++)
+			struct starpu_perfmodel_arch perf_arch;
+			perf_arch.type = STARPU_CUDA_WORKER;
+			perf_arch.ncore = 0;
+
+			for (devid = 0; model->per_arch[STARPU_CUDA_WORKER] != NULL; devid++)
 			{
+				perf_arch.devid = devid;
 				for (implid = 0; implid <STARPU_MAXIMPLEMENTATIONS; implid ++)
-				{
-					char archname[32];
-					starpu_perfmodel_get_arch_name((enum starpu_perfmodel_archtype) archid, archname, 32, implid);
-					fprintf(output, "performance model for %s\n", archname);
-					starpu_perfmodel_print(model, (enum starpu_perfmodel_archtype) archid, implid, parameter, footprint, output);
-				}
+					starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
 			}
 			return 0;
 		}
@@ -230,10 +247,13 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 		nmatched = sscanf(arch, "cuda_%d", &gpuid);
 		if (nmatched == 1)
 		{
-			int archid = STARPU_CUDA_DEFAULT+ gpuid;
+			struct starpu_perfmodel_arch perf_arch;
+			perf_arch.type = STARPU_CUDA_WORKER;
+			perf_arch.devid = gpuid;
+			perf_arch.ncore = 0;
 			unsigned implid;
 			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
-				starpu_perfmodel_print(model, (enum starpu_perfmodel_archtype) archid, implid, parameter, footprint, output);
+				starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
 			return 0;
 		}
 

+ 1 - 1
src/core/simgrid.c

@@ -97,7 +97,7 @@ int main(int argc, char **argv)
 }
 
 /* Task execution submitted by StarPU */
-void _starpu_simgrid_execute_job(struct _starpu_job *j, enum starpu_perfmodel_archtype perf_arch, double length)
+void _starpu_simgrid_execute_job(struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, double length)
 {
 	struct starpu_task *task = j->task;
 	msg_task_t simgrid_task;

+ 1 - 1
src/core/simgrid.h

@@ -30,7 +30,7 @@ struct _starpu_pthread_args
 
 #define MAX_TSD 16
 
-void _starpu_simgrid_execute_job(struct _starpu_job *job, enum starpu_perfmodel_archtype perf_arch, double length);
+void _starpu_simgrid_execute_job(struct _starpu_job *job, struct starpu_perfmodel_arch* perf_arch, double length);
 int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node, struct _starpu_data_request *req);
 /* Return the number of hosts prefixed by PREFIX */
 int _starpu_simgrid_get_nbhosts(const char *prefix);

+ 5 - 1
src/core/task.c

@@ -244,7 +244,11 @@ int _starpu_submit_job(struct _starpu_job *j)
 	if(sched_ctx != NULL && j->task->sched_ctx != _starpu_get_initial_sched_ctx()->id && j->task->sched_ctx != STARPU_NMAX_SCHED_CTXS
 	   && sched_ctx->perf_counters != NULL)
 	{
-		_starpu_compute_buffers_footprint(j->task->cl->model, STARPU_CPU_DEFAULT, 0, j);
+		struct starpu_perfmodel_arch arch;
+		arch.type = STARPU_CPU_WORKER;
+		arch.devid = 0;
+		arch.ncore = 0;
+		_starpu_compute_buffers_footprint(j->task->cl->model, &arch, 0, j);
 		int i;
 		size_t data_size = 0;
 		for(i = 0; i < STARPU_NMAXBUFS; i++)

+ 16 - 11
src/core/topology.c

@@ -633,8 +633,10 @@ _starpu_init_mic_config (struct _starpu_machine_config *config,
 	for (miccore_id = 0; miccore_id < topology->nmiccores[mic_idx]; miccore_id++)
 	{
 		int worker_idx = topology->nworkers + miccore_id;
-		enum starpu_perfmodel_archtype arch =
-			(enum starpu_perfmodel_archtype)((int)STARPU_MIC_DEFAULT + mic_idx);
+		struct starpu_perfmodel_arch arch;
+		arch.type = STARPU_MIC_WORKER;
+		arch.devid = mic_idx;
+		arch.ncore = 0; 
 		config->workers[worker_idx].arch = STARPU_MIC_WORKER;
 		config->workers[worker_idx].perf_arch = arch;
 		config->workers[worker_idx].mp_nodeid = mic_idx;
@@ -772,11 +774,11 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 		int worker_idx = topology->nworkers + cudagpu;
 		config->workers[worker_idx].arch = STARPU_CUDA_WORKER;
 		int devid = _starpu_get_next_cuda_gpuid(config);
-		enum starpu_perfmodel_archtype arch =
-			(enum starpu_perfmodel_archtype)((int)STARPU_CUDA_DEFAULT + devid);
+		config->workers[worker_idx].perf_arch.type = STARPU_CUDA_WORKER;
+		config->workers[worker_idx].perf_arch.devid = cudagpu;
+		config->workers[worker_idx].perf_arch.ncore = 0;
 		config->workers[worker_idx].mp_nodeid = -1;
 		config->workers[worker_idx].devid = devid;
-		config->workers[worker_idx].perf_arch = arch;
 		config->workers[worker_idx].worker_mask = STARPU_CUDA;
 		config->worker_mask |= STARPU_CUDA;
 
@@ -846,11 +848,11 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 			break;
 		}
 		config->workers[worker_idx].arch = STARPU_OPENCL_WORKER;
-		enum starpu_perfmodel_archtype arch =
-			(enum starpu_perfmodel_archtype)((int)STARPU_OPENCL_DEFAULT + devid);
+		config->workers[worker_idx].perf_arch.type = STARPU_OPENCL_WORKER;
+		config->workers[worker_idx].perf_arch.devid = openclgpu;
+		config->workers[worker_idx].perf_arch.ncore = 0;
 		config->workers[worker_idx].mp_nodeid = -1;
 		config->workers[worker_idx].devid = devid;
-		config->workers[worker_idx].perf_arch = arch;
 		config->workers[worker_idx].worker_mask = STARPU_OPENCL;
 		config->worker_mask |= STARPU_OPENCL;
 	}
@@ -908,10 +910,11 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 	{
 		config->workers[topology->nworkers + sccdev].arch = STARPU_SCC_WORKER;
 		int devid = _starpu_get_next_scc_deviceid(config);
-		enum starpu_perfmodel_archtype arch = (enum starpu_perfmodel_archtype)((int)STARPU_SCC_DEFAULT + devid);
+		config->workers[topology->nworkers + sccdev].perf_arch.type = STARPU_SCC_WORKER;
+		config->workers[topology->nworkers + sccdev].perf_arch.devid = sccdev;
+		config->workers[topology->nworkers + sccdev].perf_arch.ncore = 0;
 		config->workers[topology->nworkers + sccdev].mp_nodeid = -1;
 		config->workers[topology->nworkers + sccdev].devid = devid;
-		config->workers[topology->nworkers + sccdev].perf_arch = arch;
 		config->workers[topology->nworkers + sccdev].worker_mask = STARPU_SCC;
 		config->worker_mask |= STARPU_SCC;
 	}
@@ -971,7 +974,9 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 	{
 		int worker_idx = topology->nworkers + cpu;
 		config->workers[worker_idx].arch = STARPU_CPU_WORKER;
-		config->workers[worker_idx].perf_arch = STARPU_CPU_DEFAULT;
+		config->workers[worker_idx].perf_arch.type = STARPU_CPU_WORKER;
+		config->workers[worker_idx].perf_arch.devid = 0;
+		config->workers[worker_idx].perf_arch.ncore = 0;
 		config->workers[worker_idx].mp_nodeid = -1;
 		config->workers[worker_idx].devid = cpu;
 		config->workers[worker_idx].worker_mask = STARPU_CPU;

+ 2 - 2
src/core/workers.h

@@ -59,7 +59,7 @@ struct _starpu_worker
         starpu_pthread_mutex_t mutex;
 	enum starpu_worker_archtype arch; /* what is the type of worker ? */
 	uint32_t worker_mask; /* what is the type of worker ? */
-	enum starpu_perfmodel_archtype perf_arch; /* in case there are different models of the same arch */
+	struct starpu_perfmodel_arch perf_arch; /* in case there are different models of the same arch */
 	starpu_pthread_t worker_thread; /* the thread which runs the worker */
 	int mp_nodeid; /* which mp node hold the cpu/gpu/etc (-1 for this
 			* node) */
@@ -117,7 +117,7 @@ struct _starpu_worker
 
 struct _starpu_combined_worker
 {
-	enum starpu_perfmodel_archtype perf_arch; /* in case there are different models of the same arch */
+	struct starpu_perfmodel_arch perf_arch; /* in case there are different models of the same arch */
 	uint32_t worker_mask; /* what is the type of workers ? */
 	int worker_size;
 	unsigned memory_node; /* which memory node is associated that worker to ? */

+ 8 - 4
src/datawizard/footprint.c

@@ -19,7 +19,7 @@
 #include <starpu_hash.h>
 #include <core/task.h>
 
-uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, unsigned nimpl, struct _starpu_job *j)
+uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, unsigned nimpl, struct _starpu_job *j)
 {
 	if (j->footprint_is_computed)
 		return j->footprint;
@@ -29,9 +29,13 @@ uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, enum
 
 	struct starpu_task *task = j->task;
 
-	if (model && model->per_arch[arch][nimpl].size_base)
+	if (model != NULL && 
+			model->per_arch[arch->type] != NULL &&
+			model->per_arch[arch->type][arch->devid] != NULL &&
+			model->per_arch[arch->type][arch->devid][arch->ncore] != NULL &&
+			model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].size_base)
 	{
-		size_t size = model->per_arch[arch][nimpl].size_base(task, arch, nimpl);
+		size_t size = model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].size_base(task, arch, nimpl);
 		footprint = starpu_hash_crc32c_be_n(&size, sizeof(size), footprint);
 	}
 	else if (model && model->size_base)
@@ -68,7 +72,7 @@ uint32_t _starpu_compute_data_footprint(starpu_data_handle_t handle)
 	return starpu_hash_crc32c_be(handle_footprint, interfaceid);
 }
 
-uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
 	return _starpu_compute_buffers_footprint(model, arch, nimpl, j);

+ 1 - 1
src/datawizard/footprint.h

@@ -24,7 +24,7 @@
 
 /* Compute the footprint that characterizes the job and cache it into the job
  * structure. */
-uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, unsigned nimpl, struct _starpu_job *j);
+uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, unsigned nimpl, struct _starpu_job *j);
 
 /* Compute the footprint that characterizes the layout of the data handle. */
 uint32_t _starpu_compute_data_footprint(starpu_data_handle_t handle);

+ 19 - 9
src/debug/traces/starpu_fxt.c

@@ -337,7 +337,7 @@ static void handle_worker_init_start(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 	register_worker_id(threadid, workerid);
 
 	char *kindstr = "";
-	enum starpu_perfmodel_archtype archtype = 0;
+	struct starpu_perfmodel_arch arch;
 
 	switch (ev->param[0])
 	{
@@ -348,27 +348,37 @@ static void handle_worker_init_start(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 		case _STARPU_FUT_CPU_KEY:
 			set_next_cpu_worker_color(workerid);
 			kindstr = "CPU";
-			archtype = STARPU_CPU_DEFAULT;
+			arch.type = STARPU_CPU_WORKER;
+			arch.devid = 0;
+			arch.ncore = 0;
 			break;
 		case _STARPU_FUT_CUDA_KEY:
 			set_next_cuda_worker_color(workerid);
 			kindstr = "CUDA";
-			archtype = STARPU_CUDA_DEFAULT + devid;
+			arch.type = STARPU_CUDA_WORKER;
+			arch.devid = devid;
+			arch.ncore = 0;
 			break;
 		case _STARPU_FUT_OPENCL_KEY:
 			set_next_opencl_worker_color(workerid);
 			kindstr = "OPENCL";
-			archtype = STARPU_OPENCL_DEFAULT + devid;
+			arch.type = STARPU_OPENCL_WORKER;
+			arch.devid = devid;
+			arch.ncore = 0;
 			break;
 		case _STARPU_FUT_MIC_KEY:
 			set_next_mic_worker_color(workerid);
 			kindstr = "mic";
-			archtype = STARPU_MIC_DEFAULT + devid;
+			arch.type = STARPU_MIC_WORKER;
+			arch.devid = devid;
+			arch.ncore = 0;
 			break;
 		case _STARPU_FUT_SCC_KEY:
 			set_next_scc_worker_color(workerid);
 			kindstr = "scc";
-			archtype = STARPU_SCC_DEFAULT + devid;
+			arch.type = STARPU_SCC_WORKER;
+			arch.devid = devid;
+			arch.ncore = 0;
 			break;
 		default:
 			STARPU_ABORT();
@@ -405,7 +415,7 @@ static void handle_worker_init_start(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 	fprintf(activity_file, "name\t%d\t%s %d\n", workerid, kindstr, devid);
 
 	snprintf(options->worker_names[workerid], 256, "%s %d", kindstr, devid);
-	options->worker_archtypes[workerid] = archtype;
+	options->worker_archtypes[workerid] = arch;
 }
 
 static void handle_worker_init_end(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
@@ -600,14 +610,14 @@ static void handle_end_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_opti
 
 	if (options->dumped_codelets)
 	{
-		enum starpu_perfmodel_archtype archtype = ev->param[3];
+		struct starpu_perfmodel_arch* arch = ev->param[3];
 
 		dumped_codelets_count++;
 		dumped_codelets = realloc(dumped_codelets, dumped_codelets_count*sizeof(struct starpu_fxt_codelet_event));
 
 		snprintf(dumped_codelets[dumped_codelets_count - 1].symbol, 256, "%s", last_codelet_symbol[worker]);
 		dumped_codelets[dumped_codelets_count - 1].workerid = worker;
-		dumped_codelets[dumped_codelets_count - 1].archtype = archtype;
+		dumped_codelets[dumped_codelets_count - 1].arch = *arch;
 
 		dumped_codelets[dumped_codelets_count - 1].size = codelet_size;
 		dumped_codelets[dumped_codelets_count - 1].hash = codelet_hash;

+ 4 - 4
src/drivers/cpu/driver_cpu.c

@@ -111,7 +111,7 @@ _starpu_cpu_discover_devices(struct _starpu_machine_config *config)
  * Handle binding CPUs on cores.
  * In the case of a combined worker WORKER_TASK != J->TASK */
 
-static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_task, struct _starpu_worker *cpu_args, int rank, enum starpu_perfmodel_archtype perf_arch)
+static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_task, struct _starpu_worker *cpu_args, int rank, struct starpu_perfmodel_arch* perf_arch)
 {
 	int ret;
 	int is_parallel_task = (j->task_size > 1);
@@ -292,7 +292,7 @@ int _starpu_cpu_driver_run_once(struct starpu_driver *d STARPU_ATTRIBUTE_UNUSED)
 	int rank = 0;
 	int is_parallel_task = (j->task_size > 1);
 
-	enum starpu_perfmodel_archtype perf_arch;
+	struct starpu_perfmodel_arch* perf_arch;
 
 	/* Get the rank in case it is a parallel task */
 	if (is_parallel_task)
@@ -307,14 +307,14 @@ int _starpu_cpu_driver_run_once(struct starpu_driver *d STARPU_ATTRIBUTE_UNUSED)
 		cpu_worker->combined_workerid = j->combined_workerid;
 		cpu_worker->worker_size = combined_worker->worker_size;
 		cpu_worker->current_rank = rank;
-		perf_arch = combined_worker->perf_arch;
+		perf_arch = &combined_worker->perf_arch;
 	}
 	else
 	{
 		cpu_worker->combined_workerid = cpu_worker->workerid;
 		cpu_worker->worker_size = 1;
 		cpu_worker->current_rank = 0;
-		perf_arch = cpu_worker->perf_arch;
+		perf_arch = &cpu_worker->perf_arch;
 	}
 
 	_starpu_set_current_task(j->task);

+ 3 - 3
src/drivers/cuda/driver_cuda.c

@@ -351,14 +351,14 @@ static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *arg
 	STARPU_ASSERT(func);
 
 #ifdef STARPU_SIMGRID
-	_starpu_simgrid_execute_job(j, args->perf_arch, NAN);
+	_starpu_simgrid_execute_job(j, &args->perf_arch, NAN);
 #else
 	func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
 #endif
 
-	_starpu_driver_end_job(args, j, args->perf_arch, &codelet_end, 0, profiling);
+	_starpu_driver_end_job(args, j, &args->perf_arch, &codelet_end, 0, profiling);
 
-	_starpu_driver_update_job_feedback(j, args, args->perf_arch, &codelet_start, &codelet_end, profiling);
+	_starpu_driver_update_job_feedback(j, args, &args->perf_arch, &codelet_start, &codelet_end, profiling);
 
 	_starpu_push_task_output(j, mask);
 

+ 3 - 3
src/drivers/driver_common/driver_common.c

@@ -75,7 +75,7 @@ void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j
 	_STARPU_TRACE_START_CODELET_BODY(j);
 }
 
-void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, enum starpu_perfmodel_archtype perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank, int profiling)
+void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank, int profiling)
 {
 	struct starpu_task *task = j->task;
 	struct starpu_codelet *cl = task->cl;
@@ -104,7 +104,7 @@ void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j,
 	args->status = STATUS_UNKNOWN;
 }
 void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_worker *worker_args,
-					enum starpu_perfmodel_archtype perf_arch,
+					struct starpu_perfmodel_arch* perf_arch,
 					struct timespec *codelet_start, struct timespec *codelet_end, int profiling)
 {
 	struct starpu_profiling_task_info *profiling_info = j->task->profiling_info;
@@ -150,7 +150,7 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 
 	if (profiling_info && profiling_info->power_consumed && cl->power_model && cl->power_model->benchmarking)
 	{
-		_starpu_update_perfmodel_history(j, j->task->cl->power_model,  perf_arch, worker_args->devid, profiling_info->power_consumed,j->nimpl);
+		_starpu_update_perfmodel_history(j, j->task->cl->power_model, perf_arch, worker_args->devid, profiling_info->power_consumed,j->nimpl);
 	}
 }
 

+ 2 - 2
src/drivers/driver_common/driver_common.h

@@ -25,10 +25,10 @@
 
 void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j,
 			      struct timespec *codelet_start, int rank, int profiling);
-void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, enum starpu_perfmodel_archtype perf_arch,
+void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch,
 			    struct timespec *codelet_end, int rank, int profiling);
 void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_worker *worker_args,
-					enum starpu_perfmodel_archtype perf_arch,
+					struct starpu_perfmodel_arch* perf_arch,
 					struct timespec *codelet_start, struct timespec *codelet_end, int profiling);
 
 struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int workerid, unsigned memnode);

+ 0 - 5
src/drivers/mp_common/sink_common.c

@@ -249,8 +249,6 @@ void _starpu_sink_common_worker(void)
 	starpu_pthread_key_t worker_key;
 	STARPU_PTHREAD_KEY_CREATE(&worker_key, NULL);
 
-
-	struct _starpu_machine_config *config;
 	while (!exit_starpu)
 	{
 		/* If we have received a message */
@@ -264,7 +262,6 @@ void _starpu_sink_common_worker(void)
 					exit_starpu = 1;
 					break;
 				case STARPU_EXECUTE:
-					config = _starpu_get_machine_config();
 					node->execute(node, arg, arg_size);
 					break;
 				case STARPU_SINK_NBCORES:
@@ -314,7 +311,6 @@ void _starpu_sink_common_worker(void)
 			struct mp_message * message = mp_message_list_pop_back(node->message_queue);
 			STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);
 			//_STARPU_DEBUG("telling host that we have finished the task %p sur %d.\n", task->kernel, task->coreid);
-			config = _starpu_get_machine_config();
 			_starpu_mp_common_send_command(node, message->type, 
 					&message->buffer, message->size);
 			mp_message_delete(message);
@@ -378,7 +374,6 @@ static void _starpu_sink_common_erase_barrier(struct _starpu_mp_node * node, str
  */
 static void _starpu_sink_common_append_message(struct _starpu_mp_node *node, struct mp_message * message)
 {
-	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	STARPU_PTHREAD_MUTEX_LOCK(&node->message_queue_mutex);
 	mp_message_list_push_front(node->message_queue,message);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);

+ 2 - 2
src/drivers/mp_common/source_common.c

@@ -36,7 +36,7 @@ static int _starpu_src_common_finalize_job (struct _starpu_job *j, struct _starp
 	uint32_t mask = 0;
 	int profiling = starpu_profiling_status_get();
 	struct timespec codelet_end;
-	_starpu_driver_end_job(worker, j, worker->perf_arch, &codelet_end, 0,
+	_starpu_driver_end_job(worker, j, &worker->perf_arch, &codelet_end, 0,
 			profiling);
 	
 	int count = worker->current_rank;
@@ -57,7 +57,7 @@ static int _starpu_src_common_finalize_job (struct _starpu_job *j, struct _starp
 	if(count == 0)
 	{
 
-		_starpu_driver_update_job_feedback(j, worker, worker->perf_arch,
+		_starpu_driver_update_job_feedback(j, worker, &worker->perf_arch,
 				&j->cl_start, &codelet_end,
 				profiling);
 

+ 4 - 3
src/drivers/opencl/driver_opencl.c

@@ -656,6 +656,7 @@ int _starpu_opencl_driver_run_once(struct starpu_driver *d)
 
 	task = _starpu_get_worker_task(args, workerid, memnode);
 
+
 	if (task == NULL)
 		return 0;
 
@@ -841,14 +842,14 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 	STARPU_ASSERT_MSG(profiling_info->used_cycles, "Application kernel must call starpu_opencl_collect_stats to collect simulated time");
 	length = ((double) profiling_info->used_cycles)/MSG_get_host_speed(MSG_host_self());
   #endif
-	_starpu_simgrid_execute_job(j, args->perf_arch, length);
+	_starpu_simgrid_execute_job(j, &args->perf_arch, length);
 #else
 	func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
 #endif
 
-	_starpu_driver_end_job(args, j, args->perf_arch, &codelet_end, 0, profiling);
+	_starpu_driver_end_job(args, j, &args->perf_arch, &codelet_end, 0, profiling);
 
-	_starpu_driver_update_job_feedback(j, args, args->perf_arch,
+	_starpu_driver_update_job_feedback(j, args, &args->perf_arch,
 					   &codelet_start, &codelet_end, profiling);
 
 	_starpu_push_task_output(j, mask);

+ 41 - 15
src/profiling/bound.c

@@ -25,6 +25,7 @@
 #include <starpu_config.h>
 #include <profiling/bound.h>
 #include <core/jobs.h>
+#include <core/workers.h>
 
 #ifdef STARPU_HAVE_GLPK_H
 #include <glpk.h>
@@ -100,7 +101,7 @@ struct bound_task
 	int depsn;
 
 	/* Estimated duration */
-	double duration[STARPU_NARCH_VARIATIONS];
+	double** duration[STARPU_NARCH];
 
 	/* Other tasks */
 	struct bound_task *next;
@@ -186,7 +187,31 @@ static int good_job(struct _starpu_job *j)
 		return 0;
 	return 1;
 }
+static double** initialize_arch_duration(int maxdevid, unsigned* maxncore_table)
+{
+	int devid, maxncore;
+	double ** arch_model = malloc(sizeof(*arch_model)*(maxdevid+1));
+	arch_model[maxdevid] = NULL;
+	for(devid=0; devid<maxdevid; devid++)
+	{
+		if(maxncore_table != NULL)
+			maxncore = maxncore_table[devid];
+		else
+			maxncore = 1;
+		arch_model[devid] = malloc(sizeof(*arch_model[devid])*(maxncore+1));
+	}
+	return arch_model;
+}
 
+static void initialize_duration(struct bound_task *task)
+{
+	struct _starpu_machine_config *conf = _starpu_get_machine_config();
+	task->duration[STARPU_CPU_WORKER] = initialize_arch_duration(1,&conf->topology.ncpus); 
+	task->duration[STARPU_CUDA_WORKER] = initialize_arch_duration(conf->topology.ncudagpus,NULL); 
+	task->duration[STARPU_OPENCL_WORKER] = initialize_arch_duration(conf->topology.nopenclgpus,NULL); 
+	task->duration[STARPU_MIC_WORKER] = initialize_arch_duration(conf->topology.nmicdevices,conf->topology.nmiccores); 
+	task->duration[STARPU_SCC_WORKER] = initialize_arch_duration(conf->topology.nsccdevices,NULL); 
+}
 /* Create a new task (either because it has just been submitted, or a
  * dependency was added before submission) */
 static void new_task(struct _starpu_job *j)
@@ -202,10 +227,11 @@ static void new_task(struct _starpu_job *j)
 	t->tag_id = j->task->tag_id;
 	t->use_tag = j->task->use_tag;
 	t->cl = j->task->cl;
-	t->footprint = _starpu_compute_buffers_footprint(NULL, STARPU_CPU_DEFAULT, 0, j);
+	t->footprint = _starpu_compute_buffers_footprint(NULL, STARPU_CPU_WORKER, 0, j);
 	t->priority = j->task->priority;
 	t->deps = NULL;
 	t->depsn = 0;
+	initialize_duration(t);
 	t->next = tasks;
 	j->bound_task = t;
 	tasks = t;
@@ -236,7 +262,7 @@ void _starpu_bound_record(struct _starpu_job *j)
 	{
 		struct bound_task_pool *tp;
 
-		_starpu_compute_buffers_footprint(NULL, STARPU_CPU_DEFAULT, 0, j);
+		_starpu_compute_buffers_footprint(NULL, STARPU_CPU_WORKER, 0, j);
 
 		if (last && last->cl == j->task->cl && last->footprint == j->footprint)
 			tp = last;
@@ -400,7 +426,7 @@ static void _starpu_get_tasks_times(int nw, int nt, double *times)
 				.footprint = tp->footprint,
 				.footprint_is_computed = 1,
 			};
-			enum starpu_perfmodel_archtype arch = starpu_worker_get_perf_archtype(w);
+			struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
 			double length = _starpu_history_based_job_expected_perf(tp->cl->model, arch, &j, j.nimpl);
 			if (isnan(length))
 				times[w*nt+t] = NAN;
@@ -486,15 +512,15 @@ void starpu_bound_print_lp(FILE *output)
 			};
 			for (w = 0; w < nw; w++)
 			{
-				enum starpu_perfmodel_archtype arch = starpu_worker_get_perf_archtype(w);
-				if (_STARPU_IS_ZERO(t1->duration[arch]))
+				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
+				if (_STARPU_IS_ZERO(t1->duration[arch->type][arch->devid][arch->ncore]))
 				{
 					double length = _starpu_history_based_job_expected_perf(t1->cl->model, arch, &j,j.nimpl);
 					if (isnan(length))
 						/* Avoid problems with binary coding of doubles */
-						t1->duration[arch] = NAN;
+						t1->duration[arch->type][arch->devid][arch->ncore] = NAN;
 					else
-						t1->duration[arch] = length / 1000.;
+						t1->duration[arch->type][arch->devid][arch->ncore] = length / 1000.;
 				}
 			}
 			nt++;
@@ -519,8 +545,8 @@ void starpu_bound_print_lp(FILE *output)
 		{
 			for (w = 0; w < nw; w++)
 			{
-				enum starpu_perfmodel_archtype arch = starpu_worker_get_perf_archtype(w);
-				if (!isnan(t1->duration[arch]))
+				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
+				if (!isnan(t1->duration[arch->type][arch->devid][arch->ncore]))
 					fprintf(output, " +t%luw%d", t1->id, w);
 			}
 			fprintf(output, " = 1;\n");
@@ -533,9 +559,9 @@ void starpu_bound_print_lp(FILE *output)
 			fprintf(output, "/* %s %x */\tc%lu = s%lu", _starpu_codelet_get_model_name(t1->cl), (unsigned) t1->footprint, t1->id, t1->id);
 			for (w = 0; w < nw; w++)
 			{
-				enum starpu_perfmodel_archtype arch = starpu_worker_get_perf_archtype(w);
-				if (!isnan(t1->duration[arch]))
-					fprintf(output, " + %f t%luw%d", t1->duration[arch], t1->id, w);
+				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
+				if (!isnan(t1->duration[arch->type][arch->devid][arch->ncore]))
+					fprintf(output, " + %f t%luw%d", t1->duration[arch->type][arch->devid][arch->ncore], t1->id, w);
 			}
 			fprintf(output, ";\n");
 		}
@@ -616,8 +642,8 @@ void starpu_bound_print_lp(FILE *output)
 				{
 					for (w = 0; w < nw; w++)
 					{
-						enum starpu_perfmodel_archtype arch = starpu_worker_get_perf_archtype(w);
-						if (!isnan(t1->duration[arch]))
+						struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
+						if (!isnan(t1->duration[arch->type][arch->devid][arch->ncore]))
 						{
 							fprintf(output, "s%lu - c%lu >= -3e5 + 1e5 t%luw%d + 1e5 t%luw%d + 1e5 t%luafter%lu;\n",
 									t1->id, t2->id, t1->id, w, t2->id, w, t1->id, t2->id);

+ 4 - 4
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -407,7 +407,7 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 		worker = workers->get_next(workers, &it);
 		struct _starpu_fifo_taskq *fifo  = dt->queue_array[worker];
 		unsigned memory_node = starpu_worker_get_memory_node(worker);
-		enum starpu_perfmodel_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
+		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
 
 		/* Sometimes workers didn't take the tasks as early as we expected */
 		double exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
@@ -543,7 +543,7 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 	{
 		worker = workers->get_next(workers, &it);
 		struct _starpu_fifo_taskq *fifo = dt->queue_array[worker];
-		enum starpu_perfmodel_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
+		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
 		unsigned memory_node = starpu_worker_get_memory_node(worker);
 
 		/* Sometimes workers didn't take the tasks as early as we expected */
@@ -760,7 +760,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 	}
 	else if (task->bundle)
 	{
-		enum starpu_perfmodel_archtype perf_arch = starpu_worker_get_perf_archtype(best_in_ctx);
+		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(best_in_ctx);
 		unsigned memory_node = starpu_worker_get_memory_node(best);
 		model_best = starpu_task_expected_length(task, perf_arch, selected_impl);
 		transfer_model_best = starpu_task_expected_data_transfer_time(memory_node, task);
@@ -931,7 +931,7 @@ static void dmda_push_task_notify(struct starpu_task *task, int workerid, unsign
 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
 	/* Compute the expected penality */
-	enum starpu_perfmodel_archtype perf_arch = starpu_worker_get_perf_archtype(workerid);
+	struct starpu_perfmodel_arch *perf_arch = starpu_worker_get_perf_archtype(workerid);
 	unsigned memory_node = starpu_worker_get_memory_node(workerid);
 
 	double predicted = starpu_task_expected_length(task, perf_arch,

+ 2 - 2
src/sched_policies/parallel_heft.c

@@ -230,7 +230,7 @@ static double compute_expected_end(int workerid, double length)
 
 static double compute_ntasks_end(int workerid)
 {
-	enum starpu_perfmodel_archtype perf_arch = starpu_worker_get_perf_archtype(workerid);
+	struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(workerid);
 	starpu_pthread_mutex_t *sched_mutex;
 	starpu_pthread_cond_t *sched_cond;
 
@@ -347,7 +347,7 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 			}
 
 
-			enum starpu_perfmodel_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
+			struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
 
 			local_task_length[worker_ctx][nimpl] = starpu_task_expected_length(task, perf_arch,nimpl);
 

+ 1 - 1
src/sched_policies/random_policy.c

@@ -50,7 +50,7 @@ static int _random_push_task(struct starpu_task *task, unsigned prio)
 		{
 			if(starpu_worker_can_execute_task(worker, task, impl))
 			{
-				enum starpu_perfmodel_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
+				struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
 				double speedup = starpu_worker_get_relative_speedup(perf_arch);
 				alpha_sum += speedup;
 				speedup_arr[size] = speedup;

+ 1 - 1
src/starpu_parameters.h

@@ -29,5 +29,5 @@
 #define _STARPU_CPU_ALPHA	1.0f
 #define _STARPU_CUDA_ALPHA	13.33f
 #define _STARPU_OPENCL_ALPHA	12.22f
-
+#define _STARPU_MIC_ALPHA	11.11f
 #endif /* _STARPU_PARAMETERS_H */

+ 13 - 4
tests/perfmodels/feed.c

@@ -50,6 +50,9 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+	 if(starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) < 2)
+		 return STARPU_TEST_SKIPPED;
+
 	starpu_task_init(&task);
 	task.cl = &cl;
 
@@ -66,12 +69,18 @@ int main(int argc, char **argv)
 		measured_fast = 0.002+size*0.00000001;
 		measured_slow = 0.001+size*0.0000001;
 
+		struct starpu_perfmodel_arch arch;
+		arch.type = STARPU_CUDA_WORKER;
+		arch.ncore = 0;
 		/* Simulate Fast GPU */
-		starpu_perfmodel_update_history(&model, &task, STARPU_CUDA_DEFAULT, 0, 0, measured_fast);
-		starpu_perfmodel_update_history(&nl_model, &task, STARPU_CUDA_DEFAULT, 0, 0, measured_fast);
+		arch.devid = 0;
+		starpu_perfmodel_update_history(&model, &task, &arch, 0, 0, measured_fast);
+		starpu_perfmodel_update_history(&nl_model, &task, &arch, 0, 0, measured_fast);
+		
 		/* Simulate Slow GPU */
-		starpu_perfmodel_update_history(&model, &task, STARPU_CUDA_DEFAULT + 1, 0, 0, measured_slow);
-		starpu_perfmodel_update_history(&nl_model, &task, STARPU_CUDA_DEFAULT + 1, 0, 0, measured_slow);
+		arch.devid = 1;
+		starpu_perfmodel_update_history(&model, &task, &arch, 0, 0, measured_slow);
+		starpu_perfmodel_update_history(&nl_model, &task, &arch, 0, 0, measured_slow);
 		starpu_task_clean(&task);
 		starpu_data_unregister(handle);
 	}

+ 13 - 5
tests/perfmodels/valid_model.c

@@ -60,12 +60,13 @@ static int submit(struct starpu_codelet *codelet, struct starpu_perfmodel *model
 	int ret;
 	int old_nsamples, new_nsamples;
 	struct starpu_conf conf;
-	unsigned archid;
+	unsigned archid, archtype, devid, ncore;
 
 	starpu_conf_init(&conf);
 	conf.sched_policy_name = "eager";
 	conf.calibrate = 1;
 
+
 	ret = starpu_init(&conf);
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
@@ -73,10 +74,14 @@ static int submit(struct starpu_codelet *codelet, struct starpu_perfmodel *model
 	codelet->model = model;
 
 	old_nsamples = 0;
+	lmodel.is_init=0;
 	ret = starpu_perfmodel_load_symbol(codelet->model->symbol, &lmodel);
 	if (ret != 1)
-		for (archid = 0; archid < STARPU_NARCH_VARIATIONS; archid++)
-			old_nsamples += lmodel.per_arch[archid][0].regression.nsample;
+		for (archtype = 0; archtype < STARPU_NARCH; archtype++)
+			if(lmodel.per_arch[archtype] != NULL)
+				for(devid=0; lmodel.per_arch[archtype][devid] != NULL; devid++)
+					for(ncore=0; lmodel.per_arch[archtype][devid][ncore] != NULL; ncore++)
+						old_nsamples += lmodel.per_arch[archtype][devid][ncore][0].regression.nsample;
 
         starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, 100, sizeof(int));
 	for (loop = 0; loop < nloops; loop++)
@@ -97,8 +102,11 @@ static int submit(struct starpu_codelet *codelet, struct starpu_perfmodel *model
 	}
 
 	new_nsamples = 0;
-	for (archid = 0; archid < STARPU_NARCH_VARIATIONS; archid++)
-		new_nsamples += lmodel.per_arch[archid][0].regression.nsample;
+	for (archtype = 0; archtype < STARPU_NARCH; archtype++)
+		if(lmodel.per_arch[archtype] != NULL)
+			for(devid=0; lmodel.per_arch[archtype][devid] != NULL; devid++)
+				for(ncore=0; lmodel.per_arch[archtype][devid][ncore] != NULL; ncore++)
+					new_nsamples += lmodel.per_arch[archtype][devid][ncore][0].regression.nsample;
 
 	ret = starpu_perfmodel_unload_model(&lmodel);
 	if (ret == 1)

+ 42 - 13
tests/sched_policies/simple_cpu_gpu_sched.c

@@ -40,7 +40,7 @@ dummy(void *buffers[], void *args)
  */
 static double
 cpu_task_cpu(struct starpu_task *task,
-	     enum starpu_perfmodel_archtype arch,
+	     struct starpu_perfmodel_arch* arch,
 	     unsigned nimpl)
 {
 	(void) task;
@@ -51,7 +51,7 @@ cpu_task_cpu(struct starpu_task *task,
 
 static double
 cpu_task_gpu(struct starpu_task *task,
-	     enum starpu_perfmodel_archtype arch,
+	     struct starpu_perfmodel_arch* arch,
 	     unsigned nimpl)
 {
 	(void) task;
@@ -63,7 +63,7 @@ cpu_task_gpu(struct starpu_task *task,
 
 static double
 gpu_task_cpu(struct starpu_task *task,
-	     enum starpu_perfmodel_archtype arch,
+	     struct starpu_perfmodel_arch* arch,
 	     unsigned nimpl)
 {
 	(void) task;
@@ -75,7 +75,7 @@ gpu_task_cpu(struct starpu_task *task,
 
 static double
 gpu_task_gpu(struct starpu_task *task,
-	     enum starpu_perfmodel_archtype arch,
+	     struct starpu_perfmodel_arch* arch,
 	     unsigned nimpl)
 {
 	(void) task;
@@ -99,16 +99,45 @@ static struct starpu_perfmodel model_gpu_task =
 static void
 init_perfmodels(void)
 {
-	int i;
-	for (i = STARPU_CPU_DEFAULT; i < STARPU_CUDA_DEFAULT; i++)
+	unsigned devid, ncore;
+
+	starpu_initialize_model(&model_cpu_task);
+	starpu_initialize_model(&model_gpu_task);
+
+	if(model_cpu_task.per_arch[STARPU_CPU_WORKER] != NULL)
 	{
-		model_cpu_task.per_arch[i][0].cost_function = cpu_task_cpu;
-		model_gpu_task.per_arch[i][0].cost_function = gpu_task_cpu;
+		for(devid=0; model_cpu_task.per_arch[STARPU_CPU_WORKER][devid] != NULL; devid++)
+		{
+			for(ncore=0; model_cpu_task.per_arch[STARPU_CPU_WORKER][devid][ncore] != NULL; ncore++)
+			{
+				model_cpu_task.per_arch[STARPU_CPU_WORKER][devid][ncore][0].cost_function = cpu_task_cpu;
+				model_gpu_task.per_arch[STARPU_CPU_WORKER][devid][ncore][0].cost_function = gpu_task_cpu;
+			}
+		}
 	}
-	for (i = STARPU_CUDA_DEFAULT; i < STARPU_NARCH_VARIATIONS; i++)
+
+	if(model_cpu_task.per_arch[STARPU_CUDA_WORKER] != NULL)
+	{
+		for(devid=0; model_cpu_task.per_arch[STARPU_CUDA_WORKER][devid] != NULL; devid++)
+		{
+			for(ncore=0; model_cpu_task.per_arch[STARPU_CUDA_WORKER][devid][ncore] != NULL; ncore++)
+			{
+				model_cpu_task.per_arch[STARPU_CUDA_WORKER][devid][ncore][0].cost_function = cpu_task_gpu;
+				model_gpu_task.per_arch[STARPU_CUDA_WORKER][devid][ncore][0].cost_function = gpu_task_gpu;
+			}
+		}
+	}
+
+	if(model_cpu_task.per_arch[STARPU_OPENCL_WORKER] != NULL)
 	{
-		model_cpu_task.per_arch[i][0].cost_function = cpu_task_gpu;
-		model_gpu_task.per_arch[i][0].cost_function = gpu_task_gpu;
+		for(devid=0; model_cpu_task.per_arch[STARPU_OPENCL_WORKER][devid] != NULL; devid++)
+		{
+			for(ncore=0; model_cpu_task.per_arch[STARPU_OPENCL_WORKER][devid][ncore] != NULL; ncore++)
+			{
+				model_cpu_task.per_arch[STARPU_OPENCL_WORKER][devid][ncore][0].cost_function = cpu_task_gpu;
+				model_gpu_task.per_arch[STARPU_OPENCL_WORKER][devid][ncore][0].cost_function = gpu_task_gpu;
+			}
+		}
 	}
 }
 
@@ -176,8 +205,8 @@ run(struct starpu_sched_policy *policy)
 	cpu_task_worker = starpu_worker_get_type(cpu_task->profiling_info->workerid);
 	gpu_task_worker = starpu_worker_get_type(gpu_task->profiling_info->workerid);
 	if (cpu_task_worker != STARPU_CPU_WORKER ||
-	    (gpu_task_worker != STARPU_CUDA_WORKER &&
-	     gpu_task_worker != STARPU_OPENCL_WORKER))
+			(gpu_task_worker != STARPU_CUDA_WORKER &&
+			 gpu_task_worker != STARPU_OPENCL_WORKER))
 		ret = 1;
 	else
 		ret = 0;

+ 1 - 1
tools/Makefile.am

@@ -16,7 +16,7 @@
 
 SUBDIRS =
 
-AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS)
+AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(STARPU_COI_CPPFLAGS) $(GLOBAL_AM_CFLAGS)
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/tools/ -I$(top_srcdir)/mpi/ -I$(top_builddir)/src -I$(top_srcdir)/src
 AM_LDFLAGS = $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)

+ 250 - 85
tools/starpu_perfmodel_plot.c

@@ -29,6 +29,7 @@
 
 #include <starpu.h>
 #include <core/perfmodel/perfmodel.h> // we need to browse the list associated to history-based models
+#include <core/workers.h>
 
 #ifdef __MINGW32__
 #include <windows.h>
@@ -52,7 +53,7 @@ static struct starpu_fxt_options options;
 #endif
 
 #ifdef STARPU_USE_FXT
-static int archtype_is_found[STARPU_NARCH_VARIATIONS];
+static int **archtype_is_found[STARPU_NARCH];
 
 static char data_file_name[256];
 #endif
@@ -181,22 +182,22 @@ static void print_comma(FILE *gnuplot_file, int *first)
 	}
 }
 
-static void display_perf_model(FILE *gnuplot_file, struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, int *first, unsigned nimpl)
+static void display_perf_model(FILE *gnuplot_file, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, int *first, unsigned nimpl)
 {
 	char arch_name[256];
 	starpu_perfmodel_get_arch_name(arch, arch_name, 256, nimpl);
 
 	struct starpu_perfmodel_per_arch *arch_model =
-		&model->per_arch[arch][nimpl];
+		&model->per_arch[arch->type][arch->devid][arch->ncore][nimpl];
 
 	if (arch_model->regression.valid || arch_model->regression.nl_valid)
 		fprintf(stderr,"Arch: %s\n", arch_name);
 
 #ifdef STARPU_USE_FXT
-	if (!gflops && !no_fxt_file && archtype_is_found[arch] && nimpl == 0)
+	if (!gflops && !no_fxt_file && archtype_is_found[arch->type][arch->devid][arch->ncore] && nimpl == 0)
 	{
 		print_comma(gnuplot_file, first);
-		fprintf(gnuplot_file, "\"< grep -w \\^%d %s\" using 2:3 title \"Profiling %s\"", arch, data_file_name, arch_name);
+		fprintf(gnuplot_file, "\"< grep -w \\^%d_%d_%d %s\" using 2:3 title \"Profiling %s\"", arch->type, arch->devid, arch->ncore, data_file_name, arch_name);
 	}
 #endif
 
@@ -227,11 +228,10 @@ static void display_perf_model(FILE *gnuplot_file, struct starpu_perfmodel *mode
 	}
 }
 
-static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch1, enum starpu_perfmodel_archtype arch2, int *first)
+static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, enum starpu_worker_archtype* type, int* devid, int* ncore, int *first)
 {
 	char *command;
 	FILE *datafile;
-	unsigned arch;
 	struct starpu_perfmodel_history_list *ptr;
 	char arch_name[32];
 	int col;
@@ -245,20 +245,74 @@ static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_
 
 	col = 2;
 	unsigned implid;
-	for (arch = arch1; arch < arch2; arch++)
+
+	unsigned archmin, archmax, devmin, devmax, coremin, coremax;
+	if(type != NULL)
 	{
-		for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+		archmin = *type;
+		archmax = *type +1;
+		if(devid != NULL)
 		{
-			struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[arch][implid];
-			starpu_perfmodel_get_arch_name((enum starpu_perfmodel_archtype) arch, arch_name, 32, implid);
-
-			//ptrs[arch-arch1][implid] = ptr[arch-arch1][implid] = arch_model->list;
+			devmin = *devid;
+			devmax = *devid +1;
+			if(ncore != NULL)
+			{
+				coremin = *ncore;
+				coremax = *ncore +1;
+			}
+			else
+			{
+				coremin = 0;
+				coremax = 0;
+			}
+		}
+		else
+		{
+			devmin = 0;
+			devmax = 0;
+			coremin = 0;
+			coremax = 0;
+		}
+	}
+	else
+	{
+		archmin = 0;
+		archmax = STARPU_NARCH;
+		devmin = 0;
+		devmax = 0;
+		coremin = 0;
+		coremax = 0;
 
-			if (arch_model->list)
+	}
+	struct starpu_perfmodel_arch arch;
+	unsigned archtype, dev, core;
+	for (archtype = archmin; archtype < archmax; archtype++)
+	{
+		arch.type = archtype;
+		if(model->per_arch[archtype]!=NULL)
+		{
+			for(dev = devmin; model->per_arch[archtype][dev] != NULL && (devmax == 0 || dev < devmax);dev++)
 			{
-				print_comma(gnuplot_file, first);
-				fprintf(gnuplot_file, "\"%s\" using 1:%d:%d with errorlines title \"Average %s\"", avg_file_name, col, col+1, arch_name);
-				col += 2;
+				arch.devid = dev;
+
+				for(core = coremin; model->per_arch[archtype][dev][core] != NULL && (coremax == 0 || core < coremax); core++)
+				{
+					arch.ncore = core;
+					for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+					{
+						struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[archtype][dev][core][implid];
+						starpu_perfmodel_get_arch_name(&arch, arch_name, 32, implid);
+
+						//ptrs[arch-arch1][implid] = ptr[arch-arch1][implid] = arch_model->list;
+
+						if (arch_model->list)
+						{
+							print_comma(gnuplot_file, first);
+							fprintf(gnuplot_file, "\"%s\" using 1:%d:%d with errorlines title \"Average %s\"", avg_file_name, col, col+1, arch_name);
+							col += 2;
+						}
+					}
+				}
 			}
 		}
 	}
@@ -270,71 +324,159 @@ static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_
 
 		minimum = ULONG_MAX;
 		/* Get the next minimum */
-		for (arch = arch1; arch < arch2; arch++)
-			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+		for (archtype = archmin; archtype < archmax; archtype++)
+		{
+			if(model->per_arch[archtype]!=NULL)
 			{
-				struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[arch][implid];
-				for (ptr = arch_model->list; ptr; ptr = ptr->next)
+				for(dev = devmin; model->per_arch[archtype][dev] != NULL && (devmax == 0 || dev < devmax);dev++)
 				{
-					unsigned long size = ptr->entry->size;
-					if (size > last && size < minimum)
-						minimum = size;
+					for(core = coremin; model->per_arch[archtype][dev][core] != NULL && (coremax == 0 || core < coremax); core++)
+				
+					{
+						for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+						{
+							struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[archtype][dev][core][implid];
+							for (ptr = arch_model->list; ptr; ptr = ptr->next)
+							{
+								unsigned long size = ptr->entry->size;
+								if (size > last && size < minimum)
+									minimum = size;
+							}
+						}
+					}
 				}
 			}
+		}
 		if (minimum == ULONG_MAX)
 			break;
 
 		fprintf(stderr, "%lu ", minimum);
 		fprintf(datafile, "%-15lu ", minimum);
-		for (arch = arch1; arch < arch2; arch++)
-		{
-			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
-			{
-				struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[arch][implid];
-				for (ptr = arch_model->list; ptr; ptr = ptr->next)
-				{
-					struct starpu_perfmodel_history_entry *entry = ptr->entry;
-					if (entry->size == minimum)
-					{
-						if (gflops)
-							fprintf(datafile, "\t%-15le\t%-15le", entry->flops / (entry->mean * 1000),
-									entry->flops / ((entry->mean + entry->deviation) * 1000) -
-									entry->flops / (entry->mean * 1000)
-									);
-						else
-							fprintf(datafile, "\t%-15le\t%-15le", 0.001*entry->mean, 0.001*entry->deviation);
-						break;
-					}
-				}
-				if (!ptr && arch_model->list)
-					/* No value for this arch. */
-					fprintf(datafile, "\t\"\"\t\"\"");
-			}
-		}
+		for (archtype = archmin; archtype < archmax; archtype++)
+			if(model->per_arch[archtype]!=NULL)
+				for(dev = devmin; model->per_arch[archtype][dev] != NULL && (devmax == 0 || dev < devmax);dev++)
+					for(core = coremin; model->per_arch[archtype][dev][core] != NULL && (coremax == 0 || core < coremax); core++)
+						for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+						{
+							struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[archtype][dev][core][implid];
+							for (ptr = arch_model->list; ptr; ptr = ptr->next)
+							{
+								struct starpu_perfmodel_history_entry *entry = ptr->entry;
+								if (entry->size == minimum)
+								{
+									if (gflops)
+										fprintf(datafile, "\t%-15le\t%-15le", entry->flops / (entry->mean * 1000),
+												entry->flops / ((entry->mean + entry->deviation) * 1000) -
+												entry->flops / (entry->mean * 1000)
+										       );
+									else
+										fprintf(datafile, "\t%-15le\t%-15le", 0.001*entry->mean, 0.001*entry->deviation);
+									break;
+								}
+							}
+							if (!ptr && arch_model->list)
+								/* No value for this arch. */
+								fprintf(datafile, "\t\"\"\t\"\"");
+						}
 		fprintf(datafile, "\n");
 	}
 	fprintf(stderr, "\n");
 	fclose(datafile);
 }
 
-static void display_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch1, enum starpu_perfmodel_archtype arch2, int *first)
+
+static void display_selected_arch_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, int *first)
 {
-	unsigned arch;
 	unsigned implid;
-	for (arch = arch1; arch < arch2; arch++)
+	for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+		display_perf_model(gnuplot_file, model, arch, first, implid);
+}
+
+static void display_selected_device_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, enum starpu_worker_archtype archtype, int devid, int *first)
+{
+	unsigned ncore;
+	struct starpu_perfmodel_arch arch;
+	arch.type = archtype;
+	arch.devid = devid;
+	for(ncore=0; model->per_arch[archtype][devid][ncore] != NULL; ncore++)
 	{
-		for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+		arch.ncore = ncore;
+		display_selected_arch_perf_models(gnuplot_file,model,&arch,first);
+	}
+}
+
+static void display_selected_archtype_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, enum starpu_worker_archtype archtype, int *first)
+{
+	unsigned devid;
+	for(devid=0; model->per_arch[archtype][devid] != NULL; devid++)
+		display_selected_device_perf_models(gnuplot_file,model,archtype,devid,first);
+}
+
+static void display_all_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, int *first)
+{
+	unsigned archtype;
+	for(archtype = 0; archtype < STARPU_NARCH; archtype++)
+		display_selected_archtype_perf_models(gnuplot_file,model,archtype,first);
+}
+
+#ifdef STARPU_USE_FXT
+static int ** init_archtype_is_found_per_arch(int maxdevid, unsigned* maxncore_table)
+{
+	int devid, ncore;
+	int ** archtype_is_found_per_arch = malloc(sizeof(*archtype_is_found_per_arch)*(maxdevid+1));
+	archtype_is_found_per_arch[maxdevid] = NULL;
+	for(devid=0; devid<maxdevid; devid++)
+	{
+		int maxncore;
+		if(maxncore_table != NULL)
+			maxncore = maxncore_table[devid];
+		else
+			maxncore = 1;
+		
+		archtype_is_found_per_arch[devid] = malloc(sizeof(*archtype_is_found_per_arch[devid])*(maxncore+1));
+		archtype_is_found_per_arch[devid][maxncore] = 0;
+		for(ncore=0; ncore<maxncore; ncore++)
+			archtype_is_found_per_arch[devid][ncore] = 0;
+	}
+	return archtype_is_found_per_arch;
+
+}
+
+
+static void init_archtype_is_found(struct starpu_perfmodel *model)
+{
+	unsigned archtype, devid, ndevice, ncore, *maxncore;
+
+	for(archtype = 0; archtype < STARPU_NARCH; archtype++)
+	{
+	
+		for(devid=0; model->per_arch[archtype][devid] != NULL; devid++);
+		ndevice = devid;
+		if(ndevice != 0)
 		{
-			display_perf_model(gnuplot_file, model, (enum starpu_perfmodel_archtype) arch, first, implid);
+			maxncore = malloc(sizeof(*maxncore)*ndevice);
+			for(devid=0; devid < ndevice; devid++);
+			{
+			
+				for(ncore=0; model->per_arch[archtype][devid][ncore] != NULL; ncore++);
+				maxncore[devid] = ncore;
+			}
 		}
+		else
+		{
+			maxncore = NULL;
+		}
+
+		archtype_is_found[archtype] = init_archtype_is_found_per_arch(ndevice,maxncore);
+		if(maxncore != NULL)
+			free(maxncore);
 	}
-	display_history_based_perf_models(gnuplot_file, model, arch1, arch2, first);
 }
 
-#ifdef STARPU_USE_FXT
-static void dump_data_file(FILE *data_file)
+
+static void dump_data_file(FILE *data_file, struct starpu_perfmodel *model)
 {
-	memset(archtype_is_found, 0, STARPU_NARCH_VARIATIONS*sizeof(int));
+	init_archtype_is_found(model);
 
 	int i;
 	for (i = 0; i < options.dumped_codelets_count; i++)
@@ -342,13 +484,13 @@ static void dump_data_file(FILE *data_file)
 		/* Dump only if the symbol matches user's request */
 		if (strncmp(dumped_codelets[i].symbol, symbol, (FXT_MAX_PARAMS - 4)*sizeof(unsigned long)-1) == 0)
 		{
-			enum starpu_perfmodel_archtype archtype = dumped_codelets[i].archtype;
-			archtype_is_found[archtype] = 1;
+			struct starpu_perfmodel_arch* arch = &dumped_codelets[i].arch;
+			archtype_is_found[arch->type][arch->devid][arch->ncore] = 1;
 
 			size_t size = dumped_codelets[i].size;
 			float time = dumped_codelets[i].time;
 
-			fprintf(data_file, "%d	%f	%f\n", archtype, (float)size, time);
+			fprintf(data_file, "%d_%d_%d	%f	%f\n", arch->type, arch->devid, arch->ncore, (float)size, time);
 		}
 	}
 }
@@ -380,55 +522,72 @@ static void display_selected_models(FILE *gnuplot_file, struct starpu_perfmodel
 	int first = 1;
 	fprintf(gnuplot_file, "plot\t");
 
+	struct starpu_perfmodel_arch arch;
+	struct _starpu_machine_config *conf = _starpu_get_machine_config();
+
+
+
 	if (archname == NULL)
 	{
 		/* display all architectures */
-		display_perf_models(gnuplot_file, model, (enum starpu_perfmodel_archtype) 0, (enum starpu_perfmodel_archtype) STARPU_NARCH_VARIATIONS, &first);
+		display_all_perf_models(gnuplot_file, model, &first);
+		display_history_based_perf_models(gnuplot_file, model, NULL, NULL, NULL, &first);
 	}
 	else
 	{
 		if (strcmp(archname, "cpu") == 0)
 		{
-			unsigned impl;
-			for (impl = 0; impl < STARPU_MAXIMPLEMENTATIONS; impl++)
-			{
-				display_perf_model(gnuplot_file, model,
-						   STARPU_CPU_DEFAULT,
-						   &first, impl);
-			}
+			
+			arch.type = STARPU_CPU_WORKER;
+			arch.devid = 1;
+			arch.ncore = 0;
+
+			display_selected_arch_perf_models(gnuplot_file, model, &arch, &first); 
+			display_history_based_perf_models(gnuplot_file, model,  &arch.type, &arch.devid, &arch.ncore, &first);
 			return;
 		}
 
-		int k;
-		if (sscanf(archname, "cpu:%d", &k) == 1)
+		unsigned k;
+		if (sscanf(archname, "cpu:%u", &k) == 1)
 		{
 			/* For combined CPU workers */
-			if ((k < 1) || (k > STARPU_MAXCPUS))
+			if ((k < 1) || (k > conf->topology.ncpus))
 			{
 				fprintf(stderr, "Invalid CPU size\n");
 				exit(-1);
 			}
 
-			display_perf_models(gnuplot_file, model, (enum starpu_perfmodel_archtype) (STARPU_CPU_DEFAULT + k - 1), (enum starpu_perfmodel_archtype) (STARPU_CPU_DEFAULT + k), &first);
+			arch.type = STARPU_CPU_WORKER;
+			arch.devid = 1;
+			arch.ncore = k - 1;
+
+			display_selected_arch_perf_models(gnuplot_file, model, &arch, &first); 
+			display_history_based_perf_models(gnuplot_file, model,  &arch.type, &arch.devid, &arch.ncore, &first);
 			return;
 		}
 
 		if (strcmp(archname, "cuda") == 0)
 		{
-			display_perf_models(gnuplot_file, model, STARPU_CUDA_DEFAULT, (enum starpu_perfmodel_archtype) (STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS), &first);
+			unsigned archtype = STARPU_CUDA_WORKER;
+			display_selected_archtype_perf_models(gnuplot_file, model, archtype, &first);
+			display_history_based_perf_models(gnuplot_file, model,  &archtype, NULL, NULL, &first);
 			return;
 		}
 
 		/* There must be a cleaner way ! */
-		int gpuid;
+		unsigned gpuid;
 		int nmatched;
-		nmatched = sscanf(archname, "cuda_%d", &gpuid);
+		nmatched = sscanf(archname, "cuda_%u", &gpuid);
 		if (nmatched == 1)
 		{
-			int archid = STARPU_CUDA_DEFAULT+ gpuid;
-			if (archid < STARPU_OPENCL_DEFAULT)
+			if (gpuid < conf->topology.ncudagpus)
 			{
-				display_perf_models(gnuplot_file, model, (enum starpu_perfmodel_archtype) archid, (enum starpu_perfmodel_archtype) (archid + 1), &first);
+				arch.type = STARPU_CUDA_WORKER;
+				arch.devid = gpuid;
+				arch.ncore = 0;
+
+				display_selected_arch_perf_models(gnuplot_file, model, &arch, &first);
+				display_history_based_perf_models(gnuplot_file, model,  &arch.type, &arch.devid, &arch.ncore, &first);
 				return;
 			}
 			else
@@ -440,18 +599,24 @@ static void display_selected_models(FILE *gnuplot_file, struct starpu_perfmodel
 
 		if (strcmp(archname, "opencl") == 0)
 		{
-			display_perf_models(gnuplot_file, model, STARPU_OPENCL_DEFAULT, (enum starpu_perfmodel_archtype) (STARPU_OPENCL_DEFAULT + STARPU_MAXOPENCLDEVS), &first);
+			unsigned archtype = STARPU_OPENCL_WORKER;
+			display_selected_archtype_perf_models(gnuplot_file, model, archtype, &first);
+			display_history_based_perf_models(gnuplot_file, model,  &archtype, NULL, NULL, &first);
 			return;
 		}
 
 		/* There must be a cleaner way ! */
-		nmatched = sscanf(archname, "opencl_%d", &gpuid);
+		nmatched = sscanf(archname, "opencl_%u", &gpuid);
 		if (nmatched == 1)
 		{
-			int archid = STARPU_OPENCL_DEFAULT+ gpuid;
-			if (archid < STARPU_NARCH_VARIATIONS)
+			if (gpuid < conf->topology.nopenclgpus)
 			{
-				display_perf_models(gnuplot_file, model, (enum starpu_perfmodel_archtype) archid, (enum starpu_perfmodel_archtype) (archid + 1), &first);
+				arch.type = STARPU_OPENCL_WORKER;
+				arch.devid = gpuid;
+				arch.ncore = 0;
+		
+				display_selected_arch_perf_models(gnuplot_file, model, &arch, &first);
+				display_history_based_perf_models(gnuplot_file, model,  &arch.type, &arch.devid, &arch.ncore, &first);
 				return;
 			}
 			else
@@ -507,7 +672,7 @@ int main(int argc, char **argv)
 
 		FILE *data_file = fopen(data_file_name, "w+");
 		STARPU_ASSERT(data_file);
-		dump_data_file(data_file);
+		dump_data_file(data_file, &model);
 		fclose(data_file);
 	}
 #endif