пре 11 година · 497ce06798
--- a/examples/cholesky/cholesky.h
+++ b/examples/cholesky/cholesky.h
@@ -132,15 +132,23 @@ void chol_cpu_codelet_update_u11(void **, void *);
 
				 void chol_cpu_codelet_update_u21(void **, void *);
			
 
				 void chol_cpu_codelet_update_u22(void **, void *);
			
 
				 
			
 
				+double cpu_chol_task_11_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				+double cpu_chol_task_21_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				+double cpu_chol_task_22_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				+
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 void chol_cublas_codelet_update_u11(void *descr[], void *_args);
			
 
				 void chol_cublas_codelet_update_u21(void *descr[], void *_args);
			
 
				 void chol_cublas_codelet_update_u22(void *descr[], void *_args);
			
 
				+
			
 
				+double cuda_chol_task_11_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				+double cuda_chol_task_21_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				+double cuda_chol_task_22_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				 #endif
			
 
				 
			
 
				-extern struct starpu_perfmodel chol_model_11;
			
 
				-extern struct starpu_perfmodel chol_model_21;
			
 
				-extern struct starpu_perfmodel chol_model_22;
			
 
				+void initialize_chol_model(struct starpu_perfmodel* model, char* symbol, 
			
 
				+		double (*cpu_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned), 
			
 
				+		double (*cuda_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned));
			
 
				 
			
 
				 static void STARPU_ATTRIBUTE_UNUSED parse_args(int argc, char **argv)
			
 
				 {
			
--- a/examples/cholesky/cholesky_grain_tag.c
+++ b/examples/cholesky/cholesky_grain_tag.c
@@ -18,6 +18,10 @@
 
				 
			
 
				 #include "cholesky.h"
			
 
				 
			
 
				+struct starpu_perfmodel chol_model_11;
			
 
				+struct starpu_perfmodel chol_model_21;
			
 
				+struct starpu_perfmodel chol_model_22;
			
 
				+
			
 
				 /*
			
 
				  *	Some useful functions
			
 
				  */
			
@@ -292,6 +296,16 @@ static void initialize_system(float **A, unsigned dim, unsigned pinned)
 
				 		exit(77);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,cuda_chol_task_11_cost);
			
 
				+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,cuda_chol_task_21_cost);
			
 
				+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,cuda_chol_task_22_cost);
			
 
				+#else
			
 
				+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,NULL);
			
 
				+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,NULL);
			
 
				+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,NULL);
			
 
				+#endif
			
 
				+
			
 
				 	starpu_cublas_init();
			
 
				 
			
 
				 #ifndef STARPU_SIMGRID
			
--- a/examples/cholesky/cholesky_implicit.c
+++ b/examples/cholesky/cholesky_implicit.c
@@ -21,6 +21,9 @@
 
				 /*
			
 
				  *	Create the codelets
			
 
				  */
			
 
				+struct starpu_perfmodel chol_model_11;
			
 
				+struct starpu_perfmodel chol_model_21;
			
 
				+struct starpu_perfmodel chol_model_22;
			
 
				 
			
 
				 static struct starpu_codelet cl11 =
			
 
				 {
			
@@ -346,6 +349,16 @@ int main(int argc, char **argv)
 
				                 return 77;
			
 
				         STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,cuda_chol_task_11_cost);
			
 
				+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,cuda_chol_task_21_cost);
			
 
				+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,cuda_chol_task_22_cost);
			
 
				+#else
			
 
				+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,NULL);
			
 
				+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,NULL);
			
 
				+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,NULL);
			
 
				+#endif
			
 
				+
			
 
				 	starpu_cublas_init();
			
 
				 
			
 
				 	if(with_ctxs)
			
--- a/examples/cholesky/cholesky_models.c
+++ b/examples/cholesky/cholesky_models.c
@@ -26,6 +26,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_perfmodel.h>
			
 
				 #include "cholesky.h"
			
 
				 
			
 
				 /* #define USE_PERTURBATION	1 */
			
@@ -36,7 +37,7 @@
 
				 #define PERTURBATE(a)	(a)
			
 
				 #endif
			
 
				 
			
 
				-static double cpu_chol_task_11_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+double cpu_chol_task_11_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
@@ -51,7 +52,7 @@ static double cpu_chol_task_11_cost(struct starpu_task *task, enum starpu_perfmo
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
 
				-static double cuda_chol_task_11_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+double cuda_chol_task_11_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
@@ -66,7 +67,7 @@ static double cuda_chol_task_11_cost(struct starpu_task *task, enum starpu_perfm
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
 
				-static double cpu_chol_task_21_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+double cpu_chol_task_21_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
@@ -81,7 +82,7 @@ static double cpu_chol_task_21_cost(struct starpu_task *task, enum starpu_perfmo
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
 
				-static double cuda_chol_task_21_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+double cuda_chol_task_21_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
@@ -96,7 +97,7 @@ static double cuda_chol_task_21_cost(struct starpu_task *task, enum starpu_perfm
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
 
				-static double cpu_chol_task_22_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+double cpu_chol_task_22_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
@@ -111,7 +112,7 @@ static double cpu_chol_task_22_cost(struct starpu_task *task, enum starpu_perfmo
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
 
				-static double cuda_chol_task_22_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+double cuda_chol_task_22_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
@@ -126,35 +127,15 @@ static double cuda_chol_task_22_cost(struct starpu_task *task, enum starpu_perfm
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
 
				-struct starpu_perfmodel chol_model_11 =
			
 
				+void initialize_chol_model(struct starpu_perfmodel* model, char * symbol, 
			
 
				+		double (*cpu_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned), 
			
 
				+		double (*cuda_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned))
			
 
				 {
			
 
				-	.per_arch =
			
 
				-	{
			
 
				-		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_11_cost },
			
 
				-		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_11_cost }
			
 
				-	},
			
 
				-	.type = STARPU_HISTORY_BASED,
			
 
				-	.symbol = "chol_model_11"
			
 
				-};
			
 
				-
			
 
				-struct starpu_perfmodel chol_model_21 =
			
 
				-{
			
 
				-	.per_arch =
			
 
				-	{
			
 
				-		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_21_cost },
			
 
				-		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_21_cost }
			
 
				-	},
			
 
				-	.type = STARPU_HISTORY_BASED,
			
 
				-	.symbol = "chol_model_21"
			
 
				-};
			
 
				-
			
 
				-struct starpu_perfmodel chol_model_22 =
			
 
				-{
			
 
				-	.per_arch =
			
 
				-	{
			
 
				-		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_22_cost },
			
 
				-		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_22_cost }
			
 
				-	},
			
 
				-	.type = STARPU_HISTORY_BASED,
			
 
				-	.symbol = "chol_model_22"
			
 
				-};
			
 
				+	model->symbol = symbol;
			
 
				+	model->type = STARPU_HISTORY_BASED;
			
 
				+	starpu_initialize_model(model);
			
 
				+	model->per_arch[STARPU_CPU_WORKER][0][0][0].cost_function = cpu_cost_function;
			
 
				+	if(starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) != 0)
			
 
				+		model->per_arch[STARPU_CUDA_WORKER][0][0][0].cost_function = cuda_cost_function;
			
 
				+}
			
 
				+
			
--- a/examples/cholesky/cholesky_tag.c
+++ b/examples/cholesky/cholesky_tag.c
@@ -17,6 +17,11 @@
 
				  */
			
 
				 
			
 
				 #include "cholesky.h"
			
 
				+#include <starpu_perfmodel.h>
			
 
				+
			
 
				+struct starpu_perfmodel chol_model_11;
			
 
				+struct starpu_perfmodel chol_model_21;
			
 
				+struct starpu_perfmodel chol_model_22;
			
 
				 
			
 
				 /*
			
 
				  *	Some useful functions
			
@@ -258,6 +263,16 @@ static int initialize_system(float **A, unsigned dim, unsigned pinned)
 
				 		return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,cuda_chol_task_11_cost);
			
 
				+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,cuda_chol_task_21_cost);
			
 
				+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,cuda_chol_task_22_cost);
			
 
				+#else
			
 
				+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,NULL);
			
 
				+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,NULL);
			
 
				+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,NULL);
			
 
				+#endif
			
 
				+
			
 
				 	starpu_cublas_init();
			
 
				 
			
 
				 #ifndef STARPU_SIMGRID
			
--- a/examples/cholesky/cholesky_tile_tag.c
+++ b/examples/cholesky/cholesky_tile_tag.c
@@ -17,6 +17,10 @@
 
				 
			
 
				 #include "cholesky.h"
			
 
				 
			
 
				+struct starpu_perfmodel chol_model_11;
			
 
				+struct starpu_perfmodel chol_model_21;
			
 
				+struct starpu_perfmodel chol_model_22;
			
 
				+
			
 
				 /* A [ y ] [ x ] */
			
 
				 float *A[NMAXBLOCKS][NMAXBLOCKS];
			
 
				 starpu_data_handle_t A_state[NMAXBLOCKS][NMAXBLOCKS];
			
@@ -254,6 +258,16 @@ int main(int argc, char **argv)
 
				 		return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,cuda_chol_task_11_cost);
			
 
				+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,cuda_chol_task_21_cost);
			
 
				+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,cuda_chol_task_22_cost);
			
 
				+#else
			
 
				+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,NULL);
			
 
				+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,NULL);
			
 
				+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,NULL);
			
 
				+#endif
			
 
				+
			
 
				 	/* Disable sequential consistency */
			
 
				 	starpu_data_set_default_sequential_consistency_flag(0);
			
 
				 
			
--- a/examples/heat/dw_factolu.c
+++ b/examples/heat/dw_factolu.c
@@ -25,6 +25,11 @@
 
				 #define debug(fmt, ...)
			
 
				 #endif
			
 
				 
			
 
				+struct starpu_perfmodel model_11;
			
 
				+struct starpu_perfmodel model_12;
			
 
				+struct starpu_perfmodel model_21;
			
 
				+struct starpu_perfmodel model_22;
			
 
				+
			
 
				 unsigned *advance_11; /* size nblocks, whether the 11 task is done */
			
 
				 unsigned *advance_12_21; /* size nblocks*nblocks */
			
 
				 unsigned *advance_22; /* array of nblocks *nblocks*nblocks */
			
@@ -701,6 +706,27 @@ void initialize_system(float **A, float **B, unsigned dim, unsigned pinned)
 
				 		exit(77);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				+#ifdef STARPU_ATLAS
			
 
				+	char * symbol_11 = "lu_model_11_atlas";
			
 
				+	char * symbol_12 = "lu_model_12_atlas";
			
 
				+	char * symbol_21 = "lu_model_21_atlas";
			
 
				+	char * symbol_22 = "lu_model_22_atlas";
			
 
				+#elif defined(STARPU_GOTO)
			
 
				+	char * symbol_11 = "lu_model_11_goto";
			
 
				+	char * symbol_12 = "lu_model_12_goto";
			
 
				+	char * symbol_21 = "lu_model_21_goto";
			
 
				+	char * symbol_22 = "lu_model_22_goto";
			
 
				+#else
			
 
				+	char * symbol_11 = "lu_model_11";
			
 
				+	char * symbol_12 = "lu_model_12";
			
 
				+	char * symbol_21 = "lu_model_21";
			
 
				+	char * symbol_22 = "lu_model_22";
			
 
				+#endif
			
 
				+	initialize_lu_kernels_model(&model_11,symbol_11,task_11_cost,task_11_cost_cpu,task_11_cost_cuda);
			
 
				+	initialize_lu_kernels_model(&model_12,symbol_12,task_12_cost,task_12_cost_cpu,task_12_cost_cuda);
			
 
				+	initialize_lu_kernels_model(&model_21,symbol_21,task_21_cost,task_21_cost_cpu,task_21_cost_cuda);
			
 
				+	initialize_lu_kernels_model(&model_22,symbol_22,task_22_cost,task_22_cost_cpu,task_22_cost_cuda);
			
 
				+
			
 
				 	starpu_cublas_init();
			
 
				 
			
 
				 	if (pinned)
			
--- a/examples/heat/dw_factolu.h
+++ b/examples/heat/dw_factolu.h
@@ -216,9 +216,4 @@ void dw_callback_v2_codelet_update_u12(void *);
 
				 void dw_callback_v2_codelet_update_u21(void *);
			
 
				 void dw_callback_v2_codelet_update_u22(void *);
			
 
				 
			
 
				-extern struct starpu_perfmodel model_11;
			
 
				-extern struct starpu_perfmodel model_12;
			
 
				-extern struct starpu_perfmodel model_21;
			
 
				-extern struct starpu_perfmodel model_22;
			
 
				-
			
 
				 #endif /* __DW_FACTO_LU_H__ */
			
--- a/examples/heat/dw_factolu_grain.c
+++ b/examples/heat/dw_factolu_grain.c
@@ -27,6 +27,11 @@
 
				 					| ((unsigned long long)(i)<<16)	\
			
 
				 					| (unsigned long long)(j))))
			
 
				 
			
 
				+struct starpu_perfmodel model_11;
			
 
				+struct starpu_perfmodel model_12;
			
 
				+struct starpu_perfmodel model_21;
			
 
				+struct starpu_perfmodel model_22;
			
 
				+
			
 
				 /*
			
 
				  *	Construct the DAG
			
 
				  */
			
--- a/examples/heat/dw_factolu_tag.c
+++ b/examples/heat/dw_factolu_tag.c
@@ -27,6 +27,11 @@
 
				 					| ((unsigned long long)(i)<<16)	\
			
 
				 					| (unsigned long long)(j))))
			
 
				 
			
 
				+struct starpu_perfmodel model_11;
			
 
				+struct starpu_perfmodel model_12;
			
 
				+struct starpu_perfmodel model_21;
			
 
				+struct starpu_perfmodel model_22;
			
 
				+
			
 
				 static unsigned no_prio = 0;
			
 
				 
			
 
				 /*
			
--- a/examples/heat/lu_kernels_model.c
+++ b/examples/heat/lu_kernels_model.c
@@ -102,7 +102,7 @@ double task_22_cost(struct starpu_task *task, unsigned nimpl)
 
				  */
			
 
				 
			
 
				 
			
 
				-double task_11_cost_cuda(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+double task_11_cost_cuda(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
@@ -114,7 +114,7 @@ double task_11_cost_cuda(struct starpu_task *task, enum starpu_perfmodel_archtyp
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
 
				-double task_12_cost_cuda(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+double task_12_cost_cuda(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
@@ -127,7 +127,7 @@ double task_12_cost_cuda(struct starpu_task *task, enum starpu_perfmodel_archtyp
 
				 }
			
 
				 
			
 
				 
			
 
				-double task_21_cost_cuda(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+double task_21_cost_cuda(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
@@ -141,7 +141,7 @@ double task_21_cost_cuda(struct starpu_task *task, enum starpu_perfmodel_archtyp
 
				 
			
 
				 
			
 
				 
			
 
				-double task_22_cost_cuda(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+double task_22_cost_cuda(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t nx, ny, nz;
			
 
				 
			
@@ -161,7 +161,7 @@ double task_22_cost_cuda(struct starpu_task *task, enum starpu_perfmodel_archtyp
 
				  *
			
 
				  */
			
 
				 
			
 
				-double task_11_cost_cpu(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+double task_11_cost_cpu(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
@@ -173,7 +173,7 @@ double task_11_cost_cpu(struct starpu_task *task, enum starpu_perfmodel_archtype
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
 
				-double task_12_cost_cpu(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+double task_12_cost_cpu(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
@@ -186,7 +186,7 @@ double task_12_cost_cpu(struct starpu_task *task, enum starpu_perfmodel_archtype
 
				 }
			
 
				 
			
 
				 
			
 
				-double task_21_cost_cpu(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+double task_21_cost_cpu(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
@@ -200,7 +200,7 @@ double task_21_cost_cpu(struct starpu_task *task, enum starpu_perfmodel_archtype
 
				 
			
 
				 
			
 
				 
			
 
				-double task_22_cost_cpu(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+double task_22_cost_cpu(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t nx, ny, nz;
			
 
				 
			
@@ -214,74 +214,17 @@ double task_22_cost_cpu(struct starpu_task *task, enum starpu_perfmodel_archtype
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
 
				-struct starpu_perfmodel model_11 =
			
 
				-{
			
 
				-	.cost_function = task_11_cost,
			
 
				-	.per_arch =
			
 
				-	{
			
 
				-		[STARPU_CPU_DEFAULT][0] = { .cost_function = task_11_cost_cpu },
			
 
				-		[STARPU_CUDA_DEFAULT][0] = { .cost_function = task_11_cost_cuda }
			
 
				-	},
			
 
				-	.type = STARPU_HISTORY_BASED,
			
 
				-#ifdef STARPU_ATLAS
			
 
				-	.symbol = "lu_model_11_atlas"
			
 
				-#elif defined(STARPU_GOTO)
			
 
				-	.symbol = "lu_model_11_goto"
			
 
				-#else
			
 
				-	.symbol = "lu_model_11"
			
 
				-#endif
			
 
				-};
			
 
				-
			
 
				-struct starpu_perfmodel model_12 =
			
 
				-{
			
 
				-	.cost_function = task_12_cost,
			
 
				-	.per_arch =
			
 
				-	{
			
 
				-		[STARPU_CPU_DEFAULT][0] = { .cost_function = task_12_cost_cpu },
			
 
				-		[STARPU_CUDA_DEFAULT][0] = { .cost_function = task_12_cost_cuda }
			
 
				-	},
			
 
				-	.type = STARPU_HISTORY_BASED,
			
 
				-#ifdef STARPU_ATLAS
			
 
				-	.symbol = "lu_model_12_atlas"
			
 
				-#elif defined(STARPU_GOTO)
			
 
				-	.symbol = "lu_model_12_goto"
			
 
				-#else
			
 
				-	.symbol = "lu_model_12"
			
 
				-#endif
			
 
				-};
			
 
				-
			
 
				-struct starpu_perfmodel model_21 =
			
 
				-{
			
 
				-	.cost_function = task_21_cost,
			
 
				-	.per_arch =
			
 
				-	{
			
 
				-		[STARPU_CPU_DEFAULT][0] = { .cost_function = task_21_cost_cpu },
			
 
				-		[STARPU_CUDA_DEFAULT][0] = { .cost_function = task_21_cost_cuda }
			
 
				-	},
			
 
				-	.type = STARPU_HISTORY_BASED,
			
 
				-#ifdef STARPU_ATLAS
			
 
				-	.symbol = "lu_model_21_atlas"
			
 
				-#elif defined(STARPU_GOTO)
			
 
				-	.symbol = "lu_model_21_goto"
			
 
				-#else
			
 
				-	.symbol = "lu_model_21"
			
 
				-#endif
			
 
				-};
			
 
				+void initialize_lu_kernels_model(struct starpu_perfmodel* model, char * symbol, 
			
 
				+		double (*cost_function)(struct starpu_task *, unsigned), 
			
 
				+		double (*cpu_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned), 
			
 
				+		double (*cuda_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned))
			
 
				+{
			
 
				+	model->type = STARPU_HISTORY_BASED;
			
 
				+	model->symbol = symbol;
			
 
				+	starpu_initialize_model(model);
			
 
				+	model->cost_function = cost_function;
			
 
				+	model->per_arch[STARPU_CPU_WORKER][0][0][0].cost_function = cpu_cost_function;
			
 
				+	if(starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) != 0)
			
 
				+		model->per_arch[STARPU_CUDA_WORKER][0][0][0].cost_function = cuda_cost_function;
			
 
				+}
			
 
				 
			
 
				-struct starpu_perfmodel model_22 =
			
 
				-{
			
 
				-	.cost_function = task_22_cost,
			
 
				-	.per_arch =
			
 
				-	{
			
 
				-		[STARPU_CPU_DEFAULT][0] = { .cost_function = task_22_cost_cpu },
			
 
				-		[STARPU_CUDA_DEFAULT][0] = { .cost_function = task_22_cost_cuda }
			
 
				-	},
			
 
				-	.type = STARPU_HISTORY_BASED,
			
 
				-#ifdef STARPU_ATLAS
			
 
				-	.symbol = "lu_model_22_atlas"
			
 
				-#elif defined(STARPU_GOTO)
			
 
				-	.symbol = "lu_model_22_goto"
			
 
				-#else
			
 
				-	.symbol = "lu_model_22"
			
 
				-#endif
			
 
				-};
			
--- a/examples/heat/lu_kernels_model.h
+++ b/examples/heat/lu_kernels_model.h
@@ -20,4 +20,24 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 
			
 
				+double task_11_cost(struct starpu_task *task, unsigned nimpl);
			
 
				+double task_12_cost(struct starpu_task *task, unsigned nimpl);
			
 
				+double task_21_cost(struct starpu_task *task, unsigned nimpl);
			
 
				+double task_22_cost(struct starpu_task *task, unsigned nimpl);
			
 
				+
			
 
				+double task_11_cost_cuda(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				+double task_12_cost_cuda(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				+double task_21_cost_cuda(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				+double task_22_cost_cuda(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				+
			
 
				+double task_11_cost_cpu(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				+double task_12_cost_cpu(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				+double task_21_cost_cpu(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				+double task_22_cost_cpu(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				+
			
 
				+void initialize_lu_kernels_model(struct starpu_perfmodel* model, char * symbol, 
			
 
				+		double (*cost_function)(struct starpu_task *, unsigned), 
			
 
				+		double (*cpu_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned), 
			
 
				+		double (*cuda_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned));
			
 
				+
			
 
				 #endif /* __LU_KERNELS_MODEL_H__ */
			
--- a/include/starpu_fxt.h
+++ b/include/starpu_fxt.h
@@ -31,7 +31,7 @@ struct starpu_fxt_codelet_event
 
				 {
			
 
				 	char symbol[256];
			
 
				 	int workerid;
			
 
				-	enum starpu_perfmodel_archtype archtype;
			
 
				+	struct starpu_perfmodel_arch arch;
			
 
				 	uint32_t hash;
			
 
				 	size_t size;
			
 
				 	float time;
			
@@ -54,7 +54,7 @@ struct starpu_fxt_options
 
				 	int file_rank;
			
 
				 
			
 
				 	char worker_names[STARPU_NMAXWORKERS][256];
			
 
				-	enum starpu_perfmodel_archtype worker_archtypes[STARPU_NMAXWORKERS];
			
 
				+	struct starpu_perfmodel_arch worker_archtypes[STARPU_NMAXWORKERS];
			
 
				 	int nworkers;
			
 
				 
			
 
				 	struct starpu_fxt_codelet_event **dumped_codelets;
			
--- a/include/starpu_perfmodel.h
+++ b/include/starpu_perfmodel.h
@@ -23,6 +23,7 @@
 
				 #include <stdio.h>
			
 
				 
			
 
				 #include <starpu_util.h>
			
 
				+#include <starpu_worker.h>
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				 extern "C"
			
@@ -32,36 +33,15 @@ extern "C"
 
				 struct starpu_task;
			
 
				 struct starpu_data_descr;
			
 
				 
			
 
				-enum starpu_perfmodel_archtype
			
 
				+#define STARPU_NARCH STARPU_ANY_WORKER
			
 
				+
			
 
				+struct starpu_perfmodel_arch
			
 
				 {
			
 
				-	STARPU_CPU_DEFAULT = 0,
			
 
				-	STARPU_CUDA_DEFAULT = STARPU_MAXCPUS,
			
 
				-	STARPU_OPENCL_DEFAULT = STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS,
			
 
				-	STARPU_MIC_DEFAULT = STARPU_OPENCL_DEFAULT + STARPU_MAXOPENCLDEVS,
			
 
				-	STARPU_SCC_DEFAULT = STARPU_MIC_DEFAULT + STARPU_MAXMICDEVS //* STARPU_MAXMICCPUS
			
 
				+	enum starpu_worker_archtype type;
			
 
				+	int devid;
			
 
				+	int ncore;
			
 
				 };
			
 
				 
			
 
				-#ifdef __STDC_VERSION__
			
 
				-#  if __STDC_VERSION__ > 199901L || STARPU_GNUC_PREREQ(4, 6)
			
 
				-
			
 
				-/* Make sure the following assertions hold, since StarPU relies on it.  */
			
 
				-
			
 
				-_Static_assert(STARPU_CPU_DEFAULT == 0,
			
 
				-	       "invalid STARPU_CPU_DEFAULT value");
			
 
				-_Static_assert(STARPU_CPU_DEFAULT < STARPU_CUDA_DEFAULT,
			
 
				-	       "invalid STARPU_{CPU,CUDA}_DEFAULT values");
			
 
				-_Static_assert(STARPU_CUDA_DEFAULT < STARPU_OPENCL_DEFAULT,
			
 
				-	       "invalid STARPU_{CUDA,OPENCL}_DEFAULT values");
			
 
				-_Static_assert(STARPU_OPENCL_DEFAULT < STARPU_MIC_DEFAULT,
			
 
				-	       "invalid STARPU_{OPENCL,MIC}_DEFAULT values");
			
 
				-_Static_assert(STARPU_MIC_DEFAULT < STARPU_SCC_DEFAULT,
			
 
				-	       "invalid STARPU_{MIC,SCC}_DEFAULT values");
			
 
				-
			
 
				-#  endif
			
 
				-#endif
			
 
				-
			
 
				-#define STARPU_NARCH_VARIATIONS	(STARPU_MIC_DEFAULT + STARPU_MAXMICDEVS)
			
 
				-
			
 
				 struct starpu_perfmodel_history_entry
			
 
				 {
			
 
				 	double mean;
			
@@ -113,8 +93,8 @@ struct starpu_perfmodel_history_table;
 
				 struct starpu_perfmodel_per_arch
			
 
				 {
			
 
				 	double (*cost_model)(struct starpu_data_descr *t) STARPU_DEPRECATED;
			
 
				-	double (*cost_function)(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl);
			
 
				-	size_t (*size_base)(struct starpu_task *, enum starpu_perfmodel_archtype arch, unsigned nimpl);
			
 
				+	double (*cost_function)(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				+	size_t (*size_base)(struct starpu_task *, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				 
			
 
				 	struct starpu_perfmodel_history_table *history;
			
 
				 	struct starpu_perfmodel_history_list *list;
			
@@ -142,32 +122,36 @@ struct starpu_perfmodel
 
				 
			
 
				 	size_t (*size_base)(struct starpu_task *, unsigned nimpl);
			
 
				 
			
 
				-	struct starpu_perfmodel_per_arch per_arch[STARPU_NARCH_VARIATIONS][STARPU_MAXIMPLEMENTATIONS];
			
 
				+	struct starpu_perfmodel_per_arch**** per_arch; /*STARPU_MAXIMPLEMENTATIONS*/
			
 
				 
			
 
				 	const char *symbol;
			
 
				 
			
 
				+	unsigned is_init;
			
 
				 	unsigned is_loaded;
			
 
				 	unsigned benchmarking;
			
 
				 	starpu_pthread_rwlock_t model_rwlock;
			
 
				 };
			
 
				 
			
 
				-enum starpu_perfmodel_archtype starpu_worker_get_perf_archtype(int workerid);
			
 
				+void starpu_initialize_model(struct starpu_perfmodel *model);
			
 
				+void starpu_initialize_model_with_file(FILE*f, struct starpu_perfmodel *model);
			
 
				+
			
 
				+struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid);
			
 
				 
			
 
				 int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *model);
			
 
				 int starpu_perfmodel_unload_model(struct starpu_perfmodel *model);
			
 
				 
			
 
				-void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, char *path, size_t maxlen, unsigned nimpl);
			
 
				-void starpu_perfmodel_get_arch_name(enum starpu_perfmodel_archtype arch, char *archname, size_t maxlen, unsigned nimpl);
			
 
				+void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, char *path, size_t maxlen, unsigned nimpl);
			
 
				+char* starpu_perfmodel_get_archtype_name(enum starpu_worker_archtype archtype);
			
 
				+void starpu_perfmodel_get_arch_name(struct starpu_perfmodel_arch* arch, char *archname, size_t maxlen, unsigned nimpl);
			
 
				 
			
 
				-double starpu_permodel_history_based_expected_perf(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, uint32_t footprint);
			
 
				+double starpu_permodel_history_based_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, uint32_t footprint);
			
 
				 int starpu_perfmodel_list(FILE *output);
			
 
				-void starpu_perfmodel_print(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output);
			
 
				+void starpu_perfmodel_print(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output);
			
 
				 int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char *parameter, uint32_t *footprint, FILE *output);
			
 
				 
			
 
				+void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch * arch, unsigned cpuid, unsigned nimpl, double measured);
			
 
				 void starpu_perfmodel_directory(FILE *output);
			
 
				 
			
 
				-void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned cpuid, unsigned nimpl, double measured);
			
 
				-
			
 
				 void starpu_bus_print_bandwidth(FILE *f);
			
 
				 void starpu_bus_print_affinity(FILE *f);
			
 
				 
			
--- a/include/starpu_scheduler.h
+++ b/include/starpu_scheduler.h
@@ -68,17 +68,17 @@ int starpu_combined_worker_can_execute_task(unsigned workerid, struct starpu_tas
 
				 int starpu_get_prefetch_flag(void);
			
 
				 int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node);
			
 
				 
			
 
				-uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl);
			
 
				-double starpu_task_expected_length(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl);
			
 
				-double starpu_worker_get_relative_speedup(enum starpu_perfmodel_archtype perf_archtype);
			
 
				+uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				+double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				+double starpu_worker_get_relative_speedup(struct starpu_perfmodel_arch* perf_arch);
			
 
				 double starpu_task_expected_data_transfer_time(unsigned memory_node, struct starpu_task *task);
			
 
				 double starpu_data_expected_transfer_time(starpu_data_handle_t handle, unsigned memory_node, enum starpu_data_access_mode mode);
			
 
				-double starpu_task_expected_power(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl);
			
 
				-double starpu_task_expected_conversion_time(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl);
			
 
				+double starpu_task_expected_power(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				+double starpu_task_expected_conversion_time(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				 
			
 
				-double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl);
			
 
				+double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				 double starpu_task_bundle_expected_data_transfer_time(starpu_task_bundle_t bundle, unsigned memory_node);
			
 
				-double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl);
			
 
				+double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				 
			
 
				 void starpu_sched_ctx_worker_shares_tasks_lists(int workerid, int sched_ctx_id);
			
 
				 #ifdef __cplusplus
			
--- a/include/starpu_worker.h
+++ b/include/starpu_worker.h
@@ -28,12 +28,12 @@ extern "C"
 
				 
			
 
				 enum starpu_worker_archtype
			
 
				 {
			
 
				-	STARPU_ANY_WORKER,
			
 
				 	STARPU_CPU_WORKER,
			
 
				 	STARPU_CUDA_WORKER,
			
 
				 	STARPU_OPENCL_WORKER,
			
 
				 	STARPU_MIC_WORKER,
			
 
				-	STARPU_SCC_WORKER
			
 
				+	STARPU_SCC_WORKER,
			
 
				+	STARPU_ANY_WORKER
			
 
				 };
			
 
				 
			
 
				 struct starpu_sched_ctx_iterator
			
--- a/sc_hypervisor/examples/cholesky/cholesky.h
+++ b/sc_hypervisor/examples/cholesky/cholesky.h
@@ -73,15 +73,23 @@ void chol_cpu_codelet_update_u11(void **, void *);
 
				 void chol_cpu_codelet_update_u21(void **, void *);
			
 
				 void chol_cpu_codelet_update_u22(void **, void *);
			
 
				 
			
 
				+double cpu_chol_task_11_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				+double cpu_chol_task_21_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				+double cpu_chol_task_22_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				+
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 void chol_cublas_codelet_update_u11(void *descr[], void *_args);
			
 
				 void chol_cublas_codelet_update_u21(void *descr[], void *_args);
			
 
				 void chol_cublas_codelet_update_u22(void *descr[], void *_args);
			
 
				+
			
 
				+double cuda_chol_task_11_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				+double cuda_chol_task_21_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				+double cuda_chol_task_22_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				 #endif
			
 
				 
			
 
				-extern struct starpu_perfmodel chol_model_11;
			
 
				-extern struct starpu_perfmodel chol_model_21;
			
 
				-extern struct starpu_perfmodel chol_model_22;
			
 
				+void initialize_chol_model(struct starpu_perfmodel* model, char* symbol, 
			
 
				+		double (*cpu_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned), 
			
 
				+		double (*cuda_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned));
			
 
				 
			
 
				 static void STARPU_ATTRIBUTE_UNUSED parse_args(int argc, char **argv)
			
 
				 {
			
--- a/sc_hypervisor/examples/cholesky/cholesky_grain_tag.c
+++ b/sc_hypervisor/examples/cholesky/cholesky_grain_tag.c
@@ -18,6 +18,10 @@
 
				 
			
 
				 #include "cholesky.h"
			
 
				 
			
 
				+struct starpu_perfmodel chol_model_11;
			
 
				+struct starpu_perfmodel chol_model_21;
			
 
				+struct starpu_perfmodel chol_model_22;
			
 
				+
			
 
				 /*
			
 
				  *	Some useful functions
			
 
				  */
			
@@ -276,6 +280,16 @@ static void initialize_system(float **A, unsigned dim, unsigned pinned)
 
				 		exit(77);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,cuda_chol_task_11_cost);
			
 
				+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,cuda_chol_task_21_cost);
			
 
				+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,cuda_chol_task_22_cost);
			
 
				+#else
			
 
				+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,NULL);
			
 
				+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,NULL);
			
 
				+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,NULL);
			
 
				+#endif
			
 
				+
			
 
				 	starpu_cublas_init();
			
 
				 
			
 
				 	if (pinned)
			
--- a/sc_hypervisor/examples/cholesky/cholesky_implicit.c
+++ b/sc_hypervisor/examples/cholesky/cholesky_implicit.c
@@ -19,6 +19,11 @@
 
				 
			
 
				 #include "cholesky.h"
			
 
				 #include "../sched_ctx_utils/sched_ctx_utils.h"
			
 
				+
			
 
				+struct starpu_perfmodel chol_model_11;
			
 
				+struct starpu_perfmodel chol_model_21;
			
 
				+struct starpu_perfmodel chol_model_22;
			
 
				+
			
 
				 /*
			
 
				  *	Create the codelets
			
 
				  */
			
@@ -342,6 +347,16 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_init(NULL);
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,cuda_chol_task_11_cost);
			
 
				+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,cuda_chol_task_21_cost);
			
 
				+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,cuda_chol_task_22_cost);
			
 
				+#else
			
 
				+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,NULL);
			
 
				+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,NULL);
			
 
				+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,NULL);
			
 
				+#endif
			
 
				+
			
 
				 	starpu_cublas_init();
			
 
				 
			
 
				 	if(with_ctxs)
			
--- a/sc_hypervisor/examples/cholesky/cholesky_models.c
+++ b/sc_hypervisor/examples/cholesky/cholesky_models.c
@@ -36,7 +36,7 @@
 
				 #define PERTURBATE(a)	(a)
			
 
				 #endif
			
 
				 
			
 
				-static double cpu_chol_task_11_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+double cpu_chol_task_11_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
@@ -51,7 +51,7 @@ static double cpu_chol_task_11_cost(struct starpu_task *task, enum starpu_perfmo
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
 
				-static double cuda_chol_task_11_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+double cuda_chol_task_11_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
@@ -66,7 +66,7 @@ static double cuda_chol_task_11_cost(struct starpu_task *task, enum starpu_perfm
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
 
				-static double cpu_chol_task_21_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+double cpu_chol_task_21_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
@@ -81,7 +81,7 @@ static double cpu_chol_task_21_cost(struct starpu_task *task, enum starpu_perfmo
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
 
				-static double cuda_chol_task_21_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+double cuda_chol_task_21_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
@@ -96,7 +96,7 @@ static double cuda_chol_task_21_cost(struct starpu_task *task, enum starpu_perfm
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
 
				-static double cpu_chol_task_22_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+double cpu_chol_task_22_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
@@ -111,7 +111,7 @@ static double cpu_chol_task_22_cost(struct starpu_task *task, enum starpu_perfmo
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
 
				-static double cuda_chol_task_22_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+double cuda_chol_task_22_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
@@ -126,35 +126,14 @@ static double cuda_chol_task_22_cost(struct starpu_task *task, enum starpu_perfm
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
 
				-struct starpu_perfmodel chol_model_11 =
			
 
				+void initialize_chol_model(struct starpu_perfmodel* model, char * symbol, 
			
 
				+		double (*cpu_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned), 
			
 
				+		double (*cuda_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned))
			
 
				 {
			
 
				-	.per_arch =
			
 
				-	{
			
 
				-		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_11_cost },
			
 
				-		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_11_cost }
			
 
				-	},
			
 
				-	.type = STARPU_HISTORY_BASED,
			
 
				-	.symbol = "chol_model_11"
			
 
				-};
			
 
				-
			
 
				-struct starpu_perfmodel chol_model_21 =
			
 
				-{
			
 
				-	.per_arch =
			
 
				-	{
			
 
				-		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_21_cost },
			
 
				-		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_21_cost }
			
 
				-	},
			
 
				-	.type = STARPU_HISTORY_BASED,
			
 
				-	.symbol = "chol_model_21"
			
 
				-};
			
 
				-
			
 
				-struct starpu_perfmodel chol_model_22 =
			
 
				-{
			
 
				-	.per_arch =
			
 
				-	{
			
 
				-		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_22_cost },
			
 
				-		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_22_cost }
			
 
				-	},
			
 
				-	.type = STARPU_HISTORY_BASED,
			
 
				-	.symbol = "chol_model_22"
			
 
				-};
			
 
				+	starpu_initialize_model(model);
			
 
				+	model->type = STARPU_HISTORY_BASED;
			
 
				+	model->symbol = symbol;
			
 
				+	model->per_arch[STARPU_CPU_WORKER][0][0][0].cost_function = cpu_cost_function;
			
 
				+	model->per_arch[STARPU_CUDA_WORKER][0][0][0].cost_function = cuda_cost_function;
			
 
				+}
			
 
				+
			
--- a/sc_hypervisor/examples/cholesky/cholesky_tag.c
+++ b/sc_hypervisor/examples/cholesky/cholesky_tag.c
@@ -18,6 +18,10 @@
 
				 
			
 
				 #include "cholesky.h"
			
 
				 
			
 
				+struct starpu_perfmodel chol_model_11;
			
 
				+struct starpu_perfmodel chol_model_21;
			
 
				+struct starpu_perfmodel chol_model_22;
			
 
				+
			
 
				 /*
			
 
				  *	Some useful functions
			
 
				  */
			
@@ -249,6 +253,16 @@ static int initialize_system(float **A, unsigned dim, unsigned pinned)
 
				 		return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,cuda_chol_task_11_cost);
			
 
				+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,cuda_chol_task_21_cost);
			
 
				+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,cuda_chol_task_22_cost);
			
 
				+#else
			
 
				+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,NULL);
			
 
				+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,NULL);
			
 
				+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,NULL);
			
 
				+#endif
			
 
				+
			
 
				 	starpu_cublas_init();
			
 
				 
			
 
				 	if (pinned)
			
--- a/sc_hypervisor/examples/cholesky/cholesky_tile_tag.c
+++ b/sc_hypervisor/examples/cholesky/cholesky_tile_tag.c
@@ -17,6 +17,10 @@
 
				 
			
 
				 #include "cholesky.h"
			
 
				 
			
 
				+struct starpu_perfmodel chol_model_11;
			
 
				+struct starpu_perfmodel chol_model_21;
			
 
				+struct starpu_perfmodel chol_model_22;
			
 
				+
			
 
				 /* A [ y ] [ x ] */
			
 
				 float *A[NMAXBLOCKS][NMAXBLOCKS];
			
 
				 starpu_data_handle_t A_state[NMAXBLOCKS][NMAXBLOCKS];
			
@@ -239,6 +243,16 @@ int main(int argc, char **argv)
 
				 		return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,cuda_chol_task_11_cost);
			
 
				+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,cuda_chol_task_21_cost);
			
 
				+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,cuda_chol_task_22_cost);
			
 
				+#else
			
 
				+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,NULL);
			
 
				+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,NULL);
			
 
				+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,NULL);
			
 
				+#endif
			
 
				+
			
 
				 	/* Disable sequential consistency */
			
 
				 	starpu_data_set_default_sequential_consistency_flag(0);
			
 
				 
			
--- a/sc_hypervisor/src/policies_utils/policy_tools.c
+++ b/sc_hypervisor/src/policies_utils/policy_tools.c
@@ -416,7 +416,7 @@ void sc_hypervisor_get_tasks_times(int nw, int nt, double times[nw][nt], int *wo
 
				                 for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
			
 
				                 {
			
 
				 			int worker = workers == NULL ? w : workers[w];
			
 
				-                        enum starpu_perfmodel_archtype arch = starpu_worker_get_perf_archtype(worker);
			
 
				+                        struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(worker);
			
 
				                         double length = starpu_permodel_history_based_expected_perf(tp->cl->model, arch, tp->footprint);
			
 
				 
			
 
				                         if (isnan(length))
			
--- a/src/core/combined_workers.c
+++ b/src/core/combined_workers.c
@@ -101,18 +101,11 @@ int starpu_combined_worker_assign_workerid(int nworkers, int workerid_array[])
 
				 
			
 
				 	combined_worker->worker_size = nworkers;
			
 
				 
			
 
				-#ifdef STARPU_USE_MIC
			
 
				-	if(config->workers[workerid_array[0]].worker_mask == STARPU_MIC)
			
 
				-	{
			
 
				-		combined_worker->perf_arch = (enum starpu_perfmodel_archtype) (STARPU_MIC_DEFAULT + config->workers[workerid_array[0]].mp_nodeid /* *STARPU_MAXMICCPUS + nworkers - 1*/);
			
 
				-		combined_worker->worker_mask = STARPU_MIC;
			
 
				-	}
			
 
				-#endif
			
 
				-	if(config->workers[workerid_array[0]].worker_mask == STARPU_CPU)
			
 
				-	{
			
 
				-		combined_worker->perf_arch = (enum starpu_perfmodel_archtype) (STARPU_CPU_DEFAULT + nworkers - 1);
			
 
				-		combined_worker->worker_mask = STARPU_CPU;
			
 
				-	}
			
 
				+	combined_worker->perf_arch.type = config->workers[workerid_array[0]].perf_arch.type;
			
 
				+	combined_worker->perf_arch.devid = config->workers[workerid_array[0]].perf_arch.devid; 
			
 
				+	combined_worker->perf_arch.ncore = nworkers - 1;
			
 
				+	combined_worker->worker_mask = config->workers[workerid_array[0]].worker_mask;
			
 
				+	
			
 
				 #ifdef STARPU_USE_MP
			
 
				 	combined_worker->count = nworkers -1;
			
 
				 	pthread_mutex_init(&combined_worker->count_mutex,NULL);
			
--- a/src/core/detect_combined_workers.c
+++ b/src/core/detect_combined_workers.c
@@ -41,7 +41,7 @@ static void find_workers(hwloc_obj_t obj, int cpu_workers[STARPU_NMAXWORKERS], u
 
				 	/* Got to a PU leaf */
			
 
				 	struct _starpu_worker *worker = obj->userdata;
			
 
				 	/* is it a CPU worker? */
			
 
				-	if (worker->perf_arch == STARPU_CPU_DEFAULT)
			
 
				+	if (worker->perf_arch.type == STARPU_CPU_WORKER && worker->perf_arch.ncore == 0)
			
 
				 	{
			
 
				 		_STARPU_DEBUG("worker %d is part of it\n", worker->workerid);
			
 
				 		/* Add it to the combined worker */
			
@@ -174,7 +174,7 @@ static void find_and_assign_combinations_with_hwloc(int *workerids, int nworkers
 
				 	for (i = 0; i < nworkers; i++)
			
 
				 	{
			
 
				 		struct _starpu_worker *worker = _starpu_get_worker_struct(workerids[i]);
			
 
				-		if (worker->perf_arch == STARPU_CPU_DEFAULT)
			
 
				+		if (worker->perf_arch.type == STARPU_CPU_WORKER && worker->perf_arch.ncore == 0)
			
 
				 		{
			
 
				 			hwloc_obj_t obj = hwloc_get_obj_by_depth(topology->hwtopology, config->cpu_depth, worker->bindid);
			
 
				 			STARPU_ASSERT(obj->userdata == worker);
			
@@ -248,11 +248,11 @@ static void find_and_assign_combinations_without_hwloc(int *workerids, int nwork
 
				 	mic_id = malloc(sizeof(int)*nb_mics);
			
 
				 	nmics_table = malloc(sizeof(unsigned)*nb_mics);
			
 
				 	mic_workers = malloc(sizeof(int*)*nb_mics);
			
 
				-	for(i=0; i<nb_mics; i++)
			
 
				+	for(j=0; j<nb_mics; j++)
			
 
				 	{
			
 
				-		mic_id[i] = -1;
			
 
				-		nmics_table[i] = 0;
			
 
				-		mic_workers[i] = malloc(sizeof(int)*STARPU_NMAXWORKERS);
			
 
				+		mic_id[j] = -1;
			
 
				+		nmics_table[j] = 0;
			
 
				+		mic_workers[j] = malloc(sizeof(int)*STARPU_NMAXWORKERS);
			
 
				 	}
			
 
				 #endif /* STARPU_USE_MIC */
			
 
				 
			
@@ -292,13 +292,13 @@ static void find_and_assign_combinations_without_hwloc(int *workerids, int nwork
 
				 	mic_min = starpu_get_env_number("STARPU_MIN_WORKERSIZE");
			
 
				 	if (mic_min < 2)
			
 
				 		mic_min = 2;
			
 
				-	for(i=0; i<nb_mics; i++)
			
 
				+	for(j=0; j<nb_mics; j++)
			
 
				 	{
			
 
				 		mic_max = starpu_get_env_number("STARPU_MAX_WORKERSIZE");
			
 
				-		if (mic_max == -1 || mic_max > (int) nmics_table[i])
			
 
				-			mic_max = nmics_table[i];
			
 
				-		assign_combinations_without_hwloc(workers,mic_workers[i],nmics_table[i],mic_min,mic_max);
			
 
				-		free(mic_workers[i]);
			
 
				+		if (mic_max == -1 || mic_max > (int) nmics_table[j])
			
 
				+			mic_max = nmics_table[j];
			
 
				+		assign_combinations_without_hwloc(workers,mic_workers[j],nmics_table[j],mic_min,mic_max);
			
 
				+		free(mic_workers[j]);
			
 
				 	}
			
 
				 	free(mic_id);
			
 
				 	free(nmics_table);
			
@@ -325,7 +325,7 @@ static void combine_all_cpu_workers(int *workerids, int nworkers)
 
				 	{
			
 
				 		worker = _starpu_get_worker_struct(workerids[i]);
			
 
				 
			
 
				-		if (worker->perf_arch == STARPU_CPU_DEFAULT)
			
 
				+		if (worker->arch == STARPU_CPU_WORKER)
			
 
				 			cpu_workers[ncpus++] = workerids[i];
			
 
				 	}
			
 
				 
			
--- a/src/core/jobs.h
+++ b/src/core/jobs.h
@@ -166,7 +166,7 @@ unsigned _starpu_enforce_deps_starting_from_task(struct _starpu_job *j);
 
				 void _starpu_handle_job_termination(struct _starpu_job *j);
			
 
				 
			
 
				 /* Get the sum of the size of the data accessed by the job. */
			
 
				-size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, unsigned nimpl, struct _starpu_job *j);
			
 
				+size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned nimpl, struct _starpu_job *j);
			
 
				 
			
 
				 /* Get a task from the local pool of tasks that were explicitly attributed to
			
 
				  * that worker. */
			
--- a/src/core/perfmodel/perfmodel.c
+++ b/src/core/perfmodel/perfmodel.c
@@ -48,7 +48,7 @@ unsigned _starpu_get_calibrate_flag(void)
 
				 	return calibrate_flag;
			
 
				 }
			
 
				 
			
 
				-enum starpu_perfmodel_archtype starpu_worker_get_perf_archtype(int workerid)
			
 
				+struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid)
			
 
				 {
			
 
				 	struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				 
			
@@ -56,26 +56,26 @@ enum starpu_perfmodel_archtype starpu_worker_get_perf_archtype(int workerid)
 
				 	unsigned nworkers = config->topology.nworkers;
			
 
				 
			
 
				 	if (workerid < (int)config->topology.nworkers)
			
 
				-		return config->workers[workerid].perf_arch;
			
 
				+		return &config->workers[workerid].perf_arch;
			
 
				 
			
 
				 	/* We have a combined worker */
			
 
				 	unsigned ncombinedworkers = config->topology.ncombinedworkers;
			
 
				 	STARPU_ASSERT(workerid < (int)(ncombinedworkers + nworkers));
			
 
				-	return config->combined_workers[workerid - nworkers].perf_arch;
			
 
				+	return &config->combined_workers[workerid - nworkers].perf_arch;
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				  * PER ARCH model
			
 
				  */
			
 
				 
			
 
				-static double per_arch_task_expected_perf(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, struct starpu_task *task, unsigned nimpl)
			
 
				+static double per_arch_task_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, struct starpu_task *task, unsigned nimpl)
			
 
				 {
			
 
				 	double exp = NAN;
			
 
				-	double (*per_arch_cost_function)(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl);
			
 
				+	double (*per_arch_cost_function)(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				 	double (*per_arch_cost_model)(struct starpu_data_descr *);
			
 
				 
			
 
				-	per_arch_cost_function = model->per_arch[arch][nimpl].cost_function;
			
 
				-	per_arch_cost_model = model->per_arch[arch][nimpl].cost_model;
			
 
				+	per_arch_cost_function = model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].cost_function;
			
 
				+	per_arch_cost_model = model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].cost_model;
			
 
				 
			
 
				 	if (per_arch_cost_function)
			
 
				 		exp = per_arch_cost_function(task, arch, nimpl);
			
@@ -89,28 +89,31 @@ static double per_arch_task_expected_perf(struct starpu_perfmodel *model, enum s
 
				  * Common model
			
 
				  */
			
 
				 
			
 
				-double starpu_worker_get_relative_speedup(enum starpu_perfmodel_archtype perf_archtype)
			
 
				+double starpu_worker_get_relative_speedup(struct starpu_perfmodel_arch* perf_arch)
			
 
				 {
			
 
				-	if (perf_archtype < STARPU_CUDA_DEFAULT)
			
 
				+	if (perf_arch->type == STARPU_CPU_WORKER)
			
 
				 	{
			
 
				-		return _STARPU_CPU_ALPHA * (perf_archtype + 1);
			
 
				+		return _STARPU_CPU_ALPHA * (perf_arch->ncore + 1);
			
 
				 	}
			
 
				-	else if (perf_archtype < STARPU_OPENCL_DEFAULT)
			
 
				+	else if (perf_arch->type == STARPU_CUDA_WORKER)
			
 
				 	{
			
 
				 		return _STARPU_CUDA_ALPHA;
			
 
				 	}
			
 
				-	else if (perf_archtype < STARPU_NARCH_VARIATIONS)
			
 
				+	else if (perf_arch->type == STARPU_OPENCL_WORKER)
			
 
				 	{
			
 
				 		return _STARPU_OPENCL_ALPHA;
			
 
				 	}
			
 
				-
			
 
				+	else if (perf_arch->type == STARPU_MIC_WORKER)
			
 
				+	{
			
 
				+		return _STARPU_MIC_ALPHA * (perf_arch->ncore + 1);
			
 
				+	}
			
 
				 	STARPU_ABORT();
			
 
				 
			
 
				 	/* Never reached ! */
			
 
				 	return NAN;
			
 
				 }
			
 
				 
			
 
				-static double common_task_expected_perf(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, struct starpu_task *task, unsigned nimpl)
			
 
				+static double common_task_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, struct starpu_task *task, unsigned nimpl)
			
 
				 {
			
 
				 	double exp;
			
 
				 	double alpha;
			
@@ -143,7 +146,6 @@ void _starpu_load_perfmodel(struct starpu_perfmodel *model)
 
				 		return;
			
 
				 
			
 
				 	int load_model = _starpu_register_model(model);
			
 
				-
			
 
				 	if (!load_model)
			
 
				 		return;
			
 
				 
			
@@ -170,7 +172,7 @@ void _starpu_load_perfmodel(struct starpu_perfmodel *model)
 
				 	model->is_loaded = 1;
			
 
				 }
			
 
				 
			
 
				-static double starpu_model_expected_perf(struct starpu_task *task, struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch,  unsigned nimpl)
			
 
				+static double starpu_model_expected_perf(struct starpu_task *task, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch,  unsigned nimpl)
			
 
				 {
			
 
				 	if (model)
			
 
				 	{
			
@@ -203,19 +205,19 @@ static double starpu_model_expected_perf(struct starpu_task *task, struct starpu
 
				 	return 0.0;
			
 
				 }
			
 
				 
			
 
				-double starpu_task_expected_length(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
 
				 {
			
 
				 
			
 
				 	return starpu_model_expected_perf(task, task->cl->model, arch, nimpl);
			
 
				 }
			
 
				 
			
 
				-double starpu_task_expected_power(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+double starpu_task_expected_power(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
 
				 {
			
 
				 	return starpu_model_expected_perf(task, task->cl->power_model, arch, nimpl);
			
 
				 }
			
 
				 
			
 
				 double starpu_task_expected_conversion_time(struct starpu_task *task,
			
 
				-					    enum starpu_perfmodel_archtype arch,
			
 
				+					    struct starpu_perfmodel_arch* arch,
			
 
				 					    unsigned nimpl)
			
 
				 {
			
 
				 	unsigned i;
			
@@ -230,14 +232,28 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 
				 		handle = STARPU_TASK_GET_HANDLE(task, i);
			
 
				 		if (!_starpu_data_is_multiformat_handle(handle))
			
 
				 			continue;
			
 
				-
			
 
				-		if (arch < STARPU_CUDA_DEFAULT)
			
 
				-			node_kind = STARPU_CPU_RAM;
			
 
				-		else if (arch < STARPU_OPENCL_DEFAULT)
			
 
				-			node_kind = STARPU_CUDA_RAM;
			
 
				-		else
			
 
				-			node_kind = STARPU_OPENCL_RAM;
			
 
				-
			
 
				+		
			
 
				+		switch(arch->type)
			
 
				+		{
			
 
				+			case STARPU_CPU_WORKER:
			
 
				+				node_kind = STARPU_CPU_RAM;
			
 
				+				break;
			
 
				+			case STARPU_CUDA_WORKER:
			
 
				+				node_kind = STARPU_CUDA_RAM;
			
 
				+				break;
			
 
				+			case STARPU_OPENCL_WORKER:
			
 
				+				node_kind = STARPU_OPENCL_RAM;
			
 
				+				break;
			
 
				+			case STARPU_MIC_WORKER:
			
 
				+				node_kind = STARPU_MIC_RAM;
			
 
				+				break;
			
 
				+			case STARPU_SCC_WORKER:
			
 
				+				node_kind = STARPU_SCC_RAM;
			
 
				+				break;
			
 
				+			default:
			
 
				+				STARPU_ABORT();
			
 
				+				break;
			
 
				+		}
			
 
				 		if (!_starpu_handle_needs_conversion_task_for_arch(handle, node_kind))
			
 
				 			continue;
			
 
				 
			
@@ -298,7 +314,7 @@ double starpu_task_expected_data_transfer_time(unsigned memory_node, struct star
 
				 }
			
 
				 
			
 
				 /* Return the expected duration of the entire task bundle in µs */
			
 
				-double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
 
				 {
			
 
				 	double expected_length = 0.0;
			
 
				 
			
@@ -329,7 +345,7 @@ double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, enum star
 
				 }
			
 
				 
			
 
				 /* Return the expected power consumption of the entire task bundle in J */
			
 
				-double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
 
				 {
			
 
				 	double expected_power = 0.0;
			
 
				 
			
--- a/src/core/perfmodel/perfmodel.h
+++ b/src/core/perfmodel/perfmodel.h
@@ -38,7 +38,7 @@ extern "C"
 
				  * differents versions of StarPU having different performance model
			
 
				  * formats.
			
 
				  */
			
 
				-#define _STARPU_PERFMODEL_VERSION 42
			
 
				+#define _STARPU_PERFMODEL_VERSION 43
			
 
				 
			
 
				 struct _starpu_perfmodel_list
			
 
				 {
			
@@ -48,14 +48,14 @@ struct _starpu_perfmodel_list
 
				 
			
 
				 struct starpu_data_descr;
			
 
				 struct _starpu_job;
			
 
				-enum starpu_perfmodel_archtype;
			
 
				+struct starpu_perfmodel_arch;
			
 
				 
			
 
				 void _starpu_get_perf_model_dir(char *path, size_t maxlen);
			
 
				 void _starpu_get_perf_model_dir_codelets(char *path, size_t maxlen);
			
 
				 void _starpu_get_perf_model_dir_bus(char *path, size_t maxlen);
			
 
				 void _starpu_get_perf_model_dir_debug(char *path, size_t maxlen);
			
 
				 
			
 
				-double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, struct _starpu_job *j, unsigned nimpl);
			
 
				+double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, struct _starpu_job *j, unsigned nimpl);
			
 
				 int _starpu_register_model(struct starpu_perfmodel *model);
			
 
				 void _starpu_load_per_arch_based_model(struct starpu_perfmodel *model);
			
 
				 void _starpu_load_common_based_model(struct starpu_perfmodel *model);
			
@@ -65,10 +65,10 @@ void _starpu_initialize_registered_performance_models(void);
 
				 void _starpu_deinitialize_registered_performance_models(void);
			
 
				 
			
 
				 double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model,
			
 
				-					enum starpu_perfmodel_archtype arch, struct _starpu_job *j, unsigned nimpl);
			
 
				+					struct starpu_perfmodel_arch* arch, struct _starpu_job *j, unsigned nimpl);
			
 
				 double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfmodel *model,
			
 
				-					enum starpu_perfmodel_archtype arch, struct _starpu_job *j, unsigned nimpl);
			
 
				-void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch,
			
 
				+					struct starpu_perfmodel_arch* arch, struct _starpu_job *j, unsigned nimpl);
			
 
				+void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch,
			
 
				 				unsigned cpuid, double measured, unsigned nimpl);
			
 
				 
			
 
				 void _starpu_create_sampling_directory_if_needed(void);
			
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
--- a/src/core/perfmodel/perfmodel_print.c
+++ b/src/core/perfmodel/perfmodel_print.c
@@ -61,9 +61,9 @@ void _starpu_perfmodel_print_history_based(struct starpu_perfmodel_per_arch *per
 
				 	}
			
 
				 }
			
 
				 
			
 
				-void starpu_perfmodel_print(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output)
			
 
				+void starpu_perfmodel_print(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output)
			
 
				 {
			
 
				-	struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[arch][nimpl];
			
 
				+	struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[arch->type][arch->devid][arch->ncore][nimpl];
			
 
				 	char archname[32];
			
 
				 
			
 
				 	if (arch_model->regression.nsample || arch_model->regression.valid || arch_model->regression.nl_valid || arch_model->list)
			
@@ -171,13 +171,22 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 
				 	if (arch == NULL)
			
 
				 	{
			
 
				 		/* display all architectures */
			
 
				-		unsigned archid;
			
 
				-		unsigned implid;
			
 
				-		for (archid = 0; archid < STARPU_NARCH_VARIATIONS; archid++)
			
 
				+		unsigned archtype, devid, ncore, implid;
			
 
				+		struct starpu_perfmodel_arch perf_arch;
			
 
				+		for (archtype = 0; archtype < STARPU_NARCH; archtype++)
			
 
				 		{
			
 
				-			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				-			{ /* Display all codelets on each arch */
			
 
				-				starpu_perfmodel_print(model, (enum starpu_perfmodel_archtype) archid, implid, parameter, footprint, output);
			
 
				+			perf_arch.type = archtype;
			
 
				+			for(devid = 0; model->per_arch[archtype][devid] != NULL; devid++)
			
 
				+			{
			
 
				+				perf_arch.devid = devid;
			
 
				+				for(ncore = 0; model->per_arch[archtype][devid][ncore] != NULL; ncore++)
			
 
				+				{
			
 
				+					perf_arch.ncore = ncore;
			
 
				+					for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				+					{ /* Display all codelets on each arch */
			
 
				+						starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
			
 
				+					}
			
 
				+				}
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
@@ -186,8 +195,12 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 
				 		if (strcmp(arch, "cpu") == 0)
			
 
				 		{
			
 
				 			unsigned implid;
			
 
				+			struct starpu_perfmodel_arch perf_arch;
			
 
				+			perf_arch.type = STARPU_CPU_WORKER;
			
 
				+			perf_arch.devid = 0;
			
 
				+			perf_arch.ncore = 0;
			
 
				 			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				-				starpu_perfmodel_print(model, STARPU_CPU_DEFAULT,implid, parameter, footprint, output); /* Display all codelets on cpu */
			
 
				+				starpu_perfmodel_print(model, &perf_arch,implid, parameter, footprint, output); /* Display all codelets on cpu */
			
 
				 			return 0;
			
 
				 		}
			
 
				 
			
@@ -202,24 +215,28 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 
				 			}
			
 
				 
			
 
				 			unsigned implid;
			
 
				+			struct starpu_perfmodel_arch perf_arch;
			
 
				+			perf_arch.type = STARPU_CPU_WORKER;
			
 
				+			perf_arch.devid = 0;
			
 
				+			perf_arch.ncore = k-1;
			
 
				 			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				-				starpu_perfmodel_print(model, (enum starpu_perfmodel_archtype) (STARPU_CPU_DEFAULT + k - 1), implid, parameter, footprint, output);
			
 
				+				starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
			
 
				 			return 0;
			
 
				 		}
			
 
				 
			
 
				 		if (strcmp(arch, "cuda") == 0)
			
 
				 		{
			
 
				-			unsigned archid;
			
 
				+			unsigned devid;
			
 
				 			unsigned implid;
			
 
				-			for (archid = STARPU_CUDA_DEFAULT; archid < STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS; archid++)
			
 
				+			struct starpu_perfmodel_arch perf_arch;
			
 
				+			perf_arch.type = STARPU_CUDA_WORKER;
			
 
				+			perf_arch.ncore = 0;
			
 
				+
			
 
				+			for (devid = 0; model->per_arch[STARPU_CUDA_WORKER] != NULL; devid++)
			
 
				 			{
			
 
				+				perf_arch.devid = devid;
			
 
				 				for (implid = 0; implid <STARPU_MAXIMPLEMENTATIONS; implid ++)
			
 
				-				{
			
 
				-					char archname[32];
			
 
				-					starpu_perfmodel_get_arch_name((enum starpu_perfmodel_archtype) archid, archname, 32, implid);
			
 
				-					fprintf(output, "performance model for %s\n", archname);
			
 
				-					starpu_perfmodel_print(model, (enum starpu_perfmodel_archtype) archid, implid, parameter, footprint, output);
			
 
				-				}
			
 
				+					starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
			
 
				 			}
			
 
				 			return 0;
			
 
				 		}
			
@@ -230,10 +247,13 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 
				 		nmatched = sscanf(arch, "cuda_%d", &gpuid);
			
 
				 		if (nmatched == 1)
			
 
				 		{
			
 
				-			int archid = STARPU_CUDA_DEFAULT+ gpuid;
			
 
				+			struct starpu_perfmodel_arch perf_arch;
			
 
				+			perf_arch.type = STARPU_CUDA_WORKER;
			
 
				+			perf_arch.devid = gpuid;
			
 
				+			perf_arch.ncore = 0;
			
 
				 			unsigned implid;
			
 
				 			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				-				starpu_perfmodel_print(model, (enum starpu_perfmodel_archtype) archid, implid, parameter, footprint, output);
			
 
				+				starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
			
 
				 			return 0;
			
 
				 		}
			
 
				 
			
--- a/src/core/simgrid.c
+++ b/src/core/simgrid.c
@@ -97,7 +97,7 @@ int main(int argc, char **argv)
 
				 }
			
 
				 
			
 
				 /* Task execution submitted by StarPU */
			
 
				-void _starpu_simgrid_execute_job(struct _starpu_job *j, enum starpu_perfmodel_archtype perf_arch, double length)
			
 
				+void _starpu_simgrid_execute_job(struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, double length)
			
 
				 {
			
 
				 	struct starpu_task *task = j->task;
			
 
				 	msg_task_t simgrid_task;
			
--- a/src/core/simgrid.h
+++ b/src/core/simgrid.h
@@ -30,7 +30,7 @@ struct _starpu_pthread_args
 
				 
			
 
				 #define MAX_TSD 16
			
 
				 
			
 
				-void _starpu_simgrid_execute_job(struct _starpu_job *job, enum starpu_perfmodel_archtype perf_arch, double length);
			
 
				+void _starpu_simgrid_execute_job(struct _starpu_job *job, struct starpu_perfmodel_arch* perf_arch, double length);
			
 
				 int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node, struct _starpu_data_request *req);
			
 
				 /* Return the number of hosts prefixed by PREFIX */
			
 
				 int _starpu_simgrid_get_nbhosts(const char *prefix);
			
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -244,7 +244,11 @@ int _starpu_submit_job(struct _starpu_job *j)
 
				 	if(sched_ctx != NULL && j->task->sched_ctx != _starpu_get_initial_sched_ctx()->id && j->task->sched_ctx != STARPU_NMAX_SCHED_CTXS
			
 
				 	   && sched_ctx->perf_counters != NULL)
			
 
				 	{
			
 
				-		_starpu_compute_buffers_footprint(j->task->cl->model, STARPU_CPU_DEFAULT, 0, j);
			
 
				+		struct starpu_perfmodel_arch arch;
			
 
				+		arch.type = STARPU_CPU_WORKER;
			
 
				+		arch.devid = 0;
			
 
				+		arch.ncore = 0;
			
 
				+		_starpu_compute_buffers_footprint(j->task->cl->model, &arch, 0, j);
			
 
				 		int i;
			
 
				 		size_t data_size = 0;
			
 
				 		for(i = 0; i < STARPU_NMAXBUFS; i++)
			
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -633,8 +633,10 @@ _starpu_init_mic_config (struct _starpu_machine_config *config,
 
				 	for (miccore_id = 0; miccore_id < topology->nmiccores[mic_idx]; miccore_id++)
			
 
				 	{
			
 
				 		int worker_idx = topology->nworkers + miccore_id;
			
 
				-		enum starpu_perfmodel_archtype arch =
			
 
				-			(enum starpu_perfmodel_archtype)((int)STARPU_MIC_DEFAULT + mic_idx);
			
 
				+		struct starpu_perfmodel_arch arch;
			
 
				+		arch.type = STARPU_MIC_WORKER;
			
 
				+		arch.devid = mic_idx;
			
 
				+		arch.ncore = 0; 
			
 
				 		config->workers[worker_idx].arch = STARPU_MIC_WORKER;
			
 
				 		config->workers[worker_idx].perf_arch = arch;
			
 
				 		config->workers[worker_idx].mp_nodeid = mic_idx;
			
@@ -772,11 +774,11 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 
				 		int worker_idx = topology->nworkers + cudagpu;
			
 
				 		config->workers[worker_idx].arch = STARPU_CUDA_WORKER;
			
 
				 		int devid = _starpu_get_next_cuda_gpuid(config);
			
 
				-		enum starpu_perfmodel_archtype arch =
			
 
				-			(enum starpu_perfmodel_archtype)((int)STARPU_CUDA_DEFAULT + devid);
			
 
				+		config->workers[worker_idx].perf_arch.type = STARPU_CUDA_WORKER;
			
 
				+		config->workers[worker_idx].perf_arch.devid = cudagpu;
			
 
				+		config->workers[worker_idx].perf_arch.ncore = 0;
			
 
				 		config->workers[worker_idx].mp_nodeid = -1;
			
 
				 		config->workers[worker_idx].devid = devid;
			
 
				-		config->workers[worker_idx].perf_arch = arch;
			
 
				 		config->workers[worker_idx].worker_mask = STARPU_CUDA;
			
 
				 		config->worker_mask |= STARPU_CUDA;
			
 
				 
			
@@ -846,11 +848,11 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 
				 			break;
			
 
				 		}
			
 
				 		config->workers[worker_idx].arch = STARPU_OPENCL_WORKER;
			
 
				-		enum starpu_perfmodel_archtype arch =
			
 
				-			(enum starpu_perfmodel_archtype)((int)STARPU_OPENCL_DEFAULT + devid);
			
 
				+		config->workers[worker_idx].perf_arch.type = STARPU_OPENCL_WORKER;
			
 
				+		config->workers[worker_idx].perf_arch.devid = openclgpu;
			
 
				+		config->workers[worker_idx].perf_arch.ncore = 0;
			
 
				 		config->workers[worker_idx].mp_nodeid = -1;
			
 
				 		config->workers[worker_idx].devid = devid;
			
 
				-		config->workers[worker_idx].perf_arch = arch;
			
 
				 		config->workers[worker_idx].worker_mask = STARPU_OPENCL;
			
 
				 		config->worker_mask |= STARPU_OPENCL;
			
 
				 	}
			
@@ -908,10 +910,11 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 
				 	{
			
 
				 		config->workers[topology->nworkers + sccdev].arch = STARPU_SCC_WORKER;
			
 
				 		int devid = _starpu_get_next_scc_deviceid(config);
			
 
				-		enum starpu_perfmodel_archtype arch = (enum starpu_perfmodel_archtype)((int)STARPU_SCC_DEFAULT + devid);
			
 
				+		config->workers[topology->nworkers + sccdev].perf_arch.type = STARPU_SCC_WORKER;
			
 
				+		config->workers[topology->nworkers + sccdev].perf_arch.devid = sccdev;
			
 
				+		config->workers[topology->nworkers + sccdev].perf_arch.ncore = 0;
			
 
				 		config->workers[topology->nworkers + sccdev].mp_nodeid = -1;
			
 
				 		config->workers[topology->nworkers + sccdev].devid = devid;
			
 
				-		config->workers[topology->nworkers + sccdev].perf_arch = arch;
			
 
				 		config->workers[topology->nworkers + sccdev].worker_mask = STARPU_SCC;
			
 
				 		config->worker_mask |= STARPU_SCC;
			
 
				 	}
			
@@ -971,7 +974,9 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 
				 	{
			
 
				 		int worker_idx = topology->nworkers + cpu;
			
 
				 		config->workers[worker_idx].arch = STARPU_CPU_WORKER;
			
 
				-		config->workers[worker_idx].perf_arch = STARPU_CPU_DEFAULT;
			
 
				+		config->workers[worker_idx].perf_arch.type = STARPU_CPU_WORKER;
			
 
				+		config->workers[worker_idx].perf_arch.devid = 0;
			
 
				+		config->workers[worker_idx].perf_arch.ncore = 0;
			
 
				 		config->workers[worker_idx].mp_nodeid = -1;
			
 
				 		config->workers[worker_idx].devid = cpu;
			
 
				 		config->workers[worker_idx].worker_mask = STARPU_CPU;
			
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -59,7 +59,7 @@ struct _starpu_worker
 
				         starpu_pthread_mutex_t mutex;
			
 
				 	enum starpu_worker_archtype arch; /* what is the type of worker ? */
			
 
				 	uint32_t worker_mask; /* what is the type of worker ? */
			
 
				-	enum starpu_perfmodel_archtype perf_arch; /* in case there are different models of the same arch */
			
 
				+	struct starpu_perfmodel_arch perf_arch; /* in case there are different models of the same arch */
			
 
				 	starpu_pthread_t worker_thread; /* the thread which runs the worker */
			
 
				 	int mp_nodeid; /* which mp node hold the cpu/gpu/etc (-1 for this
			
 
				 			* node) */
			
@@ -117,7 +117,7 @@ struct _starpu_worker
 
				 
			
 
				 struct _starpu_combined_worker
			
 
				 {
			
 
				-	enum starpu_perfmodel_archtype perf_arch; /* in case there are different models of the same arch */
			
 
				+	struct starpu_perfmodel_arch perf_arch; /* in case there are different models of the same arch */
			
 
				 	uint32_t worker_mask; /* what is the type of workers ? */
			
 
				 	int worker_size;
			
 
				 	unsigned memory_node; /* which memory node is associated that worker to ? */
			
--- a/src/datawizard/footprint.c
+++ b/src/datawizard/footprint.c
@@ -19,7 +19,7 @@
 
				 #include <starpu_hash.h>
			
 
				 #include <core/task.h>
			
 
				 
			
 
				-uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, unsigned nimpl, struct _starpu_job *j)
			
 
				+uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, unsigned nimpl, struct _starpu_job *j)
			
 
				 {
			
 
				 	if (j->footprint_is_computed)
			
 
				 		return j->footprint;
			
@@ -29,9 +29,13 @@ uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, enum
 
				 
			
 
				 	struct starpu_task *task = j->task;
			
 
				 
			
 
				-	if (model && model->per_arch[arch][nimpl].size_base)
			
 
				+	if (model != NULL && 
			
 
				+			model->per_arch[arch->type] != NULL &&
			
 
				+			model->per_arch[arch->type][arch->devid] != NULL &&
			
 
				+			model->per_arch[arch->type][arch->devid][arch->ncore] != NULL &&
			
 
				+			model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].size_base)
			
 
				 	{
			
 
				-		size_t size = model->per_arch[arch][nimpl].size_base(task, arch, nimpl);
			
 
				+		size_t size = model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].size_base(task, arch, nimpl);
			
 
				 		footprint = starpu_hash_crc32c_be_n(&size, sizeof(size), footprint);
			
 
				 	}
			
 
				 	else if (model && model->size_base)
			
@@ -68,7 +72,7 @@ uint32_t _starpu_compute_data_footprint(starpu_data_handle_t handle)
 
				 	return starpu_hash_crc32c_be(handle_footprint, interfaceid);
			
 
				 }
			
 
				 
			
 
				-uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
 
				 {
			
 
				 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
			
 
				 	return _starpu_compute_buffers_footprint(model, arch, nimpl, j);
			
--- a/src/datawizard/footprint.h
+++ b/src/datawizard/footprint.h
@@ -24,7 +24,7 @@
 
				 
			
 
				 /* Compute the footprint that characterizes the job and cache it into the job
			
 
				  * structure. */
			
 
				-uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, unsigned nimpl, struct _starpu_job *j);
			
 
				+uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, unsigned nimpl, struct _starpu_job *j);
			
 
				 
			
 
				 /* Compute the footprint that characterizes the layout of the data handle. */
			
 
				 uint32_t _starpu_compute_data_footprint(starpu_data_handle_t handle);
			
--- a/src/debug/traces/starpu_fxt.c
+++ b/src/debug/traces/starpu_fxt.c
@@ -337,7 +337,7 @@ static void handle_worker_init_start(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 
				 	register_worker_id(threadid, workerid);
			
 
				 
			
 
				 	char *kindstr = "";
			
 
				-	enum starpu_perfmodel_archtype archtype = 0;
			
 
				+	struct starpu_perfmodel_arch arch;
			
 
				 
			
 
				 	switch (ev->param[0])
			
 
				 	{
			
@@ -348,27 +348,37 @@ static void handle_worker_init_start(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 
				 		case _STARPU_FUT_CPU_KEY:
			
 
				 			set_next_cpu_worker_color(workerid);
			
 
				 			kindstr = "CPU";
			
 
				-			archtype = STARPU_CPU_DEFAULT;
			
 
				+			arch.type = STARPU_CPU_WORKER;
			
 
				+			arch.devid = 0;
			
 
				+			arch.ncore = 0;
			
 
				 			break;
			
 
				 		case _STARPU_FUT_CUDA_KEY:
			
 
				 			set_next_cuda_worker_color(workerid);
			
 
				 			kindstr = "CUDA";
			
 
				-			archtype = STARPU_CUDA_DEFAULT + devid;
			
 
				+			arch.type = STARPU_CUDA_WORKER;
			
 
				+			arch.devid = devid;
			
 
				+			arch.ncore = 0;
			
 
				 			break;
			
 
				 		case _STARPU_FUT_OPENCL_KEY:
			
 
				 			set_next_opencl_worker_color(workerid);
			
 
				 			kindstr = "OPENCL";
			
 
				-			archtype = STARPU_OPENCL_DEFAULT + devid;
			
 
				+			arch.type = STARPU_OPENCL_WORKER;
			
 
				+			arch.devid = devid;
			
 
				+			arch.ncore = 0;
			
 
				 			break;
			
 
				 		case _STARPU_FUT_MIC_KEY:
			
 
				 			set_next_mic_worker_color(workerid);
			
 
				 			kindstr = "mic";
			
 
				-			archtype = STARPU_MIC_DEFAULT + devid;
			
 
				+			arch.type = STARPU_MIC_WORKER;
			
 
				+			arch.devid = devid;
			
 
				+			arch.ncore = 0;
			
 
				 			break;
			
 
				 		case _STARPU_FUT_SCC_KEY:
			
 
				 			set_next_scc_worker_color(workerid);
			
 
				 			kindstr = "scc";
			
 
				-			archtype = STARPU_SCC_DEFAULT + devid;
			
 
				+			arch.type = STARPU_SCC_WORKER;
			
 
				+			arch.devid = devid;
			
 
				+			arch.ncore = 0;
			
 
				 			break;
			
 
				 		default:
			
 
				 			STARPU_ABORT();
			
@@ -405,7 +415,7 @@ static void handle_worker_init_start(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 
				 	fprintf(activity_file, "name\t%d\t%s %d\n", workerid, kindstr, devid);
			
 
				 
			
 
				 	snprintf(options->worker_names[workerid], 256, "%s %d", kindstr, devid);
			
 
				-	options->worker_archtypes[workerid] = archtype;
			
 
				+	options->worker_archtypes[workerid] = arch;
			
 
				 }
			
 
				 
			
 
				 static void handle_worker_init_end(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
			
@@ -600,14 +610,14 @@ static void handle_end_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_opti
 
				 
			
 
				 	if (options->dumped_codelets)
			
 
				 	{
			
 
				-		enum starpu_perfmodel_archtype archtype = ev->param[3];
			
 
				+		struct starpu_perfmodel_arch* arch = ev->param[3];
			
 
				 
			
 
				 		dumped_codelets_count++;
			
 
				 		dumped_codelets = realloc(dumped_codelets, dumped_codelets_count*sizeof(struct starpu_fxt_codelet_event));
			
 
				 
			
 
				 		snprintf(dumped_codelets[dumped_codelets_count - 1].symbol, 256, "%s", last_codelet_symbol[worker]);
			
 
				 		dumped_codelets[dumped_codelets_count - 1].workerid = worker;
			
 
				-		dumped_codelets[dumped_codelets_count - 1].archtype = archtype;
			
 
				+		dumped_codelets[dumped_codelets_count - 1].arch = *arch;
			
 
				 
			
 
				 		dumped_codelets[dumped_codelets_count - 1].size = codelet_size;
			
 
				 		dumped_codelets[dumped_codelets_count - 1].hash = codelet_hash;
			
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -111,7 +111,7 @@ _starpu_cpu_discover_devices(struct _starpu_machine_config *config)
 
				  * Handle binding CPUs on cores.
			
 
				  * In the case of a combined worker WORKER_TASK != J->TASK */
			
 
				 
			
 
				-static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_task, struct _starpu_worker *cpu_args, int rank, enum starpu_perfmodel_archtype perf_arch)
			
 
				+static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_task, struct _starpu_worker *cpu_args, int rank, struct starpu_perfmodel_arch* perf_arch)
			
 
				 {
			
 
				 	int ret;
			
 
				 	int is_parallel_task = (j->task_size > 1);
			
@@ -292,7 +292,7 @@ int _starpu_cpu_driver_run_once(struct starpu_driver *d STARPU_ATTRIBUTE_UNUSED)
 
				 	int rank = 0;
			
 
				 	int is_parallel_task = (j->task_size > 1);
			
 
				 
			
 
				-	enum starpu_perfmodel_archtype perf_arch;
			
 
				+	struct starpu_perfmodel_arch* perf_arch;
			
 
				 
			
 
				 	/* Get the rank in case it is a parallel task */
			
 
				 	if (is_parallel_task)
			
@@ -307,14 +307,14 @@ int _starpu_cpu_driver_run_once(struct starpu_driver *d STARPU_ATTRIBUTE_UNUSED)
 
				 		cpu_worker->combined_workerid = j->combined_workerid;
			
 
				 		cpu_worker->worker_size = combined_worker->worker_size;
			
 
				 		cpu_worker->current_rank = rank;
			
 
				-		perf_arch = combined_worker->perf_arch;
			
 
				+		perf_arch = &combined_worker->perf_arch;
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				 		cpu_worker->combined_workerid = cpu_worker->workerid;
			
 
				 		cpu_worker->worker_size = 1;
			
 
				 		cpu_worker->current_rank = 0;
			
 
				-		perf_arch = cpu_worker->perf_arch;
			
 
				+		perf_arch = &cpu_worker->perf_arch;
			
 
				 	}
			
 
				 
			
 
				 	_starpu_set_current_task(j->task);
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -351,14 +351,14 @@ static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *arg
 
				 	STARPU_ASSERT(func);
			
 
				 
			
 
				 #ifdef STARPU_SIMGRID
			
 
				-	_starpu_simgrid_execute_job(j, args->perf_arch, NAN);
			
 
				+	_starpu_simgrid_execute_job(j, &args->perf_arch, NAN);
			
 
				 #else
			
 
				 	func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
			
 
				 #endif
			
 
				 
			
 
				-	_starpu_driver_end_job(args, j, args->perf_arch, &codelet_end, 0, profiling);
			
 
				+	_starpu_driver_end_job(args, j, &args->perf_arch, &codelet_end, 0, profiling);
			
 
				 
			
 
				-	_starpu_driver_update_job_feedback(j, args, args->perf_arch, &codelet_start, &codelet_end, profiling);
			
 
				+	_starpu_driver_update_job_feedback(j, args, &args->perf_arch, &codelet_start, &codelet_end, profiling);
			
 
				 
			
 
				 	_starpu_push_task_output(j, mask);
			
 
				 
			
--- a/src/drivers/driver_common/driver_common.c
+++ b/src/drivers/driver_common/driver_common.c
@@ -75,7 +75,7 @@ void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j
 
				 	_STARPU_TRACE_START_CODELET_BODY(j);
			
 
				 }
			
 
				 
			
 
				-void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, enum starpu_perfmodel_archtype perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank, int profiling)
			
 
				+void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank, int profiling)
			
 
				 {
			
 
				 	struct starpu_task *task = j->task;
			
 
				 	struct starpu_codelet *cl = task->cl;
			
@@ -104,7 +104,7 @@ void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j,
 
				 	args->status = STATUS_UNKNOWN;
			
 
				 }
			
 
				 void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_worker *worker_args,
			
 
				-					enum starpu_perfmodel_archtype perf_arch,
			
 
				+					struct starpu_perfmodel_arch* perf_arch,
			
 
				 					struct timespec *codelet_start, struct timespec *codelet_end, int profiling)
			
 
				 {
			
 
				 	struct starpu_profiling_task_info *profiling_info = j->task->profiling_info;
			
@@ -150,7 +150,7 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 
				 
			
 
				 	if (profiling_info && profiling_info->power_consumed && cl->power_model && cl->power_model->benchmarking)
			
 
				 	{
			
 
				-		_starpu_update_perfmodel_history(j, j->task->cl->power_model,  perf_arch, worker_args->devid, profiling_info->power_consumed,j->nimpl);
			
 
				+		_starpu_update_perfmodel_history(j, j->task->cl->power_model, perf_arch, worker_args->devid, profiling_info->power_consumed,j->nimpl);
			
 
				 	}
			
 
				 }
			
 
				 
			
--- a/src/drivers/driver_common/driver_common.h
+++ b/src/drivers/driver_common/driver_common.h
@@ -25,10 +25,10 @@
 
				 
			
 
				 void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j,
			
 
				 			      struct timespec *codelet_start, int rank, int profiling);
			
 
				-void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, enum starpu_perfmodel_archtype perf_arch,
			
 
				+void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch,
			
 
				 			    struct timespec *codelet_end, int rank, int profiling);
			
 
				 void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_worker *worker_args,
			
 
				-					enum starpu_perfmodel_archtype perf_arch,
			
 
				+					struct starpu_perfmodel_arch* perf_arch,
			
 
				 					struct timespec *codelet_start, struct timespec *codelet_end, int profiling);
			
 
				 
			
 
				 struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int workerid, unsigned memnode);
			
--- a/src/drivers/mp_common/sink_common.c
+++ b/src/drivers/mp_common/sink_common.c
@@ -249,8 +249,6 @@ void _starpu_sink_common_worker(void)
 
				 	starpu_pthread_key_t worker_key;
			
 
				 	STARPU_PTHREAD_KEY_CREATE(&worker_key, NULL);
			
 
				 
			
 
				-
			
 
				-	struct _starpu_machine_config *config;
			
 
				 	while (!exit_starpu)
			
 
				 	{
			
 
				 		/* If we have received a message */
			
@@ -264,7 +262,6 @@ void _starpu_sink_common_worker(void)
 
				 					exit_starpu = 1;
			
 
				 					break;
			
 
				 				case STARPU_EXECUTE:
			
 
				-					config = _starpu_get_machine_config();
			
 
				 					node->execute(node, arg, arg_size);
			
 
				 					break;
			
 
				 				case STARPU_SINK_NBCORES:
			
@@ -314,7 +311,6 @@ void _starpu_sink_common_worker(void)
 
				 			struct mp_message * message = mp_message_list_pop_back(node->message_queue);
			
 
				 			STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);
			
 
				 			//_STARPU_DEBUG("telling host that we have finished the task %p sur %d.\n", task->kernel, task->coreid);
			
 
				-			config = _starpu_get_machine_config();
			
 
				 			_starpu_mp_common_send_command(node, message->type, 
			
 
				 					&message->buffer, message->size);
			
 
				 			mp_message_delete(message);
			
@@ -378,7 +374,6 @@ static void _starpu_sink_common_erase_barrier(struct _starpu_mp_node * node, str
 
				  */
			
 
				 static void _starpu_sink_common_append_message(struct _starpu_mp_node *node, struct mp_message * message)
			
 
				 {
			
 
				-	struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&node->message_queue_mutex);
			
 
				 	mp_message_list_push_front(node->message_queue,message);
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);
			
--- a/src/drivers/mp_common/source_common.c
+++ b/src/drivers/mp_common/source_common.c
@@ -36,7 +36,7 @@ static int _starpu_src_common_finalize_job (struct _starpu_job *j, struct _starp
 
				 	uint32_t mask = 0;
			
 
				 	int profiling = starpu_profiling_status_get();
			
 
				 	struct timespec codelet_end;
			
 
				-	_starpu_driver_end_job(worker, j, worker->perf_arch, &codelet_end, 0,
			
 
				+	_starpu_driver_end_job(worker, j, &worker->perf_arch, &codelet_end, 0,
			
 
				 			profiling);
			
 
				 	
			
 
				 	int count = worker->current_rank;
			
@@ -57,7 +57,7 @@ static int _starpu_src_common_finalize_job (struct _starpu_job *j, struct _starp
 
				 	if(count == 0)
			
 
				 	{
			
 
				 
			
 
				-		_starpu_driver_update_job_feedback(j, worker, worker->perf_arch,
			
 
				+		_starpu_driver_update_job_feedback(j, worker, &worker->perf_arch,
			
 
				 				&j->cl_start, &codelet_end,
			
 
				 				profiling);
			
 
				 
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -656,6 +656,7 @@ int _starpu_opencl_driver_run_once(struct starpu_driver *d)
 
				 
			
 
				 	task = _starpu_get_worker_task(args, workerid, memnode);
			
 
				 
			
 
				+
			
 
				 	if (task == NULL)
			
 
				 		return 0;
			
 
				 
			
@@ -841,14 +842,14 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 
				 	STARPU_ASSERT_MSG(profiling_info->used_cycles, "Application kernel must call starpu_opencl_collect_stats to collect simulated time");
			
 
				 	length = ((double) profiling_info->used_cycles)/MSG_get_host_speed(MSG_host_self());
			
 
				   #endif
			
 
				-	_starpu_simgrid_execute_job(j, args->perf_arch, length);
			
 
				+	_starpu_simgrid_execute_job(j, &args->perf_arch, length);
			
 
				 #else
			
 
				 	func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
			
 
				 #endif
			
 
				 
			
 
				-	_starpu_driver_end_job(args, j, args->perf_arch, &codelet_end, 0, profiling);
			
 
				+	_starpu_driver_end_job(args, j, &args->perf_arch, &codelet_end, 0, profiling);
			
 
				 
			
 
				-	_starpu_driver_update_job_feedback(j, args, args->perf_arch,
			
 
				+	_starpu_driver_update_job_feedback(j, args, &args->perf_arch,
			
 
				 					   &codelet_start, &codelet_end, profiling);
			
 
				 
			
 
				 	_starpu_push_task_output(j, mask);
			
--- a/src/profiling/bound.c
+++ b/src/profiling/bound.c
@@ -25,6 +25,7 @@
 
				 #include <starpu_config.h>
			
 
				 #include <profiling/bound.h>
			
 
				 #include <core/jobs.h>
			
 
				+#include <core/workers.h>
			
 
				 
			
 
				 #ifdef STARPU_HAVE_GLPK_H
			
 
				 #include <glpk.h>
			
@@ -100,7 +101,7 @@ struct bound_task
 
				 	int depsn;
			
 
				 
			
 
				 	/* Estimated duration */
			
 
				-	double duration[STARPU_NARCH_VARIATIONS];
			
 
				+	double** duration[STARPU_NARCH];
			
 
				 
			
 
				 	/* Other tasks */
			
 
				 	struct bound_task *next;
			
@@ -186,7 +187,31 @@ static int good_job(struct _starpu_job *j)
 
				 		return 0;
			
 
				 	return 1;
			
 
				 }
			
 
				+static double** initialize_arch_duration(int maxdevid, unsigned* maxncore_table)
			
 
				+{
			
 
				+	int devid, maxncore;
			
 
				+	double ** arch_model = malloc(sizeof(*arch_model)*(maxdevid+1));
			
 
				+	arch_model[maxdevid] = NULL;
			
 
				+	for(devid=0; devid<maxdevid; devid++)
			
 
				+	{
			
 
				+		if(maxncore_table != NULL)
			
 
				+			maxncore = maxncore_table[devid];
			
 
				+		else
			
 
				+			maxncore = 1;
			
 
				+		arch_model[devid] = malloc(sizeof(*arch_model[devid])*(maxncore+1));
			
 
				+	}
			
 
				+	return arch_model;
			
 
				+}
			
 
				 
			
 
				+static void initialize_duration(struct bound_task *task)
			
 
				+{
			
 
				+	struct _starpu_machine_config *conf = _starpu_get_machine_config();
			
 
				+	task->duration[STARPU_CPU_WORKER] = initialize_arch_duration(1,&conf->topology.ncpus); 
			
 
				+	task->duration[STARPU_CUDA_WORKER] = initialize_arch_duration(conf->topology.ncudagpus,NULL); 
			
 
				+	task->duration[STARPU_OPENCL_WORKER] = initialize_arch_duration(conf->topology.nopenclgpus,NULL); 
			
 
				+	task->duration[STARPU_MIC_WORKER] = initialize_arch_duration(conf->topology.nmicdevices,conf->topology.nmiccores); 
			
 
				+	task->duration[STARPU_SCC_WORKER] = initialize_arch_duration(conf->topology.nsccdevices,NULL); 
			
 
				+}
			
 
				 /* Create a new task (either because it has just been submitted, or a
			
 
				  * dependency was added before submission) */
			
 
				 static void new_task(struct _starpu_job *j)
			
@@ -202,10 +227,11 @@ static void new_task(struct _starpu_job *j)
 
				 	t->tag_id = j->task->tag_id;
			
 
				 	t->use_tag = j->task->use_tag;
			
 
				 	t->cl = j->task->cl;
			
 
				-	t->footprint = _starpu_compute_buffers_footprint(NULL, STARPU_CPU_DEFAULT, 0, j);
			
 
				+	t->footprint = _starpu_compute_buffers_footprint(NULL, STARPU_CPU_WORKER, 0, j);
			
 
				 	t->priority = j->task->priority;
			
 
				 	t->deps = NULL;
			
 
				 	t->depsn = 0;
			
 
				+	initialize_duration(t);
			
 
				 	t->next = tasks;
			
 
				 	j->bound_task = t;
			
 
				 	tasks = t;
			
@@ -236,7 +262,7 @@ void _starpu_bound_record(struct _starpu_job *j)
 
				 	{
			
 
				 		struct bound_task_pool *tp;
			
 
				 
			
 
				-		_starpu_compute_buffers_footprint(NULL, STARPU_CPU_DEFAULT, 0, j);
			
 
				+		_starpu_compute_buffers_footprint(NULL, STARPU_CPU_WORKER, 0, j);
			
 
				 
			
 
				 		if (last && last->cl == j->task->cl && last->footprint == j->footprint)
			
 
				 			tp = last;
			
@@ -400,7 +426,7 @@ static void _starpu_get_tasks_times(int nw, int nt, double *times)
 
				 				.footprint = tp->footprint,
			
 
				 				.footprint_is_computed = 1,
			
 
				 			};
			
 
				-			enum starpu_perfmodel_archtype arch = starpu_worker_get_perf_archtype(w);
			
 
				+			struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
			
 
				 			double length = _starpu_history_based_job_expected_perf(tp->cl->model, arch, &j, j.nimpl);
			
 
				 			if (isnan(length))
			
 
				 				times[w*nt+t] = NAN;
			
@@ -486,15 +512,15 @@ void starpu_bound_print_lp(FILE *output)
 
				 			};
			
 
				 			for (w = 0; w < nw; w++)
			
 
				 			{
			
 
				-				enum starpu_perfmodel_archtype arch = starpu_worker_get_perf_archtype(w);
			
 
				-				if (_STARPU_IS_ZERO(t1->duration[arch]))
			
 
				+				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
			
 
				+				if (_STARPU_IS_ZERO(t1->duration[arch->type][arch->devid][arch->ncore]))
			
 
				 				{
			
 
				 					double length = _starpu_history_based_job_expected_perf(t1->cl->model, arch, &j,j.nimpl);
			
 
				 					if (isnan(length))
			
 
				 						/* Avoid problems with binary coding of doubles */
			
 
				-						t1->duration[arch] = NAN;
			
 
				+						t1->duration[arch->type][arch->devid][arch->ncore] = NAN;
			
 
				 					else
			
 
				-						t1->duration[arch] = length / 1000.;
			
 
				+						t1->duration[arch->type][arch->devid][arch->ncore] = length / 1000.;
			
 
				 				}
			
 
				 			}
			
 
				 			nt++;
			
@@ -519,8 +545,8 @@ void starpu_bound_print_lp(FILE *output)
 
				 		{
			
 
				 			for (w = 0; w < nw; w++)
			
 
				 			{
			
 
				-				enum starpu_perfmodel_archtype arch = starpu_worker_get_perf_archtype(w);
			
 
				-				if (!isnan(t1->duration[arch]))
			
 
				+				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
			
 
				+				if (!isnan(t1->duration[arch->type][arch->devid][arch->ncore]))
			
 
				 					fprintf(output, " +t%luw%d", t1->id, w);
			
 
				 			}
			
 
				 			fprintf(output, " = 1;\n");
			
@@ -533,9 +559,9 @@ void starpu_bound_print_lp(FILE *output)
 
				 			fprintf(output, "/* %s %x */\tc%lu = s%lu", _starpu_codelet_get_model_name(t1->cl), (unsigned) t1->footprint, t1->id, t1->id);
			
 
				 			for (w = 0; w < nw; w++)
			
 
				 			{
			
 
				-				enum starpu_perfmodel_archtype arch = starpu_worker_get_perf_archtype(w);
			
 
				-				if (!isnan(t1->duration[arch]))
			
 
				-					fprintf(output, " + %f t%luw%d", t1->duration[arch], t1->id, w);
			
 
				+				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
			
 
				+				if (!isnan(t1->duration[arch->type][arch->devid][arch->ncore]))
			
 
				+					fprintf(output, " + %f t%luw%d", t1->duration[arch->type][arch->devid][arch->ncore], t1->id, w);
			
 
				 			}
			
 
				 			fprintf(output, ";\n");
			
 
				 		}
			
@@ -616,8 +642,8 @@ void starpu_bound_print_lp(FILE *output)
 
				 				{
			
 
				 					for (w = 0; w < nw; w++)
			
 
				 					{
			
 
				-						enum starpu_perfmodel_archtype arch = starpu_worker_get_perf_archtype(w);
			
 
				-						if (!isnan(t1->duration[arch]))
			
 
				+						struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
			
 
				+						if (!isnan(t1->duration[arch->type][arch->devid][arch->ncore]))
			
 
				 						{
			
 
				 							fprintf(output, "s%lu - c%lu >= -3e5 + 1e5 t%luw%d + 1e5 t%luw%d + 1e5 t%luafter%lu;\n",
			
 
				 									t1->id, t2->id, t1->id, w, t2->id, w, t1->id, t2->id);
			
--- a/src/sched_policies/deque_modeling_policy_data_aware.c
+++ b/src/sched_policies/deque_modeling_policy_data_aware.c
@@ -407,7 +407,7 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 
				 		worker = workers->get_next(workers, &it);
			
 
				 		struct _starpu_fifo_taskq *fifo  = dt->queue_array[worker];
			
 
				 		unsigned memory_node = starpu_worker_get_memory_node(worker);
			
 
				-		enum starpu_perfmodel_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				+		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				 
			
 
				 		/* Sometimes workers didn't take the tasks as early as we expected */
			
 
				 		double exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
			
@@ -543,7 +543,7 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				 	{
			
 
				 		worker = workers->get_next(workers, &it);
			
 
				 		struct _starpu_fifo_taskq *fifo = dt->queue_array[worker];
			
 
				-		enum starpu_perfmodel_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				+		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				 		unsigned memory_node = starpu_worker_get_memory_node(worker);
			
 
				 
			
 
				 		/* Sometimes workers didn't take the tasks as early as we expected */
			
@@ -760,7 +760,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
				 	}
			
 
				 	else if (task->bundle)
			
 
				 	{
			
 
				-		enum starpu_perfmodel_archtype perf_arch = starpu_worker_get_perf_archtype(best_in_ctx);
			
 
				+		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(best_in_ctx);
			
 
				 		unsigned memory_node = starpu_worker_get_memory_node(best);
			
 
				 		model_best = starpu_task_expected_length(task, perf_arch, selected_impl);
			
 
				 		transfer_model_best = starpu_task_expected_data_transfer_time(memory_node, task);
			
@@ -931,7 +931,7 @@ static void dmda_push_task_notify(struct starpu_task *task, int workerid, unsign
 
				 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
			
 
				 	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
			
 
				 	/* Compute the expected penality */
			
 
				-	enum starpu_perfmodel_archtype perf_arch = starpu_worker_get_perf_archtype(workerid);
			
 
				+	struct starpu_perfmodel_arch *perf_arch = starpu_worker_get_perf_archtype(workerid);
			
 
				 	unsigned memory_node = starpu_worker_get_memory_node(workerid);
			
 
				 
			
 
				 	double predicted = starpu_task_expected_length(task, perf_arch,
			
--- a/src/sched_policies/parallel_heft.c
+++ b/src/sched_policies/parallel_heft.c
@@ -230,7 +230,7 @@ static double compute_expected_end(int workerid, double length)
 
				 
			
 
				 static double compute_ntasks_end(int workerid)
			
 
				 {
			
 
				-	enum starpu_perfmodel_archtype perf_arch = starpu_worker_get_perf_archtype(workerid);
			
 
				+	struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(workerid);
			
 
				 	starpu_pthread_mutex_t *sched_mutex;
			
 
				 	starpu_pthread_cond_t *sched_cond;
			
 
				 
			
@@ -347,7 +347,7 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 
				 			}
			
 
				 
			
 
				 
			
 
				-			enum starpu_perfmodel_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				+			struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				 
			
 
				 			local_task_length[worker_ctx][nimpl] = starpu_task_expected_length(task, perf_arch,nimpl);
			
 
				 
			
--- a/src/sched_policies/random_policy.c
+++ b/src/sched_policies/random_policy.c
@@ -50,7 +50,7 @@ static int _random_push_task(struct starpu_task *task, unsigned prio)
 
				 		{
			
 
				 			if(starpu_worker_can_execute_task(worker, task, impl))
			
 
				 			{
			
 
				-				enum starpu_perfmodel_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				+				struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				 				double speedup = starpu_worker_get_relative_speedup(perf_arch);
			
 
				 				alpha_sum += speedup;
			
 
				 				speedup_arr[size] = speedup;
			
--- a/src/starpu_parameters.h
+++ b/src/starpu_parameters.h
@@ -29,5 +29,5 @@
 
				 #define _STARPU_CPU_ALPHA	1.0f
			
 
				 #define _STARPU_CUDA_ALPHA	13.33f
			
 
				 #define _STARPU_OPENCL_ALPHA	12.22f
			
 
				-
			
 
				+#define _STARPU_MIC_ALPHA	11.11f
			
 
				 #endif /* _STARPU_PARAMETERS_H */
			
--- a/tests/perfmodels/feed.c
+++ b/tests/perfmodels/feed.c
@@ -50,6 +50,9 @@ int main(int argc, char **argv)
 
				 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				+	 if(starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) < 2)
			
 
				+		 return STARPU_TEST_SKIPPED;
			
 
				+
			
 
				 	starpu_task_init(&task);
			
 
				 	task.cl = &cl;
			
 
				 
			
@@ -66,12 +69,18 @@ int main(int argc, char **argv)
 
				 		measured_fast = 0.002+size*0.00000001;
			
 
				 		measured_slow = 0.001+size*0.0000001;
			
 
				 
			
 
				+		struct starpu_perfmodel_arch arch;
			
 
				+		arch.type = STARPU_CUDA_WORKER;
			
 
				+		arch.ncore = 0;
			
 
				 		/* Simulate Fast GPU */
			
 
				-		starpu_perfmodel_update_history(&model, &task, STARPU_CUDA_DEFAULT, 0, 0, measured_fast);
			
 
				-		starpu_perfmodel_update_history(&nl_model, &task, STARPU_CUDA_DEFAULT, 0, 0, measured_fast);
			
 
				+		arch.devid = 0;
			
 
				+		starpu_perfmodel_update_history(&model, &task, &arch, 0, 0, measured_fast);
			
 
				+		starpu_perfmodel_update_history(&nl_model, &task, &arch, 0, 0, measured_fast);
			
 
				+		
			
 
				 		/* Simulate Slow GPU */
			
 
				-		starpu_perfmodel_update_history(&model, &task, STARPU_CUDA_DEFAULT + 1, 0, 0, measured_slow);
			
 
				-		starpu_perfmodel_update_history(&nl_model, &task, STARPU_CUDA_DEFAULT + 1, 0, 0, measured_slow);
			
 
				+		arch.devid = 1;
			
 
				+		starpu_perfmodel_update_history(&model, &task, &arch, 0, 0, measured_slow);
			
 
				+		starpu_perfmodel_update_history(&nl_model, &task, &arch, 0, 0, measured_slow);
			
 
				 		starpu_task_clean(&task);
			
 
				 		starpu_data_unregister(handle);
			
 
				 	}
			
--- a/tests/perfmodels/valid_model.c
+++ b/tests/perfmodels/valid_model.c
@@ -60,12 +60,13 @@ static int submit(struct starpu_codelet *codelet, struct starpu_perfmodel *model
 
				 	int ret;
			
 
				 	int old_nsamples, new_nsamples;
			
 
				 	struct starpu_conf conf;
			
 
				-	unsigned archid;
			
 
				+	unsigned archid, archtype, devid, ncore;
			
 
				 
			
 
				 	starpu_conf_init(&conf);
			
 
				 	conf.sched_policy_name = "eager";
			
 
				 	conf.calibrate = 1;
			
 
				 
			
 
				+
			
 
				 	ret = starpu_init(&conf);
			
 
				 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
@@ -73,10 +74,14 @@ static int submit(struct starpu_codelet *codelet, struct starpu_perfmodel *model
 
				 	codelet->model = model;
			
 
				 
			
 
				 	old_nsamples = 0;
			
 
				+	lmodel.is_init=0;
			
 
				 	ret = starpu_perfmodel_load_symbol(codelet->model->symbol, &lmodel);
			
 
				 	if (ret != 1)
			
 
				-		for (archid = 0; archid < STARPU_NARCH_VARIATIONS; archid++)
			
 
				-			old_nsamples += lmodel.per_arch[archid][0].regression.nsample;
			
 
				+		for (archtype = 0; archtype < STARPU_NARCH; archtype++)
			
 
				+			if(lmodel.per_arch[archtype] != NULL)
			
 
				+				for(devid=0; lmodel.per_arch[archtype][devid] != NULL; devid++)
			
 
				+					for(ncore=0; lmodel.per_arch[archtype][devid][ncore] != NULL; ncore++)
			
 
				+						old_nsamples += lmodel.per_arch[archtype][devid][ncore][0].regression.nsample;
			
 
				 
			
 
				         starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, 100, sizeof(int));
			
 
				 	for (loop = 0; loop < nloops; loop++)
			
@@ -97,8 +102,11 @@ static int submit(struct starpu_codelet *codelet, struct starpu_perfmodel *model
 
				 	}
			
 
				 
			
 
				 	new_nsamples = 0;
			
 
				-	for (archid = 0; archid < STARPU_NARCH_VARIATIONS; archid++)
			
 
				-		new_nsamples += lmodel.per_arch[archid][0].regression.nsample;
			
 
				+	for (archtype = 0; archtype < STARPU_NARCH; archtype++)
			
 
				+		if(lmodel.per_arch[archtype] != NULL)
			
 
				+			for(devid=0; lmodel.per_arch[archtype][devid] != NULL; devid++)
			
 
				+				for(ncore=0; lmodel.per_arch[archtype][devid][ncore] != NULL; ncore++)
			
 
				+					new_nsamples += lmodel.per_arch[archtype][devid][ncore][0].regression.nsample;
			
 
				 
			
 
				 	ret = starpu_perfmodel_unload_model(&lmodel);
			
 
				 	if (ret == 1)
			
--- a/tests/sched_policies/simple_cpu_gpu_sched.c
+++ b/tests/sched_policies/simple_cpu_gpu_sched.c
@@ -40,7 +40,7 @@ dummy(void *buffers[], void *args)
 
				  */
			
 
				 static double
			
 
				 cpu_task_cpu(struct starpu_task *task,
			
 
				-	     enum starpu_perfmodel_archtype arch,
			
 
				+	     struct starpu_perfmodel_arch* arch,
			
 
				 	     unsigned nimpl)
			
 
				 {
			
 
				 	(void) task;
			
@@ -51,7 +51,7 @@ cpu_task_cpu(struct starpu_task *task,
 
				 
			
 
				 static double
			
 
				 cpu_task_gpu(struct starpu_task *task,
			
 
				-	     enum starpu_perfmodel_archtype arch,
			
 
				+	     struct starpu_perfmodel_arch* arch,
			
 
				 	     unsigned nimpl)
			
 
				 {
			
 
				 	(void) task;
			
@@ -63,7 +63,7 @@ cpu_task_gpu(struct starpu_task *task,
 
				 
			
 
				 static double
			
 
				 gpu_task_cpu(struct starpu_task *task,
			
 
				-	     enum starpu_perfmodel_archtype arch,
			
 
				+	     struct starpu_perfmodel_arch* arch,
			
 
				 	     unsigned nimpl)
			
 
				 {
			
 
				 	(void) task;
			
@@ -75,7 +75,7 @@ gpu_task_cpu(struct starpu_task *task,
 
				 
			
 
				 static double
			
 
				 gpu_task_gpu(struct starpu_task *task,
			
 
				-	     enum starpu_perfmodel_archtype arch,
			
 
				+	     struct starpu_perfmodel_arch* arch,
			
 
				 	     unsigned nimpl)
			
 
				 {
			
 
				 	(void) task;
			
@@ -99,16 +99,45 @@ static struct starpu_perfmodel model_gpu_task =
 
				 static void
			
 
				 init_perfmodels(void)
			
 
				 {
			
 
				-	int i;
			
 
				-	for (i = STARPU_CPU_DEFAULT; i < STARPU_CUDA_DEFAULT; i++)
			
 
				+	unsigned devid, ncore;
			
 
				+
			
 
				+	starpu_initialize_model(&model_cpu_task);
			
 
				+	starpu_initialize_model(&model_gpu_task);
			
 
				+
			
 
				+	if(model_cpu_task.per_arch[STARPU_CPU_WORKER] != NULL)
			
 
				 	{
			
 
				-		model_cpu_task.per_arch[i][0].cost_function = cpu_task_cpu;
			
 
				-		model_gpu_task.per_arch[i][0].cost_function = gpu_task_cpu;
			
 
				+		for(devid=0; model_cpu_task.per_arch[STARPU_CPU_WORKER][devid] != NULL; devid++)
			
 
				+		{
			
 
				+			for(ncore=0; model_cpu_task.per_arch[STARPU_CPU_WORKER][devid][ncore] != NULL; ncore++)
			
 
				+			{
			
 
				+				model_cpu_task.per_arch[STARPU_CPU_WORKER][devid][ncore][0].cost_function = cpu_task_cpu;
			
 
				+				model_gpu_task.per_arch[STARPU_CPU_WORKER][devid][ncore][0].cost_function = gpu_task_cpu;
			
 
				+			}
			
 
				+		}
			
 
				 	}
			
 
				-	for (i = STARPU_CUDA_DEFAULT; i < STARPU_NARCH_VARIATIONS; i++)
			
 
				+
			
 
				+	if(model_cpu_task.per_arch[STARPU_CUDA_WORKER] != NULL)
			
 
				+	{
			
 
				+		for(devid=0; model_cpu_task.per_arch[STARPU_CUDA_WORKER][devid] != NULL; devid++)
			
 
				+		{
			
 
				+			for(ncore=0; model_cpu_task.per_arch[STARPU_CUDA_WORKER][devid][ncore] != NULL; ncore++)
			
 
				+			{
			
 
				+				model_cpu_task.per_arch[STARPU_CUDA_WORKER][devid][ncore][0].cost_function = cpu_task_gpu;
			
 
				+				model_gpu_task.per_arch[STARPU_CUDA_WORKER][devid][ncore][0].cost_function = gpu_task_gpu;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if(model_cpu_task.per_arch[STARPU_OPENCL_WORKER] != NULL)
			
 
				 	{
			
 
				-		model_cpu_task.per_arch[i][0].cost_function = cpu_task_gpu;
			
 
				-		model_gpu_task.per_arch[i][0].cost_function = gpu_task_gpu;
			
 
				+		for(devid=0; model_cpu_task.per_arch[STARPU_OPENCL_WORKER][devid] != NULL; devid++)
			
 
				+		{
			
 
				+			for(ncore=0; model_cpu_task.per_arch[STARPU_OPENCL_WORKER][devid][ncore] != NULL; ncore++)
			
 
				+			{
			
 
				+				model_cpu_task.per_arch[STARPU_OPENCL_WORKER][devid][ncore][0].cost_function = cpu_task_gpu;
			
 
				+				model_gpu_task.per_arch[STARPU_OPENCL_WORKER][devid][ncore][0].cost_function = gpu_task_gpu;
			
 
				+			}
			
 
				+		}
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -176,8 +205,8 @@ run(struct starpu_sched_policy *policy)
 
				 	cpu_task_worker = starpu_worker_get_type(cpu_task->profiling_info->workerid);
			
 
				 	gpu_task_worker = starpu_worker_get_type(gpu_task->profiling_info->workerid);
			
 
				 	if (cpu_task_worker != STARPU_CPU_WORKER ||
			
 
				-	    (gpu_task_worker != STARPU_CUDA_WORKER &&
			
 
				-	     gpu_task_worker != STARPU_OPENCL_WORKER))
			
 
				+			(gpu_task_worker != STARPU_CUDA_WORKER &&
			
 
				+			 gpu_task_worker != STARPU_OPENCL_WORKER))
			
 
				 		ret = 1;
			
 
				 	else
			
 
				 		ret = 0;
			
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -16,7 +16,7 @@
 
				 
			
 
				 SUBDIRS =
			
 
				 
			
 
				-AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS)
			
 
				+AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(STARPU_COI_CPPFLAGS) $(GLOBAL_AM_CFLAGS)
			
 
				 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@
			
 
				 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/tools/ -I$(top_srcdir)/mpi/ -I$(top_builddir)/src -I$(top_srcdir)/src
			
 
				 AM_LDFLAGS = $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
			
--- a/tools/starpu_perfmodel_plot.c
+++ b/tools/starpu_perfmodel_plot.c
@@ -29,6 +29,7 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 #include <core/perfmodel/perfmodel.h> // we need to browse the list associated to history-based models
			
 
				+#include <core/workers.h>
			
 
				 
			
 
				 #ifdef __MINGW32__
			
 
				 #include <windows.h>
			
@@ -52,7 +53,7 @@ static struct starpu_fxt_options options;
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_USE_FXT
			
 
				-static int archtype_is_found[STARPU_NARCH_VARIATIONS];
			
 
				+static int **archtype_is_found[STARPU_NARCH];
			
 
				 
			
 
				 static char data_file_name[256];
			
 
				 #endif
			
@@ -181,22 +182,22 @@ static void print_comma(FILE *gnuplot_file, int *first)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static void display_perf_model(FILE *gnuplot_file, struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, int *first, unsigned nimpl)
			
 
				+static void display_perf_model(FILE *gnuplot_file, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, int *first, unsigned nimpl)
			
 
				 {
			
 
				 	char arch_name[256];
			
 
				 	starpu_perfmodel_get_arch_name(arch, arch_name, 256, nimpl);
			
 
				 
			
 
				 	struct starpu_perfmodel_per_arch *arch_model =
			
 
				-		&model->per_arch[arch][nimpl];
			
 
				+		&model->per_arch[arch->type][arch->devid][arch->ncore][nimpl];
			
 
				 
			
 
				 	if (arch_model->regression.valid || arch_model->regression.nl_valid)
			
 
				 		fprintf(stderr,"Arch: %s\n", arch_name);
			
 
				 
			
 
				 #ifdef STARPU_USE_FXT
			
 
				-	if (!gflops && !no_fxt_file && archtype_is_found[arch] && nimpl == 0)
			
 
				+	if (!gflops && !no_fxt_file && archtype_is_found[arch->type][arch->devid][arch->ncore] && nimpl == 0)
			
 
				 	{
			
 
				 		print_comma(gnuplot_file, first);
			
 
				-		fprintf(gnuplot_file, "\"< grep -w \\^%d %s\" using 2:3 title \"Profiling %s\"", arch, data_file_name, arch_name);
			
 
				+		fprintf(gnuplot_file, "\"< grep -w \\^%d_%d_%d %s\" using 2:3 title \"Profiling %s\"", arch->type, arch->devid, arch->ncore, data_file_name, arch_name);
			
 
				 	}
			
 
				 #endif
			
 
				 
			
@@ -227,11 +228,10 @@ static void display_perf_model(FILE *gnuplot_file, struct starpu_perfmodel *mode
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch1, enum starpu_perfmodel_archtype arch2, int *first)
			
 
				+static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, enum starpu_worker_archtype* type, int* devid, int* ncore, int *first)
			
 
				 {
			
 
				 	char *command;
			
 
				 	FILE *datafile;
			
 
				-	unsigned arch;
			
 
				 	struct starpu_perfmodel_history_list *ptr;
			
 
				 	char arch_name[32];
			
 
				 	int col;
			
@@ -245,20 +245,74 @@ static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_
 
				 
			
 
				 	col = 2;
			
 
				 	unsigned implid;
			
 
				-	for (arch = arch1; arch < arch2; arch++)
			
 
				+
			
 
				+	unsigned archmin, archmax, devmin, devmax, coremin, coremax;
			
 
				+	if(type != NULL)
			
 
				 	{
			
 
				-		for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				+		archmin = *type;
			
 
				+		archmax = *type +1;
			
 
				+		if(devid != NULL)
			
 
				 		{
			
 
				-			struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[arch][implid];
			
 
				-			starpu_perfmodel_get_arch_name((enum starpu_perfmodel_archtype) arch, arch_name, 32, implid);
			
 
				-
			
 
				-			//ptrs[arch-arch1][implid] = ptr[arch-arch1][implid] = arch_model->list;
			
 
				+			devmin = *devid;
			
 
				+			devmax = *devid +1;
			
 
				+			if(ncore != NULL)
			
 
				+			{
			
 
				+				coremin = *ncore;
			
 
				+				coremax = *ncore +1;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				coremin = 0;
			
 
				+				coremax = 0;
			
 
				+			}
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			devmin = 0;
			
 
				+			devmax = 0;
			
 
				+			coremin = 0;
			
 
				+			coremax = 0;
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		archmin = 0;
			
 
				+		archmax = STARPU_NARCH;
			
 
				+		devmin = 0;
			
 
				+		devmax = 0;
			
 
				+		coremin = 0;
			
 
				+		coremax = 0;
			
 
				 
			
 
				-			if (arch_model->list)
			
 
				+	}
			
 
				+	struct starpu_perfmodel_arch arch;
			
 
				+	unsigned archtype, dev, core;
			
 
				+	for (archtype = archmin; archtype < archmax; archtype++)
			
 
				+	{
			
 
				+		arch.type = archtype;
			
 
				+		if(model->per_arch[archtype]!=NULL)
			
 
				+		{
			
 
				+			for(dev = devmin; model->per_arch[archtype][dev] != NULL && (devmax == 0 || dev < devmax);dev++)
			
 
				 			{
			
 
				-				print_comma(gnuplot_file, first);
			
 
				-				fprintf(gnuplot_file, "\"%s\" using 1:%d:%d with errorlines title \"Average %s\"", avg_file_name, col, col+1, arch_name);
			
 
				-				col += 2;
			
 
				+				arch.devid = dev;
			
 
				+
			
 
				+				for(core = coremin; model->per_arch[archtype][dev][core] != NULL && (coremax == 0 || core < coremax); core++)
			
 
				+				{
			
 
				+					arch.ncore = core;
			
 
				+					for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				+					{
			
 
				+						struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[archtype][dev][core][implid];
			
 
				+						starpu_perfmodel_get_arch_name(&arch, arch_name, 32, implid);
			
 
				+
			
 
				+						//ptrs[arch-arch1][implid] = ptr[arch-arch1][implid] = arch_model->list;
			
 
				+
			
 
				+						if (arch_model->list)
			
 
				+						{
			
 
				+							print_comma(gnuplot_file, first);
			
 
				+							fprintf(gnuplot_file, "\"%s\" using 1:%d:%d with errorlines title \"Average %s\"", avg_file_name, col, col+1, arch_name);
			
 
				+							col += 2;
			
 
				+						}
			
 
				+					}
			
 
				+				}
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
@@ -270,71 +324,159 @@ static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_
 
				 
			
 
				 		minimum = ULONG_MAX;
			
 
				 		/* Get the next minimum */
			
 
				-		for (arch = arch1; arch < arch2; arch++)
			
 
				-			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				+		for (archtype = archmin; archtype < archmax; archtype++)
			
 
				+		{
			
 
				+			if(model->per_arch[archtype]!=NULL)
			
 
				 			{
			
 
				-				struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[arch][implid];
			
 
				-				for (ptr = arch_model->list; ptr; ptr = ptr->next)
			
 
				+				for(dev = devmin; model->per_arch[archtype][dev] != NULL && (devmax == 0 || dev < devmax);dev++)
			
 
				 				{
			
 
				-					unsigned long size = ptr->entry->size;
			
 
				-					if (size > last && size < minimum)
			
 
				-						minimum = size;
			
 
				+					for(core = coremin; model->per_arch[archtype][dev][core] != NULL && (coremax == 0 || core < coremax); core++)
			
 
				+				
			
 
				+					{
			
 
				+						for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				+						{
			
 
				+							struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[archtype][dev][core][implid];
			
 
				+							for (ptr = arch_model->list; ptr; ptr = ptr->next)
			
 
				+							{
			
 
				+								unsigned long size = ptr->entry->size;
			
 
				+								if (size > last && size < minimum)
			
 
				+									minimum = size;
			
 
				+							}
			
 
				+						}
			
 
				+					}
			
 
				 				}
			
 
				 			}
			
 
				+		}
			
 
				 		if (minimum == ULONG_MAX)
			
 
				 			break;
			
 
				 
			
 
				 		fprintf(stderr, "%lu ", minimum);
			
 
				 		fprintf(datafile, "%-15lu ", minimum);
			
 
				-		for (arch = arch1; arch < arch2; arch++)
			
 
				-		{
			
 
				-			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				-			{
			
 
				-				struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[arch][implid];
			
 
				-				for (ptr = arch_model->list; ptr; ptr = ptr->next)
			
 
				-				{
			
 
				-					struct starpu_perfmodel_history_entry *entry = ptr->entry;
			
 
				-					if (entry->size == minimum)
			
 
				-					{
			
 
				-						if (gflops)
			
 
				-							fprintf(datafile, "\t%-15le\t%-15le", entry->flops / (entry->mean * 1000),
			
 
				-									entry->flops / ((entry->mean + entry->deviation) * 1000) -
			
 
				-									entry->flops / (entry->mean * 1000)
			
 
				-									);
			
 
				-						else
			
 
				-							fprintf(datafile, "\t%-15le\t%-15le", 0.001*entry->mean, 0.001*entry->deviation);
			
 
				-						break;
			
 
				-					}
			
 
				-				}
			
 
				-				if (!ptr && arch_model->list)
			
 
				-					/* No value for this arch. */
			
 
				-					fprintf(datafile, "\t\"\"\t\"\"");
			
 
				-			}
			
 
				-		}
			
 
				+		for (archtype = archmin; archtype < archmax; archtype++)
			
 
				+			if(model->per_arch[archtype]!=NULL)
			
 
				+				for(dev = devmin; model->per_arch[archtype][dev] != NULL && (devmax == 0 || dev < devmax);dev++)
			
 
				+					for(core = coremin; model->per_arch[archtype][dev][core] != NULL && (coremax == 0 || core < coremax); core++)
			
 
				+						for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				+						{
			
 
				+							struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[archtype][dev][core][implid];
			
 
				+							for (ptr = arch_model->list; ptr; ptr = ptr->next)
			
 
				+							{
			
 
				+								struct starpu_perfmodel_history_entry *entry = ptr->entry;
			
 
				+								if (entry->size == minimum)
			
 
				+								{
			
 
				+									if (gflops)
			
 
				+										fprintf(datafile, "\t%-15le\t%-15le", entry->flops / (entry->mean * 1000),
			
 
				+												entry->flops / ((entry->mean + entry->deviation) * 1000) -
			
 
				+												entry->flops / (entry->mean * 1000)
			
 
				+										       );
			
 
				+									else
			
 
				+										fprintf(datafile, "\t%-15le\t%-15le", 0.001*entry->mean, 0.001*entry->deviation);
			
 
				+									break;
			
 
				+								}
			
 
				+							}
			
 
				+							if (!ptr && arch_model->list)
			
 
				+								/* No value for this arch. */
			
 
				+								fprintf(datafile, "\t\"\"\t\"\"");
			
 
				+						}
			
 
				 		fprintf(datafile, "\n");
			
 
				 	}
			
 
				 	fprintf(stderr, "\n");
			
 
				 	fclose(datafile);
			
 
				 }
			
 
				 
			
 
				-static void display_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch1, enum starpu_perfmodel_archtype arch2, int *first)
			
 
				+
			
 
				+static void display_selected_arch_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, int *first)
			
 
				 {
			
 
				-	unsigned arch;
			
 
				 	unsigned implid;
			
 
				-	for (arch = arch1; arch < arch2; arch++)
			
 
				+	for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				+		display_perf_model(gnuplot_file, model, arch, first, implid);
			
 
				+}
			
 
				+
			
 
				+static void display_selected_device_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, enum starpu_worker_archtype archtype, int devid, int *first)
			
 
				+{
			
 
				+	unsigned ncore;
			
 
				+	struct starpu_perfmodel_arch arch;
			
 
				+	arch.type = archtype;
			
 
				+	arch.devid = devid;
			
 
				+	for(ncore=0; model->per_arch[archtype][devid][ncore] != NULL; ncore++)
			
 
				 	{
			
 
				-		for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				+		arch.ncore = ncore;
			
 
				+		display_selected_arch_perf_models(gnuplot_file,model,&arch,first);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void display_selected_archtype_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, enum starpu_worker_archtype archtype, int *first)
			
 
				+{
			
 
				+	unsigned devid;
			
 
				+	for(devid=0; model->per_arch[archtype][devid] != NULL; devid++)
			
 
				+		display_selected_device_perf_models(gnuplot_file,model,archtype,devid,first);
			
 
				+}
			
 
				+
			
 
				+static void display_all_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, int *first)
			
 
				+{
			
 
				+	unsigned archtype;
			
 
				+	for(archtype = 0; archtype < STARPU_NARCH; archtype++)
			
 
				+		display_selected_archtype_perf_models(gnuplot_file,model,archtype,first);
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_FXT
			
 
				+static int ** init_archtype_is_found_per_arch(int maxdevid, unsigned* maxncore_table)
			
 
				+{
			
 
				+	int devid, ncore;
			
 
				+	int ** archtype_is_found_per_arch = malloc(sizeof(*archtype_is_found_per_arch)*(maxdevid+1));
			
 
				+	archtype_is_found_per_arch[maxdevid] = NULL;
			
 
				+	for(devid=0; devid<maxdevid; devid++)
			
 
				+	{
			
 
				+		int maxncore;
			
 
				+		if(maxncore_table != NULL)
			
 
				+			maxncore = maxncore_table[devid];
			
 
				+		else
			
 
				+			maxncore = 1;
			
 
				+		
			
 
				+		archtype_is_found_per_arch[devid] = malloc(sizeof(*archtype_is_found_per_arch[devid])*(maxncore+1));
			
 
				+		archtype_is_found_per_arch[devid][maxncore] = 0;
			
 
				+		for(ncore=0; ncore<maxncore; ncore++)
			
 
				+			archtype_is_found_per_arch[devid][ncore] = 0;
			
 
				+	}
			
 
				+	return archtype_is_found_per_arch;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void init_archtype_is_found(struct starpu_perfmodel *model)
			
 
				+{
			
 
				+	unsigned archtype, devid, ndevice, ncore, *maxncore;
			
 
				+
			
 
				+	for(archtype = 0; archtype < STARPU_NARCH; archtype++)
			
 
				+	{
			
 
				+	
			
 
				+		for(devid=0; model->per_arch[archtype][devid] != NULL; devid++);
			
 
				+		ndevice = devid;
			
 
				+		if(ndevice != 0)
			
 
				 		{
			
 
				-			display_perf_model(gnuplot_file, model, (enum starpu_perfmodel_archtype) arch, first, implid);
			
 
				+			maxncore = malloc(sizeof(*maxncore)*ndevice);
			
 
				+			for(devid=0; devid < ndevice; devid++);
			
 
				+			{
			
 
				+			
			
 
				+				for(ncore=0; model->per_arch[archtype][devid][ncore] != NULL; ncore++);
			
 
				+				maxncore[devid] = ncore;
			
 
				+			}
			
 
				 		}
			
 
				+		else
			
 
				+		{
			
 
				+			maxncore = NULL;
			
 
				+		}
			
 
				+
			
 
				+		archtype_is_found[archtype] = init_archtype_is_found_per_arch(ndevice,maxncore);
			
 
				+		if(maxncore != NULL)
			
 
				+			free(maxncore);
			
 
				 	}
			
 
				-	display_history_based_perf_models(gnuplot_file, model, arch1, arch2, first);
			
 
				 }
			
 
				 
			
 
				-#ifdef STARPU_USE_FXT
			
 
				-static void dump_data_file(FILE *data_file)
			
 
				+
			
 
				+static void dump_data_file(FILE *data_file, struct starpu_perfmodel *model)
			
 
				 {
			
 
				-	memset(archtype_is_found, 0, STARPU_NARCH_VARIATIONS*sizeof(int));
			
 
				+	init_archtype_is_found(model);
			
 
				 
			
 
				 	int i;
			
 
				 	for (i = 0; i < options.dumped_codelets_count; i++)
			
@@ -342,13 +484,13 @@ static void dump_data_file(FILE *data_file)
 
				 		/* Dump only if the symbol matches user's request */
			
 
				 		if (strncmp(dumped_codelets[i].symbol, symbol, (FXT_MAX_PARAMS - 4)*sizeof(unsigned long)-1) == 0)
			
 
				 		{
			
 
				-			enum starpu_perfmodel_archtype archtype = dumped_codelets[i].archtype;
			
 
				-			archtype_is_found[archtype] = 1;
			
 
				+			struct starpu_perfmodel_arch* arch = &dumped_codelets[i].arch;
			
 
				+			archtype_is_found[arch->type][arch->devid][arch->ncore] = 1;
			
 
				 
			
 
				 			size_t size = dumped_codelets[i].size;
			
 
				 			float time = dumped_codelets[i].time;
			
 
				 
			
 
				-			fprintf(data_file, "%d	%f	%f\n", archtype, (float)size, time);
			
 
				+			fprintf(data_file, "%d_%d_%d	%f	%f\n", arch->type, arch->devid, arch->ncore, (float)size, time);
			
 
				 		}
			
 
				 	}
			
 
				 }
			
@@ -380,55 +522,72 @@ static void display_selected_models(FILE *gnuplot_file, struct starpu_perfmodel
 
				 	int first = 1;
			
 
				 	fprintf(gnuplot_file, "plot\t");
			
 
				 
			
 
				+	struct starpu_perfmodel_arch arch;
			
 
				+	struct _starpu_machine_config *conf = _starpu_get_machine_config();
			
 
				+
			
 
				+
			
 
				+
			
 
				 	if (archname == NULL)
			
 
				 	{
			
 
				 		/* display all architectures */
			
 
				-		display_perf_models(gnuplot_file, model, (enum starpu_perfmodel_archtype) 0, (enum starpu_perfmodel_archtype) STARPU_NARCH_VARIATIONS, &first);
			
 
				+		display_all_perf_models(gnuplot_file, model, &first);
			
 
				+		display_history_based_perf_models(gnuplot_file, model, NULL, NULL, NULL, &first);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				 		if (strcmp(archname, "cpu") == 0)
			
 
				 		{
			
 
				-			unsigned impl;
			
 
				-			for (impl = 0; impl < STARPU_MAXIMPLEMENTATIONS; impl++)
			
 
				-			{
			
 
				-				display_perf_model(gnuplot_file, model,
			
 
				-						   STARPU_CPU_DEFAULT,
			
 
				-						   &first, impl);
			
 
				-			}
			
 
				+			
			
 
				+			arch.type = STARPU_CPU_WORKER;
			
 
				+			arch.devid = 1;
			
 
				+			arch.ncore = 0;
			
 
				+
			
 
				+			display_selected_arch_perf_models(gnuplot_file, model, &arch, &first); 
			
 
				+			display_history_based_perf_models(gnuplot_file, model,  &arch.type, &arch.devid, &arch.ncore, &first);
			
 
				 			return;
			
 
				 		}
			
 
				 
			
 
				-		int k;
			
 
				-		if (sscanf(archname, "cpu:%d", &k) == 1)
			
 
				+		unsigned k;
			
 
				+		if (sscanf(archname, "cpu:%u", &k) == 1)
			
 
				 		{
			
 
				 			/* For combined CPU workers */
			
 
				-			if ((k < 1) || (k > STARPU_MAXCPUS))
			
 
				+			if ((k < 1) || (k > conf->topology.ncpus))
			
 
				 			{
			
 
				 				fprintf(stderr, "Invalid CPU size\n");
			
 
				 				exit(-1);
			
 
				 			}
			
 
				 
			
 
				-			display_perf_models(gnuplot_file, model, (enum starpu_perfmodel_archtype) (STARPU_CPU_DEFAULT + k - 1), (enum starpu_perfmodel_archtype) (STARPU_CPU_DEFAULT + k), &first);
			
 
				+			arch.type = STARPU_CPU_WORKER;
			
 
				+			arch.devid = 1;
			
 
				+			arch.ncore = k - 1;
			
 
				+
			
 
				+			display_selected_arch_perf_models(gnuplot_file, model, &arch, &first); 
			
 
				+			display_history_based_perf_models(gnuplot_file, model,  &arch.type, &arch.devid, &arch.ncore, &first);
			
 
				 			return;
			
 
				 		}
			
 
				 
			
 
				 		if (strcmp(archname, "cuda") == 0)
			
 
				 		{
			
 
				-			display_perf_models(gnuplot_file, model, STARPU_CUDA_DEFAULT, (enum starpu_perfmodel_archtype) (STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS), &first);
			
 
				+			unsigned archtype = STARPU_CUDA_WORKER;
			
 
				+			display_selected_archtype_perf_models(gnuplot_file, model, archtype, &first);
			
 
				+			display_history_based_perf_models(gnuplot_file, model,  &archtype, NULL, NULL, &first);
			
 
				 			return;
			
 
				 		}
			
 
				 
			
 
				 		/* There must be a cleaner way ! */
			
 
				-		int gpuid;
			
 
				+		unsigned gpuid;
			
 
				 		int nmatched;
			
 
				-		nmatched = sscanf(archname, "cuda_%d", &gpuid);
			
 
				+		nmatched = sscanf(archname, "cuda_%u", &gpuid);
			
 
				 		if (nmatched == 1)
			
 
				 		{
			
 
				-			int archid = STARPU_CUDA_DEFAULT+ gpuid;
			
 
				-			if (archid < STARPU_OPENCL_DEFAULT)
			
 
				+			if (gpuid < conf->topology.ncudagpus)
			
 
				 			{
			
 
				-				display_perf_models(gnuplot_file, model, (enum starpu_perfmodel_archtype) archid, (enum starpu_perfmodel_archtype) (archid + 1), &first);
			
 
				+				arch.type = STARPU_CUDA_WORKER;
			
 
				+				arch.devid = gpuid;
			
 
				+				arch.ncore = 0;
			
 
				+
			
 
				+				display_selected_arch_perf_models(gnuplot_file, model, &arch, &first);
			
 
				+				display_history_based_perf_models(gnuplot_file, model,  &arch.type, &arch.devid, &arch.ncore, &first);
			
 
				 				return;
			
 
				 			}
			
 
				 			else
			
@@ -440,18 +599,24 @@ static void display_selected_models(FILE *gnuplot_file, struct starpu_perfmodel
 
				 
			
 
				 		if (strcmp(archname, "opencl") == 0)
			
 
				 		{
			
 
				-			display_perf_models(gnuplot_file, model, STARPU_OPENCL_DEFAULT, (enum starpu_perfmodel_archtype) (STARPU_OPENCL_DEFAULT + STARPU_MAXOPENCLDEVS), &first);
			
 
				+			unsigned archtype = STARPU_OPENCL_WORKER;
			
 
				+			display_selected_archtype_perf_models(gnuplot_file, model, archtype, &first);
			
 
				+			display_history_based_perf_models(gnuplot_file, model,  &archtype, NULL, NULL, &first);
			
 
				 			return;
			
 
				 		}
			
 
				 
			
 
				 		/* There must be a cleaner way ! */
			
 
				-		nmatched = sscanf(archname, "opencl_%d", &gpuid);
			
 
				+		nmatched = sscanf(archname, "opencl_%u", &gpuid);
			
 
				 		if (nmatched == 1)
			
 
				 		{
			
 
				-			int archid = STARPU_OPENCL_DEFAULT+ gpuid;
			
 
				-			if (archid < STARPU_NARCH_VARIATIONS)
			
 
				+			if (gpuid < conf->topology.nopenclgpus)
			
 
				 			{
			
 
				-				display_perf_models(gnuplot_file, model, (enum starpu_perfmodel_archtype) archid, (enum starpu_perfmodel_archtype) (archid + 1), &first);
			
 
				+				arch.type = STARPU_OPENCL_WORKER;
			
 
				+				arch.devid = gpuid;
			
 
				+				arch.ncore = 0;
			
 
				+		
			
 
				+				display_selected_arch_perf_models(gnuplot_file, model, &arch, &first);
			
 
				+				display_history_based_perf_models(gnuplot_file, model,  &arch.type, &arch.devid, &arch.ncore, &first);
			
 
				 				return;
			
 
				 			}
			
 
				 			else
			
@@ -507,7 +672,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 		FILE *data_file = fopen(data_file_name, "w+");
			
 
				 		STARPU_ASSERT(data_file);
			
 
				-		dump_data_file(data_file);
			
 
				+		dump_data_file(data_file, &model);
			
 
				 		fclose(data_file);
			
 
				 	}
			
 
				 #endif