11 years ago · 5593439277
--- a/Makefile.am
+++ b/Makefile.am
@@ -18,7 +18,7 @@ ACLOCAL_AMFLAGS=-I m4
 
				 CLEANFILES = *.gcno *.gcda *.linkinfo
			
 
				 
			
 
				 SUBDIRS = src
			
 
				-SUBDIRS += tools tests
			
 
				+SUBDIRS += tests
			
 
				 SUBDIRS += doc
			
 
				 
			
 
				 if USE_MPI
			
--- a/include/starpu_perfmodel.h
+++ b/include/starpu_perfmodel.h
@@ -35,13 +35,20 @@ struct starpu_data_descr;
 
				 
			
 
				 #define STARPU_NARCH STARPU_ANY_WORKER
			
 
				 
			
 
				-struct starpu_perfmodel_arch
			
 
				+struct starpu_perfmodel_device
			
 
				 {
			
 
				 	enum starpu_worker_archtype type;
			
 
				 	int devid;	/* identifier of the precise device */
			
 
				-	int ncore;	/* number of execution in parallel, minus 1 */
			
 
				+	int ncores;	/* number of execution in parallel, minus 1 */	
			
 
				 };
			
 
				 
			
 
				+struct starpu_perfmodel_arch
			
 
				+{
			
 
				+	int ndevices;
			
 
				+	struct starpu_perfmodel_device *devices;
			
 
				+};
			
 
				+
			
 
				+
			
 
				 struct starpu_perfmodel_history_entry
			
 
				 {
			
 
				 	double mean;
			
@@ -125,7 +132,7 @@ struct starpu_perfmodel
 
				 	size_t (*size_base)(struct starpu_task *, unsigned nimpl);
			
 
				 	uint32_t (*footprint)(struct starpu_task *);
			
 
				 
			
 
				-	struct starpu_perfmodel_per_arch**** per_arch; /*STARPU_MAXIMPLEMENTATIONS*/
			
 
				+	struct starpu_perfmodel_per_arch** per_arch; /*STARPU_MAXIMPLEMENTATIONS*/
			
 
				 
			
 
				 	const char *symbol;
			
 
				 
			
@@ -133,15 +140,22 @@ struct starpu_perfmodel
 
				 	unsigned is_loaded;
			
 
				 	unsigned benchmarking;
			
 
				 	starpu_pthread_rwlock_t model_rwlock;
			
 
				+	int *nimpls;
			
 
				+	int ncombs;
			
 
				 };
			
 
				 
			
 
				-void starpu_perfmodel_init(struct starpu_perfmodel *model);
			
 
				-void starpu_perfmodel_init_with_file(FILE*f, struct starpu_perfmodel *model);
			
 
				+void starpu_perfmodel_init(FILE *f, struct starpu_perfmodel *model);
			
 
				+//void starpu_perfmodel_init_with_file(FILE*f, struct starpu_perfmodel *model);
			
 
				+
			
 
				+struct starpu_perfmodel_arch *starpu_worker_get_perf_archtype(int workerid, unsigned sched_ctx_id);
			
 
				 
			
 
				-struct starpu_perfmodel_arch *starpu_worker_get_perf_archtype(int workerid);
			
 
				+struct starpu_perfmodel_per_arch *starpu_perfmodel_get_model_per_arch(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, unsigned impl);
			
 
				 
			
 
				 int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *model);
			
 
				 int starpu_perfmodel_unload_model(struct starpu_perfmodel *model);
			
 
				+int starpu_get_narch_combs();
			
 
				+int starpu_add_arch_comb(int ndevices, struct starpu_perfmodel_device* devices);
			
 
				+int starpu_get_arch_comb(int ndevices, struct starpu_perfmodel_device *devices);
			
 
				 
			
 
				 void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, char *path, size_t maxlen, unsigned nimpl);
			
 
				 char* starpu_perfmodel_get_archtype_name(enum starpu_worker_archtype archtype);
			
--- a/sc_hypervisor/src/policies_utils/policy_tools.c
+++ b/sc_hypervisor/src/policies_utils/policy_tools.c
@@ -416,7 +416,7 @@ void sc_hypervisor_get_tasks_times(int nw, int nt, double times[nw][nt], int *wo
 
				                 for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
			
 
				                 {
			
 
				 			int worker = workers == NULL ? w : workers[w];
			
 
				-                        struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(worker);
			
 
				+                        struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(worker, STARPU_NMAX_SCHED_CTXS);
			
 
				                         double length = starpu_permodel_history_based_expected_perf(tp->cl->model, arch, tp->footprint);
			
 
				 
			
 
				                         if (isnan(length))
			
--- a/src/common/fxt.h
+++ b/src/common/fxt.h
@@ -431,7 +431,7 @@ do {									\
 
				 do {									\
			
 
				 	const size_t job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));	\
			
 
				 	const uint32_t job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));\
			
 
				-	FUT_DO_PROBE7(_STARPU_FUT_END_CODELET_BODY, (job), (job_size), (job_hash), (archtype)->type, (archtype)->devid, (archtype)->ncore, workerid);	\
			
 
				+	FUT_DO_PROBE7(_STARPU_FUT_END_CODELET_BODY, (job), (job_size), (job_hash), (archtype->devices[0]).type, (archtype->devices[0]).devid, (archtype->devices[0]).ncores, workerid);	\
			
 
				 } while(0);
			
 
				 
			
 
				 #define _STARPU_TRACE_START_EXECUTING()				\
			
--- a/src/core/combined_workers.c
+++ b/src/core/combined_workers.c
@@ -101,9 +101,9 @@ int starpu_combined_worker_assign_workerid(int nworkers, int workerid_array[])
 
				 
			
 
				 	combined_worker->worker_size = nworkers;
			
 
				 
			
 
				-	combined_worker->perf_arch.type = config->workers[workerid_array[0]].perf_arch.type;
			
 
				-	combined_worker->perf_arch.devid = config->workers[workerid_array[0]].perf_arch.devid; 
			
 
				-	combined_worker->perf_arch.ncore = nworkers - 1;
			
 
				+	combined_worker->perf_arch.devices[0].type = config->workers[workerid_array[0]].perf_arch.devices[0].type;
			
 
				+	combined_worker->perf_arch.devices[0].devid = config->workers[workerid_array[0]].perf_arch.devices[0].devid; 
			
 
				+	combined_worker->perf_arch.devices[0].ncores = nworkers - 1;
			
 
				 	combined_worker->worker_mask = config->workers[workerid_array[0]].worker_mask;
			
 
				 	
			
 
				 #ifdef STARPU_USE_MP
			
--- a/src/core/detect_combined_workers.c
+++ b/src/core/detect_combined_workers.c
@@ -44,7 +44,7 @@ static void find_workers(hwloc_obj_t obj, int cpu_workers[STARPU_NMAXWORKERS], u
 
				 	for(worker = _starpu_worker_list_begin(workers); worker != _starpu_worker_list_end(workers); worker = _starpu_worker_list_next(worker))
			
 
				 	{
			
 
				 		/* is it a CPU worker? */
			
 
				-		if (worker->perf_arch.type == STARPU_CPU_WORKER && worker->perf_arch.ncore == 0)
			
 
				+		if (worker->perf_arch.devices[0].type == STARPU_CPU_WORKER && worker->perf_arch.devices[0].ncores == 0)
			
 
				 		{
			
 
				 			_STARPU_DEBUG("worker %d is part of it\n", worker->workerid);
			
 
				 			/* Add it to the combined worker */
			
@@ -178,7 +178,7 @@ static void find_and_assign_combinations_with_hwloc(int *workerids, int nworkers
 
				 	for (i = 0; i < nworkers; i++)
			
 
				 	{
			
 
				 		struct _starpu_worker *worker = _starpu_get_worker_struct(workerids[i]);
			
 
				-		if (worker->perf_arch.type == STARPU_CPU_WORKER && worker->perf_arch.ncore == 0)
			
 
				+		if (worker->perf_arch.devices[0].type == STARPU_CPU_WORKER && worker->perf_arch.devices[0].ncores == 0)
			
 
				 		{
			
 
				 			hwloc_obj_t obj = hwloc_get_obj_by_depth(topology->hwtopology, config->pu_depth, worker->bindid);
			
 
				 			obj = obj->parent;
			
--- a/src/core/perfmodel/perfmodel.c
+++ b/src/core/perfmodel/perfmodel.c
@@ -37,7 +37,6 @@
 
				  *	2: models must be calibrated, existing models are overwritten.
			
 
				  */
			
 
				 static unsigned calibrate_flag = 0;
			
 
				-
			
 
				 void _starpu_set_calibrate_flag(unsigned val)
			
 
				 {
			
 
				 	calibrate_flag = val;
			
@@ -48,8 +47,15 @@ unsigned _starpu_get_calibrate_flag(void)
 
				 	return calibrate_flag;
			
 
				 }
			
 
				 
			
 
				-struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid)
			
 
				+struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid, unsigned sched_ctx_id)
			
 
				 {
			
 
				+	if(sched_ctx_id != STARPU_NMAX_SCHED_CTXS)
			
 
				+	{
			
 
				+		unsigned child_sched_ctx = starpu_sched_ctx_worker_is_master_for_child_ctx(workerid, sched_ctx_id);
			
 
				+		if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
			
 
				+			return _starpu_sched_ctx_get_perf_archtype(child_sched_ctx);
			
 
				+	}
			
 
				+
			
 
				 	struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				 
			
 
				 	/* This workerid may either be a basic worker or a combined worker */
			
@@ -57,6 +63,7 @@ struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid)
 
				 
			
 
				 	if (workerid < (int)config->topology.nworkers)
			
 
				 		return &config->workers[workerid].perf_arch;
			
 
				+	
			
 
				 
			
 
				 	/* We have a combined worker */
			
 
				 	unsigned ncombinedworkers = config->topology.ncombinedworkers;
			
@@ -71,11 +78,14 @@ struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid)
 
				 static double per_arch_task_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, struct starpu_task *task, unsigned nimpl)
			
 
				 {
			
 
				 	double exp = NAN;
			
 
				+	int comb = starpu_get_arch_comb(arch->ndevices, arch->devices);
			
 
				+	if(comb == -1) return exp;
			
 
				+		
			
 
				 	double (*per_arch_cost_function)(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
			
 
				 	double (*per_arch_cost_model)(struct starpu_data_descr *);
			
 
				 
			
 
				-	per_arch_cost_function = model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].cost_function;
			
 
				-	per_arch_cost_model = model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].cost_model;
			
 
				+	per_arch_cost_function = model->per_arch[comb][nimpl].cost_function;
			
 
				+	per_arch_cost_model = model->per_arch[comb][nimpl].cost_model;
			
 
				 
			
 
				 	if (per_arch_cost_function)
			
 
				 		exp = per_arch_cost_function(task, arch, nimpl);
			
@@ -91,26 +101,23 @@ static double per_arch_task_expected_perf(struct starpu_perfmodel *model, struct
 
				 
			
 
				 double starpu_worker_get_relative_speedup(struct starpu_perfmodel_arch* perf_arch)
			
 
				 {
			
 
				-	if (perf_arch->type == STARPU_CPU_WORKER)
			
 
				-	{
			
 
				-		return _STARPU_CPU_ALPHA * (perf_arch->ncore + 1);
			
 
				-	}
			
 
				-	else if (perf_arch->type == STARPU_CUDA_WORKER)
			
 
				-	{
			
 
				-		return _STARPU_CUDA_ALPHA;
			
 
				-	}
			
 
				-	else if (perf_arch->type == STARPU_OPENCL_WORKER)
			
 
				-	{
			
 
				-		return _STARPU_OPENCL_ALPHA;
			
 
				-	}
			
 
				-	else if (perf_arch->type == STARPU_MIC_WORKER)
			
 
				+	double speedup = 0;
			
 
				+	int dev;
			
 
				+	for(dev = 0; dev < perf_arch->ndevices; dev++)
			
 
				 	{
			
 
				-		return _STARPU_MIC_ALPHA * (perf_arch->ncore + 1);
			
 
				+		double coef = 0.0;
			
 
				+		if (perf_arch->devices[dev].type == STARPU_CPU_WORKER)
			
 
				+			coef = _STARPU_CPU_ALPHA;
			
 
				+		else if (perf_arch->devices[dev].type == STARPU_CUDA_WORKER)
			
 
				+			coef = _STARPU_CUDA_ALPHA;
			
 
				+		else if (perf_arch->devices[dev].type == STARPU_OPENCL_WORKER)
			
 
				+			coef = _STARPU_OPENCL_ALPHA;
			
 
				+		else if (perf_arch->devices[dev].type == STARPU_MIC_WORKER)
			
 
				+			coef =  _STARPU_MIC_ALPHA;
			
 
				+		
			
 
				+		speedup += coef * (perf_arch->devices[dev].ncores + 1);
			
 
				 	}
			
 
				-	STARPU_ABORT();
			
 
				-
			
 
				-	/* Never reached ! */
			
 
				-	return NAN;
			
 
				+	return speedup == 0 ? NAN : speedup;
			
 
				 }
			
 
				 
			
 
				 static double common_task_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, struct starpu_task *task, unsigned nimpl)
			
@@ -184,22 +191,15 @@ static double starpu_model_expected_perf(struct starpu_task *task, struct starpu
 
				 		switch (model->type)
			
 
				 		{
			
 
				 			case STARPU_PER_ARCH:
			
 
				-
			
 
				 				return per_arch_task_expected_perf(model, arch, task, nimpl);
			
 
				 			case STARPU_COMMON:
			
 
				 				return common_task_expected_perf(model, arch, task, nimpl);
			
 
				-
			
 
				 			case STARPU_HISTORY_BASED:
			
 
				-
			
 
				 				return _starpu_history_based_job_expected_perf(model, arch, j, nimpl);
			
 
				 			case STARPU_REGRESSION_BASED:
			
 
				-
			
 
				 				return _starpu_regression_based_job_expected_perf(model, arch, j, nimpl);
			
 
				-
			
 
				 			case STARPU_NL_REGRESSION_BASED:
			
 
				-
			
 
				 				return _starpu_non_linear_regression_based_job_expected_perf(model, arch, j,nimpl);
			
 
				-
			
 
				 			default:
			
 
				 				STARPU_ABORT();
			
 
				 		}
			
@@ -223,6 +223,8 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 
				 					    struct starpu_perfmodel_arch* arch,
			
 
				 					    unsigned nimpl)
			
 
				 {
			
 
				+	if(arch->ndevices > 1)
			
 
				+		return -1.0;
			
 
				 	unsigned i;
			
 
				 	double sum = 0.0;
			
 
				 	enum starpu_node_kind node_kind;
			
@@ -236,7 +238,7 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 
				 		if (!_starpu_data_is_multiformat_handle(handle))
			
 
				 			continue;
			
 
				 		
			
 
				-		switch(arch->type)
			
 
				+		switch(arch->devices[0].type)
			
 
				 		{
			
 
				 			case STARPU_CPU_WORKER:
			
 
				 				node_kind = STARPU_CPU_RAM;
			
@@ -503,3 +505,4 @@ void _starpu_create_sampling_directory_if_needed(void)
 
				 		directory_existence_was_tested = 1;
			
 
				 	}
			
 
				 }
			
 
				+
			
--- a/src/core/perfmodel/perfmodel.h
+++ b/src/core/perfmodel/perfmodel.h
@@ -38,7 +38,10 @@ extern "C"
 
				  * differents versions of StarPU having different performance model
			
 
				  * formats.
			
 
				  */
			
 
				-#define _STARPU_PERFMODEL_VERSION 43
			
 
				+#define _STARPU_PERFMODEL_VERSION 44
			
 
				+
			
 
				+struct starpu_perfmodel_arch **arch_combs;
			
 
				+int narch_combs;
			
 
				 
			
 
				 struct _starpu_perfmodel_list
			
 
				 {
			
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -52,17 +52,79 @@ struct starpu_perfmodel_history_table
 
				 static starpu_pthread_rwlock_t registered_models_rwlock;
			
 
				 static struct _starpu_perfmodel_list *registered_models = NULL;
			
 
				 
			
 
				-size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned nimpl, struct _starpu_job *j)
			
 
				+int starpu_add_arch_comb(int ndevices, struct starpu_perfmodel_device* devices)
			
 
				+{
			
 
				+	arch_combs[narch_combs] = (struct starpu_perfmodel_arch*)malloc(sizeof(struct starpu_perfmodel_arch));
			
 
				+	arch_combs[narch_combs]->devices = (struct starpu_perfmodel_device*)malloc(ndevices*sizeof(struct starpu_perfmodel_device));
			
 
				+	arch_combs[narch_combs]->ndevices = ndevices;
			
 
				+	int dev;
			
 
				+	for(dev = 0; dev < ndevices; dev++)
			
 
				+	{
			
 
				+		arch_combs[narch_combs]->devices[dev].type = devices[dev].type;
			
 
				+		arch_combs[narch_combs]->devices[dev].devid = devices[dev].devid;
			
 
				+		arch_combs[narch_combs]->devices[dev].ncores = devices[dev].ncores;
			
 
				+	}
			
 
				+	narch_combs++;
			
 
				+	return narch_combs-1;
			
 
				+}
			
 
				+
			
 
				+int starpu_get_arch_comb(int ndevices, struct starpu_perfmodel_device *devices)
			
 
				+{
			
 
				+	int nfounded = 0;
			
 
				+	unsigned found = 0;
			
 
				+	int comb;
			
 
				+	for(comb = 0; comb < narch_combs; comb++)
			
 
				+	{
			
 
				+		if(arch_combs[comb]->ndevices == ndevices)
			
 
				+		{
			
 
				+			int dev1, dev2;
			
 
				+			for(dev1 = 0; dev1 < arch_combs[comb]->ndevices; dev1++)
			
 
				+			{
			
 
				+				for(dev2 = 0; dev2 < ndevices; dev2++)
			
 
				+				{
			
 
				+					if(arch_combs[comb]->devices[dev1].type == devices[dev2].type && 
			
 
				+					   arch_combs[comb]->devices[dev1].devid == devices[dev2].devid && 
			
 
				+					   arch_combs[comb]->devices[dev1].ncores == devices[dev2].ncores)
			
 
				+						nfounded++;
			
 
				+				}
			
 
				+			}
			
 
				+			if(nfounded == ndevices)
			
 
				+				found = 1;
			
 
				+		}
			
 
				+		if(found)
			
 
				+			return comb;
			
 
				+	}	
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+static 	void _free_arch_combs(void)
			
 
				+{
			
 
				+	int i;
			
 
				+	for(i = 0; i < narch_combs; i++)
			
 
				+	{
			
 
				+		free(arch_combs[i]->devices);
			
 
				+		free(arch_combs[i]);
			
 
				+	}
			
 
				+	narch_combs = 0;
			
 
				+}
			
 
				+
			
 
				+int starpu_get_narch_combs()
			
 
				+{
			
 
				+	return narch_combs;
			
 
				+}
			
 
				+
			
 
				+size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned impl, struct _starpu_job *j)
			
 
				 {
			
 
				 	struct starpu_task *task = j->task;
			
 
				+	int comb = starpu_get_arch_comb(arch->ndevices, arch->devices);
			
 
				 
			
 
				-	if (model && model->per_arch && model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].size_base)
			
 
				+	if (model && model->per_arch && comb != -1 && model->per_arch[comb][impl].size_base)
			
 
				 	{
			
 
				-		return model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].size_base(task, arch, nimpl);
			
 
				+		return model->per_arch[comb][impl].size_base(task, arch, impl);
			
 
				 	}
			
 
				 	else if (model && model->size_base)
			
 
				 	{
			
 
				-		return model->size_base(task, nimpl);
			
 
				+		return model->size_base(task, impl);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
@@ -103,11 +165,11 @@ static void insert_history_entry(struct starpu_perfmodel_history_entry *entry, s
 
				 	HASH_ADD_UINT32_T(*history_ptr, footprint, table);
			
 
				 }
			
 
				 
			
 
				-static void dump_reg_model(FILE *f, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
 
				+static void dump_reg_model(FILE *f, struct starpu_perfmodel *model, int comb, int impl)
			
 
				 {
			
 
				 	struct starpu_perfmodel_per_arch *per_arch_model;
			
 
				 
			
 
				-	per_arch_model = &model->per_arch[arch->type][arch->devid][arch->ncore][nimpl];
			
 
				+	per_arch_model = &model->per_arch[comb][impl];
			
 
				 	struct starpu_perfmodel_regression_model *reg_model;
			
 
				 	reg_model = &per_arch_model->regression;
			
 
				 
			
@@ -275,14 +337,10 @@ static void parse_per_arch_model_file(FILE *f, struct starpu_perfmodel_per_arch
 
				 }
			
 
				 
			
 
				 
			
 
				-static void parse_arch(FILE *f, struct starpu_perfmodel *model, unsigned scan_history,struct starpu_perfmodel_arch* arch)
			
 
				+static void parse_arch(FILE *f, struct starpu_perfmodel *model, unsigned scan_history, int comb)
			
 
				 {
			
 
				 	struct starpu_perfmodel_per_arch dummy;
			
 
				 	unsigned nimpls, implmax, impl, i, ret;
			
 
				-	//_STARPU_DEBUG("Parsing %s_%u_parallel_%u\n",
			
 
				-	//		starpu_perfmodel_get_archtype_name(arch->type),
			
 
				-	//		arch->devid,
			
 
				-	//		arch->ncore + 1);
			
 
				 
			
 
				 	/* Parsing number of implementation */
			
 
				 	_starpu_drop_comments(f);
			
@@ -293,8 +351,14 @@ static void parse_arch(FILE *f, struct starpu_perfmodel *model, unsigned scan_hi
 
				 	{
			
 
				 		/* Parsing each implementation */
			
 
				 		implmax = STARPU_MIN(nimpls, STARPU_MAXIMPLEMENTATIONS);
			
 
				+		if(implmax > 0)
			
 
				+		{
			
 
				+			model->nimpls[comb] = implmax;
			
 
				+			model->per_arch[comb] = (struct starpu_perfmodel_per_arch*)malloc(implmax*sizeof(struct starpu_perfmodel_per_arch));
			
 
				+		}
			
 
				+
			
 
				 		for (impl = 0; impl < implmax; impl++)
			
 
				-			parse_per_arch_model_file(f, &model->per_arch[arch->type][arch->devid][arch->ncore][impl], scan_history);
			
 
				+			parse_per_arch_model_file(f, &model->per_arch[comb][impl], scan_history);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
@@ -308,99 +372,89 @@ static void parse_arch(FILE *f, struct starpu_perfmodel *model, unsigned scan_hi
 
				 
			
 
				 }
			
 
				 
			
 
				-static void parse_device(FILE *f, struct starpu_perfmodel *model, unsigned scan_history, enum starpu_worker_archtype archtype, unsigned devid)
			
 
				+static enum starpu_worker_archtype _get_enum_type(int type)
			
 
				 {
			
 
				-	unsigned maxncore, ncore, ret, i;
			
 
				-	struct starpu_perfmodel_arch arch;
			
 
				-	arch.type = archtype;
			
 
				-	arch.devid = devid;
			
 
				-	//_STARPU_DEBUG("Parsing device %s_%u arch\n",
			
 
				-	//		starpu_perfmodel_get_archtype_name(archtype),
			
 
				-	//		devid);
			
 
				-
			
 
				-	/* Parsing maximun number of worker for this device */
			
 
				-	_starpu_drop_comments(f);
			
 
				-	ret = fscanf(f, "%u\n", &maxncore);
			
 
				-	STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
			
 
				-
			
 
				-	/* Parsing each arch */
			
 
				-	if(model !=NULL)
			
 
				-	{
			
 
				-		for(ncore=0; ncore < maxncore && model->per_arch[archtype][devid][ncore] != NULL; ncore++)
			
 
				-		{
			
 
				-			arch.ncore = ncore;
			
 
				-			parse_arch(f,model,scan_history,&arch);
			
 
				-		}
			
 
				-	}
			
 
				-	else
			
 
				+	switch(type)
			
 
				 	{
			
 
				-		ncore=0;
			
 
				+	        case 0:
			
 
				+			return STARPU_CPU_WORKER;
			
 
				+        	case 1:
			
 
				+			return STARPU_CUDA_WORKER;
			
 
				+	        case 2: 
			
 
				+			return STARPU_OPENCL_WORKER;
			
 
				+        	case 3:
			
 
				+			return STARPU_MIC_WORKER;
			
 
				+        	case 4:
			
 
				+			return STARPU_SCC_WORKER;
			
 
				+		default:
			
 
				+			STARPU_ABORT();
			
 
				 	}
			
 
				 
			
 
				-	for(i=ncore; i < maxncore; i++)
			
 
				-	{
			
 
				-		arch.ncore = i;
			
 
				-		parse_arch(f,NULL,scan_history,&arch);
			
 
				-	}
			
 
				 }
			
 
				-
			
 
				-
			
 
				-static void parse_archtype(FILE *f, struct starpu_perfmodel *model, unsigned scan_history, enum starpu_worker_archtype archtype)
			
 
				+static void parse_comb(FILE *f, struct starpu_perfmodel *model, unsigned scan_history, int comb)
			
 
				 {
			
 
				-	unsigned ndevice, devid, ret, i;
			
 
				-	//_STARPU_DEBUG("Parsing %s arch\n", starpu_perfmodel_get_archtype_name(archtype));
			
 
				-
			
 
				-	/* Parsing number of device for this archtype */
			
 
				+	int ndevices = 0;
			
 
				 	_starpu_drop_comments(f);
			
 
				-	ret = fscanf(f, "%u\n", &ndevice);
			
 
				+	int ret = fscanf(f, "%d\n", &ndevices );
			
 
				 	STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
			
 
				 
			
 
				-	/* Parsing each device for this archtype*/
			
 
				-	if(model != NULL)
			
 
				-	{
			
 
				-		for(devid=0; devid < ndevice && model->per_arch[archtype][devid] != NULL; devid++)
			
 
				-		{
			
 
				-				parse_device(f,model,scan_history,archtype,devid);
			
 
				-		}
			
 
				-	}
			
 
				-	else
			
 
				+	struct starpu_perfmodel_device devices[ndevices];
			
 
				+	
			
 
				+	int dev;
			
 
				+	for(dev = 0; dev < ndevices; dev++)
			
 
				 	{
			
 
				-		devid=0;
			
 
				+		enum starpu_worker_archtype dev_type;
			
 
				+		_starpu_drop_comments(f);
			
 
				+		int type;
			
 
				+		ret = fscanf(f, "%d\n", &type);
			
 
				+		STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
			
 
				+		dev_type = _get_enum_type(type);
			
 
				+		int dev_id;
			
 
				+		_starpu_drop_comments(f);
			
 
				+		ret = fscanf(f, "%d\n", &dev_id);
			
 
				+		STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
			
 
				+		int ncores;
			
 
				+		_starpu_drop_comments(f);
			
 
				+		ret = fscanf(f, "%d\n", &ncores);
			
 
				+		STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
			
 
				+		devices[dev].type = dev_type;	
			
 
				+		devices[dev].devid = dev_id;
			
 
				+		devices[dev].ncores = ncores;
			
 
				 	}
			
 
				+	starpu_add_arch_comb(ndevices, devices);
			
 
				 
			
 
				-	for(i=devid; i < ndevice; i++)
			
 
				-	{
			
 
				-		parse_device(f,NULL,scan_history,archtype,i);
			
 
				-	}
			
 
				+	parse_arch(f, model, scan_history, comb);
			
 
				 }
			
 
				 
			
 
				 static void parse_model_file(FILE *f, struct starpu_perfmodel *model, unsigned scan_history)
			
 
				 {
			
 
				-	unsigned archtype;
			
 
				 	int ret, version;
			
 
				 
			
 
				-	//_STARPU_DEBUG("Start parsing\n");
			
 
				-
			
 
				 	/* Parsing performance model version */
			
 
				 	_starpu_drop_comments(f);
			
 
				 	ret = fscanf(f, "%d\n", &version);
			
 
				 	STARPU_ASSERT_MSG(version == _STARPU_PERFMODEL_VERSION, "Incorrect performance model file with a model version %d not being the current model version (%d)\n",
			
 
				 			  version, _STARPU_PERFMODEL_VERSION);
			
 
				 	STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
			
 
				+	
			
 
				+	
			
 
				+	int ncombs = 0;
			
 
				+	_starpu_drop_comments(f);
			
 
				+	ret = fscanf(f, "%d\n", &ncombs);
			
 
				+	STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
			
 
				+	if(ncombs > 0)
			
 
				+		model->ncombs = ncombs;
			
 
				 
			
 
				-	/* Parsing each kind of archtype */
			
 
				-	for(archtype=0; archtype<STARPU_NARCH; archtype++)
			
 
				-	{
			
 
				-		parse_archtype(f, model, scan_history, archtype);
			
 
				-	}
			
 
				+	int comb;
			
 
				+	for(comb = 0; comb < ncombs; comb++)
			
 
				+		parse_comb(f, model, scan_history, comb);
			
 
				 }
			
 
				 
			
 
				-
			
 
				-static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, unsigned nimpl)
			
 
				+static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel *model, int comb, unsigned impl)
			
 
				 {
			
 
				 	struct starpu_perfmodel_per_arch *per_arch_model;
			
 
				 
			
 
				-	per_arch_model = &model->per_arch[arch->type][arch->devid][arch->ncore][nimpl];
			
 
				+	per_arch_model = &model->per_arch[comb][impl];
			
 
				 	/* count the number of elements in the lists */
			
 
				 	struct starpu_perfmodel_history_list *ptr = NULL;
			
 
				 	unsigned nentries = 0;
			
@@ -418,12 +472,12 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel *model, st
 
				 
			
 
				 	/* header */
			
 
				 	char archname[32];
			
 
				-	starpu_perfmodel_get_arch_name(arch, archname, 32, nimpl);
			
 
				+	starpu_perfmodel_get_arch_name(arch_combs[comb], archname,  32, impl);
			
 
				 	fprintf(f, "#####\n");
			
 
				 	fprintf(f, "# Model for %s\n", archname);
			
 
				 	fprintf(f, "# number of entries\n%u\n", nentries);
			
 
				 
			
 
				-	dump_reg_model(f, model, arch, nimpl);
			
 
				+	dump_reg_model(f, model, comb, impl);
			
 
				 
			
 
				 	/* Dump the history into the model file in case it is necessary */
			
 
				 	if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
			
@@ -440,218 +494,58 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel *model, st
 
				 	fprintf(f, "\n");
			
 
				 }
			
 
				 
			
 
				-static unsigned get_n_entries(struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, unsigned impl)
			
 
				-{
			
 
				-	struct starpu_perfmodel_per_arch *per_arch_model;
			
 
				-	per_arch_model = &model->per_arch[arch->type][arch->devid][arch->ncore][impl];
			
 
				-	/* count the number of elements in the lists */
			
 
				-	struct starpu_perfmodel_history_list *ptr = NULL;
			
 
				-	unsigned nentries = 0;
			
 
				-
			
 
				-	if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
			
 
				-	{
			
 
				-		/* Dump the list of all entries in the history */
			
 
				-		ptr = per_arch_model->list;
			
 
				-		while(ptr)
			
 
				-		{
			
 
				-			nentries++;
			
 
				-			ptr = ptr->next;
			
 
				-		}
			
 
				-	}
			
 
				-	return nentries;
			
 
				-}
			
 
				-
			
 
				 static void dump_model_file(FILE *f, struct starpu_perfmodel *model)
			
 
				 {
			
 
				-	struct _starpu_machine_config *conf = _starpu_get_machine_config();
			
 
				-	char *name = "unknown";
			
 
				-	unsigned archtype, ndevice, *ncore, devid, nc, nimpl;
			
 
				-	struct starpu_perfmodel_arch arch;
			
 
				-
			
 
				 	fprintf(f, "##################\n");
			
 
				 	fprintf(f, "# Performance Model Version\n");
			
 
				 	fprintf(f, "%d\n\n", _STARPU_PERFMODEL_VERSION);
			
 
				 
			
 
				-	for(archtype=0; archtype<STARPU_NARCH; archtype++)
			
 
				-	{
			
 
				-		arch.type = archtype;
			
 
				-		switch (archtype)
			
 
				-		{
			
 
				-			case STARPU_CPU_WORKER:
			
 
				-				ndevice = 1;
			
 
				-				ncore = &conf->topology.nhwcpus;
			
 
				-				name = "CPU";
			
 
				-				break;
			
 
				-			case STARPU_CUDA_WORKER:
			
 
				-				ndevice = conf->topology.nhwcudagpus;
			
 
				-				ncore = NULL;
			
 
				-				name = "CUDA";
			
 
				-				break;
			
 
				-			case STARPU_OPENCL_WORKER:
			
 
				-				ndevice = conf->topology.nhwopenclgpus;
			
 
				-				ncore = NULL;
			
 
				-				name = "OPENCL";
			
 
				-				break;
			
 
				-			case STARPU_MIC_WORKER:
			
 
				-				ndevice = conf->topology.nhwmicdevices;
			
 
				-				ncore = conf->topology.nhwmiccores;
			
 
				-				name = "MIC";
			
 
				-				break;
			
 
				-			case STARPU_SCC_WORKER:
			
 
				-				ndevice = conf->topology.nhwscc;
			
 
				-				ncore = NULL;
			
 
				-				name = "SCC";
			
 
				-				break;
			
 
				-			default:
			
 
				-				/* Unknown arch */
			
 
				-				STARPU_ABORT();
			
 
				-				break;
			
 
				-		}
			
 
				-
			
 
				-		fprintf(f, "####################\n");
			
 
				-		fprintf(f, "# %ss\n", name);
			
 
				-		fprintf(f, "# number of %s devices\n", name);
			
 
				-		fprintf(f, "%u\n", ndevice);
			
 
				-
			
 
				-
			
 
				-		for(devid=0; devid<ndevice; devid++)
			
 
				-		{
			
 
				-			arch.devid = devid;
			
 
				-			fprintf(f, "###############\n");
			
 
				-			fprintf(f, "# %s_%u\n", name, devid);
			
 
				-			fprintf(f, "# number of workers on device %s_%d\n", name, devid);
			
 
				-			if(ncore != NULL)
			
 
				-				fprintf(f, "%u\n", ncore[devid]);
			
 
				-			else
			
 
				-				fprintf(f, "1\n");
			
 
				-			for(nc=0; model->per_arch[archtype][devid][nc] != NULL; nc++)
			
 
				-			{
			
 
				-
			
 
				-				arch.ncore = nc;
			
 
				-				unsigned max_impl = 0;
			
 
				-				if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
			
 
				-				{
			
 
				-					for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				-						if (get_n_entries(model, &arch, nimpl))
			
 
				-							max_impl = nimpl + 1;
			
 
				-				}
			
 
				-				else if (model->type == STARPU_REGRESSION_BASED || model->type == STARPU_PER_ARCH || model->type == STARPU_COMMON)
			
 
				-				{
			
 
				-					for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				-						if (model->per_arch[archtype][devid][nc][nimpl].regression.nsample)
			
 
				-							max_impl = nimpl + 1;
			
 
				-				}
			
 
				-				else
			
 
				-					STARPU_ASSERT_MSG(0, "Unknown history-based performance model %u", model->type);
			
 
				+	int ncombs = model->ncombs;
			
 
				 
			
 
				-				fprintf(f, "##########\n");
			
 
				-				fprintf(f, "# %u worker(s) in parallel\n", nc+1);
			
 
				+	fprintf(f, "####################\n");
			
 
				+	fprintf(f, "# COMBs\n");
			
 
				+	fprintf(f, "# number of combinations\n");
			
 
				+	fprintf(f, "%u\n", ncombs);
			
 
				 
			
 
				-				fprintf(f, "# number of implementations\n");
			
 
				-				fprintf(f, "%u\n", max_impl);
			
 
				-				for (nimpl = 0; nimpl < max_impl; nimpl++)
			
 
				-				{
			
 
				-					dump_per_arch_model_file(f, model, &arch, nimpl);
			
 
				-				}
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-static void initialize_per_arch_model(struct starpu_perfmodel_per_arch *per_arch_model)
			
 
				-{
			
 
				-	memset(per_arch_model, 0, sizeof(struct starpu_perfmodel_per_arch));
			
 
				-}
			
 
				-
			
 
				-static struct starpu_perfmodel_per_arch*** initialize_arch_model(int maxdevid, unsigned* maxncore_table)
			
 
				-{
			
 
				-	int devid, ncore, nimpl;
			
 
				-	struct starpu_perfmodel_per_arch *** arch_model = malloc(sizeof(*arch_model)*(maxdevid+1));
			
 
				-	arch_model[maxdevid] = NULL;
			
 
				-	for(devid=0; devid<maxdevid; devid++)
			
 
				+	int comb, impl, dev;
			
 
				+	for(comb = 0; comb < ncombs; comb++)
			
 
				 	{
			
 
				-		int maxncore;
			
 
				-		if(maxncore_table != NULL)
			
 
				-			maxncore = maxncore_table[devid];
			
 
				-		else
			
 
				-			maxncore = 1;
			
 
				-
			
 
				-		arch_model[devid] = malloc(sizeof(*arch_model[devid])*(maxncore+1));
			
 
				-		arch_model[devid][maxncore] = NULL;
			
 
				-		for(ncore=0; ncore<maxncore; ncore++)
			
 
				+		int ndevices = arch_combs[comb]->ndevices;
			
 
				+		fprintf(f, "####################\n");
			
 
				+		fprintf(f, "# COMB_%d\n", comb);
			
 
				+		fprintf(f, "# number of types devices\n");
			
 
				+		fprintf(f, "%u\n", ndevices);
			
 
				+		
			
 
				+		for(dev = 0; dev < ndevices; dev++)
			
 
				 		{
			
 
				-			arch_model[devid][ncore] = malloc(sizeof(*arch_model[devid][ncore])*STARPU_MAXIMPLEMENTATIONS);
			
 
				-			for(nimpl=0; nimpl<STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				-			{
			
 
				-				initialize_per_arch_model(&arch_model[devid][ncore][nimpl]);
			
 
				-			}
			
 
				+			fprintf(f, "####################\n");
			
 
				+			fprintf(f, "# DEV_%d\n", dev);
			
 
				+			fprintf(f, "# device type (CPU - 0, CUDA - 1, OPENCL - 2, MIC - 3, SCC - 4)\n");
			
 
				+			fprintf(f, "%u\n", arch_combs[comb]->devices[dev].type);
			
 
				+
			
 
				+			fprintf(f, "####################\n");
			
 
				+			fprintf(f, "# DEV_%d\n", dev);
			
 
				+			fprintf(f, "# device id \n");
			
 
				+			fprintf(f, "%u\n", arch_combs[comb]->devices[dev].devid);
			
 
				+
			
 
				+			fprintf(f, "####################\n");
			
 
				+			fprintf(f, "# DEV_%d\n", dev);
			
 
				+			fprintf(f, "# number of cores \n");
			
 
				+			fprintf(f, "%u\n", arch_combs[comb]->devices[dev].ncores);
			
 
				 		}
			
 
				-	}
			
 
				-	return arch_model;
			
 
				-}
			
 
				-
			
 
				-static void initialize_model(struct starpu_perfmodel *model)
			
 
				-{
			
 
				-	struct _starpu_machine_config *conf = _starpu_get_machine_config();
			
 
				-	model->per_arch = malloc(sizeof(*model->per_arch)*(STARPU_NARCH));
			
 
				-
			
 
				-	model->per_arch[STARPU_CPU_WORKER] = initialize_arch_model(1,&conf->topology.nhwcpus);
			
 
				-	model->per_arch[STARPU_CUDA_WORKER] = initialize_arch_model(conf->topology.nhwcudagpus,NULL);
			
 
				-	model->per_arch[STARPU_OPENCL_WORKER] = initialize_arch_model(conf->topology.nhwopenclgpus,NULL);
			
 
				-	model->per_arch[STARPU_MIC_WORKER] = initialize_arch_model(conf->topology.nhwmicdevices,conf->topology.nhwmiccores);
			
 
				-	model->per_arch[STARPU_SCC_WORKER] = initialize_arch_model(conf->topology.nhwscc,NULL);
			
 
				-}
			
 
				-
			
 
				-static void initialize_model_with_file(FILE*f, struct starpu_perfmodel *model)
			
 
				-{
			
 
				-	unsigned ret, archtype, devid, i, ndevice, * maxncore;
			
 
				-	struct starpu_perfmodel_arch arch;
			
 
				-	int version;
			
 
				-
			
 
				-	/* Parsing performance model version */
			
 
				-	_starpu_drop_comments(f);
			
 
				-	ret = fscanf(f, "%d\n", &version);
			
 
				-	STARPU_ASSERT_MSG(version == _STARPU_PERFMODEL_VERSION, "Incorrect performance model file with a model version %d not being the current model version (%d)\n",
			
 
				-			version, _STARPU_PERFMODEL_VERSION);
			
 
				-	STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
			
 
				-
			
 
				-	model->per_arch = malloc(sizeof(*model->per_arch)*(STARPU_NARCH));
			
 
				-	for(archtype=0; archtype<STARPU_NARCH; archtype++)
			
 
				-	{
			
 
				-		arch.type = archtype;
			
 
				-
			
 
				-		_starpu_drop_comments(f);
			
 
				-		ret = fscanf(f, "%u\n", &ndevice);
			
 
				-		STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
			
 
				-
			
 
				-		if(ndevice != 0)
			
 
				-			maxncore = malloc(sizeof(*maxncore)*ndevice);
			
 
				-		else
			
 
				-			maxncore = NULL;
			
 
				-
			
 
				-		for(devid=0; devid < ndevice; devid++)
			
 
				+		
			
 
				+		int nimpls = model->nimpls[comb];
			
 
				+		fprintf(f, "##########\n");
			
 
				+		fprintf(f, "# number of implementations\n");
			
 
				+		fprintf(f, "%u\n", nimpls);
			
 
				+		for (impl = 0; impl < nimpls; impl++)
			
 
				 		{
			
 
				-			arch.devid = devid;
			
 
				-
			
 
				-			_starpu_drop_comments(f);
			
 
				-			ret = fscanf(f, "%u\n", &maxncore[devid]);
			
 
				-			STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
			
 
				-
			
 
				-			for(i=0; i<maxncore[devid]; i++)
			
 
				-			{
			
 
				-				arch.ncore = i;
			
 
				-
			
 
				-				parse_arch(f,NULL,0,&arch);
			
 
				-			}
			
 
				+			dump_per_arch_model_file(f, model, comb, impl);
			
 
				 		}
			
 
				-
			
 
				-		model->per_arch[archtype] = initialize_arch_model(ndevice,maxncore);
			
 
				-		if(maxncore != NULL)
			
 
				-			free(maxncore);
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				-void starpu_perfmodel_init(struct starpu_perfmodel *model)
			
 
				+void starpu_perfmodel_init(FILE *f, struct starpu_perfmodel *model)
			
 
				 {
			
 
				 	STARPU_ASSERT(model && model->symbol);
			
 
				 
			
@@ -677,43 +571,35 @@ void starpu_perfmodel_init(struct starpu_perfmodel *model)
 
				 
			
 
				 	STARPU_PTHREAD_RWLOCK_INIT(&model->model_rwlock, NULL);
			
 
				 	if(model->type != STARPU_COMMON)
			
 
				-		initialize_model(model);
			
 
				-	model->is_init = 1;
			
 
				-	STARPU_PTHREAD_RWLOCK_UNLOCK(&registered_models_rwlock);
			
 
				-}
			
 
				-
			
 
				-void starpu_perfmodel_init_with_file(FILE*f, struct starpu_perfmodel *model)
			
 
				-{
			
 
				-	STARPU_ASSERT(model && model->symbol);
			
 
				-
			
 
				-	int already_init;
			
 
				-
			
 
				-	STARPU_PTHREAD_RWLOCK_RDLOCK(&registered_models_rwlock);
			
 
				-	already_init = model->is_init;
			
 
				-	STARPU_PTHREAD_RWLOCK_UNLOCK(&registered_models_rwlock);
			
 
				-
			
 
				-	if (already_init)
			
 
				-		return;
			
 
				-
			
 
				-	/* The model is still not loaded so we grab the lock in write mode, and
			
 
				-	 * if it's not loaded once we have the lock, we do load it. */
			
 
				-	STARPU_PTHREAD_RWLOCK_WRLOCK(&registered_models_rwlock);
			
 
				-
			
 
				-	/* Was the model initialized since the previous test ? */
			
 
				-	if (model->is_init)
			
 
				 	{
			
 
				-		STARPU_PTHREAD_RWLOCK_UNLOCK(&registered_models_rwlock);
			
 
				-		return;
			
 
				+		struct _starpu_machine_config *conf = _starpu_get_machine_config();
			
 
				+		unsigned ncores = conf->topology.nhwcpus;
			
 
				+		unsigned ncuda =  conf->topology.nhwcudagpus;
			
 
				+		unsigned nopencl = conf->topology.nhwopenclgpus;
			
 
				+		unsigned nmic = 0;
			
 
				+		unsigned i;
			
 
				+		for(i = 0; i < conf->topology.nhwmicdevices; i++)
			
 
				+			nmic += conf->topology.nhwmiccores[i];
			
 
				+		unsigned nscc = conf->topology.nhwscc;
			
 
				+		unsigned npossible_combs= pow(2, (ncores + ncuda + nopencl + nmic + nscc));
			
 
				+		arch_combs = (struct starpu_perfmodel_arch**) malloc(npossible_combs*sizeof(struct starpu_perfmodel_arch*));
			
 
				+		narch_combs = 0;
			
 
				+		model->per_arch = (struct starpu_perfmodel_per_arch**) malloc(npossible_combs*sizeof(struct starpu_perfmodel_per_arch*));
			
 
				+		model->nimpls = (int *)malloc(npossible_combs*sizeof(int));
			
 
				+
			
 
				+		for(i = 0; i < npossible_combs; i++)
			
 
				+		{
			
 
				+			model->per_arch[i] = NULL;
			
 
				+			model->nimpls[i] = 0;
			
 
				+		}
			
 
				+		if(f)
			
 
				+			parse_model_file(f, model, 0);
			
 
				 	}
			
 
				 
			
 
				-	STARPU_PTHREAD_RWLOCK_INIT(&model->model_rwlock, NULL);
			
 
				-	if(model->type != STARPU_COMMON)
			
 
				-		initialize_model_with_file(f,model);
			
 
				 	model->is_init = 1;
			
 
				 	STARPU_PTHREAD_RWLOCK_UNLOCK(&registered_models_rwlock);
			
 
				 }
			
 
				 
			
 
				-
			
 
				 static void get_model_debug_path(struct starpu_perfmodel *model, const char *arch, char *path, size_t maxlen)
			
 
				 {
			
 
				 	STARPU_ASSERT(path);
			
@@ -731,11 +617,11 @@ static void get_model_debug_path(struct starpu_perfmodel *model, const char *arc
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * Returns 0 is the model was already loaded, 1 otherwise.
			
 
				+ * Returns 0 if the model was already loaded, 1 otherwise.
			
 
				  */
			
 
				 int _starpu_register_model(struct starpu_perfmodel *model)
			
 
				 {
			
 
				-	starpu_perfmodel_init(model);
			
 
				+	starpu_perfmodel_init(NULL, model);
			
 
				 
			
 
				 	/* If the model has already been loaded, there is nothing to do */
			
 
				 	STARPU_PTHREAD_RWLOCK_RDLOCK(&registered_models_rwlock);
			
@@ -768,34 +654,19 @@ int _starpu_register_model(struct starpu_perfmodel *model)
 
				 #ifdef STARPU_MODEL_DEBUG
			
 
				 	_starpu_create_sampling_directory_if_needed();
			
 
				 
			
 
				-	unsigned archtype, devid, ncore, nimpl;
			
 
				-	struct starpu_perfmodel_arch arch;
			
 
				-
			
 
				 	_STARPU_DEBUG("\n\n ###\nHere\n ###\n\n");
			
 
				 
			
 
				 	if(model->is_init)
			
 
				 	{
			
 
				 		_STARPU_DEBUG("Init\n");
			
 
				-		for (archtype = 0; archtype < STARPU_NARCH; archtype++)
			
 
				+		int nimpl = model->nimpl;
			
 
				+		int ncombs = model->ncombs;
			
 
				+		int comb, impl;
			
 
				+		for(comb = 0; comb < ncombs; comb++)
			
 
				 		{
			
 
				-			_STARPU_DEBUG("Archtype\n");
			
 
				-			arch.type = archtype;
			
 
				-			if(model->per_arch[archtype] != NULL)
			
 
				+			for(impl = 0; impl < nimpls; impl++)
			
 
				 			{
			
 
				-				for(devid=0; model->per_arch[archtype][devid] != NULL; devid++)
			
 
				-				{
			
 
				-					_STARPU_DEBUG("Devid\n");
			
 
				-					arch.devid = devid;
			
 
				-					for(ncore=0; model->per_arch[archtype][devid][ncore] != NULL; ncore++)
			
 
				-					{
			
 
				-						_STARPU_DEBUG("Ncore\n");
			
 
				-						arch.ncore = ncore;
			
 
				-						for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				-						{
			
 
				-							starpu_perfmodel_debugfilepath(model, &arch, model->per_arch[archtype][devid][ncore][nimpl].debug_path, 256, nimpl);
			
 
				-						}
			
 
				-					}
			
 
				-				}
			
 
				+				starpu_perfmodel_debugfilepath(model, model->per_arch[comb][impl], model->per_arch[comb][impl].debug_path, 256, impl);
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
@@ -868,53 +739,44 @@ void _starpu_initialize_registered_performance_models(void)
 
				 
			
 
				 void _starpu_deinitialize_performance_model(struct starpu_perfmodel *model)
			
 
				 {
			
 
				-	unsigned arch, devid, ncore, nimpl;
			
 
				-
			
 
				 	if(model->is_init && model->per_arch != NULL)
			
 
				 	{
			
 
				-		for (arch = 0; arch < STARPU_NARCH; arch++)
			
 
				+		int ncombs = model->ncombs;
			
 
				+		int comb, impl;
			
 
				+		for(comb = 0; comb < ncombs; comb++)
			
 
				 		{
			
 
				-			if( model->per_arch[arch] != NULL)
			
 
				+			int nimpls = model->nimpls[comb];
			
 
				+			for(impl = 0; impl < nimpls; impl++)
			
 
				 			{
			
 
				-				for(devid=0; model->per_arch[arch][devid] != NULL; devid++)
			
 
				+				struct starpu_perfmodel_per_arch *archmodel = &model->per_arch[comb][impl];
			
 
				+				struct starpu_perfmodel_history_list *list, *plist;
			
 
				+				struct starpu_perfmodel_history_table *entry, *tmp;
			
 
				+				
			
 
				+				HASH_ITER(hh, archmodel->history, entry, tmp)
			
 
				 				{
			
 
				-					for(ncore=0; model->per_arch[arch][devid][ncore] != NULL; ncore++)
			
 
				-					{
			
 
				-						for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				-						{
			
 
				-							struct starpu_perfmodel_per_arch *archmodel = &model->per_arch[arch][devid][ncore][nimpl];
			
 
				-							struct starpu_perfmodel_history_list *list, *plist;
			
 
				-							struct starpu_perfmodel_history_table *entry, *tmp;
			
 
				-
			
 
				-							HASH_ITER(hh, archmodel->history, entry, tmp)
			
 
				-							{
			
 
				-								HASH_DEL(archmodel->history, entry);
			
 
				-								free(entry);
			
 
				-							}
			
 
				-							archmodel->history = NULL;
			
 
				-
			
 
				-							list = archmodel->list;
			
 
				-							while (list)
			
 
				-							{
			
 
				-								free(list->entry);
			
 
				-								plist = list;
			
 
				-								list = list->next;
			
 
				-								free(plist);
			
 
				-							}
			
 
				-							archmodel->list = NULL;
			
 
				-						}
			
 
				-						free(model->per_arch[arch][devid][ncore]);
			
 
				-						model->per_arch[arch][devid][ncore] = NULL;
			
 
				-					}
			
 
				-					free(model->per_arch[arch][devid]);
			
 
				-					model->per_arch[arch][devid] = NULL;
			
 
				+					HASH_DEL(archmodel->history, entry);
			
 
				+					free(entry);
			
 
				 				}
			
 
				-				free(model->per_arch[arch]);
			
 
				-				model->per_arch[arch] = NULL;
			
 
				+				archmodel->history = NULL;
			
 
				+				
			
 
				+				list = archmodel->list;
			
 
				+				while (list)
			
 
				+				{
			
 
				+					free(list->entry);
			
 
				+					plist = list;
			
 
				+					list = list->next;
			
 
				+					free(plist);
			
 
				+				}
			
 
				+				archmodel->list = NULL;
			
 
				 			}
			
 
				-		}
			
 
				+			free(model->per_arch[comb]);
			
 
				+			model->per_arch[comb] = NULL;
			
 
				+		}		
			
 
				 		free(model->per_arch);
			
 
				 		model->per_arch = NULL;
			
 
				+		free(model->nimpls);
			
 
				+		model->nimpls = NULL;
			
 
				+
			
 
				 	}
			
 
				 
			
 
				 	model->is_init = 0;
			
@@ -949,6 +811,7 @@ void _starpu_deinitialize_registered_performance_models(void)
 
				 
			
 
				 	STARPU_PTHREAD_RWLOCK_UNLOCK(&registered_models_rwlock);
			
 
				 	STARPU_PTHREAD_RWLOCK_DESTROY(&registered_models_rwlock);
			
 
				+	_free_arch_combs();
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -958,12 +821,12 @@ void _starpu_deinitialize_registered_performance_models(void)
 
				  */
			
 
				 void _starpu_load_per_arch_based_model(struct starpu_perfmodel *model)
			
 
				 {
			
 
				-	starpu_perfmodel_init(model);
			
 
				+	starpu_perfmodel_init(NULL, model);
			
 
				 }
			
 
				 
			
 
				 void _starpu_load_common_based_model(struct starpu_perfmodel *model)
			
 
				 {
			
 
				-	starpu_perfmodel_init(model);
			
 
				+	starpu_perfmodel_init(NULL, model);
			
 
				 }
			
 
				 
			
 
				 /* We first try to grab the global lock in read mode to check whether the model
			
@@ -972,7 +835,7 @@ void _starpu_load_common_based_model(struct starpu_perfmodel *model)
 
				  * is still not loaded once we have the lock, we do load it.  */
			
 
				 void _starpu_load_history_based_model(struct starpu_perfmodel *model, unsigned scan_history)
			
 
				 {
			
 
				-	starpu_perfmodel_init(model);
			
 
				+	starpu_perfmodel_init(NULL, model);
			
 
				 
			
 
				 	STARPU_PTHREAD_RWLOCK_WRLOCK(&model->model_rwlock);
			
 
				 
			
@@ -1099,9 +962,6 @@ int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *mo
 
				 	FILE *f = fopen(path, "r");
			
 
				 	STARPU_ASSERT(f);
			
 
				 
			
 
				-	starpu_perfmodel_init_with_file(f, model);
			
 
				-	rewind(f);
			
 
				-
			
 
				 	parse_model_file(f, model, 1);
			
 
				 
			
 
				 	STARPU_ASSERT(fclose(f) == 0);
			
@@ -1141,18 +1001,19 @@ char* starpu_perfmodel_get_archtype_name(enum starpu_worker_archtype archtype)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-void starpu_perfmodel_get_arch_name(struct starpu_perfmodel_arch* arch, char *archname, size_t maxlen,unsigned nimpl)
			
 
				+void starpu_perfmodel_get_arch_name(struct starpu_perfmodel_arch* arch, char *archname, size_t maxlen,unsigned impl)
			
 
				 {
			
 
				-	snprintf(archname, maxlen, "%s%d_parallel%d_impl%u",
			
 
				-			starpu_perfmodel_get_archtype_name(arch->type),
			
 
				-			arch->devid,
			
 
				-			arch->ncore + 1,
			
 
				-			nimpl);
			
 
				+	int comb = starpu_get_arch_comb(arch->ndevices, arch->devices);
			
 
				+	STARPU_ASSERT(comb != -1);
			
 
				+
			
 
				+	snprintf(archname, maxlen, "%d_impl%u", comb, impl);
			
 
				 }
			
 
				 
			
 
				 void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model,
			
 
				 				    struct starpu_perfmodel_arch* arch, char *path, size_t maxlen, unsigned nimpl)
			
 
				 {
			
 
				+	int comb = starpu_get_arch_comb(arch->ndevices, arch->devices);
			
 
				+	STARPU_ASSERT(comb != -1);
			
 
				 	char archname[32];
			
 
				 	starpu_perfmodel_get_arch_name(arch, archname, 32, nimpl);
			
 
				 
			
@@ -1163,11 +1024,13 @@ void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model,
 
				 
			
 
				 double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, struct _starpu_job *j, unsigned nimpl)
			
 
				 {
			
 
				+	int comb = starpu_get_arch_comb(arch->ndevices, arch->devices);
			
 
				 	double exp = NAN;
			
 
				+	if(comb == -1) return exp;
			
 
				 	size_t size = _starpu_job_get_data_size(model, arch, nimpl, j);
			
 
				 	struct starpu_perfmodel_regression_model *regmodel;
			
 
				 
			
 
				-	regmodel = &model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].regression;
			
 
				+	regmodel = &model->per_arch[comb][nimpl].regression;
			
 
				 
			
 
				 	if (regmodel->valid && size >= regmodel->minx * 0.9 && size <= regmodel->maxx * 1.1)
			
 
				                 exp = regmodel->alpha*pow((double)size, regmodel->beta);
			
@@ -1177,18 +1040,20 @@ double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model
 
				 
			
 
				 double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, struct _starpu_job *j,unsigned nimpl)
			
 
				 {
			
 
				+	int comb = starpu_get_arch_comb(arch->ndevices, arch->devices);
			
 
				 	double exp = NAN;
			
 
				+	if(comb == -1) return exp;
			
 
				 	size_t size = _starpu_job_get_data_size(model, arch, nimpl, j);
			
 
				 	struct starpu_perfmodel_regression_model *regmodel;
			
 
				 
			
 
				-	regmodel = &model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].regression;
			
 
				+	regmodel = &model->per_arch[comb][nimpl].regression;
			
 
				 
			
 
				 	if (regmodel->nl_valid && size >= regmodel->minx * 0.9 && size <= regmodel->maxx * 1.1)
			
 
				 		exp = regmodel->a*pow((double)size, regmodel->b) + regmodel->c;
			
 
				 	else
			
 
				 	{
			
 
				 		uint32_t key = _starpu_compute_buffers_footprint(model, arch, nimpl, j);
			
 
				-		struct starpu_perfmodel_per_arch *per_arch_model = &model->per_arch[arch->type][arch->devid][arch->ncore][nimpl];
			
 
				+		struct starpu_perfmodel_per_arch *per_arch_model = &model->per_arch[comb][nimpl];
			
 
				 		struct starpu_perfmodel_history_table *history;
			
 
				 		struct starpu_perfmodel_history_table *entry;
			
 
				 
			
@@ -1221,14 +1086,16 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 
				 
			
 
				 double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, struct _starpu_job *j,unsigned nimpl)
			
 
				 {
			
 
				+	int comb = starpu_get_arch_comb(arch->ndevices, arch->devices);
			
 
				 	double exp = NAN;
			
 
				+	if(comb == -1) return exp;
			
 
				 	struct starpu_perfmodel_per_arch *per_arch_model;
			
 
				 	struct starpu_perfmodel_history_entry *entry;
			
 
				 	struct starpu_perfmodel_history_table *history, *elt;
			
 
				 
			
 
				 	uint32_t key = _starpu_compute_buffers_footprint(model, arch, nimpl, j);
			
 
				 
			
 
				-	per_arch_model = &model->per_arch[arch->type][arch->devid][arch->ncore][nimpl];
			
 
				+	per_arch_model = &model->per_arch[comb][nimpl];
			
 
				 
			
 
				 	STARPU_PTHREAD_RWLOCK_RDLOCK(&model->model_rwlock);
			
 
				 	history = per_arch_model->history;
			
@@ -1270,20 +1137,31 @@ double starpu_permodel_history_based_expected_perf(struct starpu_perfmodel *mode
 
				 	return _starpu_history_based_job_expected_perf(model, arch, &j, j.nimpl);
			
 
				 }
			
 
				 
			
 
				-void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned cpuid STARPU_ATTRIBUTE_UNUSED, double measured, unsigned nimpl)
			
 
				+void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned cpuid STARPU_ATTRIBUTE_UNUSED, double measured, unsigned impl)
			
 
				 {
			
 
				 	if (model)
			
 
				 	{
			
 
				+		int comb = starpu_get_arch_comb(arch->ndevices, arch->devices);
			
 
				+		if(comb == -1)
			
 
				+			comb = starpu_add_arch_comb(arch->ndevices, arch->devices);
			
 
				 		STARPU_PTHREAD_RWLOCK_WRLOCK(&model->model_rwlock);
			
 
				 
			
 
				-		struct starpu_perfmodel_per_arch *per_arch_model = &model->per_arch[arch->type][arch->devid][arch->ncore][nimpl];
			
 
				+		if(!model->per_arch[comb])
			
 
				+		{
			
 
				+			model->per_arch[comb] = (struct starpu_perfmodel_per_arch*)malloc(STARPU_MAXIMPLEMENTATIONS*sizeof(struct starpu_perfmodel_per_arch));
			
 
				+			int i;
			
 
				+			for(i = 0; i < STARPU_MAXIMPLEMENTATIONS; i++)
			
 
				+				memset(&model->per_arch[comb][i], 0, sizeof(struct starpu_perfmodel_per_arch));
			
 
				+		}
			
 
				+
			
 
				+		struct starpu_perfmodel_per_arch *per_arch_model = &model->per_arch[comb][impl];
			
 
				 
			
 
				 		if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
			
 
				 		{
			
 
				 			struct starpu_perfmodel_history_entry *entry;
			
 
				 			struct starpu_perfmodel_history_table *elt;
			
 
				 			struct starpu_perfmodel_history_list **list;
			
 
				-			uint32_t key = _starpu_compute_buffers_footprint(model, arch, nimpl, j);
			
 
				+			uint32_t key = _starpu_compute_buffers_footprint(model, arch, impl, j);
			
 
				 
			
 
				 			list = &per_arch_model->list;
			
 
				 
			
@@ -1310,7 +1188,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
				 				entry->deviation = 0.0;
			
 
				 				entry->sum2 = 0;
			
 
				 
			
 
				-				entry->size = _starpu_job_get_data_size(model, arch, nimpl, j);
			
 
				+				entry->size = _starpu_job_get_data_size(model, arch, impl, j);
			
 
				 				entry->flops = j->task->flops;
			
 
				 
			
 
				 				entry->footprint = key;
			
@@ -1318,6 +1196,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
				 				entry->nerror = 0;
			
 
				 
			
 
				 				insert_history_entry(entry, list, &per_arch_model->history);
			
 
				+				model->nimpls[comb]++;
			
 
				 			}
			
 
				 			else
			
 
				 			{
			
@@ -1336,7 +1215,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
				 					if (entry->nerror >= entry->nsample)
			
 
				 					{
			
 
				 						char archname[32];
			
 
				-						starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), nimpl);
			
 
				+						starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), impl);
			
 
				 						_STARPU_DISP("Too big deviation for model %s on %s: %f vs average %f, %u such errors against %u samples (%+f%%), flushing the performance model. Use the STARPU_HISTORY_MAX_ERROR environement variable to control the threshold (currently %d%%)\n", model->symbol, archname, measured, entry->mean, entry->nerror, entry->nsample, measured * 100. / entry->mean - 100, historymaxerror);
			
 
				 						entry->sum = 0.0;
			
 
				 						entry->sum2 = 0.0;
			
@@ -1376,7 +1255,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
				 			reg_model = &per_arch_model->regression;
			
 
				 
			
 
				 			/* update the regression model */
			
 
				-			size_t job_size = _starpu_job_get_data_size(model, arch, nimpl, j);
			
 
				+			size_t job_size = _starpu_job_get_data_size(model, arch, impl, j);
			
 
				 			double logy, logx;
			
 
				 			logx = log((double)job_size);
			
 
				 			logy = log(measured);
			
@@ -1414,11 +1293,11 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
				 		}
			
 
				 
			
 
				 		if (!j->footprint_is_computed)
			
 
				-			(void) _starpu_compute_buffers_footprint(model, arch, nimpl, j);
			
 
				+			(void) _starpu_compute_buffers_footprint(model, arch, comb, impl, j);
			
 
				 
			
 
				 		STARPU_ASSERT(j->footprint_is_computed);
			
 
				 
			
 
				-		fprintf(f, "0x%x\t%lu\t%f\t%f\t%f\t%d\t\t", j->footprint, (unsigned long) _starpu_job_get_data_size(model, arch, nimpl, j), measured, task->predicted, task->predicted_transfer, cpuid);
			
 
				+		fprintf(f, "0x%x\t%lu\t%f\t%f\t%f\t%d\t\t", j->footprint, (unsigned long) _starpu_job_get_data_size(model, arch, impl, j), measured, task->predicted, task->predicted_transfer, cpuid);
			
 
				 		unsigned i;
			
 
				 
			
 
				 		for (i = 0; i < task->cl->nbuffers; i++)
			
@@ -1450,3 +1329,11 @@ void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct star
 
				 	/* and save perfmodel on termination */
			
 
				 	_starpu_set_calibrate_flag(1);
			
 
				 }
			
 
				+
			
 
				+struct starpu_perfmodel_per_arch *starpu_perfmodel_get_model_per_arch(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, unsigned impl)
			
 
				+{
			
 
				+	int comb = starpu_get_arch_comb(arch->ndevices, arch->devices);
			
 
				+	if(comb == -1) return NULL;
			
 
				+	
			
 
				+	return &model->per_arch[comb][impl];
			
 
				+}
			
--- a/src/core/perfmodel/perfmodel_print.c
+++ b/src/core/perfmodel/perfmodel_print.c
@@ -19,7 +19,7 @@
 
				 #include <starpu.h>
			
 
				 #include <starpu_perfmodel.h>
			
 
				 #include <common/config.h>
			
 
				-
			
 
				+#include "perfmodel.h"
			
 
				 static
			
 
				 void _starpu_perfmodel_print_history_based(struct starpu_perfmodel_per_arch *per_arch_model, char *parameter, uint32_t *footprint, FILE *output)
			
 
				 {
			
@@ -63,7 +63,9 @@ void _starpu_perfmodel_print_history_based(struct starpu_perfmodel_per_arch *per
 
				 
			
 
				 void starpu_perfmodel_print(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output)
			
 
				 {
			
 
				-	struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[arch->type][arch->devid][arch->ncore][nimpl];
			
 
				+	int comb = starpu_get_arch_comb(arch->ndevices, arch->devices);
			
 
				+	STARPU_ASSERT(comb != -1);
			
 
				+	struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[comb][nimpl];
			
 
				 	char archname[32];
			
 
				 
			
 
				 	if (arch_model->regression.nsample || arch_model->regression.valid || arch_model->regression.nl_valid || arch_model->list)
			
@@ -170,24 +172,12 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 
				 {
			
 
				 	if (arch == NULL)
			
 
				 	{
			
 
				-		/* display all architectures */
			
 
				-		unsigned archtype, devid, ncore, implid;
			
 
				-		struct starpu_perfmodel_arch perf_arch;
			
 
				-		for (archtype = 0; archtype < STARPU_NARCH; archtype++)
			
 
				+		int comb, impl;
			
 
				+		for(comb = 0; comb < narch_combs; comb++)
			
 
				 		{
			
 
				-			perf_arch.type = archtype;
			
 
				-			for(devid = 0; model->per_arch[archtype][devid] != NULL; devid++)
			
 
				-			{
			
 
				-				perf_arch.devid = devid;
			
 
				-				for(ncore = 0; model->per_arch[archtype][devid][ncore] != NULL; ncore++)
			
 
				-				{
			
 
				-					perf_arch.ncore = ncore;
			
 
				-					for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				-					{ /* Display all codelets on each arch */
			
 
				-						starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
			
 
				-					}
			
 
				-				}
			
 
				-			}
			
 
				+			int nimpls = model->nimpls[comb];
			
 
				+			for(impl = 0; impl < nimpls; impl++)
			
 
				+				starpu_perfmodel_print(model, arch_combs[comb], impl, parameter, footprint, output);
			
 
				 		}
			
 
				 	}
			
 
				 	else
			
@@ -196,11 +186,17 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 
				 		{
			
 
				 			unsigned implid;
			
 
				 			struct starpu_perfmodel_arch perf_arch;
			
 
				-			perf_arch.type = STARPU_CPU_WORKER;
			
 
				-			perf_arch.devid = 0;
			
 
				-			perf_arch.ncore = 0;
			
 
				-			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				+			perf_arch.ndevices = 1;
			
 
				+			perf_arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
			
 
				+			perf_arch.devices[0].type = STARPU_CPU_WORKER;
			
 
				+			perf_arch.devices[0].devid = 0;
			
 
				+			perf_arch.devices[0].ncores = 1;
			
 
				+			int comb = starpu_get_arch_comb(perf_arch.ndevices, perf_arch.devices);
			
 
				+			STARPU_ASSERT(comb != -1);
			
 
				+			int nimpls = model->nimpls[comb];
			
 
				+			for (implid = 0; implid < nimpls; implid++)
			
 
				 				starpu_perfmodel_print(model, &perf_arch,implid, parameter, footprint, output); /* Display all codelets on cpu */
			
 
				+			free(perf_arch.devices);
			
 
				 			return 0;
			
 
				 		}
			
 
				 
			
@@ -216,11 +212,18 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 
				 
			
 
				 			unsigned implid;
			
 
				 			struct starpu_perfmodel_arch perf_arch;
			
 
				-			perf_arch.type = STARPU_CPU_WORKER;
			
 
				-			perf_arch.devid = 0;
			
 
				-			perf_arch.ncore = k-1;
			
 
				-			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				+			perf_arch.ndevices = 1;
			
 
				+			perf_arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
			
 
				+			perf_arch.devices[0].type = STARPU_CPU_WORKER;
			
 
				+			perf_arch.devices[0].devid = 0;
			
 
				+			perf_arch.devices[0].ncores = k-1;
			
 
				+			int comb = starpu_get_arch_comb(perf_arch.ndevices, perf_arch.devices);
			
 
				+			STARPU_ASSERT(comb != -1);
			
 
				+			int nimpls = model->nimpls[comb];
			
 
				+
			
 
				+			for (implid = 0; implid < nimpls; implid++)
			
 
				 				starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
			
 
				+			free(perf_arch.devices);
			
 
				 			return 0;
			
 
				 		}
			
 
				 
			
@@ -229,15 +232,24 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 
				 			unsigned devid;
			
 
				 			unsigned implid;
			
 
				 			struct starpu_perfmodel_arch perf_arch;
			
 
				-			perf_arch.type = STARPU_CUDA_WORKER;
			
 
				-			perf_arch.ncore = 0;
			
 
				 
			
 
				-			for (devid = 0; model->per_arch[STARPU_CUDA_WORKER] != NULL; devid++)
			
 
				+			perf_arch.ndevices = 1;
			
 
				+			perf_arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
			
 
				+			perf_arch.devices[0].type = STARPU_CUDA_WORKER;
			
 
				+			perf_arch.devices[0].ncores = 1;
			
 
				+			int comb;
			
 
				+			for(comb = 0; comb < narch_combs; comb++)
			
 
				 			{
			
 
				-				perf_arch.devid = devid;
			
 
				-				for (implid = 0; implid <STARPU_MAXIMPLEMENTATIONS; implid ++)
			
 
				-					starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
			
 
				+				if(arch_combs[comb]->ndevices == 1 && arch_combs[comb]->devices[0].type == STARPU_CUDA_WORKER)
			
 
				+				{
			
 
				+					perf_arch.devices[0].devid = arch_combs[comb]->devices[0].devid;
			
 
				+					int nimpls = model->nimpls[comb];
			
 
				+
			
 
				+					for (implid = 0; implid < nimpls; implid++)
			
 
				+						starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
			
 
				+				}
			
 
				 			}
			
 
				+			free(perf_arch.devices);
			
 
				 			return 0;
			
 
				 		}
			
 
				 
			
@@ -248,11 +260,19 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 
				 		if (nmatched == 1)
			
 
				 		{
			
 
				 			struct starpu_perfmodel_arch perf_arch;
			
 
				-			perf_arch.type = STARPU_CUDA_WORKER;
			
 
				-			perf_arch.devid = gpuid;
			
 
				-			perf_arch.ncore = 0;
			
 
				+			perf_arch.ndevices = 1;
			
 
				+			perf_arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
			
 
				+
			
 
				+			perf_arch.devices[0].type = STARPU_CUDA_WORKER;
			
 
				+			perf_arch.devices[0].devid = gpuid;
			
 
				+			perf_arch.devices[0].ncores = 1;
			
 
				+
			
 
				+			int comb = starpu_get_arch_comb(perf_arch.ndevices, perf_arch.devices);
			
 
				+			STARPU_ASSERT(comb != -1);
			
 
				+			int nimpls = model->nimpls[comb];
			
 
				+
			
 
				 			unsigned implid;
			
 
				-			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				+			for (implid = 0; implid < nimpls; implid++)
			
 
				 				starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
			
 
				 			return 0;
			
 
				 		}
			
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
@@ -1951,3 +1951,9 @@ void starpu_sched_ctx_unbook_workers_for_task(unsigned sched_ctx_id, int master)
 
				 	/* wake up starpu workers */
			
 
				 	_starpu_sched_ctx_wake_up_workers(sched_ctx_id, master);
			
 
				 }
			
 
				+
			
 
				+struct starpu_perfmodel_arch * _starpu_sched_ctx_get_perf_archtype(unsigned sched_ctx_id)
			
 
				+{
			
 
				+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				+	return &sched_ctx->perf_arch;
			
 
				+}
			
--- a/src/core/sched_ctx.h
+++ b/src/core/sched_ctx.h
@@ -150,6 +150,8 @@ struct _starpu_sched_ctx
 
				 	/* ctx nesting the current ctx */
			
 
				 	unsigned nesting_sched_ctx;
			
 
				 
			
 
				+	/* perf model for the device comb of the ctx */
			
 
				+	struct starpu_perfmodel_arch perf_arch;
			
 
				 };
			
 
				 
			
 
				 struct _starpu_machine_config;
			
@@ -224,6 +226,7 @@ void _starpu_fetch_tasks_from_empty_ctx_list(struct _starpu_sched_ctx *sched_ctx
 
				 
			
 
				 unsigned _starpu_sched_ctx_allow_hypervisor(unsigned sched_ctx_id);
			
 
				 
			
 
				+struct starpu_perfmodel_arch * _starpu_sched_ctx_get_perf_archtype(unsigned sched_ctx);
			
 
				 #ifdef STARPU_USE_SC_HYPERVISOR
			
 
				 /* Notifies the hypervisor that a tasks was poped from the workers' list */
			
 
				 void _starpu_sched_ctx_post_exec_task_cb(int workerid, struct starpu_task *task, size_t data_size, uint32_t footprint);
			
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -255,10 +255,13 @@ int _starpu_submit_job(struct _starpu_job *j)
 
				 	   && sched_ctx->perf_counters != NULL)
			
 
				 	{
			
 
				 		struct starpu_perfmodel_arch arch;
			
 
				-		arch.type = STARPU_CPU_WORKER;
			
 
				-		arch.devid = 0;
			
 
				-		arch.ncore = 0;
			
 
				+		arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
			
 
				+		arch.ndevices = 1;
			
 
				+		arch.devices[0].type = STARPU_CPU_WORKER;
			
 
				+		arch.devices[0].devid = 0;
			
 
				+		arch.devices[0].ncores = 1;
			
 
				 		_starpu_compute_buffers_footprint(j->task->cl->model, &arch, 0, j);
			
 
				+		free(arch.devices);
			
 
				 		int i;
			
 
				 		size_t data_size = 0;
			
 
				 		for(i = 0; i < STARPU_NMAXBUFS; i++)
			
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -862,12 +862,15 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 
				 		for (i = 0; i < nworker_per_cuda; i++)
			
 
				 		{
			
 
				 			int worker_idx = topology->nworkers + cudagpu * nworker_per_cuda + i;
			
 
				+
			
 
				 			config->workers[worker_idx].arch = STARPU_CUDA_WORKER;
			
 
				-			config->workers[worker_idx].perf_arch.type = STARPU_CUDA_WORKER;
			
 
				-			config->workers[worker_idx].perf_arch.devid = devid;
			
 
				+			config->workers[worker_idx].perf_arch.devices = (struct starpu_perfmodel_device)malloc(sizeof(struct starpu_perfmodel_device));
			
 
				+			config->workers[worker_idx].perf_arch.ndevices = 1;
			
 
				+			config->workers[worker_idx].perf_arch.devices[0].type = STARPU_CUDA_WORKER;
			
 
				+			config->workers[worker_idx].perf_arch.devices[0].devid = devid;
			
 
				 			// TODO: fix perfmodels etc.
			
 
				 			//config->workers[worker_idx].perf_arch.ncore = nworker_per_cuda - 1;
			
 
				-			config->workers[worker_idx].perf_arch.ncore = 0;
			
 
				+			config->workers[worker_idx].perf_arch.devices[0].ncores = 1;
			
 
				 			config->workers[worker_idx].devid = devid;
			
 
				 			config->workers[worker_idx].subworkerid = i;
			
 
				 			config->workers[worker_idx].worker_mask = STARPU_CUDA;
			
@@ -940,9 +943,11 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 
				 			break;
			
 
				 		}
			
 
				 		config->workers[worker_idx].arch = STARPU_OPENCL_WORKER;
			
 
				-		config->workers[worker_idx].perf_arch.type = STARPU_OPENCL_WORKER;
			
 
				-		config->workers[worker_idx].perf_arch.devid = devid;
			
 
				-		config->workers[worker_idx].perf_arch.ncore = 0;
			
 
				+		config->workers[worker_idx].perf_arch.devices = (struct starpu_perfmodel_device)malloc(sizeof(struct starpu_perfmodel_device));
			
 
				+		config->workers[worker_idx].perf_arch.ndevices = 1;
			
 
				+		config->workers[worker_idx].perf_arch.devices[0].type = STARPU_OPENCL_WORKER;
			
 
				+		config->workers[worker_idx].perf_arch.devices[0].devid = devid;
			
 
				+		config->workers[worker_idx].perf_arch.devices[0].ncore = 1;
			
 
				 		config->workers[worker_idx].subworkerid = 0;
			
 
				 		config->workers[worker_idx].devid = devid;
			
 
				 		config->workers[worker_idx].worker_mask = STARPU_OPENCL;
			
@@ -1002,9 +1007,12 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 
				 	{
			
 
				 		config->workers[topology->nworkers + sccdev].arch = STARPU_SCC_WORKER;
			
 
				 		int devid = _starpu_get_next_scc_deviceid(config);
			
 
				-		config->workers[topology->nworkers + sccdev].perf_arch.type = STARPU_SCC_WORKER;
			
 
				-		config->workers[topology->nworkers + sccdev].perf_arch.devid = sccdev;
			
 
				-		config->workers[topology->nworkers + sccdev].perf_arch.ncore = 0;
			
 
				+		config->workers[topology->nworkers + sccdev].perf_arch.devices = (struct starpu_perfmodel_device)malloc(sizeof(struct starpu_perfmodel_device));
			
 
				+		config->workers[topology->nworkers + sccdev].perf_arch.ndevices = 1;
			
 
				+
			
 
				+		config->workers[topology->nworkers + sccdev].perf_arch.devices[0].type = STARPU_SCC_WORKER;
			
 
				+		config->workers[topology->nworkers + sccdev].perf_arch.devices[0].devid = sccdev;
			
 
				+		config->workers[topology->nworkers + sccdev].perf_arch.devices[0].ncore = 1;
			
 
				 		config->workers[topology->nworkers + sccdev].subworkerid = 0;
			
 
				 		config->workers[topology->nworkers + sccdev].devid = devid;
			
 
				 		config->workers[topology->nworkers + sccdev].worker_mask = STARPU_SCC;
			
@@ -1068,9 +1076,11 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 
				 	{
			
 
				 		int worker_idx = topology->nworkers + cpu;
			
 
				 		config->workers[worker_idx].arch = STARPU_CPU_WORKER;
			
 
				-		config->workers[worker_idx].perf_arch.type = STARPU_CPU_WORKER;
			
 
				-		config->workers[worker_idx].perf_arch.devid = 0;
			
 
				-		config->workers[worker_idx].perf_arch.ncore = 0;
			
 
				+		config->workers[worker_idx].perf_arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
			
 
				+		config->workers[worker_idx].perf_arch.ndevices = 1;
			
 
				+		config->workers[worker_idx].perf_arch.devices[0].type = STARPU_CPU_WORKER;
			
 
				+		config->workers[worker_idx].perf_arch.devices[0].devid = 0;
			
 
				+		config->workers[worker_idx].perf_arch.devices[0].ncores = 1;
			
 
				 		config->workers[worker_idx].subworkerid = 0;
			
 
				 		config->workers[worker_idx].devid = cpu;
			
 
				 		config->workers[worker_idx].worker_mask = STARPU_CPU;
			
--- a/src/datawizard/footprint.c
+++ b/src/datawizard/footprint.c
@@ -37,7 +37,7 @@ uint32_t starpu_task_data_footprint(struct starpu_task *task)
 
				 	return footprint;
			
 
				 }
			
 
				 
			
 
				-uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, unsigned nimpl, struct _starpu_job *j)
			
 
				+uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned nimpl, struct _starpu_job *j)
			
 
				 {
			
 
				 	if (j->footprint_is_computed)
			
 
				 		return j->footprint;
			
@@ -50,23 +50,26 @@ uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, struc
 
				 	{
			
 
				 		footprint = model->footprint(task);
			
 
				 	}
			
 
				-	else if (model != NULL && model->per_arch &&
			
 
				-			model->per_arch[arch->type] != NULL &&
			
 
				-			model->per_arch[arch->type][arch->devid] != NULL &&
			
 
				-			model->per_arch[arch->type][arch->devid][arch->ncore] != NULL &&
			
 
				-			model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].size_base)
			
 
				-	{
			
 
				-		size_t size = model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].size_base(task, arch, nimpl);
			
 
				-		footprint = starpu_hash_crc32c_be_n(&size, sizeof(size), footprint);
			
 
				-	}
			
 
				-	else if (model && model->size_base)
			
 
				-	{
			
 
				-		size_t size = model->size_base(task, nimpl);
			
 
				-		footprint = starpu_hash_crc32c_be_n(&size, sizeof(size), footprint);
			
 
				-	}
			
 
				 	else
			
 
				-	{
			
 
				-		footprint = starpu_task_data_footprint(task);
			
 
				+	{ 
			
 
				+		if (model != NULL && model->per_arch)
			
 
				+		{
			
 
				+			struct starpu_perfmodel_per_arch *per_arch = starpu_perfmodel_get_model_per_arch(model, arch, nimpl);
			
 
				+			if(per_arch != NULL && per_arch->size_base)
			
 
				+			{
			
 
				+				size_t size = per_arch->size_base(task, arch, nimpl);
			
 
				+				footprint = starpu_hash_crc32c_be_n(&size, sizeof(size), footprint);
			
 
				+			}
			
 
				+		}
			
 
				+		else if (model && model->size_base)
			
 
				+		{
			
 
				+			size_t size = model->size_base(task, nimpl);
			
 
				+			footprint = starpu_hash_crc32c_be_n(&size, sizeof(size), footprint);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			footprint = starpu_task_data_footprint(task);
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	j->footprint = footprint;
			
--- a/src/debug/traces/starpu_fxt.c
+++ b/src/debug/traces/starpu_fxt.c
@@ -388,6 +388,8 @@ static void handle_worker_init_start(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 
				 
			
 
				 	char *kindstr = "";
			
 
				 	struct starpu_perfmodel_arch arch;
			
 
				+	arch.ndevices = 1;
			
 
				+	arch.devices = (struct starpu_perfmodel_device *)malloc(sizeof(struct starpu_perfmodel_device));
			
 
				 
			
 
				 	switch (ev->param[0])
			
 
				 	{
			
@@ -398,37 +400,37 @@ static void handle_worker_init_start(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 
				 		case _STARPU_FUT_CPU_KEY:
			
 
				 			set_next_cpu_worker_color(workerid);
			
 
				 			kindstr = "CPU";
			
 
				-			arch.type = STARPU_CPU_WORKER;
			
 
				-			arch.devid = 0;
			
 
				-			arch.ncore = 0;
			
 
				+			arch.devices[0].type = STARPU_CPU_WORKER;
			
 
				+			arch.devices[0].devid = 0;
			
 
				+			arch.devices[0].ncores = 1;
			
 
				 			break;
			
 
				 		case _STARPU_FUT_CUDA_KEY:
			
 
				 			set_next_cuda_worker_color(workerid);
			
 
				 			kindstr = "CUDA";
			
 
				-			arch.type = STARPU_CUDA_WORKER;
			
 
				-			arch.devid = devid;
			
 
				-			arch.ncore = 0;
			
 
				+			arch.devices[0].type = STARPU_CUDA_WORKER;
			
 
				+			arch.devices[0].devid = devid;
			
 
				+			arch.devices[0].ncores = 1;
			
 
				 			break;
			
 
				 		case _STARPU_FUT_OPENCL_KEY:
			
 
				 			set_next_opencl_worker_color(workerid);
			
 
				 			kindstr = "OPENCL";
			
 
				-			arch.type = STARPU_OPENCL_WORKER;
			
 
				-			arch.devid = devid;
			
 
				-			arch.ncore = 0;
			
 
				+			arch.devices[0].type = STARPU_OPENCL_WORKER;
			
 
				+			arch.devices[0].devid = devid;
			
 
				+			arch.devices[0].ncores = 1;
			
 
				 			break;
			
 
				 		case _STARPU_FUT_MIC_KEY:
			
 
				 			set_next_mic_worker_color(workerid);
			
 
				 			kindstr = "mic";
			
 
				-			arch.type = STARPU_MIC_WORKER;
			
 
				-			arch.devid = devid;
			
 
				-			arch.ncore = 0;
			
 
				+			arch.devices[0].type = STARPU_MIC_WORKER;
			
 
				+			arch.devices[0].devid = devid;
			
 
				+			arch.devices[0].ncores = 1;
			
 
				 			break;
			
 
				 		case _STARPU_FUT_SCC_KEY:
			
 
				 			set_next_scc_worker_color(workerid);
			
 
				 			kindstr = "scc";
			
 
				-			arch.type = STARPU_SCC_WORKER;
			
 
				-			arch.devid = devid;
			
 
				-			arch.ncore = 0;
			
 
				+			arch.devices[0].type = STARPU_SCC_WORKER;
			
 
				+			arch.devices[0].devid = devid;
			
 
				+			arch.devices[0].ncores = 1;
			
 
				 			break;
			
 
				 		default:
			
 
				 			STARPU_ABORT();
			
@@ -757,9 +759,11 @@ static void handle_end_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_opti
 
				 
			
 
				 		snprintf(dumped_codelets[dumped_codelets_count - 1].symbol, 256, "%s", last_codelet_symbol[worker]);
			
 
				 		dumped_codelets[dumped_codelets_count - 1].workerid = worker;
			
 
				-		dumped_codelets[dumped_codelets_count - 1].arch.type = ev->param[3];
			
 
				-		dumped_codelets[dumped_codelets_count - 1].arch.devid = ev->param[4];
			
 
				-		dumped_codelets[dumped_codelets_count - 1].arch.ncore = ev->param[5];
			
 
				+		dumped_codelets[dumped_codelets_count - 1].arch.ndevices = 1;
			
 
				+		dumped_codelets[dumped_codelets_count - 1].arch.devices = (struct starpu_perfmodel_device *)malloc(sizeof(struct starpu_perfmodel_device));
			
 
				+		dumped_codelets[dumped_codelets_count - 1].arch.devices[0].type = ev->param[3];
			
 
				+		dumped_codelets[dumped_codelets_count - 1].arch.devices[0].devid = ev->param[4];
			
 
				+		dumped_codelets[dumped_codelets_count - 1].arch.devices[0].ncores = ev->param[5];
			
 
				 
			
 
				 		dumped_codelets[dumped_codelets_count - 1].size = codelet_size;
			
 
				 		dumped_codelets[dumped_codelets_count - 1].hash = codelet_hash;
			
--- a/src/profiling/bound.c
+++ b/src/profiling/bound.c
@@ -426,7 +426,7 @@ static void _starpu_get_tasks_times(int nw, int nt, double *times)
 
				 				.footprint = tp->footprint,
			
 
				 				.footprint_is_computed = 1,
			
 
				 			};
			
 
				-			struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
			
 
				+			struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
			
 
				 			double length = _starpu_history_based_job_expected_perf(tp->cl->model, arch, &j, j.nimpl);
			
 
				 			if (isnan(length))
			
 
				 				times[w*nt+t] = NAN;
			
@@ -512,15 +512,15 @@ void starpu_bound_print_lp(FILE *output)
 
				 			};
			
 
				 			for (w = 0; w < nw; w++)
			
 
				 			{
			
 
				-				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
			
 
				-				if (_STARPU_IS_ZERO(t1->duration[arch->type][arch->devid][arch->ncore]))
			
 
				+				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
			
 
				+				if (_STARPU_IS_ZERO(t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores]))
			
 
				 				{
			
 
				 					double length = _starpu_history_based_job_expected_perf(t1->cl->model, arch, &j,j.nimpl);
			
 
				 					if (isnan(length))
			
 
				 						/* Avoid problems with binary coding of doubles */
			
 
				-						t1->duration[arch->type][arch->devid][arch->ncore] = NAN;
			
 
				+						t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores] = NAN;
			
 
				 					else
			
 
				-						t1->duration[arch->type][arch->devid][arch->ncore] = length / 1000.;
			
 
				+						t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores] = length / 1000.;
			
 
				 				}
			
 
				 			}
			
 
				 			nt++;
			
@@ -545,8 +545,8 @@ void starpu_bound_print_lp(FILE *output)
 
				 		{
			
 
				 			for (w = 0; w < nw; w++)
			
 
				 			{
			
 
				-				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
			
 
				-				if (!isnan(t1->duration[arch->type][arch->devid][arch->ncore]))
			
 
				+				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
			
 
				+				if (!isnan(t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores]))
			
 
				 					fprintf(output, " +t%luw%d", t1->id, w);
			
 
				 			}
			
 
				 			fprintf(output, " = 1;\n");
			
@@ -559,9 +559,9 @@ void starpu_bound_print_lp(FILE *output)
 
				 			fprintf(output, "/* %s %x */\tc%lu = s%lu", _starpu_codelet_get_model_name(t1->cl), (unsigned) t1->footprint, t1->id, t1->id);
			
 
				 			for (w = 0; w < nw; w++)
			
 
				 			{
			
 
				-				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
			
 
				-				if (!isnan(t1->duration[arch->type][arch->devid][arch->ncore]))
			
 
				-					fprintf(output, " + %f t%luw%d", t1->duration[arch->type][arch->devid][arch->ncore], t1->id, w);
			
 
				+				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
			
 
				+				if (!isnan(t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores]))
			
 
				+					fprintf(output, " + %f t%luw%d", t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores], t1->id, w);
			
 
				 			}
			
 
				 			fprintf(output, ";\n");
			
 
				 		}
			
@@ -642,8 +642,8 @@ void starpu_bound_print_lp(FILE *output)
 
				 				{
			
 
				 					for (w = 0; w < nw; w++)
			
 
				 					{
			
 
				-						struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
			
 
				-						if (!isnan(t1->duration[arch->type][arch->devid][arch->ncore]))
			
 
				+						struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
			
 
				+						if (!isnan(t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores]))
			
 
				 						{
			
 
				 							fprintf(output, "s%lu - c%lu >= -3e5 + 1e5 t%luw%d + 1e5 t%luw%d + 1e5 t%luafter%lu;\n",
			
 
				 									t1->id, t2->id, t1->id, w, t2->id, w, t1->id, t2->id);
			
--- a/src/sched_policies/deque_modeling_policy_data_aware.c
+++ b/src/sched_policies/deque_modeling_policy_data_aware.c
@@ -417,7 +417,7 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 
				 		worker = workers->get_next_master(workers, &it);
			
 
				 		struct _starpu_fifo_taskq *fifo  = dt->queue_array[worker];
			
 
				 		unsigned memory_node = starpu_worker_get_memory_node(worker);
			
 
				-		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				+		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker, sched_ctx_id);
			
 
				 
			
 
				 		/* Sometimes workers didn't take the tasks as early as we expected */
			
 
				 		double exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
			
@@ -555,7 +555,7 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				 		worker = workers->get_next_master(workers, &it);
			
 
				 
			
 
				 		struct _starpu_fifo_taskq *fifo = dt->queue_array[worker];
			
 
				-		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				+		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker, sched_ctx_id);
			
 
				 		unsigned memory_node = starpu_worker_get_memory_node(worker);
			
 
				 
			
 
				 		/* Sometimes workers didn't take the tasks as early as we expected */
			
@@ -770,7 +770,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
				 	}
			
 
				 	else if (task->bundle)
			
 
				 	{
			
 
				-		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(best_in_ctx);
			
 
				+		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(best_in_ctx, sched_ctx_id);
			
 
				 		unsigned memory_node = starpu_worker_get_memory_node(best);
			
 
				 		model_best = starpu_task_expected_length(task, perf_arch, selected_impl);
			
 
				 		transfer_model_best = starpu_task_expected_data_transfer_time(memory_node, task);
			
@@ -943,7 +943,7 @@ static void dmda_push_task_notify(struct starpu_task *task, int workerid, int pe
 
				 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
			
 
				 	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
			
 
				 	/* Compute the expected penality */
			
 
				-	struct starpu_perfmodel_arch *perf_arch = starpu_worker_get_perf_archtype(perf_workerid);
			
 
				+	struct starpu_perfmodel_arch *perf_arch = starpu_worker_get_perf_archtype(perf_workerid, sched_ctx_id);
			
 
				 	unsigned memory_node = starpu_worker_get_memory_node(workerid);
			
 
				 
			
 
				 	double predicted = starpu_task_expected_length(task, perf_arch,
			
--- a/src/sched_policies/parallel_heft.c
+++ b/src/sched_policies/parallel_heft.c
@@ -232,9 +232,9 @@ static double compute_expected_end(int workerid, double length)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static double compute_ntasks_end(int workerid)
			
 
				+static double compute_ntasks_end(int workerid, unsigned sched_ctx_id)
			
 
				 {
			
 
				-	struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(workerid);
			
 
				+	struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(workerid, sched_ctx_id);
			
 
				 	starpu_pthread_mutex_t *sched_mutex;
			
 
				 	starpu_pthread_cond_t *sched_cond;
			
 
				 
			
@@ -351,14 +351,14 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 
				 			}
			
 
				 
			
 
				 
			
 
				-			struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				+			struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker, sched_ctx_id);
			
 
				 
			
 
				 			local_task_length[worker_ctx][nimpl] = starpu_task_expected_length(task, perf_arch,nimpl);
			
 
				 
			
 
				 			unsigned memory_node = starpu_worker_get_memory_node(worker);
			
 
				 			local_data_penalty[worker_ctx][nimpl] = starpu_task_expected_data_transfer_time(memory_node, task);
			
 
				 
			
 
				-			double ntasks_end = compute_ntasks_end(worker);
			
 
				+			double ntasks_end = compute_ntasks_end(worker, sched_ctx_id);
			
 
				 
			
 
				 			if (ntasks_best == -1
			
 
				 			    || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
			
--- a/src/sched_policies/random_policy.c
+++ b/src/sched_policies/random_policy.c
@@ -50,7 +50,7 @@ static int _random_push_task(struct starpu_task *task, unsigned prio)
 
				 		{
			
 
				 			if(starpu_worker_can_execute_task(worker, task, impl))
			
 
				 			{
			
 
				-				struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				+				struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker, sched_ctx_id);
			
 
				 				double speedup = starpu_worker_get_relative_speedup(perf_arch);
			
 
				 				alpha_sum += speedup;
			
 
				 				speedup_arr[size] = speedup;
			
--- a/tests/perfmodels/feed.c
+++ b/tests/perfmodels/feed.c
@@ -73,15 +73,17 @@ int main(int argc, char **argv)
 
				 		measured_slow = 0.001+size*0.0000001;
			
 
				 
			
 
				 		struct starpu_perfmodel_arch arch;
			
 
				-		arch.type = STARPU_CUDA_WORKER;
			
 
				-		arch.ncore = 0;
			
 
				+		arch.ndevices = 1;
			
 
				+		arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
			
 
				+		arch.devices[0].type = STARPU_CUDA_WORKER;
			
 
				+		arch.devices[0].ncores = 0;
			
 
				 		/* Simulate Fast GPU */
			
 
				-		arch.devid = 0;
			
 
				+		arch.devices[0].devid = 0;
			
 
				 		starpu_perfmodel_update_history(&model, &task, &arch, 0, 0, measured_fast);
			
 
				 		starpu_perfmodel_update_history(&nl_model, &task, &arch, 0, 0, measured_fast);
			
 
				 
			
 
				 		/* Simulate Slow GPU */
			
 
				-		arch.devid = 1;
			
 
				+		arch.devices[0].devid = 1;
			
 
				 		starpu_perfmodel_update_history(&model, &task, &arch, 0, 0, measured_slow);
			
 
				 		starpu_perfmodel_update_history(&nl_model, &task, &arch, 0, 0, measured_slow);
			
 
				 		starpu_task_clean(&task);
			
--- a/tests/perfmodels/regression_based.c
+++ b/tests/perfmodels/regression_based.c
@@ -128,7 +128,7 @@ static void show_task_perfs(int size, struct starpu_task *task)
 
				 		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				 		{
			
 
				 			FPRINTF(stdout, "Expected time for %d on %s (impl %d):\t%f\n",
			
 
				-				size, name, nimpl, starpu_task_expected_length(task, starpu_worker_get_perf_archtype(workerid), nimpl));
			
 
				+				size, name, nimpl, starpu_task_expected_length(task, starpu_worker_get_perf_archtype(workerid, task->sched_ctx), nimpl));
			
 
				 		}
			
 
				 	}
			
 
				 }
			
--- a/tests/perfmodels/valid_model.c
+++ b/tests/perfmodels/valid_model.c
@@ -77,12 +77,11 @@ static int submit(struct starpu_codelet *codelet, struct starpu_perfmodel *model
 
				 	lmodel.is_init=0;
			
 
				 	lmodel.type = model->type;
			
 
				 	ret = starpu_perfmodel_load_symbol(codelet->model->symbol, &lmodel);
			
 
				+	int narch_combs = starpu_get_narch_combs();
			
 
				+	int comb;
			
 
				 	if (ret != 1)
			
 
				-		for (archtype = 0; archtype < STARPU_NARCH; archtype++)
			
 
				-			if(lmodel.per_arch[archtype] != NULL)
			
 
				-				for(devid=0; lmodel.per_arch[archtype][devid] != NULL; devid++)
			
 
				-					for(ncore=0; lmodel.per_arch[archtype][devid][ncore] != NULL; ncore++)
			
 
				-						old_nsamples += lmodel.per_arch[archtype][devid][ncore][0].regression.nsample;
			
 
				+		for(comb = 0; comb < narch_combs; comb++)
			
 
				+			old_nsamples += lmodel.per_arch[comb][0].regression.nsample;
			
 
				 
			
 
				         starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, 100, sizeof(int));
			
 
				 	for (loop = 0; loop < nloops; loop++)
			
@@ -107,11 +106,8 @@ static int submit(struct starpu_codelet *codelet, struct starpu_perfmodel *model
 
				 	}
			
 
				 
			
 
				 	new_nsamples = 0;
			
 
				-	for (archtype = 0; archtype < STARPU_NARCH; archtype++)
			
 
				-		if(lmodel.per_arch[archtype] != NULL)
			
 
				-			for(devid=0; lmodel.per_arch[archtype][devid] != NULL; devid++)
			
 
				-				for(ncore=0; lmodel.per_arch[archtype][devid][ncore] != NULL; ncore++)
			
 
				-					new_nsamples += lmodel.per_arch[archtype][devid][ncore][0].regression.nsample;
			
 
				+	for(comb = 0; comb < narch_combs; comb++)
			
 
				+		new_nsamples += lmodel.per_arch[comb][0].regression.nsample;
			
 
				 
			
 
				 	ret = starpu_perfmodel_unload_model(&lmodel);
			
 
				 	starpu_shutdown();
			
--- a/tests/sched_policies/simple_cpu_gpu_sched.c
+++ b/tests/sched_policies/simple_cpu_gpu_sched.c
@@ -99,44 +99,92 @@ init_perfmodels(void)
 
				 {
			
 
				 	unsigned devid, ncore;
			
 
				 
			
 
				-	starpu_perfmodel_init(&model_cpu_task);
			
 
				-	starpu_perfmodel_init(&model_gpu_task);
			
 
				-
			
 
				-	if(model_cpu_task.per_arch[STARPU_CPU_WORKER] != NULL)
			
 
				-	{
			
 
				-		for(devid=0; model_cpu_task.per_arch[STARPU_CPU_WORKER][devid] != NULL; devid++)
			
 
				-		{
			
 
				-			for(ncore=0; model_cpu_task.per_arch[STARPU_CPU_WORKER][devid][ncore] != NULL; ncore++)
			
 
				-			{
			
 
				-				model_cpu_task.per_arch[STARPU_CPU_WORKER][devid][ncore][0].cost_function = cpu_task_cpu;
			
 
				-				model_gpu_task.per_arch[STARPU_CPU_WORKER][devid][ncore][0].cost_function = gpu_task_cpu;
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	if(model_cpu_task.per_arch[STARPU_CUDA_WORKER] != NULL)
			
 
				-	{
			
 
				-		for(devid=0; model_cpu_task.per_arch[STARPU_CUDA_WORKER][devid] != NULL; devid++)
			
 
				-		{
			
 
				-			for(ncore=0; model_cpu_task.per_arch[STARPU_CUDA_WORKER][devid][ncore] != NULL; ncore++)
			
 
				-			{
			
 
				-				model_cpu_task.per_arch[STARPU_CUDA_WORKER][devid][ncore][0].cost_function = cpu_task_gpu;
			
 
				-				model_gpu_task.per_arch[STARPU_CUDA_WORKER][devid][ncore][0].cost_function = gpu_task_gpu;
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	if(model_cpu_task.per_arch[STARPU_OPENCL_WORKER] != NULL)
			
 
				-	{
			
 
				-		for(devid=0; model_cpu_task.per_arch[STARPU_OPENCL_WORKER][devid] != NULL; devid++)
			
 
				-		{
			
 
				-			for(ncore=0; model_cpu_task.per_arch[STARPU_OPENCL_WORKER][devid][ncore] != NULL; ncore++)
			
 
				-			{
			
 
				-				model_cpu_task.per_arch[STARPU_OPENCL_WORKER][devid][ncore][0].cost_function = cpu_task_gpu;
			
 
				-				model_gpu_task.per_arch[STARPU_OPENCL_WORKER][devid][ncore][0].cost_function = gpu_task_gpu;
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				+	starpu_perfmodel_init(NULL, &model_cpu_task);
			
 
				+	starpu_perfmodel_init(NULL, &model_gpu_task);
			
 
				+
			
 
				+	struct starpu_perfmodel_arch arch_cpu;
			
 
				+	arch_cpu.ndevices = 1;
			
 
				+	arch_cpu.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
			
 
				+	arch_cpu.devices[0].type = STARPU_CPU_WORKER;	
			
 
				+	arch_cpu.devices[0].devid = 0;
			
 
				+	arch_cpu.devices[0].ncores = 1;
			
 
				+
			
 
				+	int comb_cpu = starpu_get_arch_comb(arch_cpu.ndevices, arch_cpu.devices);
			
 
				+	if(comb_cpu == -1)
			
 
				+		comb_cpu = starpu_add_arch_comb(arch_cpu.ndevices, arch_cpu.devices);
			
 
				+
			
 
				+
			
 
				+	model_cpu_task.per_arch[comb_cpu] = (struct starpu_perfmodel_per_arch*)malloc(sizeof(struct starpu_perfmodel_per_arch));
			
 
				+	memset(&model_cpu_task.per_arch[comb_cpu][0], 0, sizeof(struct starpu_perfmodel_per_arch));
			
 
				+	model_cpu_task.nimpls[comb_cpu] = 1;
			
 
				+	model_cpu_task.per_arch[comb_cpu][0].cost_function = cpu_task_cpu;
			
 
				+
			
 
				+	model_gpu_task.per_arch[comb_cpu] = (struct starpu_perfmodel_per_arch*)malloc(sizeof(struct starpu_perfmodel_per_arch));
			
 
				+	memset(&model_gpu_task.per_arch[comb_cpu][0], 0, sizeof(struct starpu_perfmodel_per_arch));
			
 
				+	model_gpu_task.nimpls[comb_cpu] = 1;
			
 
				+	model_gpu_task.per_arch[comb_cpu][0].cost_function = gpu_task_cpu;
			
 
				+
			
 
				+
			
 
				+
			
 
				+	struct starpu_perfmodel_arch arch_cuda;
			
 
				+	arch_cuda.ndevices = 1;
			
 
				+	arch_cuda.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
			
 
				+	arch_cuda.devices[0].type = STARPU_CUDA_WORKER;	
			
 
				+	arch_cuda.devices[0].devid = 0;
			
 
				+	arch_cuda.devices[0].ncores = 1;
			
 
				+	
			
 
				+
			
 
				+
			
 
				+	int comb_cuda = starpu_get_arch_comb(arch_cuda.ndevices, arch_cuda.devices);
			
 
				+	if(comb_cuda == -1)
			
 
				+		comb_cuda = starpu_add_arch_comb(arch_cuda.ndevices, arch_cuda.devices);
			
 
				+
			
 
				+	model_gpu_task.per_arch[comb_cpu] = (struct starpu_perfmodel_per_arch*)malloc(sizeof(struct starpu_perfmodel_per_arch));
			
 
				+	memset(&model_cpu_task.per_arch[comb_cuda][0], 0, sizeof(struct starpu_perfmodel_per_arch));
			
 
				+	model_cpu_task.nimpls[comb_cuda] = 1;
			
 
				+	model_cpu_task.per_arch[comb_cuda][0].cost_function = cpu_task_cpu;
			
 
				+
			
 
				+	model_gpu_task.per_arch[comb_cuda] = (struct starpu_perfmodel_per_arch*)malloc(sizeof(struct starpu_perfmodel_per_arch));
			
 
				+	memset(&model_gpu_task.per_arch[comb_cuda][0], 0, sizeof(struct starpu_perfmodel_per_arch));
			
 
				+	model_gpu_task.nimpls[comb_cuda] = 1;
			
 
				+	model_gpu_task.per_arch[comb_cuda][0].cost_function = gpu_task_cpu;
			
 
				+
			
 
				+
			
 
				+/* 	if(model_cpu_task.per_arch[STARPU_CPU_WORKER] != NULL) */
			
 
				+/* 	{ */
			
 
				+/* 		for(devid=0; model_cpu_task.per_arch[STARPU_CPU_WORKER][devid] != NULL; devid++) */
			
 
				+/* 		{ */
			
 
				+/* 			for(ncore=0; model_cpu_task.per_arch[STARPU_CPU_WORKER][devid][ncore] != NULL; ncore++) */
			
 
				+/* 			{ */
			
 
				+/* 				model_cpu_task.per_arch[STARPU_CPU_WORKER][devid][ncore][0].cost_function = cpu_task_cpu; */
			
 
				+/* 				model_gpu_task.per_arch[STARPU_CPU_WORKER][devid][ncore][0].cost_function = gpu_task_cpu; */
			
 
				+/* 			} */
			
 
				+/* 		} */
			
 
				+/* 	} */
			
 
				+
			
 
				+/* 	if(model_cpu_task.per_arch[STARPU_CUDA_WORKER] != NULL) */
			
 
				+/* 	{ */
			
 
				+/* 		for(devid=0; model_cpu_task.per_arch[STARPU_CUDA_WORKER][devid] != NULL; devid++) */
			
 
				+/* 		{ */
			
 
				+/* 			for(ncore=0; model_cpu_task.per_arch[STARPU_CUDA_WORKER][devid][ncore] != NULL; ncore++) */
			
 
				+/* 			{ */
			
 
				+/* 				model_cpu_task.per_arch[STARPU_CUDA_WORKER][devid][ncore][0].cost_function = cpu_task_gpu; */
			
 
				+/* 				model_gpu_task.per_arch[STARPU_CUDA_WORKER][devid][ncore][0].cost_function = gpu_task_gpu; */
			
 
				+/* 			} */
			
 
				+/* 		} */
			
 
				+/* 	} */
			
 
				+
			
 
				+/* 	if(model_cpu_task.per_arch[STARPU_OPENCL_WORKER] != NULL) */
			
 
				+/* 	{ */
			
 
				+/* 		for(devid=0; model_cpu_task.per_arch[STARPU_OPENCL_WORKER][devid] != NULL; devid++) */
			
 
				+/* 		{ */
			
 
				+/* 			for(ncore=0; model_cpu_task.per_arch[STARPU_OPENCL_WORKER][devid][ncore] != NULL; ncore++) */
			
 
				+/* 			{ */
			
 
				+/* 				model_cpu_task.per_arch[STARPU_OPENCL_WORKER][devid][ncore][0].cost_function = cpu_task_gpu; */
			
 
				+/* 				model_gpu_task.per_arch[STARPU_OPENCL_WORKER][devid][ncore][0].cost_function = gpu_task_gpu; */
			
 
				+/* 			} */
			
 
				+/* 		} */
			
 
				+/* 	} */
			
 
				 }
			
 
				 
			
 
				 /*