Browse Source

changing perf_model structure: arch now contains several devices, so we can have for eg one device STARPU_CPU with 2 cores one device STARPU_CUDA_WORKER with 1 core
tools folder not yet changed ( temp removed from the Makefile)
fxt just considers the first device of the structure (s.t. old programs having arch for e.g. composed of only a CPU or a GPU should work)
tests/perfmodel don't work very well (not sure what they should actually do)

Andra Hugo 11 years ago
parent
commit
5593439277

+ 1 - 1
Makefile.am

@@ -18,7 +18,7 @@ ACLOCAL_AMFLAGS=-I m4
 CLEANFILES = *.gcno *.gcda *.linkinfo
 
 SUBDIRS = src
-SUBDIRS += tools tests
+SUBDIRS += tests
 SUBDIRS += doc
 
 if USE_MPI

+ 20 - 6
include/starpu_perfmodel.h

@@ -35,13 +35,20 @@ struct starpu_data_descr;
 
 #define STARPU_NARCH STARPU_ANY_WORKER
 
-struct starpu_perfmodel_arch
+struct starpu_perfmodel_device
 {
 	enum starpu_worker_archtype type;
 	int devid;	/* identifier of the precise device */
-	int ncore;	/* number of execution in parallel, minus 1 */
+	int ncores;	/* number of execution in parallel, minus 1 */	
 };
 
+struct starpu_perfmodel_arch
+{
+	int ndevices;
+	struct starpu_perfmodel_device *devices;
+};
+
+
 struct starpu_perfmodel_history_entry
 {
 	double mean;
@@ -125,7 +132,7 @@ struct starpu_perfmodel
 	size_t (*size_base)(struct starpu_task *, unsigned nimpl);
 	uint32_t (*footprint)(struct starpu_task *);
 
-	struct starpu_perfmodel_per_arch**** per_arch; /*STARPU_MAXIMPLEMENTATIONS*/
+	struct starpu_perfmodel_per_arch** per_arch; /*STARPU_MAXIMPLEMENTATIONS*/
 
 	const char *symbol;
 
@@ -133,15 +140,22 @@ struct starpu_perfmodel
 	unsigned is_loaded;
 	unsigned benchmarking;
 	starpu_pthread_rwlock_t model_rwlock;
+	int *nimpls;
+	int ncombs;
 };
 
-void starpu_perfmodel_init(struct starpu_perfmodel *model);
-void starpu_perfmodel_init_with_file(FILE*f, struct starpu_perfmodel *model);
+void starpu_perfmodel_init(FILE *f, struct starpu_perfmodel *model);
+//void starpu_perfmodel_init_with_file(FILE*f, struct starpu_perfmodel *model);
+
+struct starpu_perfmodel_arch *starpu_worker_get_perf_archtype(int workerid, unsigned sched_ctx_id);
 
-struct starpu_perfmodel_arch *starpu_worker_get_perf_archtype(int workerid);
+struct starpu_perfmodel_per_arch *starpu_perfmodel_get_model_per_arch(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, unsigned impl);
 
 int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *model);
 int starpu_perfmodel_unload_model(struct starpu_perfmodel *model);
+int starpu_get_narch_combs();
+int starpu_add_arch_comb(int ndevices, struct starpu_perfmodel_device* devices);
+int starpu_get_arch_comb(int ndevices, struct starpu_perfmodel_device *devices);
 
 void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, char *path, size_t maxlen, unsigned nimpl);
 char* starpu_perfmodel_get_archtype_name(enum starpu_worker_archtype archtype);

+ 1 - 1
sc_hypervisor/src/policies_utils/policy_tools.c

@@ -416,7 +416,7 @@ void sc_hypervisor_get_tasks_times(int nw, int nt, double times[nw][nt], int *wo
                 for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
                 {
 			int worker = workers == NULL ? w : workers[w];
-                        struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(worker);
+                        struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(worker, STARPU_NMAX_SCHED_CTXS);
                         double length = starpu_permodel_history_based_expected_perf(tp->cl->model, arch, tp->footprint);
 
                         if (isnan(length))

+ 1 - 1
src/common/fxt.h

@@ -431,7 +431,7 @@ do {									\
 do {									\
 	const size_t job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));	\
 	const uint32_t job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));\
-	FUT_DO_PROBE7(_STARPU_FUT_END_CODELET_BODY, (job), (job_size), (job_hash), (archtype)->type, (archtype)->devid, (archtype)->ncore, workerid);	\
+	FUT_DO_PROBE7(_STARPU_FUT_END_CODELET_BODY, (job), (job_size), (job_hash), (archtype->devices[0]).type, (archtype->devices[0]).devid, (archtype->devices[0]).ncores, workerid);	\
 } while(0);
 
 #define _STARPU_TRACE_START_EXECUTING()				\

+ 3 - 3
src/core/combined_workers.c

@@ -101,9 +101,9 @@ int starpu_combined_worker_assign_workerid(int nworkers, int workerid_array[])
 
 	combined_worker->worker_size = nworkers;
 
-	combined_worker->perf_arch.type = config->workers[workerid_array[0]].perf_arch.type;
-	combined_worker->perf_arch.devid = config->workers[workerid_array[0]].perf_arch.devid; 
-	combined_worker->perf_arch.ncore = nworkers - 1;
+	combined_worker->perf_arch.devices[0].type = config->workers[workerid_array[0]].perf_arch.devices[0].type;
+	combined_worker->perf_arch.devices[0].devid = config->workers[workerid_array[0]].perf_arch.devices[0].devid; 
+	combined_worker->perf_arch.devices[0].ncores = nworkers - 1;
 	combined_worker->worker_mask = config->workers[workerid_array[0]].worker_mask;
 	
 #ifdef STARPU_USE_MP

+ 2 - 2
src/core/detect_combined_workers.c

@@ -44,7 +44,7 @@ static void find_workers(hwloc_obj_t obj, int cpu_workers[STARPU_NMAXWORKERS], u
 	for(worker = _starpu_worker_list_begin(workers); worker != _starpu_worker_list_end(workers); worker = _starpu_worker_list_next(worker))
 	{
 		/* is it a CPU worker? */
-		if (worker->perf_arch.type == STARPU_CPU_WORKER && worker->perf_arch.ncore == 0)
+		if (worker->perf_arch.devices[0].type == STARPU_CPU_WORKER && worker->perf_arch.devices[0].ncores == 0)
 		{
 			_STARPU_DEBUG("worker %d is part of it\n", worker->workerid);
 			/* Add it to the combined worker */
@@ -178,7 +178,7 @@ static void find_and_assign_combinations_with_hwloc(int *workerids, int nworkers
 	for (i = 0; i < nworkers; i++)
 	{
 		struct _starpu_worker *worker = _starpu_get_worker_struct(workerids[i]);
-		if (worker->perf_arch.type == STARPU_CPU_WORKER && worker->perf_arch.ncore == 0)
+		if (worker->perf_arch.devices[0].type == STARPU_CPU_WORKER && worker->perf_arch.devices[0].ncores == 0)
 		{
 			hwloc_obj_t obj = hwloc_get_obj_by_depth(topology->hwtopology, config->pu_depth, worker->bindid);
 			obj = obj->parent;

+ 33 - 30
src/core/perfmodel/perfmodel.c

@@ -37,7 +37,6 @@
  *	2: models must be calibrated, existing models are overwritten.
  */
 static unsigned calibrate_flag = 0;
-
 void _starpu_set_calibrate_flag(unsigned val)
 {
 	calibrate_flag = val;
@@ -48,8 +47,15 @@ unsigned _starpu_get_calibrate_flag(void)
 	return calibrate_flag;
 }
 
-struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid)
+struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid, unsigned sched_ctx_id)
 {
+	if(sched_ctx_id != STARPU_NMAX_SCHED_CTXS)
+	{
+		unsigned child_sched_ctx = starpu_sched_ctx_worker_is_master_for_child_ctx(workerid, sched_ctx_id);
+		if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
+			return _starpu_sched_ctx_get_perf_archtype(child_sched_ctx);
+	}
+
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
 
 	/* This workerid may either be a basic worker or a combined worker */
@@ -57,6 +63,7 @@ struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid)
 
 	if (workerid < (int)config->topology.nworkers)
 		return &config->workers[workerid].perf_arch;
+	
 
 	/* We have a combined worker */
 	unsigned ncombinedworkers = config->topology.ncombinedworkers;
@@ -71,11 +78,14 @@ struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid)
 static double per_arch_task_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, struct starpu_task *task, unsigned nimpl)
 {
 	double exp = NAN;
+	int comb = starpu_get_arch_comb(arch->ndevices, arch->devices);
+	if(comb == -1) return exp;
+		
 	double (*per_arch_cost_function)(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
 	double (*per_arch_cost_model)(struct starpu_data_descr *);
 
-	per_arch_cost_function = model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].cost_function;
-	per_arch_cost_model = model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].cost_model;
+	per_arch_cost_function = model->per_arch[comb][nimpl].cost_function;
+	per_arch_cost_model = model->per_arch[comb][nimpl].cost_model;
 
 	if (per_arch_cost_function)
 		exp = per_arch_cost_function(task, arch, nimpl);
@@ -91,26 +101,23 @@ static double per_arch_task_expected_perf(struct starpu_perfmodel *model, struct
 
 double starpu_worker_get_relative_speedup(struct starpu_perfmodel_arch* perf_arch)
 {
-	if (perf_arch->type == STARPU_CPU_WORKER)
-	{
-		return _STARPU_CPU_ALPHA * (perf_arch->ncore + 1);
-	}
-	else if (perf_arch->type == STARPU_CUDA_WORKER)
-	{
-		return _STARPU_CUDA_ALPHA;
-	}
-	else if (perf_arch->type == STARPU_OPENCL_WORKER)
-	{
-		return _STARPU_OPENCL_ALPHA;
-	}
-	else if (perf_arch->type == STARPU_MIC_WORKER)
+	double speedup = 0;
+	int dev;
+	for(dev = 0; dev < perf_arch->ndevices; dev++)
 	{
-		return _STARPU_MIC_ALPHA * (perf_arch->ncore + 1);
+		double coef = 0.0;
+		if (perf_arch->devices[dev].type == STARPU_CPU_WORKER)
+			coef = _STARPU_CPU_ALPHA;
+		else if (perf_arch->devices[dev].type == STARPU_CUDA_WORKER)
+			coef = _STARPU_CUDA_ALPHA;
+		else if (perf_arch->devices[dev].type == STARPU_OPENCL_WORKER)
+			coef = _STARPU_OPENCL_ALPHA;
+		else if (perf_arch->devices[dev].type == STARPU_MIC_WORKER)
+			coef =  _STARPU_MIC_ALPHA;
+		
+		speedup += coef * (perf_arch->devices[dev].ncores + 1);
 	}
-	STARPU_ABORT();
-
-	/* Never reached ! */
-	return NAN;
+	return speedup == 0 ? NAN : speedup;
 }
 
 static double common_task_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, struct starpu_task *task, unsigned nimpl)
@@ -184,22 +191,15 @@ static double starpu_model_expected_perf(struct starpu_task *task, struct starpu
 		switch (model->type)
 		{
 			case STARPU_PER_ARCH:
-
 				return per_arch_task_expected_perf(model, arch, task, nimpl);
 			case STARPU_COMMON:
 				return common_task_expected_perf(model, arch, task, nimpl);
-
 			case STARPU_HISTORY_BASED:
-
 				return _starpu_history_based_job_expected_perf(model, arch, j, nimpl);
 			case STARPU_REGRESSION_BASED:
-
 				return _starpu_regression_based_job_expected_perf(model, arch, j, nimpl);
-
 			case STARPU_NL_REGRESSION_BASED:
-
 				return _starpu_non_linear_regression_based_job_expected_perf(model, arch, j,nimpl);
-
 			default:
 				STARPU_ABORT();
 		}
@@ -223,6 +223,8 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 					    struct starpu_perfmodel_arch* arch,
 					    unsigned nimpl)
 {
+	if(arch->ndevices > 1)
+		return -1.0;
 	unsigned i;
 	double sum = 0.0;
 	enum starpu_node_kind node_kind;
@@ -236,7 +238,7 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 		if (!_starpu_data_is_multiformat_handle(handle))
 			continue;
 		
-		switch(arch->type)
+		switch(arch->devices[0].type)
 		{
 			case STARPU_CPU_WORKER:
 				node_kind = STARPU_CPU_RAM;
@@ -503,3 +505,4 @@ void _starpu_create_sampling_directory_if_needed(void)
 		directory_existence_was_tested = 1;
 	}
 }
+

+ 4 - 1
src/core/perfmodel/perfmodel.h

@@ -38,7 +38,10 @@ extern "C"
  * differents versions of StarPU having different performance model
  * formats.
  */
-#define _STARPU_PERFMODEL_VERSION 43
+#define _STARPU_PERFMODEL_VERSION 44
+
+struct starpu_perfmodel_arch **arch_combs;
+int narch_combs;
 
 struct _starpu_perfmodel_list
 {

+ 275 - 388
src/core/perfmodel/perfmodel_history.c

@@ -52,17 +52,79 @@ struct starpu_perfmodel_history_table
 static starpu_pthread_rwlock_t registered_models_rwlock;
 static struct _starpu_perfmodel_list *registered_models = NULL;
 
-size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned nimpl, struct _starpu_job *j)
+int starpu_add_arch_comb(int ndevices, struct starpu_perfmodel_device* devices)
+{
+	arch_combs[narch_combs] = (struct starpu_perfmodel_arch*)malloc(sizeof(struct starpu_perfmodel_arch));
+	arch_combs[narch_combs]->devices = (struct starpu_perfmodel_device*)malloc(ndevices*sizeof(struct starpu_perfmodel_device));
+	arch_combs[narch_combs]->ndevices = ndevices;
+	int dev;
+	for(dev = 0; dev < ndevices; dev++)
+	{
+		arch_combs[narch_combs]->devices[dev].type = devices[dev].type;
+		arch_combs[narch_combs]->devices[dev].devid = devices[dev].devid;
+		arch_combs[narch_combs]->devices[dev].ncores = devices[dev].ncores;
+	}
+	narch_combs++;
+	return narch_combs-1;
+}
+
+int starpu_get_arch_comb(int ndevices, struct starpu_perfmodel_device *devices)
+{
+	int nfounded = 0;
+	unsigned found = 0;
+	int comb;
+	for(comb = 0; comb < narch_combs; comb++)
+	{
+		if(arch_combs[comb]->ndevices == ndevices)
+		{
+			int dev1, dev2;
+			for(dev1 = 0; dev1 < arch_combs[comb]->ndevices; dev1++)
+			{
+				for(dev2 = 0; dev2 < ndevices; dev2++)
+				{
+					if(arch_combs[comb]->devices[dev1].type == devices[dev2].type && 
+					   arch_combs[comb]->devices[dev1].devid == devices[dev2].devid && 
+					   arch_combs[comb]->devices[dev1].ncores == devices[dev2].ncores)
+						nfounded++;
+				}
+			}
+			if(nfounded == ndevices)
+				found = 1;
+		}
+		if(found)
+			return comb;
+	}	
+	return -1;
+}
+
+static 	void _free_arch_combs(void)
+{
+	int i;
+	for(i = 0; i < narch_combs; i++)
+	{
+		free(arch_combs[i]->devices);
+		free(arch_combs[i]);
+	}
+	narch_combs = 0;
+}
+
+int starpu_get_narch_combs()
+{
+	return narch_combs;
+}
+
+size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned impl, struct _starpu_job *j)
 {
 	struct starpu_task *task = j->task;
+	int comb = starpu_get_arch_comb(arch->ndevices, arch->devices);
 
-	if (model && model->per_arch && model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].size_base)
+	if (model && model->per_arch && comb != -1 && model->per_arch[comb][impl].size_base)
 	{
-		return model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].size_base(task, arch, nimpl);
+		return model->per_arch[comb][impl].size_base(task, arch, impl);
 	}
 	else if (model && model->size_base)
 	{
-		return model->size_base(task, nimpl);
+		return model->size_base(task, impl);
 	}
 	else
 	{
@@ -103,11 +165,11 @@ static void insert_history_entry(struct starpu_perfmodel_history_entry *entry, s
 	HASH_ADD_UINT32_T(*history_ptr, footprint, table);
 }
 
-static void dump_reg_model(FILE *f, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned nimpl)
+static void dump_reg_model(FILE *f, struct starpu_perfmodel *model, int comb, int impl)
 {
 	struct starpu_perfmodel_per_arch *per_arch_model;
 
-	per_arch_model = &model->per_arch[arch->type][arch->devid][arch->ncore][nimpl];
+	per_arch_model = &model->per_arch[comb][impl];
 	struct starpu_perfmodel_regression_model *reg_model;
 	reg_model = &per_arch_model->regression;
 
@@ -275,14 +337,10 @@ static void parse_per_arch_model_file(FILE *f, struct starpu_perfmodel_per_arch
 }
 
 
-static void parse_arch(FILE *f, struct starpu_perfmodel *model, unsigned scan_history,struct starpu_perfmodel_arch* arch)
+static void parse_arch(FILE *f, struct starpu_perfmodel *model, unsigned scan_history, int comb)
 {
 	struct starpu_perfmodel_per_arch dummy;
 	unsigned nimpls, implmax, impl, i, ret;
-	//_STARPU_DEBUG("Parsing %s_%u_parallel_%u\n",
-	//		starpu_perfmodel_get_archtype_name(arch->type),
-	//		arch->devid,
-	//		arch->ncore + 1);
 
 	/* Parsing number of implementation */
 	_starpu_drop_comments(f);
@@ -293,8 +351,14 @@ static void parse_arch(FILE *f, struct starpu_perfmodel *model, unsigned scan_hi
 	{
 		/* Parsing each implementation */
 		implmax = STARPU_MIN(nimpls, STARPU_MAXIMPLEMENTATIONS);
+		if(implmax > 0)
+		{
+			model->nimpls[comb] = implmax;
+			model->per_arch[comb] = (struct starpu_perfmodel_per_arch*)malloc(implmax*sizeof(struct starpu_perfmodel_per_arch));
+		}
+
 		for (impl = 0; impl < implmax; impl++)
-			parse_per_arch_model_file(f, &model->per_arch[arch->type][arch->devid][arch->ncore][impl], scan_history);
+			parse_per_arch_model_file(f, &model->per_arch[comb][impl], scan_history);
 	}
 	else
 	{
@@ -308,99 +372,89 @@ static void parse_arch(FILE *f, struct starpu_perfmodel *model, unsigned scan_hi
 
 }
 
-static void parse_device(FILE *f, struct starpu_perfmodel *model, unsigned scan_history, enum starpu_worker_archtype archtype, unsigned devid)
+static enum starpu_worker_archtype _get_enum_type(int type)
 {
-	unsigned maxncore, ncore, ret, i;
-	struct starpu_perfmodel_arch arch;
-	arch.type = archtype;
-	arch.devid = devid;
-	//_STARPU_DEBUG("Parsing device %s_%u arch\n",
-	//		starpu_perfmodel_get_archtype_name(archtype),
-	//		devid);
-
-	/* Parsing maximun number of worker for this device */
-	_starpu_drop_comments(f);
-	ret = fscanf(f, "%u\n", &maxncore);
-	STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
-
-	/* Parsing each arch */
-	if(model !=NULL)
-	{
-		for(ncore=0; ncore < maxncore && model->per_arch[archtype][devid][ncore] != NULL; ncore++)
-		{
-			arch.ncore = ncore;
-			parse_arch(f,model,scan_history,&arch);
-		}
-	}
-	else
+	switch(type)
 	{
-		ncore=0;
+	        case 0:
+			return STARPU_CPU_WORKER;
+        	case 1:
+			return STARPU_CUDA_WORKER;
+	        case 2: 
+			return STARPU_OPENCL_WORKER;
+        	case 3:
+			return STARPU_MIC_WORKER;
+        	case 4:
+			return STARPU_SCC_WORKER;
+		default:
+			STARPU_ABORT();
 	}
 
-	for(i=ncore; i < maxncore; i++)
-	{
-		arch.ncore = i;
-		parse_arch(f,NULL,scan_history,&arch);
-	}
 }
-
-
-static void parse_archtype(FILE *f, struct starpu_perfmodel *model, unsigned scan_history, enum starpu_worker_archtype archtype)
+static void parse_comb(FILE *f, struct starpu_perfmodel *model, unsigned scan_history, int comb)
 {
-	unsigned ndevice, devid, ret, i;
-	//_STARPU_DEBUG("Parsing %s arch\n", starpu_perfmodel_get_archtype_name(archtype));
-
-	/* Parsing number of device for this archtype */
+	int ndevices = 0;
 	_starpu_drop_comments(f);
-	ret = fscanf(f, "%u\n", &ndevice);
+	int ret = fscanf(f, "%d\n", &ndevices );
 	STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
 
-	/* Parsing each device for this archtype*/
-	if(model != NULL)
-	{
-		for(devid=0; devid < ndevice && model->per_arch[archtype][devid] != NULL; devid++)
-		{
-				parse_device(f,model,scan_history,archtype,devid);
-		}
-	}
-	else
+	struct starpu_perfmodel_device devices[ndevices];
+	
+	int dev;
+	for(dev = 0; dev < ndevices; dev++)
 	{
-		devid=0;
+		enum starpu_worker_archtype dev_type;
+		_starpu_drop_comments(f);
+		int type;
+		ret = fscanf(f, "%d\n", &type);
+		STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
+		dev_type = _get_enum_type(type);
+		int dev_id;
+		_starpu_drop_comments(f);
+		ret = fscanf(f, "%d\n", &dev_id);
+		STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
+		int ncores;
+		_starpu_drop_comments(f);
+		ret = fscanf(f, "%d\n", &ncores);
+		STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
+		devices[dev].type = dev_type;	
+		devices[dev].devid = dev_id;
+		devices[dev].ncores = ncores;
 	}
+	starpu_add_arch_comb(ndevices, devices);
 
-	for(i=devid; i < ndevice; i++)
-	{
-		parse_device(f,NULL,scan_history,archtype,i);
-	}
+	parse_arch(f, model, scan_history, comb);
 }
 
 static void parse_model_file(FILE *f, struct starpu_perfmodel *model, unsigned scan_history)
 {
-	unsigned archtype;
 	int ret, version;
 
-	//_STARPU_DEBUG("Start parsing\n");
-
 	/* Parsing performance model version */
 	_starpu_drop_comments(f);
 	ret = fscanf(f, "%d\n", &version);
 	STARPU_ASSERT_MSG(version == _STARPU_PERFMODEL_VERSION, "Incorrect performance model file with a model version %d not being the current model version (%d)\n",
 			  version, _STARPU_PERFMODEL_VERSION);
 	STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
+	
+	
+	int ncombs = 0;
+	_starpu_drop_comments(f);
+	ret = fscanf(f, "%d\n", &ncombs);
+	STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
+	if(ncombs > 0)
+		model->ncombs = ncombs;
 
-	/* Parsing each kind of archtype */
-	for(archtype=0; archtype<STARPU_NARCH; archtype++)
-	{
-		parse_archtype(f, model, scan_history, archtype);
-	}
+	int comb;
+	for(comb = 0; comb < ncombs; comb++)
+		parse_comb(f, model, scan_history, comb);
 }
 
-
-static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, unsigned nimpl)
+static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel *model, int comb, unsigned impl)
 {
 	struct starpu_perfmodel_per_arch *per_arch_model;
 
-	per_arch_model = &model->per_arch[arch->type][arch->devid][arch->ncore][nimpl];
+	per_arch_model = &model->per_arch[comb][impl];
 	/* count the number of elements in the lists */
 	struct starpu_perfmodel_history_list *ptr = NULL;
 	unsigned nentries = 0;
@@ -418,12 +472,12 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel *model, st
 
 	/* header */
 	char archname[32];
-	starpu_perfmodel_get_arch_name(arch, archname, 32, nimpl);
+	starpu_perfmodel_get_arch_name(arch_combs[comb], archname,  32, impl);
 	fprintf(f, "#####\n");
 	fprintf(f, "# Model for %s\n", archname);
 	fprintf(f, "# number of entries\n%u\n", nentries);
 
-	dump_reg_model(f, model, arch, nimpl);
+	dump_reg_model(f, model, comb, impl);
 
 	/* Dump the history into the model file in case it is necessary */
 	if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
@@ -440,218 +494,58 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel *model, st
 	fprintf(f, "\n");
 }
 
-static unsigned get_n_entries(struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, unsigned impl)
-{
-	struct starpu_perfmodel_per_arch *per_arch_model;
-	per_arch_model = &model->per_arch[arch->type][arch->devid][arch->ncore][impl];
-	/* count the number of elements in the lists */
-	struct starpu_perfmodel_history_list *ptr = NULL;
-	unsigned nentries = 0;
-
-	if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
-	{
-		/* Dump the list of all entries in the history */
-		ptr = per_arch_model->list;
-		while(ptr)
-		{
-			nentries++;
-			ptr = ptr->next;
-		}
-	}
-	return nentries;
-}
-
 static void dump_model_file(FILE *f, struct starpu_perfmodel *model)
 {
-	struct _starpu_machine_config *conf = _starpu_get_machine_config();
-	char *name = "unknown";
-	unsigned archtype, ndevice, *ncore, devid, nc, nimpl;
-	struct starpu_perfmodel_arch arch;
-
 	fprintf(f, "##################\n");
 	fprintf(f, "# Performance Model Version\n");
 	fprintf(f, "%d\n\n", _STARPU_PERFMODEL_VERSION);
 
-	for(archtype=0; archtype<STARPU_NARCH; archtype++)
-	{
-		arch.type = archtype;
-		switch (archtype)
-		{
-			case STARPU_CPU_WORKER:
-				ndevice = 1;
-				ncore = &conf->topology.nhwcpus;
-				name = "CPU";
-				break;
-			case STARPU_CUDA_WORKER:
-				ndevice = conf->topology.nhwcudagpus;
-				ncore = NULL;
-				name = "CUDA";
-				break;
-			case STARPU_OPENCL_WORKER:
-				ndevice = conf->topology.nhwopenclgpus;
-				ncore = NULL;
-				name = "OPENCL";
-				break;
-			case STARPU_MIC_WORKER:
-				ndevice = conf->topology.nhwmicdevices;
-				ncore = conf->topology.nhwmiccores;
-				name = "MIC";
-				break;
-			case STARPU_SCC_WORKER:
-				ndevice = conf->topology.nhwscc;
-				ncore = NULL;
-				name = "SCC";
-				break;
-			default:
-				/* Unknown arch */
-				STARPU_ABORT();
-				break;
-		}
-
-		fprintf(f, "####################\n");
-		fprintf(f, "# %ss\n", name);
-		fprintf(f, "# number of %s devices\n", name);
-		fprintf(f, "%u\n", ndevice);
-
-
-		for(devid=0; devid<ndevice; devid++)
-		{
-			arch.devid = devid;
-			fprintf(f, "###############\n");
-			fprintf(f, "# %s_%u\n", name, devid);
-			fprintf(f, "# number of workers on device %s_%d\n", name, devid);
-			if(ncore != NULL)
-				fprintf(f, "%u\n", ncore[devid]);
-			else
-				fprintf(f, "1\n");
-			for(nc=0; model->per_arch[archtype][devid][nc] != NULL; nc++)
-			{
-
-				arch.ncore = nc;
-				unsigned max_impl = 0;
-				if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
-				{
-					for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
-						if (get_n_entries(model, &arch, nimpl))
-							max_impl = nimpl + 1;
-				}
-				else if (model->type == STARPU_REGRESSION_BASED || model->type == STARPU_PER_ARCH || model->type == STARPU_COMMON)
-				{
-					for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
-						if (model->per_arch[archtype][devid][nc][nimpl].regression.nsample)
-							max_impl = nimpl + 1;
-				}
-				else
-					STARPU_ASSERT_MSG(0, "Unknown history-based performance model %u", model->type);
+	int ncombs = model->ncombs;
 
-				fprintf(f, "##########\n");
-				fprintf(f, "# %u worker(s) in parallel\n", nc+1);
+	fprintf(f, "####################\n");
+	fprintf(f, "# COMBs\n");
+	fprintf(f, "# number of combinations\n");
+	fprintf(f, "%u\n", ncombs);
 
-				fprintf(f, "# number of implementations\n");
-				fprintf(f, "%u\n", max_impl);
-				for (nimpl = 0; nimpl < max_impl; nimpl++)
-				{
-					dump_per_arch_model_file(f, model, &arch, nimpl);
-				}
-			}
-		}
-	}
-}
-
-static void initialize_per_arch_model(struct starpu_perfmodel_per_arch *per_arch_model)
-{
-	memset(per_arch_model, 0, sizeof(struct starpu_perfmodel_per_arch));
-}
-
-static struct starpu_perfmodel_per_arch*** initialize_arch_model(int maxdevid, unsigned* maxncore_table)
-{
-	int devid, ncore, nimpl;
-	struct starpu_perfmodel_per_arch *** arch_model = malloc(sizeof(*arch_model)*(maxdevid+1));
-	arch_model[maxdevid] = NULL;
-	for(devid=0; devid<maxdevid; devid++)
+	int comb, impl, dev;
+	for(comb = 0; comb < ncombs; comb++)
 	{
-		int maxncore;
-		if(maxncore_table != NULL)
-			maxncore = maxncore_table[devid];
-		else
-			maxncore = 1;
-
-		arch_model[devid] = malloc(sizeof(*arch_model[devid])*(maxncore+1));
-		arch_model[devid][maxncore] = NULL;
-		for(ncore=0; ncore<maxncore; ncore++)
+		int ndevices = arch_combs[comb]->ndevices;
+		fprintf(f, "####################\n");
+		fprintf(f, "# COMB_%d\n", comb);
+		fprintf(f, "# number of types devices\n");
+		fprintf(f, "%u\n", ndevices);
+		
+		for(dev = 0; dev < ndevices; dev++)
 		{
-			arch_model[devid][ncore] = malloc(sizeof(*arch_model[devid][ncore])*STARPU_MAXIMPLEMENTATIONS);
-			for(nimpl=0; nimpl<STARPU_MAXIMPLEMENTATIONS; nimpl++)
-			{
-				initialize_per_arch_model(&arch_model[devid][ncore][nimpl]);
-			}
+			fprintf(f, "####################\n");
+			fprintf(f, "# DEV_%d\n", dev);
+			fprintf(f, "# device type (CPU - 0, CUDA - 1, OPENCL - 2, MIC - 3, SCC - 4)\n");
+			fprintf(f, "%u\n", arch_combs[comb]->devices[dev].type);
+
+			fprintf(f, "####################\n");
+			fprintf(f, "# DEV_%d\n", dev);
+			fprintf(f, "# device id \n");
+			fprintf(f, "%u\n", arch_combs[comb]->devices[dev].devid);
+
+			fprintf(f, "####################\n");
+			fprintf(f, "# DEV_%d\n", dev);
+			fprintf(f, "# number of cores \n");
+			fprintf(f, "%u\n", arch_combs[comb]->devices[dev].ncores);
 		}
-	}
-	return arch_model;
-}
-
-static void initialize_model(struct starpu_perfmodel *model)
-{
-	struct _starpu_machine_config *conf = _starpu_get_machine_config();
-	model->per_arch = malloc(sizeof(*model->per_arch)*(STARPU_NARCH));
-
-	model->per_arch[STARPU_CPU_WORKER] = initialize_arch_model(1,&conf->topology.nhwcpus);
-	model->per_arch[STARPU_CUDA_WORKER] = initialize_arch_model(conf->topology.nhwcudagpus,NULL);
-	model->per_arch[STARPU_OPENCL_WORKER] = initialize_arch_model(conf->topology.nhwopenclgpus,NULL);
-	model->per_arch[STARPU_MIC_WORKER] = initialize_arch_model(conf->topology.nhwmicdevices,conf->topology.nhwmiccores);
-	model->per_arch[STARPU_SCC_WORKER] = initialize_arch_model(conf->topology.nhwscc,NULL);
-}
-
-static void initialize_model_with_file(FILE*f, struct starpu_perfmodel *model)
-{
-	unsigned ret, archtype, devid, i, ndevice, * maxncore;
-	struct starpu_perfmodel_arch arch;
-	int version;
-
-	/* Parsing performance model version */
-	_starpu_drop_comments(f);
-	ret = fscanf(f, "%d\n", &version);
-	STARPU_ASSERT_MSG(version == _STARPU_PERFMODEL_VERSION, "Incorrect performance model file with a model version %d not being the current model version (%d)\n",
-			version, _STARPU_PERFMODEL_VERSION);
-	STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
-
-	model->per_arch = malloc(sizeof(*model->per_arch)*(STARPU_NARCH));
-	for(archtype=0; archtype<STARPU_NARCH; archtype++)
-	{
-		arch.type = archtype;
-
-		_starpu_drop_comments(f);
-		ret = fscanf(f, "%u\n", &ndevice);
-		STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
-
-		if(ndevice != 0)
-			maxncore = malloc(sizeof(*maxncore)*ndevice);
-		else
-			maxncore = NULL;
-
-		for(devid=0; devid < ndevice; devid++)
+		
+		int nimpls = model->nimpls[comb];
+		fprintf(f, "##########\n");
+		fprintf(f, "# number of implementations\n");
+		fprintf(f, "%u\n", nimpls);
+		for (impl = 0; impl < nimpls; impl++)
 		{
-			arch.devid = devid;
-
-			_starpu_drop_comments(f);
-			ret = fscanf(f, "%u\n", &maxncore[devid]);
-			STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
-
-			for(i=0; i<maxncore[devid]; i++)
-			{
-				arch.ncore = i;
-
-				parse_arch(f,NULL,0,&arch);
-			}
+			dump_per_arch_model_file(f, model, comb, impl);
 		}
-
-		model->per_arch[archtype] = initialize_arch_model(ndevice,maxncore);
-		if(maxncore != NULL)
-			free(maxncore);
 	}
 }
 
-void starpu_perfmodel_init(struct starpu_perfmodel *model)
+void starpu_perfmodel_init(FILE *f, struct starpu_perfmodel *model)
 {
 	STARPU_ASSERT(model && model->symbol);
 
@@ -677,43 +571,35 @@ void starpu_perfmodel_init(struct starpu_perfmodel *model)
 
 	STARPU_PTHREAD_RWLOCK_INIT(&model->model_rwlock, NULL);
 	if(model->type != STARPU_COMMON)
-		initialize_model(model);
-	model->is_init = 1;
-	STARPU_PTHREAD_RWLOCK_UNLOCK(&registered_models_rwlock);
-}
-
-void starpu_perfmodel_init_with_file(FILE*f, struct starpu_perfmodel *model)
-{
-	STARPU_ASSERT(model && model->symbol);
-
-	int already_init;
-
-	STARPU_PTHREAD_RWLOCK_RDLOCK(&registered_models_rwlock);
-	already_init = model->is_init;
-	STARPU_PTHREAD_RWLOCK_UNLOCK(&registered_models_rwlock);
-
-	if (already_init)
-		return;
-
-	/* The model is still not loaded so we grab the lock in write mode, and
-	 * if it's not loaded once we have the lock, we do load it. */
-	STARPU_PTHREAD_RWLOCK_WRLOCK(&registered_models_rwlock);
-
-	/* Was the model initialized since the previous test ? */
-	if (model->is_init)
 	{
-		STARPU_PTHREAD_RWLOCK_UNLOCK(&registered_models_rwlock);
-		return;
+		struct _starpu_machine_config *conf = _starpu_get_machine_config();
+		unsigned ncores = conf->topology.nhwcpus;
+		unsigned ncuda =  conf->topology.nhwcudagpus;
+		unsigned nopencl = conf->topology.nhwopenclgpus;
+		unsigned nmic = 0;
+		unsigned i;
+		for(i = 0; i < conf->topology.nhwmicdevices; i++)
+			nmic += conf->topology.nhwmiccores[i];
+		unsigned nscc = conf->topology.nhwscc;
+		unsigned npossible_combs= pow(2, (ncores + ncuda + nopencl + nmic + nscc));
+		arch_combs = (struct starpu_perfmodel_arch**) malloc(npossible_combs*sizeof(struct starpu_perfmodel_arch*));
+		narch_combs = 0;
+		model->per_arch = (struct starpu_perfmodel_per_arch**) malloc(npossible_combs*sizeof(struct starpu_perfmodel_per_arch*));
+		model->nimpls = (int *)malloc(npossible_combs*sizeof(int));
+
+		for(i = 0; i < npossible_combs; i++)
+		{
+			model->per_arch[i] = NULL;
+			model->nimpls[i] = 0;
+		}
+		if(f)
+			parse_model_file(f, model, 0);
 	}
 
-	STARPU_PTHREAD_RWLOCK_INIT(&model->model_rwlock, NULL);
-	if(model->type != STARPU_COMMON)
-		initialize_model_with_file(f,model);
 	model->is_init = 1;
 	STARPU_PTHREAD_RWLOCK_UNLOCK(&registered_models_rwlock);
 }
 
-
 static void get_model_debug_path(struct starpu_perfmodel *model, const char *arch, char *path, size_t maxlen)
 {
 	STARPU_ASSERT(path);
@@ -731,11 +617,11 @@ static void get_model_debug_path(struct starpu_perfmodel *model, const char *arc
 }
 
 /*
- * Returns 0 is the model was already loaded, 1 otherwise.
+ * Returns 0 if the model was already loaded, 1 otherwise.
  */
 int _starpu_register_model(struct starpu_perfmodel *model)
 {
-	starpu_perfmodel_init(model);
+	starpu_perfmodel_init(NULL, model);
 
 	/* If the model has already been loaded, there is nothing to do */
 	STARPU_PTHREAD_RWLOCK_RDLOCK(&registered_models_rwlock);
@@ -768,34 +654,19 @@ int _starpu_register_model(struct starpu_perfmodel *model)
 #ifdef STARPU_MODEL_DEBUG
 	_starpu_create_sampling_directory_if_needed();
 
-	unsigned archtype, devid, ncore, nimpl;
-	struct starpu_perfmodel_arch arch;
-
 	_STARPU_DEBUG("\n\n ###\nHere\n ###\n\n");
 
 	if(model->is_init)
 	{
 		_STARPU_DEBUG("Init\n");
-		for (archtype = 0; archtype < STARPU_NARCH; archtype++)
+		int nimpl = model->nimpl;
+		int ncombs = model->ncombs;
+		int comb, impl;
+		for(comb = 0; comb < ncombs; comb++)
 		{
-			_STARPU_DEBUG("Archtype\n");
-			arch.type = archtype;
-			if(model->per_arch[archtype] != NULL)
+			for(impl = 0; impl < nimpls; impl++)
 			{
-				for(devid=0; model->per_arch[archtype][devid] != NULL; devid++)
-				{
-					_STARPU_DEBUG("Devid\n");
-					arch.devid = devid;
-					for(ncore=0; model->per_arch[archtype][devid][ncore] != NULL; ncore++)
-					{
-						_STARPU_DEBUG("Ncore\n");
-						arch.ncore = ncore;
-						for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
-						{
-							starpu_perfmodel_debugfilepath(model, &arch, model->per_arch[archtype][devid][ncore][nimpl].debug_path, 256, nimpl);
-						}
-					}
-				}
+				starpu_perfmodel_debugfilepath(model, model->per_arch[comb][impl], model->per_arch[comb][impl].debug_path, 256, impl);
 			}
 		}
 	}
@@ -868,53 +739,44 @@ void _starpu_initialize_registered_performance_models(void)
 
 void _starpu_deinitialize_performance_model(struct starpu_perfmodel *model)
 {
-	unsigned arch, devid, ncore, nimpl;
-
 	if(model->is_init && model->per_arch != NULL)
 	{
-		for (arch = 0; arch < STARPU_NARCH; arch++)
+		int ncombs = model->ncombs;
+		int comb, impl;
+		for(comb = 0; comb < ncombs; comb++)
 		{
-			if( model->per_arch[arch] != NULL)
+			int nimpls = model->nimpls[comb];
+			for(impl = 0; impl < nimpls; impl++)
 			{
-				for(devid=0; model->per_arch[arch][devid] != NULL; devid++)
+				struct starpu_perfmodel_per_arch *archmodel = &model->per_arch[comb][impl];
+				struct starpu_perfmodel_history_list *list, *plist;
+				struct starpu_perfmodel_history_table *entry, *tmp;
+				
+				HASH_ITER(hh, archmodel->history, entry, tmp)
 				{
-					for(ncore=0; model->per_arch[arch][devid][ncore] != NULL; ncore++)
-					{
-						for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
-						{
-							struct starpu_perfmodel_per_arch *archmodel = &model->per_arch[arch][devid][ncore][nimpl];
-							struct starpu_perfmodel_history_list *list, *plist;
-							struct starpu_perfmodel_history_table *entry, *tmp;
-
-							HASH_ITER(hh, archmodel->history, entry, tmp)
-							{
-								HASH_DEL(archmodel->history, entry);
-								free(entry);
-							}
-							archmodel->history = NULL;
-
-							list = archmodel->list;
-							while (list)
-							{
-								free(list->entry);
-								plist = list;
-								list = list->next;
-								free(plist);
-							}
-							archmodel->list = NULL;
-						}
-						free(model->per_arch[arch][devid][ncore]);
-						model->per_arch[arch][devid][ncore] = NULL;
-					}
-					free(model->per_arch[arch][devid]);
-					model->per_arch[arch][devid] = NULL;
+					HASH_DEL(archmodel->history, entry);
+					free(entry);
 				}
-				free(model->per_arch[arch]);
-				model->per_arch[arch] = NULL;
+				archmodel->history = NULL;
+				
+				list = archmodel->list;
+				while (list)
+				{
+					free(list->entry);
+					plist = list;
+					list = list->next;
+					free(plist);
+				}
+				archmodel->list = NULL;
 			}
-		}
+			free(model->per_arch[comb]);
+			model->per_arch[comb] = NULL;
+		}		
 		free(model->per_arch);
 		model->per_arch = NULL;
+		free(model->nimpls);
+		model->nimpls = NULL;
+
 	}
 
 	model->is_init = 0;
@@ -949,6 +811,7 @@ void _starpu_deinitialize_registered_performance_models(void)
 
 	STARPU_PTHREAD_RWLOCK_UNLOCK(&registered_models_rwlock);
 	STARPU_PTHREAD_RWLOCK_DESTROY(&registered_models_rwlock);
+	_free_arch_combs();
 }
 
 /*
@@ -958,12 +821,12 @@ void _starpu_deinitialize_registered_performance_models(void)
  */
 void _starpu_load_per_arch_based_model(struct starpu_perfmodel *model)
 {
-	starpu_perfmodel_init(model);
+	starpu_perfmodel_init(NULL, model);
 }
 
 void _starpu_load_common_based_model(struct starpu_perfmodel *model)
 {
-	starpu_perfmodel_init(model);
+	starpu_perfmodel_init(NULL, model);
 }
 
 /* We first try to grab the global lock in read mode to check whether the model
@@ -972,7 +835,7 @@ void _starpu_load_common_based_model(struct starpu_perfmodel *model)
  * is still not loaded once we have the lock, we do load it.  */
 void _starpu_load_history_based_model(struct starpu_perfmodel *model, unsigned scan_history)
 {
-	starpu_perfmodel_init(model);
+	starpu_perfmodel_init(NULL, model);
 
 	STARPU_PTHREAD_RWLOCK_WRLOCK(&model->model_rwlock);
 
@@ -1099,9 +962,6 @@ int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *mo
 	FILE *f = fopen(path, "r");
 	STARPU_ASSERT(f);
 
-	starpu_perfmodel_init_with_file(f, model);
-	rewind(f);
-
 	parse_model_file(f, model, 1);
 
 	STARPU_ASSERT(fclose(f) == 0);
@@ -1141,18 +1001,19 @@ char* starpu_perfmodel_get_archtype_name(enum starpu_worker_archtype archtype)
 	}
 }
 
-void starpu_perfmodel_get_arch_name(struct starpu_perfmodel_arch* arch, char *archname, size_t maxlen,unsigned nimpl)
+void starpu_perfmodel_get_arch_name(struct starpu_perfmodel_arch* arch, char *archname, size_t maxlen,unsigned impl)
 {
-	snprintf(archname, maxlen, "%s%d_parallel%d_impl%u",
-			starpu_perfmodel_get_archtype_name(arch->type),
-			arch->devid,
-			arch->ncore + 1,
-			nimpl);
+	int comb = starpu_get_arch_comb(arch->ndevices, arch->devices);
+	STARPU_ASSERT(comb != -1);
+
+	snprintf(archname, maxlen, "%d_impl%u", comb, impl);
 }
 
 void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model,
 				    struct starpu_perfmodel_arch* arch, char *path, size_t maxlen, unsigned nimpl)
 {
+	int comb = starpu_get_arch_comb(arch->ndevices, arch->devices);
+	STARPU_ASSERT(comb != -1);
 	char archname[32];
 	starpu_perfmodel_get_arch_name(arch, archname, 32, nimpl);
 
@@ -1163,11 +1024,13 @@ void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model,
 
 double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, struct _starpu_job *j, unsigned nimpl)
 {
+	int comb = starpu_get_arch_comb(arch->ndevices, arch->devices);
 	double exp = NAN;
+	if(comb == -1) return exp;
 	size_t size = _starpu_job_get_data_size(model, arch, nimpl, j);
 	struct starpu_perfmodel_regression_model *regmodel;
 
-	regmodel = &model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].regression;
+	regmodel = &model->per_arch[comb][nimpl].regression;
 
 	if (regmodel->valid && size >= regmodel->minx * 0.9 && size <= regmodel->maxx * 1.1)
                 exp = regmodel->alpha*pow((double)size, regmodel->beta);
@@ -1177,18 +1040,20 @@ double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model
 
 double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, struct _starpu_job *j,unsigned nimpl)
 {
+	int comb = starpu_get_arch_comb(arch->ndevices, arch->devices);
 	double exp = NAN;
+	if(comb == -1) return exp;
 	size_t size = _starpu_job_get_data_size(model, arch, nimpl, j);
 	struct starpu_perfmodel_regression_model *regmodel;
 
-	regmodel = &model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].regression;
+	regmodel = &model->per_arch[comb][nimpl].regression;
 
 	if (regmodel->nl_valid && size >= regmodel->minx * 0.9 && size <= regmodel->maxx * 1.1)
 		exp = regmodel->a*pow((double)size, regmodel->b) + regmodel->c;
 	else
 	{
 		uint32_t key = _starpu_compute_buffers_footprint(model, arch, nimpl, j);
-		struct starpu_perfmodel_per_arch *per_arch_model = &model->per_arch[arch->type][arch->devid][arch->ncore][nimpl];
+		struct starpu_perfmodel_per_arch *per_arch_model = &model->per_arch[comb][nimpl];
 		struct starpu_perfmodel_history_table *history;
 		struct starpu_perfmodel_history_table *entry;
 
@@ -1221,14 +1086,16 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 
 double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, struct _starpu_job *j,unsigned nimpl)
 {
+	int comb = starpu_get_arch_comb(arch->ndevices, arch->devices);
 	double exp = NAN;
+	if(comb == -1) return exp;
 	struct starpu_perfmodel_per_arch *per_arch_model;
 	struct starpu_perfmodel_history_entry *entry;
 	struct starpu_perfmodel_history_table *history, *elt;
 
 	uint32_t key = _starpu_compute_buffers_footprint(model, arch, nimpl, j);
 
-	per_arch_model = &model->per_arch[arch->type][arch->devid][arch->ncore][nimpl];
+	per_arch_model = &model->per_arch[comb][nimpl];
 
 	STARPU_PTHREAD_RWLOCK_RDLOCK(&model->model_rwlock);
 	history = per_arch_model->history;
@@ -1270,20 +1137,31 @@ double starpu_permodel_history_based_expected_perf(struct starpu_perfmodel *mode
 	return _starpu_history_based_job_expected_perf(model, arch, &j, j.nimpl);
 }
 
-void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned cpuid STARPU_ATTRIBUTE_UNUSED, double measured, unsigned nimpl)
+void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned cpuid STARPU_ATTRIBUTE_UNUSED, double measured, unsigned impl)
 {
 	if (model)
 	{
+		int comb = starpu_get_arch_comb(arch->ndevices, arch->devices);
+		if(comb == -1)
+			comb = starpu_add_arch_comb(arch->ndevices, arch->devices);
 		STARPU_PTHREAD_RWLOCK_WRLOCK(&model->model_rwlock);
 
-		struct starpu_perfmodel_per_arch *per_arch_model = &model->per_arch[arch->type][arch->devid][arch->ncore][nimpl];
+		if(!model->per_arch[comb])
+		{
+			model->per_arch[comb] = (struct starpu_perfmodel_per_arch*)malloc(STARPU_MAXIMPLEMENTATIONS*sizeof(struct starpu_perfmodel_per_arch));
+			int i;
+			for(i = 0; i < STARPU_MAXIMPLEMENTATIONS; i++)
+				memset(&model->per_arch[comb][i], 0, sizeof(struct starpu_perfmodel_per_arch));
+		}
+
+		struct starpu_perfmodel_per_arch *per_arch_model = &model->per_arch[comb][impl];
 
 		if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
 		{
 			struct starpu_perfmodel_history_entry *entry;
 			struct starpu_perfmodel_history_table *elt;
 			struct starpu_perfmodel_history_list **list;
-			uint32_t key = _starpu_compute_buffers_footprint(model, arch, nimpl, j);
+			uint32_t key = _starpu_compute_buffers_footprint(model, arch, impl, j);
 
 			list = &per_arch_model->list;
 
@@ -1310,7 +1188,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 				entry->deviation = 0.0;
 				entry->sum2 = 0;
 
-				entry->size = _starpu_job_get_data_size(model, arch, nimpl, j);
+				entry->size = _starpu_job_get_data_size(model, arch, impl, j);
 				entry->flops = j->task->flops;
 
 				entry->footprint = key;
@@ -1318,6 +1196,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 				entry->nerror = 0;
 
 				insert_history_entry(entry, list, &per_arch_model->history);
+				model->nimpls[comb]++;
 			}
 			else
 			{
@@ -1336,7 +1215,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 					if (entry->nerror >= entry->nsample)
 					{
 						char archname[32];
-						starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), nimpl);
+						starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), impl);
 						_STARPU_DISP("Too big deviation for model %s on %s: %f vs average %f, %u such errors against %u samples (%+f%%), flushing the performance model. Use the STARPU_HISTORY_MAX_ERROR environement variable to control the threshold (currently %d%%)\n", model->symbol, archname, measured, entry->mean, entry->nerror, entry->nsample, measured * 100. / entry->mean - 100, historymaxerror);
 						entry->sum = 0.0;
 						entry->sum2 = 0.0;
@@ -1376,7 +1255,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 			reg_model = &per_arch_model->regression;
 
 			/* update the regression model */
-			size_t job_size = _starpu_job_get_data_size(model, arch, nimpl, j);
+			size_t job_size = _starpu_job_get_data_size(model, arch, impl, j);
 			double logy, logx;
 			logx = log((double)job_size);
 			logy = log(measured);
@@ -1414,11 +1293,11 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 		}
 
 		if (!j->footprint_is_computed)
-			(void) _starpu_compute_buffers_footprint(model, arch, nimpl, j);
+			(void) _starpu_compute_buffers_footprint(model, arch, comb, impl, j);
 
 		STARPU_ASSERT(j->footprint_is_computed);
 
-		fprintf(f, "0x%x\t%lu\t%f\t%f\t%f\t%d\t\t", j->footprint, (unsigned long) _starpu_job_get_data_size(model, arch, nimpl, j), measured, task->predicted, task->predicted_transfer, cpuid);
+		fprintf(f, "0x%x\t%lu\t%f\t%f\t%f\t%d\t\t", j->footprint, (unsigned long) _starpu_job_get_data_size(model, arch, impl, j), measured, task->predicted, task->predicted_transfer, cpuid);
 		unsigned i;
 
 		for (i = 0; i < task->cl->nbuffers; i++)
@@ -1450,3 +1329,11 @@ void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct star
 	/* and save perfmodel on termination */
 	_starpu_set_calibrate_flag(1);
 }
+
+struct starpu_perfmodel_per_arch *starpu_perfmodel_get_model_per_arch(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, unsigned impl)
+{
+	int comb = starpu_get_arch_comb(arch->ndevices, arch->devices);
+	if(comb == -1) return NULL;
+	
+	return &model->per_arch[comb][impl];
+}

+ 57 - 37
src/core/perfmodel/perfmodel_print.c

@@ -19,7 +19,7 @@
 #include <starpu.h>
 #include <starpu_perfmodel.h>
 #include <common/config.h>
-
+#include "perfmodel.h"
 static
 void _starpu_perfmodel_print_history_based(struct starpu_perfmodel_per_arch *per_arch_model, char *parameter, uint32_t *footprint, FILE *output)
 {
@@ -63,7 +63,9 @@ void _starpu_perfmodel_print_history_based(struct starpu_perfmodel_per_arch *per
 
 void starpu_perfmodel_print(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output)
 {
-	struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[arch->type][arch->devid][arch->ncore][nimpl];
+	int comb = starpu_get_arch_comb(arch->ndevices, arch->devices);
+	STARPU_ASSERT(comb != -1);
+	struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[comb][nimpl];
 	char archname[32];
 
 	if (arch_model->regression.nsample || arch_model->regression.valid || arch_model->regression.nl_valid || arch_model->list)
@@ -170,24 +172,12 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 {
 	if (arch == NULL)
 	{
-		/* display all architectures */
-		unsigned archtype, devid, ncore, implid;
-		struct starpu_perfmodel_arch perf_arch;
-		for (archtype = 0; archtype < STARPU_NARCH; archtype++)
+		int comb, impl;
+		for(comb = 0; comb < narch_combs; comb++)
 		{
-			perf_arch.type = archtype;
-			for(devid = 0; model->per_arch[archtype][devid] != NULL; devid++)
-			{
-				perf_arch.devid = devid;
-				for(ncore = 0; model->per_arch[archtype][devid][ncore] != NULL; ncore++)
-				{
-					perf_arch.ncore = ncore;
-					for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
-					{ /* Display all codelets on each arch */
-						starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
-					}
-				}
-			}
+			int nimpls = model->nimpls[comb];
+			for(impl = 0; impl < nimpls; impl++)
+				starpu_perfmodel_print(model, arch_combs[comb], impl, parameter, footprint, output);
 		}
 	}
 	else
@@ -196,11 +186,17 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 		{
 			unsigned implid;
 			struct starpu_perfmodel_arch perf_arch;
-			perf_arch.type = STARPU_CPU_WORKER;
-			perf_arch.devid = 0;
-			perf_arch.ncore = 0;
-			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+			perf_arch.ndevices = 1;
+			perf_arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
+			perf_arch.devices[0].type = STARPU_CPU_WORKER;
+			perf_arch.devices[0].devid = 0;
+			perf_arch.devices[0].ncores = 1;
+			int comb = starpu_get_arch_comb(perf_arch.ndevices, perf_arch.devices);
+			STARPU_ASSERT(comb != -1);
+			int nimpls = model->nimpls[comb];
+			for (implid = 0; implid < nimpls; implid++)
 				starpu_perfmodel_print(model, &perf_arch,implid, parameter, footprint, output); /* Display all codelets on cpu */
+			free(perf_arch.devices);
 			return 0;
 		}
 
@@ -216,11 +212,18 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 
 			unsigned implid;
 			struct starpu_perfmodel_arch perf_arch;
-			perf_arch.type = STARPU_CPU_WORKER;
-			perf_arch.devid = 0;
-			perf_arch.ncore = k-1;
-			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+			perf_arch.ndevices = 1;
+			perf_arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
+			perf_arch.devices[0].type = STARPU_CPU_WORKER;
+			perf_arch.devices[0].devid = 0;
+			perf_arch.devices[0].ncores = k-1;
+			int comb = starpu_get_arch_comb(perf_arch.ndevices, perf_arch.devices);
+			STARPU_ASSERT(comb != -1);
+			int nimpls = model->nimpls[comb];
+
+			for (implid = 0; implid < nimpls; implid++)
 				starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
+			free(perf_arch.devices);
 			return 0;
 		}
 
@@ -229,15 +232,24 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 			unsigned devid;
 			unsigned implid;
 			struct starpu_perfmodel_arch perf_arch;
-			perf_arch.type = STARPU_CUDA_WORKER;
-			perf_arch.ncore = 0;
 
-			for (devid = 0; model->per_arch[STARPU_CUDA_WORKER] != NULL; devid++)
+			perf_arch.ndevices = 1;
+			perf_arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
+			perf_arch.devices[0].type = STARPU_CUDA_WORKER;
+			perf_arch.devices[0].ncores = 1;
+			int comb;
+			for(comb = 0; comb < narch_combs; comb++)
 			{
-				perf_arch.devid = devid;
-				for (implid = 0; implid <STARPU_MAXIMPLEMENTATIONS; implid ++)
-					starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
+				if(arch_combs[comb]->ndevices == 1 && arch_combs[comb]->devices[0].type == STARPU_CUDA_WORKER)
+				{
+					perf_arch.devices[0].devid = arch_combs[comb]->devices[0].devid;
+					int nimpls = model->nimpls[comb];
+
+					for (implid = 0; implid < nimpls; implid++)
+						starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
+				}
 			}
+			free(perf_arch.devices);
 			return 0;
 		}
 
@@ -248,11 +260,19 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 		if (nmatched == 1)
 		{
 			struct starpu_perfmodel_arch perf_arch;
-			perf_arch.type = STARPU_CUDA_WORKER;
-			perf_arch.devid = gpuid;
-			perf_arch.ncore = 0;
+			perf_arch.ndevices = 1;
+			perf_arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
+
+			perf_arch.devices[0].type = STARPU_CUDA_WORKER;
+			perf_arch.devices[0].devid = gpuid;
+			perf_arch.devices[0].ncores = 1;
+
+			int comb = starpu_get_arch_comb(perf_arch.ndevices, perf_arch.devices);
+			STARPU_ASSERT(comb != -1);
+			int nimpls = model->nimpls[comb];
+
 			unsigned implid;
-			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+			for (implid = 0; implid < nimpls; implid++)
 				starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
 			return 0;
 		}

+ 6 - 0
src/core/sched_ctx.c

@@ -1951,3 +1951,9 @@ void starpu_sched_ctx_unbook_workers_for_task(unsigned sched_ctx_id, int master)
 	/* wake up starpu workers */
 	_starpu_sched_ctx_wake_up_workers(sched_ctx_id, master);
 }
+
+struct starpu_perfmodel_arch * _starpu_sched_ctx_get_perf_archtype(unsigned sched_ctx_id)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	return &sched_ctx->perf_arch;
+}

+ 3 - 0
src/core/sched_ctx.h

@@ -150,6 +150,8 @@ struct _starpu_sched_ctx
 	/* ctx nesting the current ctx */
 	unsigned nesting_sched_ctx;
 
+	/* perf model for the device comb of the ctx */
+	struct starpu_perfmodel_arch perf_arch;
 };
 
 struct _starpu_machine_config;
@@ -224,6 +226,7 @@ void _starpu_fetch_tasks_from_empty_ctx_list(struct _starpu_sched_ctx *sched_ctx
 
 unsigned _starpu_sched_ctx_allow_hypervisor(unsigned sched_ctx_id);
 
+struct starpu_perfmodel_arch * _starpu_sched_ctx_get_perf_archtype(unsigned sched_ctx);
 #ifdef STARPU_USE_SC_HYPERVISOR
 /* Notifies the hypervisor that a tasks was poped from the workers' list */
 void _starpu_sched_ctx_post_exec_task_cb(int workerid, struct starpu_task *task, size_t data_size, uint32_t footprint);

+ 6 - 3
src/core/task.c

@@ -255,10 +255,13 @@ int _starpu_submit_job(struct _starpu_job *j)
 	   && sched_ctx->perf_counters != NULL)
 	{
 		struct starpu_perfmodel_arch arch;
-		arch.type = STARPU_CPU_WORKER;
-		arch.devid = 0;
-		arch.ncore = 0;
+		arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
+		arch.ndevices = 1;
+		arch.devices[0].type = STARPU_CPU_WORKER;
+		arch.devices[0].devid = 0;
+		arch.devices[0].ncores = 1;
 		_starpu_compute_buffers_footprint(j->task->cl->model, &arch, 0, j);
+		free(arch.devices);
 		int i;
 		size_t data_size = 0;
 		for(i = 0; i < STARPU_NMAXBUFS; i++)

+ 22 - 12
src/core/topology.c

@@ -862,12 +862,15 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 		for (i = 0; i < nworker_per_cuda; i++)
 		{
 			int worker_idx = topology->nworkers + cudagpu * nworker_per_cuda + i;
+
 			config->workers[worker_idx].arch = STARPU_CUDA_WORKER;
-			config->workers[worker_idx].perf_arch.type = STARPU_CUDA_WORKER;
-			config->workers[worker_idx].perf_arch.devid = devid;
+			config->workers[worker_idx].perf_arch.devices = (struct starpu_perfmodel_device)malloc(sizeof(struct starpu_perfmodel_device));
+			config->workers[worker_idx].perf_arch.ndevices = 1;
+			config->workers[worker_idx].perf_arch.devices[0].type = STARPU_CUDA_WORKER;
+			config->workers[worker_idx].perf_arch.devices[0].devid = devid;
 			// TODO: fix perfmodels etc.
 			//config->workers[worker_idx].perf_arch.ncore = nworker_per_cuda - 1;
-			config->workers[worker_idx].perf_arch.ncore = 0;
+			config->workers[worker_idx].perf_arch.devices[0].ncores = 1;
 			config->workers[worker_idx].devid = devid;
 			config->workers[worker_idx].subworkerid = i;
 			config->workers[worker_idx].worker_mask = STARPU_CUDA;
@@ -940,9 +943,11 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 			break;
 		}
 		config->workers[worker_idx].arch = STARPU_OPENCL_WORKER;
-		config->workers[worker_idx].perf_arch.type = STARPU_OPENCL_WORKER;
-		config->workers[worker_idx].perf_arch.devid = devid;
-		config->workers[worker_idx].perf_arch.ncore = 0;
+		config->workers[worker_idx].perf_arch.devices = (struct starpu_perfmodel_device)malloc(sizeof(struct starpu_perfmodel_device));
+		config->workers[worker_idx].perf_arch.ndevices = 1;
+		config->workers[worker_idx].perf_arch.devices[0].type = STARPU_OPENCL_WORKER;
+		config->workers[worker_idx].perf_arch.devices[0].devid = devid;
+		config->workers[worker_idx].perf_arch.devices[0].ncore = 1;
 		config->workers[worker_idx].subworkerid = 0;
 		config->workers[worker_idx].devid = devid;
 		config->workers[worker_idx].worker_mask = STARPU_OPENCL;
@@ -1002,9 +1007,12 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 	{
 		config->workers[topology->nworkers + sccdev].arch = STARPU_SCC_WORKER;
 		int devid = _starpu_get_next_scc_deviceid(config);
-		config->workers[topology->nworkers + sccdev].perf_arch.type = STARPU_SCC_WORKER;
-		config->workers[topology->nworkers + sccdev].perf_arch.devid = sccdev;
-		config->workers[topology->nworkers + sccdev].perf_arch.ncore = 0;
+		config->workers[topology->nworkers + sccdev].perf_arch.devices = (struct starpu_perfmodel_device)malloc(sizeof(struct starpu_perfmodel_device));
+		config->workers[topology->nworkers + sccdev].perf_arch.ndevices = 1;
+
+		config->workers[topology->nworkers + sccdev].perf_arch.devices[0].type = STARPU_SCC_WORKER;
+		config->workers[topology->nworkers + sccdev].perf_arch.devices[0].devid = sccdev;
+		config->workers[topology->nworkers + sccdev].perf_arch.devices[0].ncore = 1;
 		config->workers[topology->nworkers + sccdev].subworkerid = 0;
 		config->workers[topology->nworkers + sccdev].devid = devid;
 		config->workers[topology->nworkers + sccdev].worker_mask = STARPU_SCC;
@@ -1068,9 +1076,11 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 	{
 		int worker_idx = topology->nworkers + cpu;
 		config->workers[worker_idx].arch = STARPU_CPU_WORKER;
-		config->workers[worker_idx].perf_arch.type = STARPU_CPU_WORKER;
-		config->workers[worker_idx].perf_arch.devid = 0;
-		config->workers[worker_idx].perf_arch.ncore = 0;
+		config->workers[worker_idx].perf_arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
+		config->workers[worker_idx].perf_arch.ndevices = 1;
+		config->workers[worker_idx].perf_arch.devices[0].type = STARPU_CPU_WORKER;
+		config->workers[worker_idx].perf_arch.devices[0].devid = 0;
+		config->workers[worker_idx].perf_arch.devices[0].ncores = 1;
 		config->workers[worker_idx].subworkerid = 0;
 		config->workers[worker_idx].devid = cpu;
 		config->workers[worker_idx].worker_mask = STARPU_CPU;

+ 20 - 17
src/datawizard/footprint.c

@@ -37,7 +37,7 @@ uint32_t starpu_task_data_footprint(struct starpu_task *task)
 	return footprint;
 }
 
-uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, unsigned nimpl, struct _starpu_job *j)
+uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned nimpl, struct _starpu_job *j)
 {
 	if (j->footprint_is_computed)
 		return j->footprint;
@@ -50,23 +50,26 @@ uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, struc
 	{
 		footprint = model->footprint(task);
 	}
-	else if (model != NULL && model->per_arch &&
-			model->per_arch[arch->type] != NULL &&
-			model->per_arch[arch->type][arch->devid] != NULL &&
-			model->per_arch[arch->type][arch->devid][arch->ncore] != NULL &&
-			model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].size_base)
-	{
-		size_t size = model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].size_base(task, arch, nimpl);
-		footprint = starpu_hash_crc32c_be_n(&size, sizeof(size), footprint);
-	}
-	else if (model && model->size_base)
-	{
-		size_t size = model->size_base(task, nimpl);
-		footprint = starpu_hash_crc32c_be_n(&size, sizeof(size), footprint);
-	}
 	else
-	{
-		footprint = starpu_task_data_footprint(task);
+	{ 
+		if (model != NULL && model->per_arch)
+		{
+			struct starpu_perfmodel_per_arch *per_arch = starpu_perfmodel_get_model_per_arch(model, arch, nimpl);
+			if(per_arch != NULL && per_arch->size_base)
+			{
+				size_t size = per_arch->size_base(task, arch, nimpl);
+				footprint = starpu_hash_crc32c_be_n(&size, sizeof(size), footprint);
+			}
+		}
+		else if (model && model->size_base)
+		{
+			size_t size = model->size_base(task, nimpl);
+			footprint = starpu_hash_crc32c_be_n(&size, sizeof(size), footprint);
+		}
+		else
+		{
+			footprint = starpu_task_data_footprint(task);
+		}
 	}
 
 	j->footprint = footprint;

+ 22 - 18
src/debug/traces/starpu_fxt.c

@@ -388,6 +388,8 @@ static void handle_worker_init_start(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 
 	char *kindstr = "";
 	struct starpu_perfmodel_arch arch;
+	arch.ndevices = 1;
+	arch.devices = (struct starpu_perfmodel_device *)malloc(sizeof(struct starpu_perfmodel_device));
 
 	switch (ev->param[0])
 	{
@@ -398,37 +400,37 @@ static void handle_worker_init_start(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 		case _STARPU_FUT_CPU_KEY:
 			set_next_cpu_worker_color(workerid);
 			kindstr = "CPU";
-			arch.type = STARPU_CPU_WORKER;
-			arch.devid = 0;
-			arch.ncore = 0;
+			arch.devices[0].type = STARPU_CPU_WORKER;
+			arch.devices[0].devid = 0;
+			arch.devices[0].ncores = 1;
 			break;
 		case _STARPU_FUT_CUDA_KEY:
 			set_next_cuda_worker_color(workerid);
 			kindstr = "CUDA";
-			arch.type = STARPU_CUDA_WORKER;
-			arch.devid = devid;
-			arch.ncore = 0;
+			arch.devices[0].type = STARPU_CUDA_WORKER;
+			arch.devices[0].devid = devid;
+			arch.devices[0].ncores = 1;
 			break;
 		case _STARPU_FUT_OPENCL_KEY:
 			set_next_opencl_worker_color(workerid);
 			kindstr = "OPENCL";
-			arch.type = STARPU_OPENCL_WORKER;
-			arch.devid = devid;
-			arch.ncore = 0;
+			arch.devices[0].type = STARPU_OPENCL_WORKER;
+			arch.devices[0].devid = devid;
+			arch.devices[0].ncores = 1;
 			break;
 		case _STARPU_FUT_MIC_KEY:
 			set_next_mic_worker_color(workerid);
 			kindstr = "mic";
-			arch.type = STARPU_MIC_WORKER;
-			arch.devid = devid;
-			arch.ncore = 0;
+			arch.devices[0].type = STARPU_MIC_WORKER;
+			arch.devices[0].devid = devid;
+			arch.devices[0].ncores = 1;
 			break;
 		case _STARPU_FUT_SCC_KEY:
 			set_next_scc_worker_color(workerid);
 			kindstr = "scc";
-			arch.type = STARPU_SCC_WORKER;
-			arch.devid = devid;
-			arch.ncore = 0;
+			arch.devices[0].type = STARPU_SCC_WORKER;
+			arch.devices[0].devid = devid;
+			arch.devices[0].ncores = 1;
 			break;
 		default:
 			STARPU_ABORT();
@@ -757,9 +759,11 @@ static void handle_end_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_opti
 
 		snprintf(dumped_codelets[dumped_codelets_count - 1].symbol, 256, "%s", last_codelet_symbol[worker]);
 		dumped_codelets[dumped_codelets_count - 1].workerid = worker;
-		dumped_codelets[dumped_codelets_count - 1].arch.type = ev->param[3];
-		dumped_codelets[dumped_codelets_count - 1].arch.devid = ev->param[4];
-		dumped_codelets[dumped_codelets_count - 1].arch.ncore = ev->param[5];
+		dumped_codelets[dumped_codelets_count - 1].arch.ndevices = 1;
+		dumped_codelets[dumped_codelets_count - 1].arch.devices = (struct starpu_perfmodel_device *)malloc(sizeof(struct starpu_perfmodel_device));
+		dumped_codelets[dumped_codelets_count - 1].arch.devices[0].type = ev->param[3];
+		dumped_codelets[dumped_codelets_count - 1].arch.devices[0].devid = ev->param[4];
+		dumped_codelets[dumped_codelets_count - 1].arch.devices[0].ncores = ev->param[5];
 
 		dumped_codelets[dumped_codelets_count - 1].size = codelet_size;
 		dumped_codelets[dumped_codelets_count - 1].hash = codelet_hash;

+ 12 - 12
src/profiling/bound.c

@@ -426,7 +426,7 @@ static void _starpu_get_tasks_times(int nw, int nt, double *times)
 				.footprint = tp->footprint,
 				.footprint_is_computed = 1,
 			};
-			struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
+			struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
 			double length = _starpu_history_based_job_expected_perf(tp->cl->model, arch, &j, j.nimpl);
 			if (isnan(length))
 				times[w*nt+t] = NAN;
@@ -512,15 +512,15 @@ void starpu_bound_print_lp(FILE *output)
 			};
 			for (w = 0; w < nw; w++)
 			{
-				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
-				if (_STARPU_IS_ZERO(t1->duration[arch->type][arch->devid][arch->ncore]))
+				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
+				if (_STARPU_IS_ZERO(t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores]))
 				{
 					double length = _starpu_history_based_job_expected_perf(t1->cl->model, arch, &j,j.nimpl);
 					if (isnan(length))
 						/* Avoid problems with binary coding of doubles */
-						t1->duration[arch->type][arch->devid][arch->ncore] = NAN;
+						t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores] = NAN;
 					else
-						t1->duration[arch->type][arch->devid][arch->ncore] = length / 1000.;
+						t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores] = length / 1000.;
 				}
 			}
 			nt++;
@@ -545,8 +545,8 @@ void starpu_bound_print_lp(FILE *output)
 		{
 			for (w = 0; w < nw; w++)
 			{
-				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
-				if (!isnan(t1->duration[arch->type][arch->devid][arch->ncore]))
+				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
+				if (!isnan(t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores]))
 					fprintf(output, " +t%luw%d", t1->id, w);
 			}
 			fprintf(output, " = 1;\n");
@@ -559,9 +559,9 @@ void starpu_bound_print_lp(FILE *output)
 			fprintf(output, "/* %s %x */\tc%lu = s%lu", _starpu_codelet_get_model_name(t1->cl), (unsigned) t1->footprint, t1->id, t1->id);
 			for (w = 0; w < nw; w++)
 			{
-				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
-				if (!isnan(t1->duration[arch->type][arch->devid][arch->ncore]))
-					fprintf(output, " + %f t%luw%d", t1->duration[arch->type][arch->devid][arch->ncore], t1->id, w);
+				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
+				if (!isnan(t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores]))
+					fprintf(output, " + %f t%luw%d", t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores], t1->id, w);
 			}
 			fprintf(output, ";\n");
 		}
@@ -642,8 +642,8 @@ void starpu_bound_print_lp(FILE *output)
 				{
 					for (w = 0; w < nw; w++)
 					{
-						struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
-						if (!isnan(t1->duration[arch->type][arch->devid][arch->ncore]))
+						struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
+						if (!isnan(t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores]))
 						{
 							fprintf(output, "s%lu - c%lu >= -3e5 + 1e5 t%luw%d + 1e5 t%luw%d + 1e5 t%luafter%lu;\n",
 									t1->id, t2->id, t1->id, w, t2->id, w, t1->id, t2->id);

+ 4 - 4
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -417,7 +417,7 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 		worker = workers->get_next_master(workers, &it);
 		struct _starpu_fifo_taskq *fifo  = dt->queue_array[worker];
 		unsigned memory_node = starpu_worker_get_memory_node(worker);
-		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
+		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker, sched_ctx_id);
 
 		/* Sometimes workers didn't take the tasks as early as we expected */
 		double exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
@@ -555,7 +555,7 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 		worker = workers->get_next_master(workers, &it);
 
 		struct _starpu_fifo_taskq *fifo = dt->queue_array[worker];
-		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
+		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker, sched_ctx_id);
 		unsigned memory_node = starpu_worker_get_memory_node(worker);
 
 		/* Sometimes workers didn't take the tasks as early as we expected */
@@ -770,7 +770,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 	}
 	else if (task->bundle)
 	{
-		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(best_in_ctx);
+		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(best_in_ctx, sched_ctx_id);
 		unsigned memory_node = starpu_worker_get_memory_node(best);
 		model_best = starpu_task_expected_length(task, perf_arch, selected_impl);
 		transfer_model_best = starpu_task_expected_data_transfer_time(memory_node, task);
@@ -943,7 +943,7 @@ static void dmda_push_task_notify(struct starpu_task *task, int workerid, int pe
 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
 	/* Compute the expected penality */
-	struct starpu_perfmodel_arch *perf_arch = starpu_worker_get_perf_archtype(perf_workerid);
+	struct starpu_perfmodel_arch *perf_arch = starpu_worker_get_perf_archtype(perf_workerid, sched_ctx_id);
 	unsigned memory_node = starpu_worker_get_memory_node(workerid);
 
 	double predicted = starpu_task_expected_length(task, perf_arch,

+ 4 - 4
src/sched_policies/parallel_heft.c

@@ -232,9 +232,9 @@ static double compute_expected_end(int workerid, double length)
 	}
 }
 
-static double compute_ntasks_end(int workerid)
+static double compute_ntasks_end(int workerid, unsigned sched_ctx_id)
 {
-	struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(workerid);
+	struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(workerid, sched_ctx_id);
 	starpu_pthread_mutex_t *sched_mutex;
 	starpu_pthread_cond_t *sched_cond;
 
@@ -351,14 +351,14 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 			}
 
 
-			struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
+			struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker, sched_ctx_id);
 
 			local_task_length[worker_ctx][nimpl] = starpu_task_expected_length(task, perf_arch,nimpl);
 
 			unsigned memory_node = starpu_worker_get_memory_node(worker);
 			local_data_penalty[worker_ctx][nimpl] = starpu_task_expected_data_transfer_time(memory_node, task);
 
-			double ntasks_end = compute_ntasks_end(worker);
+			double ntasks_end = compute_ntasks_end(worker, sched_ctx_id);
 
 			if (ntasks_best == -1
 			    || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */

+ 1 - 1
src/sched_policies/random_policy.c

@@ -50,7 +50,7 @@ static int _random_push_task(struct starpu_task *task, unsigned prio)
 		{
 			if(starpu_worker_can_execute_task(worker, task, impl))
 			{
-				struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
+				struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker, sched_ctx_id);
 				double speedup = starpu_worker_get_relative_speedup(perf_arch);
 				alpha_sum += speedup;
 				speedup_arr[size] = speedup;

+ 6 - 4
tests/perfmodels/feed.c

@@ -73,15 +73,17 @@ int main(int argc, char **argv)
 		measured_slow = 0.001+size*0.0000001;
 
 		struct starpu_perfmodel_arch arch;
-		arch.type = STARPU_CUDA_WORKER;
-		arch.ncore = 0;
+		arch.ndevices = 1;
+		arch.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
+		arch.devices[0].type = STARPU_CUDA_WORKER;
+		arch.devices[0].ncores = 0;
 		/* Simulate Fast GPU */
-		arch.devid = 0;
+		arch.devices[0].devid = 0;
 		starpu_perfmodel_update_history(&model, &task, &arch, 0, 0, measured_fast);
 		starpu_perfmodel_update_history(&nl_model, &task, &arch, 0, 0, measured_fast);
 
 		/* Simulate Slow GPU */
-		arch.devid = 1;
+		arch.devices[0].devid = 1;
 		starpu_perfmodel_update_history(&model, &task, &arch, 0, 0, measured_slow);
 		starpu_perfmodel_update_history(&nl_model, &task, &arch, 0, 0, measured_slow);
 		starpu_task_clean(&task);

+ 1 - 1
tests/perfmodels/regression_based.c

@@ -128,7 +128,7 @@ static void show_task_perfs(int size, struct starpu_task *task)
 		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 		{
 			FPRINTF(stdout, "Expected time for %d on %s (impl %d):\t%f\n",
-				size, name, nimpl, starpu_task_expected_length(task, starpu_worker_get_perf_archtype(workerid), nimpl));
+				size, name, nimpl, starpu_task_expected_length(task, starpu_worker_get_perf_archtype(workerid, task->sched_ctx), nimpl));
 		}
 	}
 }

+ 6 - 10
tests/perfmodels/valid_model.c

@@ -77,12 +77,11 @@ static int submit(struct starpu_codelet *codelet, struct starpu_perfmodel *model
 	lmodel.is_init=0;
 	lmodel.type = model->type;
 	ret = starpu_perfmodel_load_symbol(codelet->model->symbol, &lmodel);
+	int narch_combs = starpu_get_narch_combs();
+	int comb;
 	if (ret != 1)
-		for (archtype = 0; archtype < STARPU_NARCH; archtype++)
-			if(lmodel.per_arch[archtype] != NULL)
-				for(devid=0; lmodel.per_arch[archtype][devid] != NULL; devid++)
-					for(ncore=0; lmodel.per_arch[archtype][devid][ncore] != NULL; ncore++)
-						old_nsamples += lmodel.per_arch[archtype][devid][ncore][0].regression.nsample;
+		for(comb = 0; comb < narch_combs; comb++)
+			old_nsamples += lmodel.per_arch[comb][0].regression.nsample;
 
         starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, 100, sizeof(int));
 	for (loop = 0; loop < nloops; loop++)
@@ -107,11 +106,8 @@ static int submit(struct starpu_codelet *codelet, struct starpu_perfmodel *model
 	}
 
 	new_nsamples = 0;
-	for (archtype = 0; archtype < STARPU_NARCH; archtype++)
-		if(lmodel.per_arch[archtype] != NULL)
-			for(devid=0; lmodel.per_arch[archtype][devid] != NULL; devid++)
-				for(ncore=0; lmodel.per_arch[archtype][devid][ncore] != NULL; ncore++)
-					new_nsamples += lmodel.per_arch[archtype][devid][ncore][0].regression.nsample;
+	for(comb = 0; comb < narch_combs; comb++)
+		new_nsamples += lmodel.per_arch[comb][0].regression.nsample;
 
 	ret = starpu_perfmodel_unload_model(&lmodel);
 	starpu_shutdown();

+ 86 - 38
tests/sched_policies/simple_cpu_gpu_sched.c

@@ -99,44 +99,92 @@ init_perfmodels(void)
 {
 	unsigned devid, ncore;
 
-	starpu_perfmodel_init(&model_cpu_task);
-	starpu_perfmodel_init(&model_gpu_task);
-
-	if(model_cpu_task.per_arch[STARPU_CPU_WORKER] != NULL)
-	{
-		for(devid=0; model_cpu_task.per_arch[STARPU_CPU_WORKER][devid] != NULL; devid++)
-		{
-			for(ncore=0; model_cpu_task.per_arch[STARPU_CPU_WORKER][devid][ncore] != NULL; ncore++)
-			{
-				model_cpu_task.per_arch[STARPU_CPU_WORKER][devid][ncore][0].cost_function = cpu_task_cpu;
-				model_gpu_task.per_arch[STARPU_CPU_WORKER][devid][ncore][0].cost_function = gpu_task_cpu;
-			}
-		}
-	}
-
-	if(model_cpu_task.per_arch[STARPU_CUDA_WORKER] != NULL)
-	{
-		for(devid=0; model_cpu_task.per_arch[STARPU_CUDA_WORKER][devid] != NULL; devid++)
-		{
-			for(ncore=0; model_cpu_task.per_arch[STARPU_CUDA_WORKER][devid][ncore] != NULL; ncore++)
-			{
-				model_cpu_task.per_arch[STARPU_CUDA_WORKER][devid][ncore][0].cost_function = cpu_task_gpu;
-				model_gpu_task.per_arch[STARPU_CUDA_WORKER][devid][ncore][0].cost_function = gpu_task_gpu;
-			}
-		}
-	}
-
-	if(model_cpu_task.per_arch[STARPU_OPENCL_WORKER] != NULL)
-	{
-		for(devid=0; model_cpu_task.per_arch[STARPU_OPENCL_WORKER][devid] != NULL; devid++)
-		{
-			for(ncore=0; model_cpu_task.per_arch[STARPU_OPENCL_WORKER][devid][ncore] != NULL; ncore++)
-			{
-				model_cpu_task.per_arch[STARPU_OPENCL_WORKER][devid][ncore][0].cost_function = cpu_task_gpu;
-				model_gpu_task.per_arch[STARPU_OPENCL_WORKER][devid][ncore][0].cost_function = gpu_task_gpu;
-			}
-		}
-	}
+	starpu_perfmodel_init(NULL, &model_cpu_task);
+	starpu_perfmodel_init(NULL, &model_gpu_task);
+
+	struct starpu_perfmodel_arch arch_cpu;
+	arch_cpu.ndevices = 1;
+	arch_cpu.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
+	arch_cpu.devices[0].type = STARPU_CPU_WORKER;	
+	arch_cpu.devices[0].devid = 0;
+	arch_cpu.devices[0].ncores = 1;
+
+	int comb_cpu = starpu_get_arch_comb(arch_cpu.ndevices, arch_cpu.devices);
+	if(comb_cpu == -1)
+		comb_cpu = starpu_add_arch_comb(arch_cpu.ndevices, arch_cpu.devices);
+
+
+	model_cpu_task.per_arch[comb_cpu] = (struct starpu_perfmodel_per_arch*)malloc(sizeof(struct starpu_perfmodel_per_arch));
+	memset(&model_cpu_task.per_arch[comb_cpu][0], 0, sizeof(struct starpu_perfmodel_per_arch));
+	model_cpu_task.nimpls[comb_cpu] = 1;
+	model_cpu_task.per_arch[comb_cpu][0].cost_function = cpu_task_cpu;
+
+	model_gpu_task.per_arch[comb_cpu] = (struct starpu_perfmodel_per_arch*)malloc(sizeof(struct starpu_perfmodel_per_arch));
+	memset(&model_gpu_task.per_arch[comb_cpu][0], 0, sizeof(struct starpu_perfmodel_per_arch));
+	model_gpu_task.nimpls[comb_cpu] = 1;
+	model_gpu_task.per_arch[comb_cpu][0].cost_function = gpu_task_cpu;
+
+
+
+	struct starpu_perfmodel_arch arch_cuda;
+	arch_cuda.ndevices = 1;
+	arch_cuda.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
+	arch_cuda.devices[0].type = STARPU_CUDA_WORKER;	
+	arch_cuda.devices[0].devid = 0;
+	arch_cuda.devices[0].ncores = 1;
+	
+
+
+	int comb_cuda = starpu_get_arch_comb(arch_cuda.ndevices, arch_cuda.devices);
+	if(comb_cuda == -1)
+		comb_cuda = starpu_add_arch_comb(arch_cuda.ndevices, arch_cuda.devices);
+
+	model_gpu_task.per_arch[comb_cpu] = (struct starpu_perfmodel_per_arch*)malloc(sizeof(struct starpu_perfmodel_per_arch));
+	memset(&model_cpu_task.per_arch[comb_cuda][0], 0, sizeof(struct starpu_perfmodel_per_arch));
+	model_cpu_task.nimpls[comb_cuda] = 1;
+	model_cpu_task.per_arch[comb_cuda][0].cost_function = cpu_task_cpu;
+
+	model_gpu_task.per_arch[comb_cuda] = (struct starpu_perfmodel_per_arch*)malloc(sizeof(struct starpu_perfmodel_per_arch));
+	memset(&model_gpu_task.per_arch[comb_cuda][0], 0, sizeof(struct starpu_perfmodel_per_arch));
+	model_gpu_task.nimpls[comb_cuda] = 1;
+	model_gpu_task.per_arch[comb_cuda][0].cost_function = gpu_task_cpu;
+
+
+/* 	if(model_cpu_task.per_arch[STARPU_CPU_WORKER] != NULL) */
+/* 	{ */
+/* 		for(devid=0; model_cpu_task.per_arch[STARPU_CPU_WORKER][devid] != NULL; devid++) */
+/* 		{ */
+/* 			for(ncore=0; model_cpu_task.per_arch[STARPU_CPU_WORKER][devid][ncore] != NULL; ncore++) */
+/* 			{ */
+/* 				model_cpu_task.per_arch[STARPU_CPU_WORKER][devid][ncore][0].cost_function = cpu_task_cpu; */
+/* 				model_gpu_task.per_arch[STARPU_CPU_WORKER][devid][ncore][0].cost_function = gpu_task_cpu; */
+/* 			} */
+/* 		} */
+/* 	} */
+
+/* 	if(model_cpu_task.per_arch[STARPU_CUDA_WORKER] != NULL) */
+/* 	{ */
+/* 		for(devid=0; model_cpu_task.per_arch[STARPU_CUDA_WORKER][devid] != NULL; devid++) */
+/* 		{ */
+/* 			for(ncore=0; model_cpu_task.per_arch[STARPU_CUDA_WORKER][devid][ncore] != NULL; ncore++) */
+/* 			{ */
+/* 				model_cpu_task.per_arch[STARPU_CUDA_WORKER][devid][ncore][0].cost_function = cpu_task_gpu; */
+/* 				model_gpu_task.per_arch[STARPU_CUDA_WORKER][devid][ncore][0].cost_function = gpu_task_gpu; */
+/* 			} */
+/* 		} */
+/* 	} */
+
+/* 	if(model_cpu_task.per_arch[STARPU_OPENCL_WORKER] != NULL) */
+/* 	{ */
+/* 		for(devid=0; model_cpu_task.per_arch[STARPU_OPENCL_WORKER][devid] != NULL; devid++) */
+/* 		{ */
+/* 			for(ncore=0; model_cpu_task.per_arch[STARPU_OPENCL_WORKER][devid][ncore] != NULL; ncore++) */
+/* 			{ */
+/* 				model_cpu_task.per_arch[STARPU_OPENCL_WORKER][devid][ncore][0].cost_function = cpu_task_gpu; */
+/* 				model_gpu_task.per_arch[STARPU_OPENCL_WORKER][devid][ncore][0].cost_function = gpu_task_gpu; */
+/* 			} */
+/* 		} */
+/* 	} */
 }
 
 /*