|
@@ -111,7 +111,7 @@ init_perfmodels(void)
|
|
|
arch_cpu.devices[0].ncores = 1;
|
|
|
|
|
|
int comb_cpu = starpu_get_arch_comb(arch_cpu.ndevices, arch_cpu.devices);
|
|
|
- if(comb_cpu == -1)
|
|
|
+ if (comb_cpu == -1)
|
|
|
comb_cpu = starpu_add_arch_comb(arch_cpu.ndevices, arch_cpu.devices);
|
|
|
|
|
|
model_cpu_task.per_arch[comb_cpu] = (struct starpu_perfmodel_per_arch*)malloc(sizeof(struct starpu_perfmodel_per_arch));
|
|
@@ -124,27 +124,39 @@ init_perfmodels(void)
|
|
|
model_gpu_task.nimpls[comb_cpu] = 1;
|
|
|
model_gpu_task.per_arch[comb_cpu][0].cost_function = gpu_task_cpu;
|
|
|
|
|
|
- struct starpu_perfmodel_arch arch_cuda;
|
|
|
- arch_cuda.ndevices = 1;
|
|
|
- arch_cuda.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
|
|
|
- arch_cuda.devices[0].type = STARPU_CUDA_WORKER;
|
|
|
- arch_cuda.devices[0].devid = 0;
|
|
|
- arch_cuda.devices[0].ncores = 1;
|
|
|
-
|
|
|
- int comb_cuda = starpu_get_arch_comb(arch_cuda.ndevices, arch_cuda.devices);
|
|
|
- if(comb_cuda == -1)
|
|
|
- comb_cuda = starpu_add_arch_comb(arch_cuda.ndevices, arch_cuda.devices);
|
|
|
-
|
|
|
- model_cpu_task.per_arch[comb_cuda] = (struct starpu_perfmodel_per_arch*)malloc(sizeof(struct starpu_perfmodel_per_arch));
|
|
|
- memset(&model_cpu_task.per_arch[comb_cuda][0], 0, sizeof(struct starpu_perfmodel_per_arch));
|
|
|
- model_cpu_task.nimpls[comb_cuda] = 1;
|
|
|
- model_cpu_task.per_arch[comb_cuda][0].cost_function = cpu_task_gpu;
|
|
|
-
|
|
|
- model_gpu_task.per_arch[comb_cuda] = (struct starpu_perfmodel_per_arch*)malloc(sizeof(struct starpu_perfmodel_per_arch));
|
|
|
- memset(&model_gpu_task.per_arch[comb_cuda][0], 0, sizeof(struct starpu_perfmodel_per_arch));
|
|
|
- model_gpu_task.nimpls[comb_cuda] = 1;
|
|
|
- model_gpu_task.per_arch[comb_cuda][0].cost_function = gpu_task_gpu;
|
|
|
-
|
|
|
+ {
|
|
|
+ // We need to set the cost function for each combination with a CUDA worker
|
|
|
+ int nb_worker_cuda = starpu_worker_get_count_by_type(STARPU_CUDA_WORKER);
|
|
|
+ int *worker_cuda_ids = malloc(nb_worker_cuda * sizeof(int));
|
|
|
+ int worker_cuda;
|
|
|
+
|
|
|
+ starpu_worker_get_ids_by_type(STARPU_CUDA_WORKER, worker_cuda_ids, nb_worker_cuda);
|
|
|
+ for(worker_cuda = 0 ; worker_cuda < nb_worker_cuda ; worker_cuda ++)
|
|
|
+ {
|
|
|
+ struct starpu_perfmodel_arch arch_cuda;
|
|
|
+ arch_cuda.ndevices = 1;
|
|
|
+ arch_cuda.devices = (struct starpu_perfmodel_device*)malloc(sizeof(struct starpu_perfmodel_device));
|
|
|
+ arch_cuda.devices[0].type = STARPU_CUDA_WORKER;
|
|
|
+ arch_cuda.devices[0].devid = starpu_worker_get_devid(worker_cuda_ids[worker_cuda]);
|
|
|
+ arch_cuda.devices[0].ncores = 1;
|
|
|
+
|
|
|
+ int comb_cuda = starpu_get_arch_comb(arch_cuda.ndevices, arch_cuda.devices);
|
|
|
+ if(comb_cuda == -1)
|
|
|
+ {
|
|
|
+ comb_cuda = starpu_add_arch_comb(arch_cuda.ndevices, arch_cuda.devices);
|
|
|
+
|
|
|
+ model_cpu_task.per_arch[comb_cuda] = (struct starpu_perfmodel_per_arch*)malloc(sizeof(struct starpu_perfmodel_per_arch));
|
|
|
+ memset(&model_cpu_task.per_arch[comb_cuda][0], 0, sizeof(struct starpu_perfmodel_per_arch));
|
|
|
+ model_cpu_task.nimpls[comb_cuda] = 1;
|
|
|
+ model_cpu_task.per_arch[comb_cuda][0].cost_function = cpu_task_gpu;
|
|
|
+
|
|
|
+ model_gpu_task.per_arch[comb_cuda] = (struct starpu_perfmodel_per_arch*)malloc(sizeof(struct starpu_perfmodel_per_arch));
|
|
|
+ memset(&model_gpu_task.per_arch[comb_cuda][0], 0, sizeof(struct starpu_perfmodel_per_arch));
|
|
|
+ model_gpu_task.nimpls[comb_cuda] = 1;
|
|
|
+ model_gpu_task.per_arch[comb_cuda][0].cost_function = gpu_task_gpu;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
|
|
|
/* if(model_cpu_task.per_arch[STARPU_CPU_WORKER] != NULL) */
|
|
|
/* { */
|