Browse Source

Multi-implementation.

Users can now specify multiple kernels for a single arch type.
Cyril Roelandt 14 years ago
parent
commit
ffcfb86084

+ 2 - 0
AUTHORS

@@ -10,3 +10,5 @@ William Braik <wbraik@gmail.com>
 Yann Courtois <yann.courtois33@gmail.com>
 Jean-Marie Couteyen <jm.couteyen@gmail.com>
 Anthony Roy <theanthony33@gmail.com>
+David Gómez <david_gomez1380@yahoo.com.mx>
+NGUYEN quôc dinh <nguyen.quocdinh@gmail.com>

+ 10 - 0
configure.ac

@@ -2,6 +2,7 @@
 #
 # Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+# * Copyright (C) 2011  Télécom-SudParis
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -889,6 +890,15 @@ AC_MSG_CHECKING(Maximum number of workers)
 AC_MSG_RESULT($nmaxworkers)
 AC_DEFINE_UNQUOTED(STARPU_NMAXWORKERS, [$nmaxworkers], [Maximum number of workers])
 
+# Computes the maximum number of implementations per arch
+AC_MSG_CHECKING(maximum number of implementations)
+AC_ARG_ENABLE(maximplementations, [AS_HELP_STRING([--enable-maximplementations=<number>],
+		[maximum number of implementations])],
+		maximplementations=$enableval, maximplementations=1)
+AC_MSG_RESULT($maximplementations)
+AC_DEFINE_UNQUOTED(STARPU_MAXIMPLEMENTATIONS, [$maximplementations],
+		[maximum number of implementations])
+
 ###############################################################################
 #                                                                             #
 #                                    MPI                                      #

+ 2 - 0
examples/Makefile.am

@@ -2,6 +2,7 @@
 #
 # Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+# Copyright (C) 2011  Télécom-SudParis
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -148,6 +149,7 @@ examplebin_PROGRAMS +=				\
 	basic_examples/mult			\
 	basic_examples/block			\
 	basic_examples/variable			\
+	basic_examples/mult_impl                \
 	filters/fvector				\
 	filters/fblock				\
 	filters/fmatrix				\

+ 384 - 0
examples/basic_examples/mult_impl.c

@@ -0,0 +1,384 @@
+/*/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Télécom-SudParis
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#include <string.h>
+#include <math.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <pthread.h>
+#include <signal.h>
+
+#include <starpu.h>
+
+static float *A, *B, *C;
+static starpu_data_handle A_handle, B_handle, C_handle;
+
+static unsigned nslicesx = 4;
+static unsigned nslicesy = 4;
+static unsigned xdim = 1024;
+static unsigned ydim = 1024;
+static unsigned zdim = 512;
+
+
+double mult_gemm_cost(starpu_buffer_descr *descr)
+{
+	/* C = A * B */
+	uint32_t nxC, nyC, nxA;
+
+
+	nxC = starpu_matrix_get_nx(descr[2].handle);
+	nyC = starpu_matrix_get_ny(descr[2].handle);
+	nxA = starpu_matrix_get_nx(descr[0].handle);
+
+	//printf("nxC %d nxC %d nxA %d\n", nxC, nyC, nxA);
+
+	double cost = ((double)nxC)*((double)nyC)*((double)nxA/1000.0f/4.11f);
+
+	printf("cost %e \n", cost);
+
+	return cost;
+}
+
+static void cpu_mult(void *descr[], __attribute__((unused))  void *arg)
+{
+	float *subA, *subB, *subC;
+	uint32_t nxC, nyC, nyA;
+	uint32_t ldA, ldB, ldC;
+	printf("On application: Hello, this is kernel cpu_mult\n\n");
+	/* .blas.ptr gives a pointer to the first element of the local copy */
+	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
+	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
+	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
+
+	/* .blas.nx is the number of rows (consecutive elements) and .blas.ny
+	 * is the number of lines that are separated by .blas.ld elements (ld
+	 * stands for leading dimension).
+	 * NB: in case some filters were used, the leading dimension is not
+	 * guaranteed to be the same in main memory (on the original matrix)
+	 * and on the accelerator! */
+	nxC = STARPU_MATRIX_GET_NX(descr[2]);
+	nyC = STARPU_MATRIX_GET_NY(descr[2]);
+	nyA = STARPU_MATRIX_GET_NY(descr[0]);
+
+	ldA = STARPU_MATRIX_GET_LD(descr[0]);
+	ldB = STARPU_MATRIX_GET_LD(descr[1]);
+	ldC = STARPU_MATRIX_GET_LD(descr[2]);
+
+	/* we assume a FORTRAN-ordering! */
+	unsigned i,j,k;
+	for (i = 0; i < nyC; i++)
+	{
+		for (j = 0; j < nxC; j++)
+		{
+			float sum = 0.0;
+
+			for (k = 0; k < nyA; k++)
+			{
+				sum += subA[j+k*ldA]*subB[k+i*ldB];
+			}
+
+			subC[j + i*ldC] = sum;
+		}
+	}
+}
+
+static void cpu_mult_2(void *descr[], __attribute__((unused))  void *arg)
+{
+	float *subA, *subB, *subC;
+	uint32_t nxC, nyC, nyA;
+	uint32_t ldA, ldB, ldC;
+	printf("On application: this is kernel cpu_mult_2\n\n");
+	/* .blas.ptr gives a pointer to the first element of the local copy */
+	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
+	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
+	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
+
+	nxC = STARPU_MATRIX_GET_NX(descr[2]);
+	nyC = STARPU_MATRIX_GET_NY(descr[2]);
+	nyA = STARPU_MATRIX_GET_NY(descr[0]);
+
+	ldA = STARPU_MATRIX_GET_LD(descr[0]);
+	ldB = STARPU_MATRIX_GET_LD(descr[1]);
+	ldC = STARPU_MATRIX_GET_LD(descr[2]);
+
+	/* we assume a FORTRAN-ordering! */
+	unsigned i,j,k;
+	for (j = 0; j < nxC; j++)
+	{
+		for (i = 0; i < nyC; i++)
+		{
+			float sum = 0.0;
+
+			for (k = 0; k < nyA; k++)
+			{
+				sum += subA[j+k*ldA]*subB[k+i*ldB];
+			}
+
+			subC[j + i*ldC] = sum;
+		}
+	}
+}
+
+
+
+static void init_problem_data(void)
+{
+	unsigned i,j;
+
+	/* we initialize matrices A, B and C in the usual way */
+
+	A = malloc(zdim*ydim*sizeof(float));
+	B = malloc(xdim*zdim*sizeof(float));
+	C = malloc(xdim*ydim*sizeof(float));
+
+	/* fill the A and B matrices */
+	srand(2009);
+	for (j=0; j < ydim; j++) {
+		for (i=0; i < zdim; i++) {
+			A[j+i*ydim] = (float)(starpu_drand48());
+		}
+	}
+
+	for (j=0; j < zdim; j++) {
+		for (i=0; i < xdim; i++) {
+			B[j+i*zdim] = (float)(starpu_drand48());
+		}
+	}
+
+	for (j=0; j < ydim; j++) {
+		for (i=0; i < xdim; i++) {
+			C[j+i*ydim] = (float)(0);
+		}
+	}
+}
+
+static void partition_mult_data(void)
+{
+	/* note that we assume a FORTRAN ordering here! */
+
+	starpu_matrix_data_register(&A_handle, 0, (uintptr_t)A,
+		ydim, ydim, zdim, sizeof(float));
+	starpu_matrix_data_register(&B_handle, 0, (uintptr_t)B,
+		zdim, zdim, xdim, sizeof(float));
+	starpu_matrix_data_register(&C_handle, 0, (uintptr_t)C,
+		ydim, ydim, xdim, sizeof(float));
+
+	/* A filter is a method to partition a data into disjoint chunks, it is
+	 * described by the means of the "struct starpu_data_filter" structure that
+	 * contains a function that is applied on a data handle to partition it
+	 * into smaller chunks, and an argument that is passed to the function
+	 * (eg. the number of blocks to create here).
+	 */
+
+	struct starpu_data_filter vert = {
+		.filter_func = starpu_vertical_block_filter_func,
+		.nchildren = nslicesx,
+		.get_nchildren = NULL,
+		.get_child_ops = NULL
+	};
+
+	struct starpu_data_filter horiz = {
+		.filter_func = starpu_block_filter_func,
+		.nchildren = nslicesy,
+		.get_nchildren = NULL,
+		.get_child_ops = NULL
+	};
+
+/*
+ *	Illustration with nslicex = 4 and nslicey = 2, it is possible to access
+ *	sub-data by using the "starpu_data_get_sub_data" method, which takes a data handle,
+ *	the number of filters to apply, and the indexes for each filters, for
+ *	instance:
+ *
+ *		A' handle is starpu_data_get_sub_data(A_handle, 1, 1);
+ *		B' handle is starpu_data_get_sub_data(B_handle, 1, 2);
+ *		C' handle is starpu_data_get_sub_data(C_handle, 2, 2, 1);
+ *
+ *	Note that here we applied 2 filters recursively onto C.
+ *
+ *	"starpu_data_get_sub_data(C_handle, 1, 3)" would return a handle to the 4th column
+ *	of blocked matrix C for example.
+ *
+ *		              |---|---|---|---|
+ *		              |   |   | B'|   | B
+ *		              |---|---|---|---|
+ *		                0   1   2   3
+ *		     |----|   |---|---|---|---|
+ *		     |    |   |   |   |   |   |
+ *		     |    | 0 |   |   |   |   |
+ *		     |----|   |---|---|---|---|
+ *		     | A' |   |   |   | C'|   |
+ *		     |    |   |   |   |   |   |
+ *		     |----|   |---|---|---|---|
+ *		       A              C
+ *
+ *	IMPORTANT: applying filters is equivalent to partitionning a piece of
+ *	data in a hierarchical manner, so that memory consistency is enforced
+ *	for each of the elements independantly. The tasks should therefore NOT
+ *	access inner nodes (eg. one column of C or the whole C) but only the
+ *	leafs of the tree (ie. blocks here). Manipulating inner nodes is only
+ *	possible by disapplying the filters (using starpu_data_unpartition), to
+ *	enforce memory consistency.
+ */
+
+	starpu_data_partition(B_handle, &vert);
+	starpu_data_partition(A_handle, &horiz);
+
+	/* starpu_data_map_filters is a variable-arity function, the first argument
+	 * is the handle of the data to partition, the second argument is the
+	 * number of filters to apply recursively. Filters are applied in the
+	 * same order as the arguments.
+	 * This would be equivalent to starpu_data_partition(C_handle, &vert) and
+	 * then applying horiz on each sub-data (ie. each column of C)
+	 */
+	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
+}
+
+static struct starpu_perfmodel_t starpu_dgemm_model_common = {
+	.cost_model = mult_gemm_cost,
+	.type = STARPU_HISTORY_BASED,//STARPU_COMMON, //STARPU_PER_ARCH,
+	.symbol = "mult_perf_model"
+};
+
+/*
+static struct starpu_perfmodel_t mult_perf_model = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "mult_perf_model"
+};
+*/
+
+struct starpu_conf conf = {
+		.sched_policy_name = "heft",
+		.calibrate = 1,
+		.ncpus = 4
+};
+
+
+static starpu_codelet cl = {
+        /* we can only execute that kernel on a CPU yet */
+        .where = STARPU_CPU,
+        //.starpu_impl_multiple = 1,
+        /* CPU implementation of the codelet */
+        .cpu_func = STARPU_MULTIPLE_CPU_IMPLEMENTATIONS,
+        .cpu_funcs = {cpu_mult,cpu_mult_2},
+        /* the codelet manipulates 3 buffers that are managed by the
+         * DSM */
+        .nbuffers = 3,
+        /* in case the scheduling policy may use performance models */
+        .model = &starpu_dgemm_model_common
+};
+
+static void launch_tasks(void)
+{
+	/* partition the work into slices */
+	unsigned taskx, tasky;
+
+	for (taskx = 0; taskx < nslicesx; taskx++)
+	{
+		for (tasky = 0; tasky < nslicesy; tasky++)
+		{
+			/* C[taskx, tasky] = A[tasky] B[taskx] */
+
+			/* by default, starpu_task_create() returns an
+ 			 * asynchronous task (ie. task->synchronous = 0) */
+			struct starpu_task *task = starpu_task_create();
+
+			/* this task implements codelet "cl" */
+			task->cl = &cl;
+
+			/*
+			 *              |---|---|---|---|
+			 *              |   | * |   |   | B
+			 *              |---|---|---|---|
+			 *                    X
+			 *     |----|   |---|---|---|---|
+			 *     |****| Y |   |***|   |   |
+			 *     |****|   |   |***|   |   |
+			 *     |----|   |---|---|---|---|
+			 *     |    |   |   |   |   |   |
+			 *     |    |   |   |   |   |   |
+			 *     |----|   |---|---|---|---|
+			 *       A              C
+			 */
+
+			/* there was a single filter applied to matrices A
+			 * (respectively B) so we grab the handle to the chunk
+			 * identified by "tasky" (respectively "taskx). The "1"
+			 * tells StarPU that there is a single argument to the
+			 * variable-arity function starpu_data_get_sub_data */
+			task->buffers[0].handle = starpu_data_get_sub_data(A_handle, 1, tasky);
+			task->buffers[0].mode = STARPU_R;
+			task->buffers[1].handle = starpu_data_get_sub_data(B_handle, 1, taskx);
+			task->buffers[1].mode = STARPU_R;
+
+			/* 2 filters were applied on matrix C, so we give
+			 * starpu_data_get_sub_data 2 arguments. The order of the arguments
+			 * must match the order in which the filters were
+			 * applied.
+			 * NB: starpu_data_get_sub_data(C_handle, 1, k) would have returned
+			 * a handle to the column number k of matrix C.
+			 * NB2: starpu_data_get_sub_data(C_handle, 2, taskx, tasky) is
+			 * equivalent to
+			 * starpu_data_get_sub_data(starpu_data_get_sub_data(C_handle, 1, taskx), 1, tasky)*/
+			task->buffers[2].handle = starpu_data_get_sub_data(C_handle, 2, taskx, tasky);
+			task->buffers[2].mode = STARPU_W;
+
+			/* this is not a blocking call since task->synchronous = 0 */
+			int summit_task;
+			summit_task = starpu_task_submit(task);
+			printf("task is submmited or not %d\n",summit_task);
+
+		}
+	}
+}
+
+int main(void)
+{
+	/* start the runtime */
+	starpu_init(&conf);
+
+	/* initialize matrices A, B and C and register them to StarPU */
+	init_problem_data();
+
+	/* partition matrices into blocks that can be manipulated by the
+ 	 * codelets */
+	partition_mult_data();
+
+	/* submit all tasks in an asynchronous fashion */
+	launch_tasks();
+
+	/* wait for termination */
+	starpu_task_wait_for_all();
+
+	/* remove the filters applied by the means of starpu_data_map_filters; now
+ 	 * it's not possible to manipulate a subset of C using starpu_data_get_sub_data until
+	 * starpu_data_map_filters is called again on C_handle.
+	 * The second argument is the memory node where the different subsets
+	 * should be reassembled, 0 = main memory (RAM) */
+	starpu_data_unpartition(C_handle, 0);
+
+	/* stop monitoring matrix C : after this, it is not possible to pass C
+	 * (or any subset of C) as a codelet input/output. This also implements
+	 * a barrier so that the piece of data is put back into main memory in
+	 * case it was only available on a GPU for instance. */
+	starpu_data_unregister(C_handle);
+
+	starpu_shutdown();
+
+	return 0;
+}

+ 10 - 9
examples/cholesky/cholesky_models.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -125,27 +126,27 @@ static double cuda_chol_task_22_cost(starpu_buffer_descr *descr)
 }
 
 struct starpu_perfmodel_t chol_model_11 = {
-	.per_arch = { 
-		[STARPU_CPU_DEFAULT] = { .cost_model = cpu_chol_task_11_cost },
-		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_chol_task_11_cost }
+	.per_arch = {
+		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_11_cost },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_11_cost }
 	},
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "chol_model_11"
 };
 
 struct starpu_perfmodel_t chol_model_21 = {
-	.per_arch = { 
-		[STARPU_CPU_DEFAULT] = { .cost_model = cpu_chol_task_21_cost },
-		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_chol_task_21_cost }
+	.per_arch = {
+		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_21_cost },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_21_cost }
 	},
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "chol_model_21"
 };
 
 struct starpu_perfmodel_t chol_model_22 = {
-	.per_arch = { 
-		[STARPU_CPU_DEFAULT] = { .cost_model = cpu_chol_task_22_cost },
-		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_chol_task_22_cost }
+	.per_arch = {
+		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_22_cost },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_22_cost }
 	},
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "chol_model_22"

+ 13 - 12
examples/heat/lu_kernels_model.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -215,9 +216,9 @@ double task_22_cost_cpu(starpu_buffer_descr *descr)
 
 struct starpu_perfmodel_t model_11 = {
 	.cost_model = task_11_cost,
-	.per_arch = { 
-		[STARPU_CPU_DEFAULT] = { .cost_model = task_11_cost_cpu },
-		[STARPU_CUDA_DEFAULT] = { .cost_model = task_11_cost_cuda }
+	.per_arch = {
+		[STARPU_CPU_DEFAULT][0] = { .cost_model = task_11_cost_cpu },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = task_11_cost_cuda }
 	},
 	.type = STARPU_HISTORY_BASED,
 #ifdef STARPU_ATLAS
@@ -231,9 +232,9 @@ struct starpu_perfmodel_t model_11 = {
 
 struct starpu_perfmodel_t model_12 = {
 	.cost_model = task_12_cost,
-	.per_arch = { 
-		[STARPU_CPU_DEFAULT] = { .cost_model = task_12_cost_cpu },
-		[STARPU_CUDA_DEFAULT] = { .cost_model = task_12_cost_cuda }
+	.per_arch = {
+		[STARPU_CPU_DEFAULT][0] = { .cost_model = task_12_cost_cpu },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = task_12_cost_cuda }
 	},
 	.type = STARPU_HISTORY_BASED,
 #ifdef STARPU_ATLAS
@@ -247,9 +248,9 @@ struct starpu_perfmodel_t model_12 = {
 
 struct starpu_perfmodel_t model_21 = {
 	.cost_model = task_21_cost,
-	.per_arch = { 
-		[STARPU_CPU_DEFAULT] = { .cost_model = task_21_cost_cpu },
-		[STARPU_CUDA_DEFAULT] = { .cost_model = task_21_cost_cuda }
+	.per_arch = {
+		[STARPU_CPU_DEFAULT][0] = { .cost_model = task_21_cost_cpu },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = task_21_cost_cuda }
 	},
 	.type = STARPU_HISTORY_BASED,
 #ifdef STARPU_ATLAS
@@ -263,9 +264,9 @@ struct starpu_perfmodel_t model_21 = {
 
 struct starpu_perfmodel_t model_22 = {
 	.cost_model = task_22_cost,
-	.per_arch = { 
-		[STARPU_CPU_DEFAULT] = { .cost_model = task_22_cost_cpu },
-		[STARPU_CUDA_DEFAULT] = { .cost_model = task_22_cost_cuda }
+	.per_arch = {
+		[STARPU_CPU_DEFAULT][0] = { .cost_model = task_22_cost_cpu },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = task_22_cost_cuda }
 	},
 	.type = STARPU_HISTORY_BASED,
 #ifdef STARPU_ATLAS

+ 1 - 0
include/starpu_config.h.in

@@ -46,6 +46,7 @@
 #undef STARPU_MAXCUDADEVS
 #undef STARPU_MAXOPENCLDEVS
 #undef STARPU_NMAXWORKERS
+#undef STARPU_MAXIMPLEMENTATIONS
 
 #undef STARPU_HAVE_LIBNUMA
 

+ 4 - 4
include/starpu_perfmodel.h

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -104,7 +105,7 @@ struct starpu_perfmodel_t {
 	double (*cost_model)(struct starpu_buffer_descr_t *);
 
 	/* per-architecture model */
-	struct starpu_per_arch_perfmodel_t per_arch[STARPU_NARCH_VARIATIONS];
+	struct starpu_per_arch_perfmodel_t per_arch[STARPU_NARCH_VARIATIONS][STARPU_MAXIMPLEMENTATIONS];
 
 	/* Name of the performance model, this is used as a file name when saving history-based performance models */
 	const char *symbol;
@@ -126,9 +127,8 @@ enum starpu_perf_archtype starpu_worker_get_perf_archtype(int workerid);
  * performance model files */
 int starpu_load_history_debug(const char *symbol, struct starpu_perfmodel_t *model);
 void starpu_perfmodel_debugfilepath(struct starpu_perfmodel_t *model,
-		enum starpu_perf_archtype arch, char *path, size_t maxlen);
-void starpu_perfmodel_get_arch_name(enum starpu_perf_archtype arch,
-		char *archname, size_t maxlen);
+		enum starpu_perf_archtype arch, char *path, size_t maxlen, unsigned nimpl);
+void starpu_perfmodel_get_arch_name(enum starpu_perf_archtype arch,	char *archname, size_t maxlen, unsigned nimpl);
 int starpu_list_models(void);
 
 void starpu_force_bus_sampling(void);

+ 5 - 4
include/starpu_scheduler.h

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -113,7 +114,7 @@ void starpu_worker_set_sched_condition(int workerid, pthread_cond_t *sched_cond,
 #endif
 
 /* Check if the worker specified by workerid can execute the codelet. */
-int starpu_worker_may_execute_task(unsigned workerid, struct starpu_task *task);
+int starpu_worker_may_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
 
 /* The scheduling policy may put tasks directly into a worker's local queue so
  * that it is not always necessary to create its own queue when the local queue
@@ -151,7 +152,7 @@ void _starpu_sched_find_worker_combinations(struct starpu_machine_topology_s *to
 /* Get the description of a combined worker */
 int starpu_combined_worker_get_description(int workerid, int *worker_size, int **combined_workerid);
 /* Variant of starpu_worker_may_execute_task compatible with combined workers */
-int starpu_combined_worker_may_execute_task(unsigned workerid, struct starpu_task *task);
+int starpu_combined_worker_may_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
 
 /*
  *	Data prefetching
@@ -169,7 +170,7 @@ int starpu_prefetch_task_input_on_node(struct starpu_task *task, uint32_t node);
 /* Return the current date */
 double starpu_timing_now(void);
 /* Returns expected task duration in µs */
-double starpu_task_expected_length(struct starpu_task *task, enum starpu_perf_archtype arch);
+double starpu_task_expected_length(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
 /* Returns an estimated speedup factor relative to CPU speed */
 double starpu_worker_get_relative_speedup(enum starpu_perf_archtype perf_archtype);
 /* Returns expected data transfer time in µs */
@@ -177,6 +178,6 @@ double starpu_task_expected_data_transfer_time(uint32_t memory_node, struct star
 /* Predict the transfer time (in µs) to move a handle to a memory node */
 double starpu_data_expected_transfer_time(starpu_data_handle handle, unsigned memory_node, starpu_access_mode mode);
 /* Returns expected power consumption in J */
-double starpu_task_expected_power(struct starpu_task *task, enum starpu_perf_archtype arch);
+double starpu_task_expected_power(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
 
 #endif /* __STARPU_SCHEDULER_H__ */

+ 18 - 0
include/starpu_task.h

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -56,6 +57,18 @@ extern "C" {
 
 typedef uint64_t starpu_tag_t;
 
+
+typedef void (*starpu_cpu_func_t)(void **, void*);    /* CPU core */
+typedef void (*starpu_cuda_func_t)(void **, void*);   /* NVIDIA CUDA device */
+typedef void (*starpu_opencl_func_t)(void **, void*); /* OpenCL CUDA device */
+typedef uint8_t starpu_gordon_func_t; /* Cell SPU */
+
+#define STARPU_MULTIPLE_CPU_IMPLEMENTATIONS    (starpu_cpu_func_t) -1
+#define STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS   (starpu_cuda_func_t) -1
+#define STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS (starpu_opencl_func_t) -1
+#define STARPU_MULTIPLE_GORDON_IMPLEMENTATIONS 255
+
+
 /*
  * A codelet describes the various function 
  * that may be called from a worker
@@ -72,6 +85,11 @@ typedef struct starpu_codelet_t {
 	void (*opencl_func)(void **, void *);
 	uint8_t gordon_func;
 
+	starpu_cpu_func_t cpu_funcs[STARPU_MAXIMPLEMENTATIONS];
+	starpu_cuda_func_t cuda_funcs[STARPU_MAXIMPLEMENTATIONS];
+	starpu_opencl_func_t opencl_funcs[STARPU_MAXIMPLEMENTATIONS];
+	starpu_gordon_func_t gordon_funcs[STARPU_MAXIMPLEMENTATIONS];
+
 	/* how many buffers do the codelet takes as argument ? */
 	unsigned nbuffers;
 

+ 3 - 2
include/starpu_task_bundle.h

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -71,10 +72,10 @@ int starpu_task_bundle_remove(struct starpu_task_bundle *bundle, struct starpu_t
 void starpu_task_bundle_close(struct starpu_task_bundle *bundle);
 
 /* Return the expected duration of the entire task bundle in µs. */
-double starpu_task_bundle_expected_length(struct starpu_task_bundle *bundle, enum starpu_perf_archtype arch);
+double starpu_task_bundle_expected_length(struct starpu_task_bundle *bundle, enum starpu_perf_archtype arch, unsigned nimpl);
 /* Return the time (in µs) expected to transfer all data used within the bundle */
 double starpu_task_bundle_expected_data_transfer_time(struct starpu_task_bundle *bundle, unsigned memory_node);
 /* Return the expected power consumption of the entire task bundle in J. */
-double starpu_task_bundle_expected_power(struct starpu_task_bundle *bundle,  enum starpu_perf_archtype arch);
+double starpu_task_bundle_expected_power(struct starpu_task_bundle *bundle,  enum starpu_perf_archtype arch, unsigned nimpl);
 
 #endif // __STARPU_TASK_BUNDLE_H__

+ 2 - 0
src/core/jobs.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -62,6 +63,7 @@ starpu_job_t __attribute__((malloc)) _starpu_job_create(struct starpu_task *task
 
 	job = starpu_job_new();
 
+	job->nimpl =0; /* best implementation */
 	job->task = task;
 
 	job->footprint_is_computed = 0;

+ 5 - 0
src/core/jobs.h

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -55,6 +56,10 @@ typedef void (*callback)(void *);
 
 /* A job is the internal representation of a task. */
 LIST_TYPE(starpu_job,
+
+	/* The implementation associated to the job */
+	unsigned nimpl;
+
 	/* The task associated to that job */
 	struct starpu_task *task;
 

+ 15 - 11
src/core/perfmodel/perfmodel.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -67,12 +68,12 @@ enum starpu_perf_archtype starpu_worker_get_perf_archtype(int workerid)
  * PER ARCH model
  */
 
-static double per_arch_task_expected_perf(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct starpu_task *task)
+static double per_arch_task_expected_perf(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct starpu_task *task, unsigned nimpl)
 {
 	double exp = -1.0;
 	double (*per_arch_cost_model)(struct starpu_buffer_descr_t *);
 	
-	per_arch_cost_model = model->per_arch[arch].cost_model;
+	per_arch_cost_model = model->per_arch[arch][nimpl].cost_model;
 
 	if (per_arch_cost_model)
 		exp = per_arch_cost_model(task->buffers);
@@ -153,25 +154,27 @@ void _starpu_load_perfmodel(struct starpu_perfmodel_t *model)
 	model->is_loaded = 1;
 }
 
-static double starpu_model_expected_perf(struct starpu_task *task, struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch)
+static double starpu_model_expected_perf(struct starpu_task *task, struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch,  unsigned nimpl)
 {
 	if (model) {
 		starpu_job_t j = _starpu_get_job_associated_to_task(task);
 		switch (model->type) {
 			case STARPU_PER_ARCH:
-				return per_arch_task_expected_perf(model, arch, task);
 
+				return per_arch_task_expected_perf(model, arch, task, nimpl);
 			case STARPU_COMMON:
 				return common_task_expected_perf(model, arch, task);
 
 			case STARPU_HISTORY_BASED:
-				return _starpu_history_based_job_expected_perf(model, arch, j);
 
+				return _starpu_history_based_job_expected_perf(model, arch, j, nimpl);
 			case STARPU_REGRESSION_BASED:
-				return _starpu_regression_based_job_expected_perf(model, arch, j);
+
+				return _starpu_regression_based_job_expected_perf(model, arch, j, nimpl);
 
 			case STARPU_NL_REGRESSION_BASED:
-				return _starpu_non_linear_regression_based_job_expected_perf(model, arch, j);
+
+				return _starpu_non_linear_regression_based_job_expected_perf(model, arch, j,nimpl);
 
 			default:
 				STARPU_ABORT();
@@ -182,14 +185,15 @@ static double starpu_model_expected_perf(struct starpu_task *task, struct starpu
 	return 0.0;
 }
 
-double starpu_task_expected_length(struct starpu_task *task, enum starpu_perf_archtype arch)
+double starpu_task_expected_length(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
 {
-	return starpu_model_expected_perf(task, task->cl->model, arch);
+
+	return starpu_model_expected_perf(task, task->cl->model, arch, nimpl);
 }
 
-double starpu_task_expected_power(struct starpu_task *task, enum starpu_perf_archtype arch)
+double starpu_task_expected_power(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
 {
-	return starpu_model_expected_perf(task, task->cl->power_model, arch);
+	return starpu_model_expected_perf(task, task->cl->power_model, arch, nimpl);
 }
 
 /* Predict the transfer time (in µs) to move a handle to a memory node */

+ 5 - 4
src/core/perfmodel/perfmodel.h

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -91,7 +92,7 @@ void _starpu_get_perf_model_dir_codelets(char *path, size_t maxlen);
 void _starpu_get_perf_model_dir_bus(char *path, size_t maxlen);
 void _starpu_get_perf_model_dir_debug(char *path, size_t maxlen);
 
-double _starpu_history_based_job_expected_perf(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct starpu_job_s *j);
+double _starpu_history_based_job_expected_perf(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct starpu_job_s *j, unsigned nimpl);
 void _starpu_register_model(struct starpu_perfmodel_t *model);
 void _starpu_load_history_based_model(struct starpu_perfmodel_t *model, unsigned scan_history);
 void _starpu_load_perfmodel(struct starpu_perfmodel_t *model);
@@ -99,11 +100,11 @@ void _starpu_initialize_registered_performance_models(void);
 void _starpu_deinitialize_registered_performance_models(void);
 
 double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel_t *model,
-					enum starpu_perf_archtype arch, struct starpu_job_s *j);
+					enum starpu_perf_archtype arch, struct starpu_job_s *j, unsigned nimpl);
 double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfmodel_t *model,
-					enum starpu_perf_archtype arch, struct starpu_job_s *j);
+					enum starpu_perf_archtype arch, struct starpu_job_s *j, unsigned nimpl);
 void _starpu_update_perfmodel_history(struct starpu_job_s *j, struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch,
-				unsigned cpuid, double measured);
+				unsigned cpuid, double measured, unsigned nimpl);
 
 void _starpu_create_sampling_directory_if_needed(void);
 

+ 54 - 35
src/core/perfmodel/perfmodel_history.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -57,11 +58,11 @@ static void insert_history_entry(struct starpu_history_entry_t *entry, struct st
 }
 
 
-static void dump_reg_model(FILE *f, struct starpu_perfmodel_t *model, unsigned arch)
+static void dump_reg_model(FILE *f, struct starpu_perfmodel_t *model, unsigned arch, unsigned nimpl)
 {
 	struct starpu_per_arch_perfmodel_t *per_arch_model;
-	per_arch_model = &model->per_arch[arch];
 
+	per_arch_model = &model->per_arch[arch][nimpl];
 	struct starpu_regression_model_t *reg_model;
 	reg_model = &per_arch_model->regression;
 
@@ -206,15 +207,20 @@ static void parse_per_arch_model_file(FILE *f, struct starpu_per_arch_perfmodel_
 static void parse_model_file(FILE *f, struct starpu_perfmodel_t *model, unsigned scan_history)
 {
 	unsigned arch;
-	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
-		parse_per_arch_model_file(f, &model->per_arch[arch], scan_history);
+	unsigned nimpl;
+	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++) {
+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++) {
+			parse_per_arch_model_file(f, &model->per_arch[arch][nimpl], scan_history);
+		}
+	}
 }
 
-static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel_t *model, unsigned arch)
+
+static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel_t *model, unsigned arch, unsigned nimpl)
 {
 	struct starpu_per_arch_perfmodel_t *per_arch_model;
-	per_arch_model = &model->per_arch[arch];
 
+	per_arch_model = &model->per_arch[arch][nimpl];
 	/* count the number of elements in the lists */
 	struct starpu_history_list_t *ptr = NULL;
 	unsigned nentries = 0;
@@ -232,7 +238,7 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel_t *model,
 	/* header */
 	fprintf(f, "# number of entries\n%u\n", nentries);
 
-	dump_reg_model(f, model, arch);
+	dump_reg_model(f, model, arch, nimpl);
 
 	/* Dump the history into the model file in case it is necessary */
 	if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
@@ -251,13 +257,17 @@ static void dump_model_file(FILE *f, struct starpu_perfmodel_t *model)
 	fprintf(f, "#################\n");
 
 	unsigned arch;
+	unsigned nimpl;
 	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
 	{
-		char archname[32];
-		starpu_perfmodel_get_arch_name((enum starpu_perf_archtype) arch, archname, 32);
-		fprintf(f, "# Model for %s\n", archname);
-		dump_per_arch_model_file(f, model, arch);
-		fprintf(f, "\n##################\n");
+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
+		{
+			char archname[32];
+			starpu_perfmodel_get_arch_name((enum starpu_perf_archtype) arch, archname, 32, nimpl);
+			fprintf(f, "# Model for %s\n", archname);
+			dump_per_arch_model_file(f, model, arch, nimpl);
+			fprintf(f, "\n##################\n");
+		}
 	}
 }
 
@@ -270,8 +280,14 @@ static void initialize_per_arch_model(struct starpu_per_arch_perfmodel_t *per_ar
 static void initialize_model(struct starpu_perfmodel_t *model)
 {
 	unsigned arch;
+	unsigned nimpl;
 	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
-		initialize_per_arch_model(&model->per_arch[arch]);
+	{
+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
+		{
+			initialize_per_arch_model(&model->per_arch[arch][nimpl]);
+		}
+	}
 }
 
 static void get_model_debug_path(struct starpu_perfmodel_t *model, const char *arch, char *path, size_t maxlen)
@@ -312,12 +328,15 @@ void _starpu_register_model(struct starpu_perfmodel_t *model)
 	_starpu_create_sampling_directory_if_needed();
 
 	unsigned arch;
-	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
-	{
-		char debugpath[256];
-		starpu_perfmodel_debugfilepath(model, arch, debugpath, 256);
-		model->per_arch[arch].debug_file = fopen(debugpath, "a+");
-		STARPU_ASSERT(model->per_arch[arch].debug_file);
+	unsigned nimpl;
+
+	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++) {
+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++) {
+			char debugpath[256];
+			starpu_perfmodel_debugfilepath(model, arch, debugpath, 256, nimpl);
+			model->per_arch[arch][nimpl].debug_file = fopen(debugpath, "a+");
+			STARPU_ASSERT(model->per_arch[arch][nimpl].debug_file);
+		}
 	}
 #endif
 
@@ -554,37 +573,37 @@ int starpu_load_history_debug(const char *symbol, struct starpu_perfmodel_t *mod
 	return 0;
 }
 
-void starpu_perfmodel_get_arch_name(enum starpu_perf_archtype arch, char *archname, size_t maxlen)
+void starpu_perfmodel_get_arch_name(enum starpu_perf_archtype arch, char *archname, size_t maxlen,unsigned nimpl)
 {
 	if (arch < STARPU_CUDA_DEFAULT)
 	{
 		if (arch == STARPU_CPU_DEFAULT)
 		{
 			/* NB: We could just use cpu_1 as well ... */
-			snprintf(archname, maxlen, "cpu");
+			snprintf(archname, maxlen, "cpu_impl_%u",nimpl);
 		}
 		else
 		{
 			/* For combined CPU workers */
 			int cpu_count = arch - STARPU_CPU_DEFAULT + 1;
-			snprintf(archname, maxlen, "cpu_%d", cpu_count);
+			snprintf(archname, maxlen, "cpu_%d_impl_%u", cpu_count,nimpl);
 		}
 	}
 	else if ((STARPU_CUDA_DEFAULT <= arch)
 		&& (arch < STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS))
 	{
 		int devid = arch - STARPU_CUDA_DEFAULT;
-		snprintf(archname, maxlen, "cuda_%d", devid);
+		snprintf(archname, maxlen, "cuda_%d_impl_%u", devid,nimpl);
 	}
 	else if ((STARPU_OPENCL_DEFAULT <= arch)
 		&& (arch < STARPU_OPENCL_DEFAULT + STARPU_MAXOPENCLDEVS))
 	{
 		int devid = arch - STARPU_OPENCL_DEFAULT;
-		snprintf(archname, maxlen, "opencl_%d", devid);
+		snprintf(archname, maxlen, "opencl_%d_impl_%u", devid,nimpl);
 	}
 	else if (arch == STARPU_GORDON_DEFAULT)
 	{
-		snprintf(archname, maxlen, "gordon");
+		snprintf(archname, maxlen, "gordon_impl_%u",nimpl);
 	}
 	else
 	{
@@ -593,23 +612,23 @@ void starpu_perfmodel_get_arch_name(enum starpu_perf_archtype arch, char *archna
 }
 
 void starpu_perfmodel_debugfilepath(struct starpu_perfmodel_t *model,
-		enum starpu_perf_archtype arch, char *path, size_t maxlen)
+		enum starpu_perf_archtype arch, char *path, size_t maxlen, unsigned nimpl)
 {
 	char archname[32];
-	starpu_perfmodel_get_arch_name(arch, archname, 32);
+	starpu_perfmodel_get_arch_name(arch, archname, 32, nimpl);
 
 	STARPU_ASSERT(path);
 
 	get_model_debug_path(model, archname, path, maxlen);
 }
 
-double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct starpu_job_s *j)
+double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct starpu_job_s *j, unsigned nimpl)
 {
 	double exp = -1.0;
 	size_t size = _starpu_job_get_data_size(j);
 	struct starpu_regression_model_t *regmodel;
 
-	regmodel = &model->per_arch[arch].regression;
+	regmodel = &model->per_arch[arch][nimpl].regression;
 
 	if (regmodel->valid)
                 exp = regmodel->alpha*pow((double)size, regmodel->beta);
@@ -617,13 +636,13 @@ double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel_t *mod
 	return exp;
 }
 
-double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct starpu_job_s *j)
+double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct starpu_job_s *j,unsigned nimpl)
 {
 	double exp = -1.0;
 	size_t size = _starpu_job_get_data_size(j);
 	struct starpu_regression_model_t *regmodel;
 
-	regmodel = &model->per_arch[arch].regression;
+	regmodel = &model->per_arch[arch][nimpl].regression;
 
 	if (regmodel->nl_valid)
 		exp = regmodel->a*pow((double)size, regmodel->b) + regmodel->c;
@@ -631,7 +650,7 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 	return exp;
 }
 
-double _starpu_history_based_job_expected_perf(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct starpu_job_s *j)
+double _starpu_history_based_job_expected_perf(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct starpu_job_s *j,unsigned nimpl)
 {
 	double exp;
 	struct starpu_per_arch_perfmodel_t *per_arch_model;
@@ -640,7 +659,7 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel_t *model,
 
 	uint32_t key = _starpu_compute_buffers_footprint(j);
 
-	per_arch_model = &model->per_arch[arch];
+	per_arch_model = &model->per_arch[arch][nimpl];
 
 	history = per_arch_model->history;
 	if (!history)
@@ -661,13 +680,13 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel_t *model,
 	return exp;
 }
 
-void _starpu_update_perfmodel_history(starpu_job_t j, struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, unsigned cpuid STARPU_ATTRIBUTE_UNUSED, double measured)
+void _starpu_update_perfmodel_history(starpu_job_t j, struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, unsigned cpuid STARPU_ATTRIBUTE_UNUSED, double measured, unsigned nimpl)
 {
 	if (model)
 	{
 		PTHREAD_RWLOCK_WRLOCK(&model->model_rwlock);
 
-		struct starpu_per_arch_perfmodel_t *per_arch_model = &model->per_arch[arch];
+		struct starpu_per_arch_perfmodel_t *per_arch_model = &model->per_arch[arch][nimpl];
 
 		if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
 		{

+ 2 - 1
src/core/task.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -256,7 +257,7 @@ int starpu_task_submit(struct starpu_task *task)
 		/* In case we require that a task should be explicitely
 		 * executed on a specific worker, we make sure that the worker
 		 * is able to execute this task.  */
-		if (task->execute_on_a_specific_worker && !starpu_combined_worker_may_execute_task(task->workerid, task)) {
+		if (task->execute_on_a_specific_worker && !starpu_combined_worker_may_execute_task(task->workerid, task, 0)) {
                         _STARPU_LOG_OUT_TAG("ENODEV");
 			return -ENODEV;
                 }

+ 5 - 4
src/core/task_bundle.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -177,7 +178,7 @@ void starpu_task_bundle_close(struct starpu_task_bundle *bundle)
 }
 
 /* Return the expected duration of the entire task bundle in µs */
-double starpu_task_bundle_expected_length(struct starpu_task_bundle *bundle,  enum starpu_perf_archtype arch)
+double starpu_task_bundle_expected_length(struct starpu_task_bundle *bundle,  enum starpu_perf_archtype arch, unsigned nimpl)
 {
 	double expected_length = 0.0;
 
@@ -188,7 +189,7 @@ double starpu_task_bundle_expected_length(struct starpu_task_bundle *bundle,  en
 	entry = bundle->list;
 
 	while (entry) {
-		double task_length = starpu_task_expected_length(entry->task, arch);
+		double task_length = starpu_task_expected_length(entry->task, arch, nimpl);
 
 		/* In case the task is not calibrated, we consider the task
 		 * ends immediately. */
@@ -204,7 +205,7 @@ double starpu_task_bundle_expected_length(struct starpu_task_bundle *bundle,  en
 }
 
 /* Return the expected power consumption of the entire task bundle in J */
-double starpu_task_bundle_expected_power(struct starpu_task_bundle *bundle,  enum starpu_perf_archtype arch)
+double starpu_task_bundle_expected_power(struct starpu_task_bundle *bundle,  enum starpu_perf_archtype arch, unsigned nimpl)
 {
 	double expected_power = 0.0;
 
@@ -215,7 +216,7 @@ double starpu_task_bundle_expected_power(struct starpu_task_bundle *bundle,  enu
 	entry = bundle->list;
 
 	while (entry) {
-		double task_power = starpu_task_expected_power(entry->task, arch);
+		double task_power = starpu_task_expected_power(entry->task, arch, nimpl);
 
 		/* In case the task is not calibrated, we consider the task
 		 * ends immediately. */

+ 30 - 5
src/core/workers.c

@@ -3,6 +3,7 @@
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Institut National de Recherche en Informatique et Automatique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -68,17 +69,39 @@ uint32_t _starpu_may_submit_opencl_task(void)
 	return (STARPU_OPENCL & config.worker_mask);
 }
 
-int starpu_worker_may_execute_task(unsigned workerid, struct starpu_task *task)
+static int _starpu_may_use_nth_implementation(enum starpu_archtype arch, struct starpu_codelet_t *cl, unsigned nimpl)
+{
+	switch(arch) {
+	case STARPU_CPU_WORKER:
+		return !(cl->cpu_func == STARPU_MULTIPLE_CPU_IMPLEMENTATIONS &&
+			cl->cpu_funcs[nimpl] == NULL);
+	case STARPU_CUDA_WORKER:
+		return !(cl->cuda_func == STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS &&
+			cl->cuda_funcs[nimpl] == NULL);
+	case STARPU_OPENCL_WORKER:
+		return !(cl->opencl_func == STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS &&
+			cl->opencl_funcs[nimpl] == NULL);
+	case STARPU_GORDON_WORKER:
+		return !(cl->gordon_func == STARPU_MULTIPLE_GORDON_IMPLEMENTATIONS &&
+			cl->gordon_funcs[nimpl] == NULL);
+	default:
+		return 0;
+	}
+}
+
+
+int starpu_worker_may_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl)
 {
 	/* TODO: check that the task operand sizes will fit on that device */
 	/* TODO: call application-provided function for various cases like
 	 * double support, shared memory size limit, etc. */
-	return !!(task->cl->where & config.workers[workerid].worker_mask);
+	return !!((task->cl->where & config.workers[workerid].worker_mask) &&
+		_starpu_may_use_nth_implementation(config.workers[workerid].arch, task->cl, nimpl));
 }
 
 
 
-int starpu_combined_worker_may_execute_task(unsigned workerid, struct starpu_task *task)
+int starpu_combined_worker_may_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl)
 {
 	/* TODO: check that the task operand sizes will fit on that device */
 	/* TODO: call application-provided function for various cases like
@@ -90,7 +113,8 @@ int starpu_combined_worker_may_execute_task(unsigned workerid, struct starpu_tas
 	/* Is this a parallel worker ? */
 	if (workerid < nworkers)
 	{
-		return !!(task->cl->where & config.workers[workerid].worker_mask);
+		return !!((task->cl->where & config.workers[workerid].worker_mask) &&
+				_starpu_may_use_nth_implementation(config.workers[workerid].arch, task->cl, nimpl));
 	}
 	else {
 		if ((cl->type == STARPU_SPMD) || (cl->type == STARPU_FORKJOIN))
@@ -99,7 +123,8 @@ int starpu_combined_worker_may_execute_task(unsigned workerid, struct starpu_tas
 
 			/* Is the worker larger than requested ? */
 			int worker_size = (int)config.combined_workers[workerid - nworkers].worker_size;
-			return !!(worker_size <= task->cl->max_parallelism);
+			return !!((worker_size <= task->cl->max_parallelism) &&
+				_starpu_may_use_nth_implementation(config.workers[workerid].arch, task->cl, nimpl));
 		}
 		else
 		{

+ 14 - 3
src/drivers/cpu/driver_cpu.c

@@ -3,6 +3,7 @@
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -55,9 +56,19 @@ static int execute_job_on_cpu(starpu_job_t j, struct starpu_worker_s *cpu_args,
 	 * execute the kernel at all. */
 	if ((rank == 0) || (cl->type != STARPU_FORKJOIN))
 	{
-		cl_func func = cl->cpu_func;
-		STARPU_ASSERT(func);
-		func(task->interfaces, task->cl_arg);
+		if (cl->cpu_func != STARPU_MULTIPLE_CPU_IMPLEMENTATIONS) {
+			cl_func func = cl->cpu_func;
+			STARPU_ASSERT(func);
+			func(task->interfaces, task->cl_arg);
+		}
+		else {
+			if (cl->cpu_funcs[j->nimpl] != NULL) {
+				/* _STARPU_DEBUG("CPU driver : running kernel (%d)\n", j->nimpl); */
+				cl_func func = cl->cpu_funcs[j->nimpl];
+				STARPU_ASSERT(func);
+				func(task->interfaces, task->cl_arg);
+			}
+		}
 	}
 
 	_starpu_driver_end_job(cpu_args, j, &codelet_end, rank);

+ 14 - 3
src/drivers/cuda/driver_cuda.c

@@ -3,6 +3,7 @@
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -198,9 +199,19 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 	cures = cudaSetDevice(args->devid);
 #endif
 
-	cl_func func = cl->cuda_func;
-	STARPU_ASSERT(func);
-	func(task->interfaces, task->cl_arg);
+	if (cl->cuda_func != STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS) {
+		cl_func func = cl->cuda_func;
+		STARPU_ASSERT(func);
+		func(task->interfaces, task->cl_arg);
+	}
+	else {
+		if (cl->cuda_funcs[j->nimpl] != NULL) {
+			/* _STARPU_DEBUG("Cuda driver : running kernel * (%d)\n", j->nimpl); */
+			cl_func func = cl->cuda_funcs[j->nimpl];
+			STARPU_ASSERT(func);
+			func(task->interfaces, task->cl_arg);
+		}
+	}
 
 	_starpu_driver_end_job(args, j, &codelet_end, 0);
 

+ 6 - 2
src/drivers/driver_common/driver_common.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -120,14 +121,17 @@ void _starpu_driver_update_job_feedback(starpu_job_t j, struct starpu_worker_s *
 		}
 
 		if (calibrate_model)
-			_starpu_update_perfmodel_history(j, j->task->cl->model,  perf_arch, worker_args->devid, measured);
+
+			_starpu_update_perfmodel_history(j, j->task->cl->model,  perf_arch, worker_args->devid, measured,j->nimpl);
+
+
 	}
 
 	if (!updated)
 		_starpu_worker_update_profiling_info_executing(workerid, NULL, 1, 0, 0, 0);
 
 	if (profiling_info && profiling_info->power_consumed && cl->power_model && cl->power_model->benchmarking) {
-		_starpu_update_perfmodel_history(j, j->task->cl->power_model,  perf_arch, worker_args->devid, profiling_info->power_consumed);
+		_starpu_update_perfmodel_history(j, j->task->cl->power_model,  perf_arch, worker_args->devid, profiling_info->power_consumed,j->nimpl);
 		}
 }
 

+ 5 - 1
src/drivers/gordon/driver_gordon.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -161,7 +162,10 @@ static struct gordon_task_wrapper_s *starpu_to_gordon_job(starpu_job_t j)
 	task_wrapper->j = j;
 	task_wrapper->terminated = 0;
 
-	gordon_job->index = j->task->cl->gordon_func;
+	if (j->task->clgordon_func != STARPU_MULTIPLE_GORDON_IMPLEMENTATIONS)
+		gordon_job->index = j->task->cl->gordon_func;
+	else
+		gordon_job->index = j->task->cl->gordon_funcs[j->nimpl];
 
 	/* we should not hardcore the memory node ... XXX */
 	unsigned memory_node = 0;

+ 14 - 3
src/drivers/opencl/driver_opencl.c

@@ -3,6 +3,7 @@
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -529,9 +530,19 @@ static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *ar
 
 	_starpu_driver_start_job(args, j, &codelet_start, 0);
 
-	cl_func func = cl->opencl_func;
-	STARPU_ASSERT(func);
-	func(task->interfaces, task->cl_arg);
+	if (cl->opencl_func != STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS) {
+		cl_func func = cl->opencl_func;
+		STARPU_ASSERT(func);
+		func(task->interfaces, task->cl_arg);
+	}
+	else {
+		if (cl->opencl_funcs[j->nimpl] != NULL) {
+			/* _STARPU_DEBUG("OpenCL driver : running kernel (%d)\n", j->nimpl); */
+			cl_func func = cl->opencl_funcs[j->nimpl];
+			STARPU_ASSERT(func);
+			func(task->interfaces, task->cl_arg);
+		}
+	}
 
 	_starpu_driver_end_job(args, j, &codelet_end, 0);
 

+ 3 - 2
src/profiling/bound.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -335,7 +336,7 @@ static void _starpu_get_tasks_times(int nw, int nt, double *times) {
 				.footprint_is_computed = 1,
 			};
 			enum starpu_perf_archtype arch = starpu_worker_get_perf_archtype(w);
-			double length = _starpu_history_based_job_expected_perf(tp->cl->model, arch, &j);
+			double length = _starpu_history_based_job_expected_perf(tp->cl->model, arch, &j, j.nimpl);
 			if (length == -1.0)
 				times[w*nt+t] = -1.0;
 			else
@@ -401,7 +402,7 @@ void starpu_bound_print_lp(FILE *output)
 			for (w = 0; w < nw; w++) {
 				enum starpu_perf_archtype arch = starpu_worker_get_perf_archtype(w);
 				if (t1->duration[arch] == 0.) {
-					double length = _starpu_history_based_job_expected_perf(t1->cl->model, arch, &j);
+					double length = _starpu_history_based_job_expected_perf(t1->cl->model, arch, &j,j.nimpl);
 					if (length == -1.0)
 						/* Avoid problems with binary coding of doubles */
 						t1->duration[arch] = -1.0;

+ 117 - 86
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -311,57 +312,65 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio)
 	/* A priori, we know all estimations */
 	int unknown = 0;
 
+	unsigned best_impl = 0;
+	unsigned nimpl;
 	for (worker = 0; worker < nworkers; worker++)
 	{
-		double exp_end;
-		
-		fifo = queue_array[worker];
+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
+		{
+			double exp_end;
 
-		/* Sometimes workers didn't take the tasks as early as we expected */
-		fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
-		fifo->exp_end = fifo->exp_start + fifo->exp_len;
+			fifo = queue_array[worker];
 
-		if (!starpu_worker_may_execute_task(worker, task))
-		{
-			/* no one on that queue may execute this task */
-			continue;
-		}
+			/* Sometimes workers didn't take the tasks as early as we expected */
+			fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
+			fifo->exp_end = fifo->exp_start + fifo->exp_len;
 
-		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
-		double local_length = starpu_task_expected_length(task, perf_arch);
-		double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
-
-		if (ntasks_best == -1
-				|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
-				|| (!calibrating && local_length == -1.0) /* Not calibrating but this worker is being calibrated */
-				|| (calibrating && local_length == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
-				) {
-			ntasks_best_end = ntasks_end;
-			ntasks_best = worker;
-		}
+			if (!starpu_worker_may_execute_task(worker, task, nimpl))
+			{
+				/* no one on that queue may execute this task */
+				continue;
+			}
 
-		if (local_length == -1.0)
-			/* we are calibrating, we want to speed-up calibration time
-			 * so we privilege non-calibrated tasks (but still
-			 * greedily distribute them to avoid dumb schedules) */
-			calibrating = 1;
+			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
+			double local_length = starpu_task_expected_length(task, perf_arch, nimpl);
+			double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
 
-		if (local_length <= 0.0)
-			/* there is no prediction available for that task
-			 * with that arch yet, so switch to a greedy strategy */
-			unknown = 1;
+			//_STARPU_DEBUG("Scheduler dm: task length (%lf) worker (%u) kernel (%u) \n", local_length,worker,nimpl);
 
-		if (unknown)
-			continue;
+			if (ntasks_best == -1
+					|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
+					|| (!calibrating && local_length == -1.0) /* Not calibrating but this worker is being calibrated */
+					|| (calibrating && local_length == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
+					) {
+				ntasks_best_end = ntasks_end;
+				ntasks_best = worker;
+			}
 
-		exp_end = fifo->exp_start + fifo->exp_len + local_length;
+			if (local_length == -1.0)
+				/* we are calibrating, we want to speed-up calibration time
+				 * so we privilege non-calibrated tasks (but still
+				 * greedily distribute them to avoid dumb schedules) */
+				calibrating = 1;
 
-		if (best == -1 || exp_end < best_exp_end)
-		{
-			/* a better solution was found */
-			best_exp_end = exp_end;
-			best = worker;
-			model_best = local_length;
+			if (local_length <= 0.0)
+				/* there is no prediction available for that task
+				 * with that arch yet, so switch to a greedy strategy */
+				unknown = 1;
+
+			if (unknown)
+				continue;
+
+			exp_end = fifo->exp_start + fifo->exp_len + local_length;
+
+			if (best == -1 || exp_end < best_exp_end)
+			{
+				/* a better solution was found */
+				best_exp_end = exp_end;
+				best = worker;
+				model_best = local_length;
+				best_impl = nimpl;
+			}
 		}
 	}
 
@@ -370,6 +379,10 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio)
 		model_best = 0.0;
 	}
 	
+	//_STARPU_DEBUG("Scheduler dm: kernel (%u)\n", best_impl);
+
+	 _starpu_get_job_associated_to_task(task)->nimpl = 0;//best_impl;
+
 	/* we should now have the best worker in variable "best" */
 	return push_task_on_best_worker(task, best, model_best, prio);
 }
@@ -404,64 +417,78 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio)
 	/* A priori, we know all estimations */
 	int unknown = 0;
 
+	unsigned best_impl = 0;
+	unsigned nimpl=0;
 	for (worker = 0; worker < nworkers; worker++)
 	{
-		fifo = queue_array[worker];
+		for(nimpl  = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
+	 	{
+			fifo = queue_array[worker];
 
-		/* Sometimes workers didn't take the tasks as early as we expected */
-		fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
-		fifo->exp_end = fifo->exp_start + fifo->exp_len;
-		if (fifo->exp_end > max_exp_end)
-			max_exp_end = fifo->exp_end;
+			/* Sometimes workers didn't take the tasks as early as we expected */
+			fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
+			fifo->exp_end = fifo->exp_start + fifo->exp_len;
+			if (fifo->exp_end > max_exp_end)
+				max_exp_end = fifo->exp_end;
 
-		if (!starpu_worker_may_execute_task(worker, task))
-		{
-			/* no one on that queue may execute this task */
-			continue;
-		}
+			if (!starpu_worker_may_execute_task(worker, task, nimpl))
+			{
+				/* no one on that queue may execute this task */
+				continue;
+			}
 
-		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
-		local_task_length[worker] = starpu_task_expected_length(task, perf_arch);
+			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
+			local_task_length[worker] = starpu_task_expected_length(task, perf_arch, nimpl);
 
-		unsigned memory_node = starpu_worker_get_memory_node(worker);
-		local_data_penalty[worker] = starpu_task_expected_data_transfer_time(memory_node, task);
+			//_STARPU_DEBUG("Scheduler dmda: task length (%lf) worker (%u) kernel (%u) \n", local_task_length[worker],worker,nimpl);
 
-		double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
+			unsigned memory_node = starpu_worker_get_memory_node(worker);
+			local_data_penalty[worker] = starpu_task_expected_data_transfer_time(memory_node, task);
 
-		if (ntasks_best == -1
-				|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
-				|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
-				|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
-				) {
-			ntasks_best_end = ntasks_end;
-			ntasks_best = worker;
-		}
+			double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
 
-		if (local_task_length[worker] == -1.0)
-			/* we are calibrating, we want to speed-up calibration time
-			 * so we privilege non-calibrated tasks (but still
-			 * greedily distribute them to avoid dumb schedules) */
-			calibrating = 1;
+			if (ntasks_best == -1
+					|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
+					|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
+					|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
+					) {
+				ntasks_best_end = ntasks_end;
+				ntasks_best = worker;
 
-		if (local_task_length[worker] <= 0.0)
-			/* there is no prediction available for that task
-			 * with that arch yet, so switch to a greedy strategy */
-			unknown = 1;
+			}
 
-		if (unknown)
-			continue;
+			if (local_task_length[worker] == -1.0)
+				/* we are calibrating, we want to speed-up calibration time
+				 * so we privilege non-calibrated tasks (but still
+				 * greedily distribute them to avoid dumb schedules) */
+				calibrating = 1;
 
-		exp_end[worker] = fifo->exp_start + fifo->exp_len + local_task_length[worker];
+			if (local_task_length[worker] <= 0.0)
+				/* there is no prediction available for that task
+				 * with that arch yet, so switch to a greedy strategy */
+				unknown = 1;
 
-		if (exp_end[worker] < best_exp_end)
-		{
-			/* a better solution was found */
-			best_exp_end = exp_end[worker];
-		}
+			if (unknown)
+					continue;
 
-		local_power[worker] = starpu_task_expected_power(task, perf_arch);
-		if (local_power[worker] == -1.0)
-			local_power[worker] = 0.;
+			exp_end[worker] = fifo->exp_start + fifo->exp_len + local_task_length[worker];
+
+			if (exp_end[worker] < best_exp_end)
+			{
+				/* a better solution was found */
+				best_exp_end = exp_end[worker];
+				best_impl = nimpl;
+
+			}
+
+
+
+			local_power[worker] = starpu_task_expected_power(task, perf_arch, nimpl);
+			if (local_power[worker] == -1.0)
+				local_power[worker] = 0.;
+
+
+		 }
 	}
 
 	if (unknown)
@@ -475,7 +502,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio)
 		{
 			fifo = queue_array[worker];
 	
-			if (!starpu_worker_may_execute_task(worker, task))
+			if (!starpu_worker_may_execute_task(worker, task, 0))
 			{
 				/* no one on that queue may execute this task */
 				continue;
@@ -519,6 +546,10 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio)
 		//penality_best = local_data_penalty[best];
 	}
 
+
+	//_STARPU_DEBUG("Scheduler dmda: kernel (%u)\n", best_impl);
+	 _starpu_get_job_associated_to_task(task)->nimpl = best_impl;
+
 	/* we should now have the best worker in variable "best" */
 	return push_task_on_best_worker(task, best, model_best, prio);
 }

+ 6 - 1
src/sched_policies/deque_queues.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2010  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -109,7 +110,11 @@ struct starpu_job_list_s *_starpu_deque_pop_every_task(struct starpu_deque_jobq_
 		{
 			next_job = starpu_job_list_next(i);
 
-			if (starpu_worker_may_execute_task(workerid, i->task))
+			/* In case there are multiples implementations of the
+ 			 * codelet for a single device, We dont really care
+			 * about the implementation used, so let's try the 
+			 * first one. */
+			if (starpu_worker_may_execute_task(workerid, i->task, 0))
 			{
 				/* this elements can be moved into the new list */
 				new_list_size++;

+ 2 - 1
src/sched_policies/fifo_queues.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -111,7 +112,7 @@ struct starpu_task *_starpu_fifo_pop_every_task(struct starpu_fifo_taskq_s *fifo
 		{
 			next_task = task->next;
 
-			if (starpu_worker_may_execute_task(workerid, task))
+			if (starpu_worker_may_execute_task(workerid, task, 0))
 			{
 				/* this elements can be moved into the new list */
 				new_list_size++;

+ 80 - 63
src/sched_policies/heft.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -115,7 +116,9 @@ static void heft_push_task_notify(struct starpu_task *task, int workerid)
 {
 	/* Compute the expected penality */
 	enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(workerid);
-	double predicted = starpu_task_expected_length(task, perf_arch);
+
+	double predicted = starpu_task_expected_length(task, perf_arch,
+			_starpu_get_job_associated_to_task(task)->nimpl);
 
 	/* Update the predictions */
 	PTHREAD_MUTEX_LOCK(&sched_mutex[workerid]);
@@ -179,78 +182,92 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
 	/* A priori, we know all estimations */
 	int unknown = 0;
-
 	unsigned worker;
-	for (worker = 0; worker < nworkers; worker++)
-	{
-		/* Sometimes workers didn't take the tasks as early as we expected */
-		exp_start[worker] = STARPU_MAX(exp_start[worker], starpu_timing_now());
-		exp_end[worker] = exp_start[worker] + exp_len[worker];
-		if (exp_end[worker] > max_exp_end)
-			max_exp_end = exp_end[worker];
 
-		if (!starpu_worker_may_execute_task(worker, task))
-		{
-			/* no one on that queue may execute this task */
-			continue;
-		}
-
-		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
-		unsigned memory_node = starpu_worker_get_memory_node(worker);
-
-		if (bundle)
-		{
-			local_task_length[worker] = starpu_task_bundle_expected_length(bundle, perf_arch);
-			local_data_penalty[worker] = starpu_task_bundle_expected_data_transfer_time(bundle, memory_node);
-			local_power[worker] = starpu_task_bundle_expected_power(bundle, perf_arch);
-		}
-		else {
-			local_task_length[worker] = starpu_task_expected_length(task, perf_arch);
-			local_data_penalty[worker] = starpu_task_expected_data_transfer_time(memory_node, task);
-			local_power[worker] = starpu_task_expected_power(task, perf_arch);
-		}
-
-		double ntasks_end = ntasks[worker] / starpu_worker_get_relative_speedup(perf_arch);
-
-		if (ntasks_best == -1
+	unsigned nimpl;
+	unsigned best_impl = 0;
+
+	for (worker = 0; worker < nworkers; worker++) {
+		for (nimpl = 0; nimpl <STARPU_MAXIMPLEMENTATIONS; nimpl++) {
+			/* Sometimes workers didn't take the tasks as early as we expected */
+			exp_start[worker] = STARPU_MAX(exp_start[worker], starpu_timing_now());
+			exp_end[worker] = exp_start[worker] + exp_len[worker];
+			if (exp_end[worker] > max_exp_end)
+				max_exp_end = exp_end[worker];
+
+			if (!starpu_worker_may_execute_task(worker, task, nimpl))
+			{
+				/* no one on that queue may execute this task */
+				continue;
+			}
+
+			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
+			unsigned memory_node = starpu_worker_get_memory_node(worker);
+
+			if (bundle)
+			{
+				local_task_length[worker] = starpu_task_bundle_expected_length(bundle, perf_arch, nimpl);
+				local_data_penalty[worker] = starpu_task_bundle_expected_data_transfer_time(bundle, memory_node);
+				local_power[worker] = starpu_task_bundle_expected_power(bundle, perf_arch,nimpl);
+				//_STARPU_DEBUG("Scheduler heft bundle: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker],local_power[worker],worker,nimpl);
+
+			}
+			else {
+				local_task_length[worker] = starpu_task_expected_length(task, perf_arch, nimpl);
+				local_data_penalty[worker] = starpu_task_expected_data_transfer_time(memory_node, task);
+				local_power[worker] = starpu_task_expected_power(task, perf_arch,nimpl);
+				//_STARPU_DEBUG("Scheduler heft: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker],local_power[worker],worker,nimpl);
+
+			}
+
+			double ntasks_end = ntasks[worker] / starpu_worker_get_relative_speedup(perf_arch);
+
+			if (ntasks_best == -1
 				|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
 				|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
 				|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
 				) {
-			ntasks_best_end = ntasks_end;
-			ntasks_best = worker;
-		}
+				ntasks_best_end = ntasks_end;
+				ntasks_best = worker;
+			}
 
-		if (local_task_length[worker] == -1.0)
-			/* we are calibrating, we want to speed-up calibration time
-			 * so we privilege non-calibrated tasks (but still
-			 * greedily distribute them to avoid dumb schedules) */
-			calibrating = 1;
+			if (local_task_length[worker] == -1.0)
+				/* we are calibrating, we want to speed-up calibration time
+				 * so we privilege non-calibrated tasks (but still
+				 * greedily distribute them to avoid dumb schedules) */
+				calibrating = 1;
 
-		if (local_task_length[worker] <= 0.0)
-			/* there is no prediction available for that task
-			 * with that arch yet, so switch to a greedy strategy */
-			unknown = 1;
+			if (local_task_length[worker] <= 0.0)
+				/* there is no prediction available for that task
+				 * with that arch yet, so switch to a greedy strategy */
+				unknown = 1;
 
-		if (unknown)
-			continue;
+			if (unknown)
+				continue;
 
-		exp_end[worker] = exp_start[worker] + exp_len[worker] + local_task_length[worker];
+			exp_end[worker] = exp_start[worker] + exp_len[worker] + local_task_length[worker];
 
-		if (exp_end[worker] < best_exp_end)
-		{
-			/* a better solution was found */
-			best_exp_end = exp_end[worker];
-		}
+			if (exp_end[worker] < best_exp_end)
+			{
+				/* a better solution was found */
+				best_exp_end = exp_end[worker];
+				best_impl = nimpl;
+			}
+
+			if (local_power[worker] == -1.0)
+				local_power[worker] = 0.;
 
-		if (local_power[worker] == -1.0)
-			local_power[worker] = 0.;
+		}
 	}
 
 	*forced_best = unknown?ntasks_best:-1;
 
 	*best_exp_endp = best_exp_end;
 	*max_exp_endp = max_exp_end;
+
+	/* save the best implementation */
+	//_STARPU_DEBUG("Scheduler heft: kernel (%u)\n", best_impl);
+	_starpu_get_job_associated_to_task(task)->nimpl = best_impl;
 }
 
 static int _heft_push_task(struct starpu_task *task, unsigned prio)
@@ -298,7 +315,7 @@ static int _heft_push_task(struct starpu_task *task, unsigned prio)
 
 	for (worker = 0; worker < nworkers; worker++)
 	{
-		if (!starpu_worker_may_execute_task(worker, task))
+		if (!starpu_worker_may_execute_task(worker, task, 0))
 		{
 			/* no one on that queue may execute this task */
 			continue;
@@ -314,12 +331,11 @@ static int _heft_push_task(struct starpu_task *task, unsigned prio)
 			 * consumption of other cpus */
 			fitness[worker] += _gamma * idle_power * (exp_end[worker] - max_exp_end) / 1000000.0;
 
-		if (best == -1 || fitness[worker] < best_fitness)
-		{
-			/* we found a better solution */
-			best_fitness = fitness[worker];
-			best = worker;
-		}
+			if (best == -1 || fitness[worker] < best_fitness)
+			{
+				/* we found a better solution */
+				best_fitness = fitness[worker]; best = worker;
+			}
 	}
 
 	/* By now, we must have found a solution */
@@ -333,7 +349,8 @@ static int _heft_push_task(struct starpu_task *task, unsigned prio)
 		/* If we have a task bundle, we have computed the expected
 		 * length for the entire bundle, but not for the task alone. */
 		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(best);
-		model_best = starpu_task_expected_length(task, perf_arch);
+		model_best = starpu_task_expected_length(task, perf_arch,
+				_starpu_get_job_associated_to_task(task)->nimpl);
 
 		/* Remove the task from the bundle since we have made a
 		 * decision for it, and that other tasks should not consider it

+ 2 - 1
src/sched_policies/parallel_greedy.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -167,7 +168,7 @@ static struct starpu_task *pop_task_pgreedy_policy(void)
 			if (possible_combinations_size[workerid][i] > best_size)
 			{
 				int combined_worker = possible_combinations[workerid][i];
-				if (starpu_combined_worker_may_execute_task(combined_worker, task))
+				if (starpu_combined_worker_may_execute_task(combined_worker, task, 0))
 				{
 					best_size = possible_combinations_size[workerid][i];
 					best_workerid = combined_worker;

+ 57 - 42
src/sched_policies/parallel_heft.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -225,62 +226,73 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio)
 			max_exp_end = worker_exp_end[worker];
 	}
 
+	unsigned nimpl;
+	unsigned best_impl = 0;
 	for (worker = 0; worker < (nworkers+ncombinedworkers); worker++)
 	{
-		if (!starpu_combined_worker_may_execute_task(worker, task))
+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 		{
-			/* no one on that queue may execute this task */
-			skip_worker[worker] = 1;
-			continue;
-		}
-		else {
-			skip_worker[worker] = 0;
-		}
+			if (!starpu_combined_worker_may_execute_task(worker, task, nimpl))
+			{
+				/* no one on that queue may execute this task */
+				skip_worker[worker] = 1;
+				continue;
+			}
+			else {
+				skip_worker[worker] = 0;
+			}
 
-		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
-		local_task_length[worker] = starpu_task_expected_length(task, perf_arch);
+			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
 
-		unsigned memory_node = starpu_worker_get_memory_node(worker);
-		local_data_penalty[worker] = starpu_task_expected_data_transfer_time(memory_node, task);
+			local_task_length[worker] = starpu_task_expected_length(task, perf_arch,nimpl);
 
-		double ntasks_end = compute_ntasks_end(worker);
+			unsigned memory_node = starpu_worker_get_memory_node(worker);
+			local_data_penalty[worker] = starpu_task_expected_data_transfer_time(memory_node, task);
 
-		if (ntasks_best == -1
-				|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
-				|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
-				|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
-				) {
-			ntasks_best_end = ntasks_end;
-			ntasks_best = worker;
-		}
+			double ntasks_end = compute_ntasks_end(worker);
 
-		if (local_task_length[worker] == -1.0)
-			/* we are calibrating, we want to speed-up calibration time
-			 * so we privilege non-calibrated tasks (but still
-			 * greedily distribute them to avoid dumb schedules) */
-			calibrating = 1;
+			if (ntasks_best == -1
+					|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
+					|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
+					|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
+					) {
+				ntasks_best_end = ntasks_end;
+				ntasks_best = worker;
+			}
 
-		if (local_task_length[worker] <= 0.0)
-			/* there is no prediction available for that task
-			 * with that arch yet, so switch to a greedy strategy */
-			unknown = 1;
+			if (local_task_length[worker] == -1.0)
+				/* we are calibrating, we want to speed-up calibration time
+				 * so we privilege non-calibrated tasks (but still
+				 * greedily distribute them to avoid dumb schedules) */
+				calibrating = 1;
 
-		if (unknown)
-			continue;
+			if (local_task_length[worker] <= 0.0)
+				/* there is no prediction available for that task
+				 * with that arch yet, so switch to a greedy strategy */
+				unknown = 1;
 
-		local_exp_end[worker] = compute_expected_end(worker, local_task_length[worker]);
+			if (unknown)
+				continue;
 
-		//fprintf(stderr, "WORKER %d -> length %e end %e\n", worker, local_task_length[worker], local_exp_end[worker]);
+			local_exp_end[worker] = compute_expected_end(worker, local_task_length[worker]);
 
-		if (local_exp_end[worker] < best_exp_end)
-		{
-			/* a better solution was found */
-			best_exp_end = local_exp_end[worker];
-		}
+			//fprintf(stderr, "WORKER %d -> length %e end %e\n", worker, local_task_length[worker], local_exp_end[worker]);
+
+			if (local_exp_end[worker] < best_exp_end)
+			{
+				/* a better solution was found */
+				best_exp_end = local_exp_end[worker];
+				best_impl = nimpl;
+			}
 
-		local_power[worker] = starpu_task_expected_power(task, perf_arch);
-		if (local_power[worker] == -1.0)
-			local_power[worker] = 0.;
+
+			local_power[worker] = starpu_task_expected_power(task, perf_arch,nimpl);
+			//_STARPU_DEBUG("Scheduler parallel heft: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker],local_power[worker],worker,nimpl);
+
+			if (local_power[worker] == -1.0)
+				local_power[worker] = 0.;
+
+		} //end for
 	}
 
 	if (unknown)
@@ -338,6 +350,9 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio)
 		best_exp_end = local_exp_end[best];
 	}
 
+
+	//_STARPU_DEBUG("Scheduler parallel heft: kernel (%u)\n", best_impl);
+	_starpu_get_job_associated_to_task(task)->nimpl = best_impl;
 	/* we should now have the best worker in variable "best" */
 	return push_task_on_best_worker(task, best, best_exp_end, prio);
 }

+ 6 - 1
tests/perfmodels/regression_based.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -97,7 +98,11 @@ static void show_task_perfs(int size, struct starpu_task *task) {
 		char name[16];
 		starpu_worker_get_name(workerid, name, sizeof(name));
 
-		printf("Expected time for %d on %s:\t%f\n", size, name, starpu_task_expected_length(task, starpu_worker_get_perf_archtype(workerid)));
+		unsigned nimpl;
+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++) {
+			printf("Expected time for %d on %s:\t%f\n",
+				size, name, starpu_task_expected_length(task, starpu_worker_get_perf_archtype(workerid), nimpl));
+		}
 	}
 }
 

+ 31 - 18
tools/starpu_perfmodel_display.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2011  Université de Bordeaux 1
  * Copyright (C) 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -143,14 +144,14 @@ static void display_history_based_perf_model(struct starpu_per_arch_perfmodel_t
 	}
 }
 
-static void display_perf_model(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch)
+static void display_perf_model(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, unsigned nimpl)
 {
-	struct starpu_per_arch_perfmodel_t *arch_model = &model->per_arch[arch];
+	struct starpu_per_arch_perfmodel_t *arch_model = &model->per_arch[arch][nimpl];
 	char archname[32];
 
 	if (arch_model->regression.nsample || arch_model->regression.valid || arch_model->regression.nl_valid || arch_model->list) {
 
-		starpu_perfmodel_get_arch_name(arch, archname, 32);
+		starpu_perfmodel_get_arch_name(arch, archname, 32, nimpl);
 		fprintf(stderr, "performance model for %s\n", archname);
 	}
 
@@ -187,7 +188,7 @@ static void display_perf_model(struct starpu_perfmodel_t *model, enum starpu_per
 
 #if 0
 		char debugname[1024];
-		starpu_perfmodel_debugfilepath(model, arch, debugname, 1024);
+		starpu_perfmodel_debugfilepath(model, arch, debugname, 1024, nimpl);
 		printf("\t debug file path : %s\n", debugname);
 #endif
 	}
@@ -220,7 +221,7 @@ static void display_perf_model(struct starpu_perfmodel_t *model, enum starpu_per
 
 		if (strcmp(parameter, "path-file-debug") == 0) {
 			char debugname[256];
-			starpu_perfmodel_debugfilepath(model, arch, debugname, 1024);
+			starpu_perfmodel_debugfilepath(model, arch, debugname, 1024, nimpl);
 			printf("%s\n", debugname);
 			return;
 		}
@@ -243,14 +244,18 @@ static void display_all_perf_models(struct starpu_perfmodel_t *model)
 	{
 		/* display all architectures */
 		unsigned archid;
-		for (archid = 0; archid < STARPU_NARCH_VARIATIONS; archid++)
-		{
-			display_perf_model(model, (enum starpu_perf_archtype) archid);
+		unsigned implid;
+		for (archid = 0; archid < STARPU_NARCH_VARIATIONS; archid++) {
+			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++) { /* Display all codelets on each arch */
+				display_perf_model(model, (enum starpu_perf_archtype) archid, implid);
+			}
 		}
 	}
 	else {
 		if (strcmp(arch, "cpu") == 0) {
-			display_perf_model(model, STARPU_CPU_DEFAULT);
+			unsigned implid;
+			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+				display_perf_model(model, STARPU_CPU_DEFAULT,implid); /* Display all codelets on cpu */
 			return;
 		}
 
@@ -264,18 +269,22 @@ static void display_all_perf_models(struct starpu_perfmodel_t *model)
 				exit(-1);
 			}
 
-			display_perf_model(model, (enum starpu_perf_archtype) (STARPU_CPU_DEFAULT + k - 1));
+			unsigned implid;
+			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+				display_perf_model(model, (enum starpu_perf_archtype) STARPU_CPU_DEFAULT + k - 1, implid);
 			return;
 		}
 
 		if (strcmp(arch, "cuda") == 0) {
 			unsigned archid;
-			for (archid = STARPU_CUDA_DEFAULT; archid < STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS; archid++)
-			{
-				char archname[32];
-				starpu_perfmodel_get_arch_name((enum starpu_perf_archtype) archid, archname, 32);
-				fprintf(stderr, "performance model for %s\n", archname);
-				display_perf_model(model, (enum starpu_perf_archtype) archid);
+			unsigned implid;
+			for (archid = STARPU_CUDA_DEFAULT; archid < STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS; archid++) {
+				for (implid = 0; implid <STARPU_MAXIMPLEMENTATIONS; implid ++) {
+					char archname[32];
+					starpu_perfmodel_get_arch_name((enum starpu_perf_archtype) archid, archname, 32, implid);
+					fprintf(stderr, "performance model for %s\n", archname);
+					display_perf_model(model, (enum starpu_perf_archtype) archid, implid);
+				}
 			}
 			return;
 		}
@@ -287,13 +296,17 @@ static void display_all_perf_models(struct starpu_perfmodel_t *model)
 		if (nmatched == 1)
 		{
 			unsigned archid = STARPU_CUDA_DEFAULT+ gpuid;
-			display_perf_model(model, (enum starpu_perf_archtype) archid);
+			unsigned implid;
+			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+				display_perf_model(model, (enum starpu_perf_archtype) archid, implid);
 			return;
 		}
 
 		if (strcmp(arch, "gordon") == 0) {
 			fprintf(stderr, "performance model for gordon\n");
-			display_perf_model(model, STARPU_GORDON_DEFAULT);
+			unsigned implid;
+			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+				display_perf_model(model, STARPU_GORDON_DEFAULT, implid);
 			return;
 		}
 

+ 29 - 13
tools/starpu_perfmodel_plot.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2011  Université de Bordeaux 1
  * Copyright (C) 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -141,10 +142,10 @@ static void print_comma(FILE *gnuplot_file, int *first)
 	}
 }
 
-static void display_perf_model(FILE *gnuplot_file, struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, int *first)
+static void display_perf_model(FILE *gnuplot_file, struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, int *first, unsigned nimpl)
 {
 	char arch_name[256];
-	starpu_perfmodel_get_arch_name(arch, arch_name, 256);
+	starpu_perfmodel_get_arch_name(arch, arch_name, 256, nimpl);
 
 	fprintf(stderr,"Arch: %s\n", arch_name);
 
@@ -156,7 +157,8 @@ static void display_perf_model(FILE *gnuplot_file, struct starpu_perfmodel_t *mo
 	}
 #endif
 
-	struct starpu_per_arch_perfmodel_t *arch_model = &model->per_arch[arch];
+	struct starpu_per_arch_perfmodel_t *arch_model =
+		&model->per_arch[arch][nimpl];
 
 	/* Only display the regression model if we could actually build a model */
 	if (arch_model->regression.valid)
@@ -203,16 +205,20 @@ static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_
 	free(command);
 
 	col = 2;
+	unsigned implid;
 	for (arch = arch1; arch < arch2; arch++) {
-		struct starpu_per_arch_perfmodel_t *arch_model = &model->per_arch[arch];
-		starpu_perfmodel_get_arch_name((enum starpu_perf_archtype) arch, archname, 32);
+		for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++) {
+			struct starpu_per_arch_perfmodel_t *arch_model =
+				&model->per_arch[arch][implid];
+			starpu_perfmodel_get_arch_name((enum starpu_perf_archtype) arch, archname, 32, implid);
 
-		ptrs[arch-arch1] = ptr[arch-arch1] = arch_model->list;
+			ptrs[arch-arch1] = ptr[arch-arch1] = arch_model->list;
 
-		if (ptr[arch-arch1]) {
-			print_comma(gnuplot_file, first);
-			fprintf(gnuplot_file, "\"%s\" using 1:%d:%d with errorlines title \"Measured %s\"", avg_file_name, col, col+1, archname);
-			col += 2;
+			if (ptr[arch-arch1]) {
+				print_comma(gnuplot_file, first);
+				fprintf(gnuplot_file, "\"%s\" using 1:%d:%d with errorlines title \"Measured %s\"", avg_file_name, col, col+1, archname);
+				col += 2;
+			}
 		}
 	}
 
@@ -260,8 +266,13 @@ static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_
 static void display_perf_models(FILE *gnuplot_file, struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch1, enum starpu_perf_archtype arch2, int *first)
 {
 	unsigned arch;
-	for (arch = arch1; arch < arch2; arch++)
-		display_perf_model(gnuplot_file, model, (enum starpu_perf_archtype) arch, first);
+	unsigned implid;
+	for (arch = arch1; arch < arch2; arch++) {
+		for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++) {
+			display_perf_model(gnuplot_file, model, (enum starpu_perf_archtype) arch, first,
+implid);
+		}
+	}
 	display_history_based_perf_models(gnuplot_file, model, arch1, arch2, first);
 }
 
@@ -317,7 +328,12 @@ static void display_selected_models(FILE *gnuplot_file, struct starpu_perfmodel_
 	}
 	else {
 		if (strcmp(arch, "cpu") == 0) {
-			display_perf_model(gnuplot_file, model, STARPU_CPU_DEFAULT, &first);
+			unsigned impl;
+			for (impl = 0; impl < STARPU_MAXIMPLEMENTATIONS; impl++) {
+				display_perf_model(gnuplot_file, model,
+							STARPU_CPU_DEFAULT,
+							&first, impl);
+			}
 			return;
 		}