14 years ago · ffcfb86084
--- a/AUTHORS
+++ b/AUTHORS
@@ -10,3 +10,5 @@ William Braik <wbraik@gmail.com>
 
				 Yann Courtois <yann.courtois33@gmail.com>
			
 
				 Jean-Marie Couteyen <jm.couteyen@gmail.com>
			
 
				 Anthony Roy <theanthony33@gmail.com>
			
 
				+David Gómez <david_gomez1380@yahoo.com.mx>
			
 
				+NGUYEN quôc dinh <nguyen.quocdinh@gmail.com>
			
--- a/configure.ac
+++ b/configure.ac
@@ -2,6 +2,7 @@
 
				 #
			
 
				 # Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+# * Copyright (C) 2011  Télécom-SudParis
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
 
				 # it under the terms of the GNU Lesser General Public License as published by
			
@@ -889,6 +890,15 @@ AC_MSG_CHECKING(Maximum number of workers)
 
				 AC_MSG_RESULT($nmaxworkers)
			
 
				 AC_DEFINE_UNQUOTED(STARPU_NMAXWORKERS, [$nmaxworkers], [Maximum number of workers])
			
 
				 
			
 
				+# Computes the maximum number of implementations per arch
			
 
				+AC_MSG_CHECKING(maximum number of implementations)
			
 
				+AC_ARG_ENABLE(maximplementations, [AS_HELP_STRING([--enable-maximplementations=<number>],
			
 
				+		[maximum number of implementations])],
			
 
				+		maximplementations=$enableval, maximplementations=1)
			
 
				+AC_MSG_RESULT($maximplementations)
			
 
				+AC_DEFINE_UNQUOTED(STARPU_MAXIMPLEMENTATIONS, [$maximplementations],
			
 
				+		[maximum number of implementations])
			
 
				+
			
 
				 ###############################################################################
			
 
				 #                                                                             #
			
 
				 #                                    MPI                                      #
			
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -2,6 +2,7 @@
 
				 #
			
 
				 # Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+# Copyright (C) 2011  Télécom-SudParis
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
 
				 # it under the terms of the GNU Lesser General Public License as published by
			
@@ -148,6 +149,7 @@ examplebin_PROGRAMS +=				\
 
				 	basic_examples/mult			\
			
 
				 	basic_examples/block			\
			
 
				 	basic_examples/variable			\
			
 
				+	basic_examples/mult_impl                \
			
 
				 	filters/fvector				\
			
 
				 	filters/fblock				\
			
 
				 	filters/fmatrix				\
			
--- a/examples/basic_examples/mult_impl.c
+++ b/examples/basic_examples/mult_impl.c
@@ -0,0 +1,384 @@
 
				+/*/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Télécom-SudParis
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include <string.h>
			
 
				+#include <math.h>
			
 
				+#include <sys/types.h>
			
 
				+#include <sys/time.h>
			
 
				+#include <pthread.h>
			
 
				+#include <signal.h>
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+static float *A, *B, *C;
			
 
				+static starpu_data_handle A_handle, B_handle, C_handle;
			
 
				+
			
 
				+static unsigned nslicesx = 4;
			
 
				+static unsigned nslicesy = 4;
			
 
				+static unsigned xdim = 1024;
			
 
				+static unsigned ydim = 1024;
			
 
				+static unsigned zdim = 512;
			
 
				+
			
 
				+
			
 
				+double mult_gemm_cost(starpu_buffer_descr *descr)
			
 
				+{
			
 
				+	/* C = A * B */
			
 
				+	uint32_t nxC, nyC, nxA;
			
 
				+
			
 
				+
			
 
				+	nxC = starpu_matrix_get_nx(descr[2].handle);
			
 
				+	nyC = starpu_matrix_get_ny(descr[2].handle);
			
 
				+	nxA = starpu_matrix_get_nx(descr[0].handle);
			
 
				+
			
 
				+	//printf("nxC %d nxC %d nxA %d\n", nxC, nyC, nxA);
			
 
				+
			
 
				+	double cost = ((double)nxC)*((double)nyC)*((double)nxA/1000.0f/4.11f);
			
 
				+
			
 
				+	printf("cost %e \n", cost);
			
 
				+
			
 
				+	return cost;
			
 
				+}
			
 
				+
			
 
				+static void cpu_mult(void *descr[], __attribute__((unused))  void *arg)
			
 
				+{
			
 
				+	float *subA, *subB, *subC;
			
 
				+	uint32_t nxC, nyC, nyA;
			
 
				+	uint32_t ldA, ldB, ldC;
			
 
				+	printf("On application: Hello, this is kernel cpu_mult\n\n");
			
 
				+	/* .blas.ptr gives a pointer to the first element of the local copy */
			
 
				+	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
			
 
				+
			
 
				+	/* .blas.nx is the number of rows (consecutive elements) and .blas.ny
			
 
				+	 * is the number of lines that are separated by .blas.ld elements (ld
			
 
				+	 * stands for leading dimension).
			
 
				+	 * NB: in case some filters were used, the leading dimension is not
			
 
				+	 * guaranteed to be the same in main memory (on the original matrix)
			
 
				+	 * and on the accelerator! */
			
 
				+	nxC = STARPU_MATRIX_GET_NX(descr[2]);
			
 
				+	nyC = STARPU_MATRIX_GET_NY(descr[2]);
			
 
				+	nyA = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+
			
 
				+	ldA = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+	ldB = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				+	ldC = STARPU_MATRIX_GET_LD(descr[2]);
			
 
				+
			
 
				+	/* we assume a FORTRAN-ordering! */
			
 
				+	unsigned i,j,k;
			
 
				+	for (i = 0; i < nyC; i++)
			
 
				+	{
			
 
				+		for (j = 0; j < nxC; j++)
			
 
				+		{
			
 
				+			float sum = 0.0;
			
 
				+
			
 
				+			for (k = 0; k < nyA; k++)
			
 
				+			{
			
 
				+				sum += subA[j+k*ldA]*subB[k+i*ldB];
			
 
				+			}
			
 
				+
			
 
				+			subC[j + i*ldC] = sum;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void cpu_mult_2(void *descr[], __attribute__((unused))  void *arg)
			
 
				+{
			
 
				+	float *subA, *subB, *subC;
			
 
				+	uint32_t nxC, nyC, nyA;
			
 
				+	uint32_t ldA, ldB, ldC;
			
 
				+	printf("On application: this is kernel cpu_mult_2\n\n");
			
 
				+	/* .blas.ptr gives a pointer to the first element of the local copy */
			
 
				+	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
			
 
				+
			
 
				+	nxC = STARPU_MATRIX_GET_NX(descr[2]);
			
 
				+	nyC = STARPU_MATRIX_GET_NY(descr[2]);
			
 
				+	nyA = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+
			
 
				+	ldA = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+	ldB = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				+	ldC = STARPU_MATRIX_GET_LD(descr[2]);
			
 
				+
			
 
				+	/* we assume a FORTRAN-ordering! */
			
 
				+	unsigned i,j,k;
			
 
				+	for (j = 0; j < nxC; j++)
			
 
				+	{
			
 
				+		for (i = 0; i < nyC; i++)
			
 
				+		{
			
 
				+			float sum = 0.0;
			
 
				+
			
 
				+			for (k = 0; k < nyA; k++)
			
 
				+			{
			
 
				+				sum += subA[j+k*ldA]*subB[k+i*ldB];
			
 
				+			}
			
 
				+
			
 
				+			subC[j + i*ldC] = sum;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+static void init_problem_data(void)
			
 
				+{
			
 
				+	unsigned i,j;
			
 
				+
			
 
				+	/* we initialize matrices A, B and C in the usual way */
			
 
				+
			
 
				+	A = malloc(zdim*ydim*sizeof(float));
			
 
				+	B = malloc(xdim*zdim*sizeof(float));
			
 
				+	C = malloc(xdim*ydim*sizeof(float));
			
 
				+
			
 
				+	/* fill the A and B matrices */
			
 
				+	srand(2009);
			
 
				+	for (j=0; j < ydim; j++) {
			
 
				+		for (i=0; i < zdim; i++) {
			
 
				+			A[j+i*ydim] = (float)(starpu_drand48());
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for (j=0; j < zdim; j++) {
			
 
				+		for (i=0; i < xdim; i++) {
			
 
				+			B[j+i*zdim] = (float)(starpu_drand48());
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for (j=0; j < ydim; j++) {
			
 
				+		for (i=0; i < xdim; i++) {
			
 
				+			C[j+i*ydim] = (float)(0);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void partition_mult_data(void)
			
 
				+{
			
 
				+	/* note that we assume a FORTRAN ordering here! */
			
 
				+
			
 
				+	starpu_matrix_data_register(&A_handle, 0, (uintptr_t)A,
			
 
				+		ydim, ydim, zdim, sizeof(float));
			
 
				+	starpu_matrix_data_register(&B_handle, 0, (uintptr_t)B,
			
 
				+		zdim, zdim, xdim, sizeof(float));
			
 
				+	starpu_matrix_data_register(&C_handle, 0, (uintptr_t)C,
			
 
				+		ydim, ydim, xdim, sizeof(float));
			
 
				+
			
 
				+	/* A filter is a method to partition a data into disjoint chunks, it is
			
 
				+	 * described by the means of the "struct starpu_data_filter" structure that
			
 
				+	 * contains a function that is applied on a data handle to partition it
			
 
				+	 * into smaller chunks, and an argument that is passed to the function
			
 
				+	 * (eg. the number of blocks to create here).
			
 
				+	 */
			
 
				+
			
 
				+	struct starpu_data_filter vert = {
			
 
				+		.filter_func = starpu_vertical_block_filter_func,
			
 
				+		.nchildren = nslicesx,
			
 
				+		.get_nchildren = NULL,
			
 
				+		.get_child_ops = NULL
			
 
				+	};
			
 
				+
			
 
				+	struct starpu_data_filter horiz = {
			
 
				+		.filter_func = starpu_block_filter_func,
			
 
				+		.nchildren = nslicesy,
			
 
				+		.get_nchildren = NULL,
			
 
				+		.get_child_ops = NULL
			
 
				+	};
			
 
				+
			
 
				+/*
			
 
				+ *	Illustration with nslicex = 4 and nslicey = 2, it is possible to access
			
 
				+ *	sub-data by using the "starpu_data_get_sub_data" method, which takes a data handle,
			
 
				+ *	the number of filters to apply, and the indexes for each filters, for
			
 
				+ *	instance:
			
 
				+ *
			
 
				+ *		A' handle is starpu_data_get_sub_data(A_handle, 1, 1);
			
 
				+ *		B' handle is starpu_data_get_sub_data(B_handle, 1, 2);
			
 
				+ *		C' handle is starpu_data_get_sub_data(C_handle, 2, 2, 1);
			
 
				+ *
			
 
				+ *	Note that here we applied 2 filters recursively onto C.
			
 
				+ *
			
 
				+ *	"starpu_data_get_sub_data(C_handle, 1, 3)" would return a handle to the 4th column
			
 
				+ *	of blocked matrix C for example.
			
 
				+ *
			
 
				+ *		              |---|---|---|---|
			
 
				+ *		              |   |   | B'|   | B
			
 
				+ *		              |---|---|---|---|
			
 
				+ *		                0   1   2   3
			
 
				+ *		     |----|   |---|---|---|---|
			
 
				+ *		     |    |   |   |   |   |   |
			
 
				+ *		     |    | 0 |   |   |   |   |
			
 
				+ *		     |----|   |---|---|---|---|
			
 
				+ *		     | A' |   |   |   | C'|   |
			
 
				+ *		     |    |   |   |   |   |   |
			
 
				+ *		     |----|   |---|---|---|---|
			
 
				+ *		       A              C
			
 
				+ *
			
 
				+ *	IMPORTANT: applying filters is equivalent to partitionning a piece of
			
 
				+ *	data in a hierarchical manner, so that memory consistency is enforced
			
 
				+ *	for each of the elements independantly. The tasks should therefore NOT
			
 
				+ *	access inner nodes (eg. one column of C or the whole C) but only the
			
 
				+ *	leafs of the tree (ie. blocks here). Manipulating inner nodes is only
			
 
				+ *	possible by disapplying the filters (using starpu_data_unpartition), to
			
 
				+ *	enforce memory consistency.
			
 
				+ */
			
 
				+
			
 
				+	starpu_data_partition(B_handle, &vert);
			
 
				+	starpu_data_partition(A_handle, &horiz);
			
 
				+
			
 
				+	/* starpu_data_map_filters is a variable-arity function, the first argument
			
 
				+	 * is the handle of the data to partition, the second argument is the
			
 
				+	 * number of filters to apply recursively. Filters are applied in the
			
 
				+	 * same order as the arguments.
			
 
				+	 * This would be equivalent to starpu_data_partition(C_handle, &vert) and
			
 
				+	 * then applying horiz on each sub-data (ie. each column of C)
			
 
				+	 */
			
 
				+	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
			
 
				+}
			
 
				+
			
 
				+static struct starpu_perfmodel_t starpu_dgemm_model_common = {
			
 
				+	.cost_model = mult_gemm_cost,
			
 
				+	.type = STARPU_HISTORY_BASED,//STARPU_COMMON, //STARPU_PER_ARCH,
			
 
				+	.symbol = "mult_perf_model"
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+static struct starpu_perfmodel_t mult_perf_model = {
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = "mult_perf_model"
			
 
				+};
			
 
				+*/
			
 
				+
			
 
				+struct starpu_conf conf = {
			
 
				+		.sched_policy_name = "heft",
			
 
				+		.calibrate = 1,
			
 
				+		.ncpus = 4
			
 
				+};
			
 
				+
			
 
				+
			
 
				+static starpu_codelet cl = {
			
 
				+        /* we can only execute that kernel on a CPU yet */
			
 
				+        .where = STARPU_CPU,
			
 
				+        //.starpu_impl_multiple = 1,
			
 
				+        /* CPU implementation of the codelet */
			
 
				+        .cpu_func = STARPU_MULTIPLE_CPU_IMPLEMENTATIONS,
			
 
				+        .cpu_funcs = {cpu_mult,cpu_mult_2},
			
 
				+        /* the codelet manipulates 3 buffers that are managed by the
			
 
				+         * DSM */
			
 
				+        .nbuffers = 3,
			
 
				+        /* in case the scheduling policy may use performance models */
			
 
				+        .model = &starpu_dgemm_model_common
			
 
				+};
			
 
				+
			
 
				+static void launch_tasks(void)
			
 
				+{
			
 
				+	/* partition the work into slices */
			
 
				+	unsigned taskx, tasky;
			
 
				+
			
 
				+	for (taskx = 0; taskx < nslicesx; taskx++)
			
 
				+	{
			
 
				+		for (tasky = 0; tasky < nslicesy; tasky++)
			
 
				+		{
			
 
				+			/* C[taskx, tasky] = A[tasky] B[taskx] */
			
 
				+
			
 
				+			/* by default, starpu_task_create() returns an
			
 
				+ 			 * asynchronous task (ie. task->synchronous = 0) */
			
 
				+			struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+			/* this task implements codelet "cl" */
			
 
				+			task->cl = &cl;
			
 
				+
			
 
				+			/*
			
 
				+			 *              |---|---|---|---|
			
 
				+			 *              |   | * |   |   | B
			
 
				+			 *              |---|---|---|---|
			
 
				+			 *                    X
			
 
				+			 *     |----|   |---|---|---|---|
			
 
				+			 *     |****| Y |   |***|   |   |
			
 
				+			 *     |****|   |   |***|   |   |
			
 
				+			 *     |----|   |---|---|---|---|
			
 
				+			 *     |    |   |   |   |   |   |
			
 
				+			 *     |    |   |   |   |   |   |
			
 
				+			 *     |----|   |---|---|---|---|
			
 
				+			 *       A              C
			
 
				+			 */
			
 
				+
			
 
				+			/* there was a single filter applied to matrices A
			
 
				+			 * (respectively B) so we grab the handle to the chunk
			
 
				+			 * identified by "tasky" (respectively "taskx). The "1"
			
 
				+			 * tells StarPU that there is a single argument to the
			
 
				+			 * variable-arity function starpu_data_get_sub_data */
			
 
				+			task->buffers[0].handle = starpu_data_get_sub_data(A_handle, 1, tasky);
			
 
				+			task->buffers[0].mode = STARPU_R;
			
 
				+			task->buffers[1].handle = starpu_data_get_sub_data(B_handle, 1, taskx);
			
 
				+			task->buffers[1].mode = STARPU_R;
			
 
				+
			
 
				+			/* 2 filters were applied on matrix C, so we give
			
 
				+			 * starpu_data_get_sub_data 2 arguments. The order of the arguments
			
 
				+			 * must match the order in which the filters were
			
 
				+			 * applied.
			
 
				+			 * NB: starpu_data_get_sub_data(C_handle, 1, k) would have returned
			
 
				+			 * a handle to the column number k of matrix C.
			
 
				+			 * NB2: starpu_data_get_sub_data(C_handle, 2, taskx, tasky) is
			
 
				+			 * equivalent to
			
 
				+			 * starpu_data_get_sub_data(starpu_data_get_sub_data(C_handle, 1, taskx), 1, tasky)*/
			
 
				+			task->buffers[2].handle = starpu_data_get_sub_data(C_handle, 2, taskx, tasky);
			
 
				+			task->buffers[2].mode = STARPU_W;
			
 
				+
			
 
				+			/* this is not a blocking call since task->synchronous = 0 */
			
 
				+			int summit_task;
			
 
				+			summit_task = starpu_task_submit(task);
			
 
				+			printf("task is submmited or not %d\n",summit_task);
			
 
				+
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int main(void)
			
 
				+{
			
 
				+	/* start the runtime */
			
 
				+	starpu_init(&conf);
			
 
				+
			
 
				+	/* initialize matrices A, B and C and register them to StarPU */
			
 
				+	init_problem_data();
			
 
				+
			
 
				+	/* partition matrices into blocks that can be manipulated by the
			
 
				+ 	 * codelets */
			
 
				+	partition_mult_data();
			
 
				+
			
 
				+	/* submit all tasks in an asynchronous fashion */
			
 
				+	launch_tasks();
			
 
				+
			
 
				+	/* wait for termination */
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	/* remove the filters applied by the means of starpu_data_map_filters; now
			
 
				+ 	 * it's not possible to manipulate a subset of C using starpu_data_get_sub_data until
			
 
				+	 * starpu_data_map_filters is called again on C_handle.
			
 
				+	 * The second argument is the memory node where the different subsets
			
 
				+	 * should be reassembled, 0 = main memory (RAM) */
			
 
				+	starpu_data_unpartition(C_handle, 0);
			
 
				+
			
 
				+	/* stop monitoring matrix C : after this, it is not possible to pass C
			
 
				+	 * (or any subset of C) as a codelet input/output. This also implements
			
 
				+	 * a barrier so that the piece of data is put back into main memory in
			
 
				+	 * case it was only available on a GPU for instance. */
			
 
				+	starpu_data_unregister(C_handle);
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/examples/cholesky/cholesky_models.c
+++ b/examples/cholesky/cholesky_models.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -125,27 +126,27 @@ static double cuda_chol_task_22_cost(starpu_buffer_descr *descr)
 
				 }
			
 
				 
			
 
				 struct starpu_perfmodel_t chol_model_11 = {
			
 
				-	.per_arch = { 
			
 
				-		[STARPU_CPU_DEFAULT] = { .cost_model = cpu_chol_task_11_cost },
			
 
				-		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_chol_task_11_cost }
			
 
				+	.per_arch = {
			
 
				+		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_11_cost },
			
 
				+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_11_cost }
			
 
				 	},
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "chol_model_11"
			
 
				 };
			
 
				 
			
 
				 struct starpu_perfmodel_t chol_model_21 = {
			
 
				-	.per_arch = { 
			
 
				-		[STARPU_CPU_DEFAULT] = { .cost_model = cpu_chol_task_21_cost },
			
 
				-		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_chol_task_21_cost }
			
 
				+	.per_arch = {
			
 
				+		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_21_cost },
			
 
				+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_21_cost }
			
 
				 	},
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "chol_model_21"
			
 
				 };
			
 
				 
			
 
				 struct starpu_perfmodel_t chol_model_22 = {
			
 
				-	.per_arch = { 
			
 
				-		[STARPU_CPU_DEFAULT] = { .cost_model = cpu_chol_task_22_cost },
			
 
				-		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_chol_task_22_cost }
			
 
				+	.per_arch = {
			
 
				+		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_22_cost },
			
 
				+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_22_cost }
			
 
				 	},
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "chol_model_22"
			
--- a/examples/heat/lu_kernels_model.c
+++ b/examples/heat/lu_kernels_model.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -215,9 +216,9 @@ double task_22_cost_cpu(starpu_buffer_descr *descr)
 
				 
			
 
				 struct starpu_perfmodel_t model_11 = {
			
 
				 	.cost_model = task_11_cost,
			
 
				-	.per_arch = { 
			
 
				-		[STARPU_CPU_DEFAULT] = { .cost_model = task_11_cost_cpu },
			
 
				-		[STARPU_CUDA_DEFAULT] = { .cost_model = task_11_cost_cuda }
			
 
				+	.per_arch = {
			
 
				+		[STARPU_CPU_DEFAULT][0] = { .cost_model = task_11_cost_cpu },
			
 
				+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = task_11_cost_cuda }
			
 
				 	},
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 #ifdef STARPU_ATLAS
			
@@ -231,9 +232,9 @@ struct starpu_perfmodel_t model_11 = {
 
				 
			
 
				 struct starpu_perfmodel_t model_12 = {
			
 
				 	.cost_model = task_12_cost,
			
 
				-	.per_arch = { 
			
 
				-		[STARPU_CPU_DEFAULT] = { .cost_model = task_12_cost_cpu },
			
 
				-		[STARPU_CUDA_DEFAULT] = { .cost_model = task_12_cost_cuda }
			
 
				+	.per_arch = {
			
 
				+		[STARPU_CPU_DEFAULT][0] = { .cost_model = task_12_cost_cpu },
			
 
				+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = task_12_cost_cuda }
			
 
				 	},
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 #ifdef STARPU_ATLAS
			
@@ -247,9 +248,9 @@ struct starpu_perfmodel_t model_12 = {
 
				 
			
 
				 struct starpu_perfmodel_t model_21 = {
			
 
				 	.cost_model = task_21_cost,
			
 
				-	.per_arch = { 
			
 
				-		[STARPU_CPU_DEFAULT] = { .cost_model = task_21_cost_cpu },
			
 
				-		[STARPU_CUDA_DEFAULT] = { .cost_model = task_21_cost_cuda }
			
 
				+	.per_arch = {
			
 
				+		[STARPU_CPU_DEFAULT][0] = { .cost_model = task_21_cost_cpu },
			
 
				+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = task_21_cost_cuda }
			
 
				 	},
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 #ifdef STARPU_ATLAS
			
@@ -263,9 +264,9 @@ struct starpu_perfmodel_t model_21 = {
 
				 
			
 
				 struct starpu_perfmodel_t model_22 = {
			
 
				 	.cost_model = task_22_cost,
			
 
				-	.per_arch = { 
			
 
				-		[STARPU_CPU_DEFAULT] = { .cost_model = task_22_cost_cpu },
			
 
				-		[STARPU_CUDA_DEFAULT] = { .cost_model = task_22_cost_cuda }
			
 
				+	.per_arch = {
			
 
				+		[STARPU_CPU_DEFAULT][0] = { .cost_model = task_22_cost_cpu },
			
 
				+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = task_22_cost_cuda }
			
 
				 	},
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 #ifdef STARPU_ATLAS
			
--- a/include/starpu_config.h.in
+++ b/include/starpu_config.h.in
@@ -46,6 +46,7 @@
 
				 #undef STARPU_MAXCUDADEVS
			
 
				 #undef STARPU_MAXOPENCLDEVS
			
 
				 #undef STARPU_NMAXWORKERS
			
 
				+#undef STARPU_MAXIMPLEMENTATIONS
			
 
				 
			
 
				 #undef STARPU_HAVE_LIBNUMA
			
 
				 
			
--- a/include/starpu_perfmodel.h
+++ b/include/starpu_perfmodel.h
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -104,7 +105,7 @@ struct starpu_perfmodel_t {
 
				 	double (*cost_model)(struct starpu_buffer_descr_t *);
			
 
				 
			
 
				 	/* per-architecture model */
			
 
				-	struct starpu_per_arch_perfmodel_t per_arch[STARPU_NARCH_VARIATIONS];
			
 
				+	struct starpu_per_arch_perfmodel_t per_arch[STARPU_NARCH_VARIATIONS][STARPU_MAXIMPLEMENTATIONS];
			
 
				 
			
 
				 	/* Name of the performance model, this is used as a file name when saving history-based performance models */
			
 
				 	const char *symbol;
			
@@ -126,9 +127,8 @@ enum starpu_perf_archtype starpu_worker_get_perf_archtype(int workerid);
 
				  * performance model files */
			
 
				 int starpu_load_history_debug(const char *symbol, struct starpu_perfmodel_t *model);
			
 
				 void starpu_perfmodel_debugfilepath(struct starpu_perfmodel_t *model,
			
 
				-		enum starpu_perf_archtype arch, char *path, size_t maxlen);
			
 
				-void starpu_perfmodel_get_arch_name(enum starpu_perf_archtype arch,
			
 
				-		char *archname, size_t maxlen);
			
 
				+		enum starpu_perf_archtype arch, char *path, size_t maxlen, unsigned nimpl);
			
 
				+void starpu_perfmodel_get_arch_name(enum starpu_perf_archtype arch,	char *archname, size_t maxlen, unsigned nimpl);
			
 
				 int starpu_list_models(void);
			
 
				 
			
 
				 void starpu_force_bus_sampling(void);
			
--- a/include/starpu_scheduler.h
+++ b/include/starpu_scheduler.h
@@ -1,6 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -113,7 +114,7 @@ void starpu_worker_set_sched_condition(int workerid, pthread_cond_t *sched_cond,
 
				 #endif
			
 
				 
			
 
				 /* Check if the worker specified by workerid can execute the codelet. */
			
 
				-int starpu_worker_may_execute_task(unsigned workerid, struct starpu_task *task);
			
 
				+int starpu_worker_may_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
			
 
				 
			
 
				 /* The scheduling policy may put tasks directly into a worker's local queue so
			
 
				  * that it is not always necessary to create its own queue when the local queue
			
@@ -151,7 +152,7 @@ void _starpu_sched_find_worker_combinations(struct starpu_machine_topology_s *to
 
				 /* Get the description of a combined worker */
			
 
				 int starpu_combined_worker_get_description(int workerid, int *worker_size, int **combined_workerid);
			
 
				 /* Variant of starpu_worker_may_execute_task compatible with combined workers */
			
 
				-int starpu_combined_worker_may_execute_task(unsigned workerid, struct starpu_task *task);
			
 
				+int starpu_combined_worker_may_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
			
 
				 
			
 
				 /*
			
 
				  *	Data prefetching
			
@@ -169,7 +170,7 @@ int starpu_prefetch_task_input_on_node(struct starpu_task *task, uint32_t node);
 
				 /* Return the current date */
			
 
				 double starpu_timing_now(void);
			
 
				 /* Returns expected task duration in µs */
			
 
				-double starpu_task_expected_length(struct starpu_task *task, enum starpu_perf_archtype arch);
			
 
				+double starpu_task_expected_length(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
			
 
				 /* Returns an estimated speedup factor relative to CPU speed */
			
 
				 double starpu_worker_get_relative_speedup(enum starpu_perf_archtype perf_archtype);
			
 
				 /* Returns expected data transfer time in µs */
			
@@ -177,6 +178,6 @@ double starpu_task_expected_data_transfer_time(uint32_t memory_node, struct star
 
				 /* Predict the transfer time (in µs) to move a handle to a memory node */
			
 
				 double starpu_data_expected_transfer_time(starpu_data_handle handle, unsigned memory_node, starpu_access_mode mode);
			
 
				 /* Returns expected power consumption in J */
			
 
				-double starpu_task_expected_power(struct starpu_task *task, enum starpu_perf_archtype arch);
			
 
				+double starpu_task_expected_power(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
			
 
				 
			
 
				 #endif /* __STARPU_SCHEDULER_H__ */
			
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -56,6 +57,18 @@ extern "C" {
 
				 
			
 
				 typedef uint64_t starpu_tag_t;
			
 
				 
			
 
				+
			
 
				+typedef void (*starpu_cpu_func_t)(void **, void*);    /* CPU core */
			
 
				+typedef void (*starpu_cuda_func_t)(void **, void*);   /* NVIDIA CUDA device */
			
 
				+typedef void (*starpu_opencl_func_t)(void **, void*); /* OpenCL CUDA device */
			
 
				+typedef uint8_t starpu_gordon_func_t; /* Cell SPU */
			
 
				+
			
 
				+#define STARPU_MULTIPLE_CPU_IMPLEMENTATIONS    (starpu_cpu_func_t) -1
			
 
				+#define STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS   (starpu_cuda_func_t) -1
			
 
				+#define STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS (starpu_opencl_func_t) -1
			
 
				+#define STARPU_MULTIPLE_GORDON_IMPLEMENTATIONS 255
			
 
				+
			
 
				+
			
 
				 /*
			
 
				  * A codelet describes the various function 
			
 
				  * that may be called from a worker
			
@@ -72,6 +85,11 @@ typedef struct starpu_codelet_t {
 
				 	void (*opencl_func)(void **, void *);
			
 
				 	uint8_t gordon_func;
			
 
				 
			
 
				+	starpu_cpu_func_t cpu_funcs[STARPU_MAXIMPLEMENTATIONS];
			
 
				+	starpu_cuda_func_t cuda_funcs[STARPU_MAXIMPLEMENTATIONS];
			
 
				+	starpu_opencl_func_t opencl_funcs[STARPU_MAXIMPLEMENTATIONS];
			
 
				+	starpu_gordon_func_t gordon_funcs[STARPU_MAXIMPLEMENTATIONS];
			
 
				+
			
 
				 	/* how many buffers do the codelet takes as argument ? */
			
 
				 	unsigned nbuffers;
			
 
				 
			
--- a/include/starpu_task_bundle.h
+++ b/include/starpu_task_bundle.h
@@ -1,6 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -71,10 +72,10 @@ int starpu_task_bundle_remove(struct starpu_task_bundle *bundle, struct starpu_t
 
				 void starpu_task_bundle_close(struct starpu_task_bundle *bundle);
			
 
				 
			
 
				 /* Return the expected duration of the entire task bundle in µs. */
			
 
				-double starpu_task_bundle_expected_length(struct starpu_task_bundle *bundle, enum starpu_perf_archtype arch);
			
 
				+double starpu_task_bundle_expected_length(struct starpu_task_bundle *bundle, enum starpu_perf_archtype arch, unsigned nimpl);
			
 
				 /* Return the time (in µs) expected to transfer all data used within the bundle */
			
 
				 double starpu_task_bundle_expected_data_transfer_time(struct starpu_task_bundle *bundle, unsigned memory_node);
			
 
				 /* Return the expected power consumption of the entire task bundle in J. */
			
 
				-double starpu_task_bundle_expected_power(struct starpu_task_bundle *bundle,  enum starpu_perf_archtype arch);
			
 
				+double starpu_task_bundle_expected_power(struct starpu_task_bundle *bundle,  enum starpu_perf_archtype arch, unsigned nimpl);
			
 
				 
			
 
				 #endif // __STARPU_TASK_BUNDLE_H__
			
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -62,6 +63,7 @@ starpu_job_t __attribute__((malloc)) _starpu_job_create(struct starpu_task *task
 
				 
			
 
				 	job = starpu_job_new();
			
 
				 
			
 
				+	job->nimpl =0; /* best implementation */
			
 
				 	job->task = task;
			
 
				 
			
 
				 	job->footprint_is_computed = 0;
			
--- a/src/core/jobs.h
+++ b/src/core/jobs.h
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -55,6 +56,10 @@ typedef void (*callback)(void *);
 
				 
			
 
				 /* A job is the internal representation of a task. */
			
 
				 LIST_TYPE(starpu_job,
			
 
				+
			
 
				+	/* The implementation associated to the job */
			
 
				+	unsigned nimpl;
			
 
				+
			
 
				 	/* The task associated to that job */
			
 
				 	struct starpu_task *task;
			
 
				 
			
--- a/src/core/perfmodel/perfmodel.c
+++ b/src/core/perfmodel/perfmodel.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -67,12 +68,12 @@ enum starpu_perf_archtype starpu_worker_get_perf_archtype(int workerid)
 
				  * PER ARCH model
			
 
				  */
			
 
				 
			
 
				-static double per_arch_task_expected_perf(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct starpu_task *task)
			
 
				+static double per_arch_task_expected_perf(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct starpu_task *task, unsigned nimpl)
			
 
				 {
			
 
				 	double exp = -1.0;
			
 
				 	double (*per_arch_cost_model)(struct starpu_buffer_descr_t *);
			
 
				 	
			
 
				-	per_arch_cost_model = model->per_arch[arch].cost_model;
			
 
				+	per_arch_cost_model = model->per_arch[arch][nimpl].cost_model;
			
 
				 
			
 
				 	if (per_arch_cost_model)
			
 
				 		exp = per_arch_cost_model(task->buffers);
			
@@ -153,25 +154,27 @@ void _starpu_load_perfmodel(struct starpu_perfmodel_t *model)
 
				 	model->is_loaded = 1;
			
 
				 }
			
 
				 
			
 
				-static double starpu_model_expected_perf(struct starpu_task *task, struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch)
			
 
				+static double starpu_model_expected_perf(struct starpu_task *task, struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch,  unsigned nimpl)
			
 
				 {
			
 
				 	if (model) {
			
 
				 		starpu_job_t j = _starpu_get_job_associated_to_task(task);
			
 
				 		switch (model->type) {
			
 
				 			case STARPU_PER_ARCH:
			
 
				-				return per_arch_task_expected_perf(model, arch, task);
			
 
				 
			
 
				+				return per_arch_task_expected_perf(model, arch, task, nimpl);
			
 
				 			case STARPU_COMMON:
			
 
				 				return common_task_expected_perf(model, arch, task);
			
 
				 
			
 
				 			case STARPU_HISTORY_BASED:
			
 
				-				return _starpu_history_based_job_expected_perf(model, arch, j);
			
 
				 
			
 
				+				return _starpu_history_based_job_expected_perf(model, arch, j, nimpl);
			
 
				 			case STARPU_REGRESSION_BASED:
			
 
				-				return _starpu_regression_based_job_expected_perf(model, arch, j);
			
 
				+
			
 
				+				return _starpu_regression_based_job_expected_perf(model, arch, j, nimpl);
			
 
				 
			
 
				 			case STARPU_NL_REGRESSION_BASED:
			
 
				-				return _starpu_non_linear_regression_based_job_expected_perf(model, arch, j);
			
 
				+
			
 
				+				return _starpu_non_linear_regression_based_job_expected_perf(model, arch, j,nimpl);
			
 
				 
			
 
				 			default:
			
 
				 				STARPU_ABORT();
			
@@ -182,14 +185,15 @@ static double starpu_model_expected_perf(struct starpu_task *task, struct starpu
 
				 	return 0.0;
			
 
				 }
			
 
				 
			
 
				-double starpu_task_expected_length(struct starpu_task *task, enum starpu_perf_archtype arch)
			
 
				+double starpu_task_expected_length(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
			
 
				 {
			
 
				-	return starpu_model_expected_perf(task, task->cl->model, arch);
			
 
				+
			
 
				+	return starpu_model_expected_perf(task, task->cl->model, arch, nimpl);
			
 
				 }
			
 
				 
			
 
				-double starpu_task_expected_power(struct starpu_task *task, enum starpu_perf_archtype arch)
			
 
				+double starpu_task_expected_power(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
			
 
				 {
			
 
				-	return starpu_model_expected_perf(task, task->cl->power_model, arch);
			
 
				+	return starpu_model_expected_perf(task, task->cl->power_model, arch, nimpl);
			
 
				 }
			
 
				 
			
 
				 /* Predict the transfer time (in µs) to move a handle to a memory node */
			
--- a/src/core/perfmodel/perfmodel.h
+++ b/src/core/perfmodel/perfmodel.h
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -91,7 +92,7 @@ void _starpu_get_perf_model_dir_codelets(char *path, size_t maxlen);
 
				 void _starpu_get_perf_model_dir_bus(char *path, size_t maxlen);
			
 
				 void _starpu_get_perf_model_dir_debug(char *path, size_t maxlen);
			
 
				 
			
 
				-double _starpu_history_based_job_expected_perf(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct starpu_job_s *j);
			
 
				+double _starpu_history_based_job_expected_perf(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct starpu_job_s *j, unsigned nimpl);
			
 
				 void _starpu_register_model(struct starpu_perfmodel_t *model);
			
 
				 void _starpu_load_history_based_model(struct starpu_perfmodel_t *model, unsigned scan_history);
			
 
				 void _starpu_load_perfmodel(struct starpu_perfmodel_t *model);
			
@@ -99,11 +100,11 @@ void _starpu_initialize_registered_performance_models(void);
 
				 void _starpu_deinitialize_registered_performance_models(void);
			
 
				 
			
 
				 double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel_t *model,
			
 
				-					enum starpu_perf_archtype arch, struct starpu_job_s *j);
			
 
				+					enum starpu_perf_archtype arch, struct starpu_job_s *j, unsigned nimpl);
			
 
				 double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfmodel_t *model,
			
 
				-					enum starpu_perf_archtype arch, struct starpu_job_s *j);
			
 
				+					enum starpu_perf_archtype arch, struct starpu_job_s *j, unsigned nimpl);
			
 
				 void _starpu_update_perfmodel_history(struct starpu_job_s *j, struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch,
			
 
				-				unsigned cpuid, double measured);
			
 
				+				unsigned cpuid, double measured, unsigned nimpl);
			
 
				 
			
 
				 void _starpu_create_sampling_directory_if_needed(void);
			
 
				 
			
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -57,11 +58,11 @@ static void insert_history_entry(struct starpu_history_entry_t *entry, struct st
 
				 }
			
 
				 
			
 
				 
			
 
				-static void dump_reg_model(FILE *f, struct starpu_perfmodel_t *model, unsigned arch)
			
 
				+static void dump_reg_model(FILE *f, struct starpu_perfmodel_t *model, unsigned arch, unsigned nimpl)
			
 
				 {
			
 
				 	struct starpu_per_arch_perfmodel_t *per_arch_model;
			
 
				-	per_arch_model = &model->per_arch[arch];
			
 
				 
			
 
				+	per_arch_model = &model->per_arch[arch][nimpl];
			
 
				 	struct starpu_regression_model_t *reg_model;
			
 
				 	reg_model = &per_arch_model->regression;
			
 
				 
			
@@ -206,15 +207,20 @@ static void parse_per_arch_model_file(FILE *f, struct starpu_per_arch_perfmodel_
 
				 static void parse_model_file(FILE *f, struct starpu_perfmodel_t *model, unsigned scan_history)
			
 
				 {
			
 
				 	unsigned arch;
			
 
				-	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
			
 
				-		parse_per_arch_model_file(f, &model->per_arch[arch], scan_history);
			
 
				+	unsigned nimpl;
			
 
				+	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++) {
			
 
				+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++) {
			
 
				+			parse_per_arch_model_file(f, &model->per_arch[arch][nimpl], scan_history);
			
 
				+		}
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				-static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel_t *model, unsigned arch)
			
 
				+
			
 
				+static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel_t *model, unsigned arch, unsigned nimpl)
			
 
				 {
			
 
				 	struct starpu_per_arch_perfmodel_t *per_arch_model;
			
 
				-	per_arch_model = &model->per_arch[arch];
			
 
				 
			
 
				+	per_arch_model = &model->per_arch[arch][nimpl];
			
 
				 	/* count the number of elements in the lists */
			
 
				 	struct starpu_history_list_t *ptr = NULL;
			
 
				 	unsigned nentries = 0;
			
@@ -232,7 +238,7 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel_t *model,
 
				 	/* header */
			
 
				 	fprintf(f, "# number of entries\n%u\n", nentries);
			
 
				 
			
 
				-	dump_reg_model(f, model, arch);
			
 
				+	dump_reg_model(f, model, arch, nimpl);
			
 
				 
			
 
				 	/* Dump the history into the model file in case it is necessary */
			
 
				 	if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
			
@@ -251,13 +257,17 @@ static void dump_model_file(FILE *f, struct starpu_perfmodel_t *model)
 
				 	fprintf(f, "#################\n");
			
 
				 
			
 
				 	unsigned arch;
			
 
				+	unsigned nimpl;
			
 
				 	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
			
 
				 	{
			
 
				-		char archname[32];
			
 
				-		starpu_perfmodel_get_arch_name((enum starpu_perf_archtype) arch, archname, 32);
			
 
				-		fprintf(f, "# Model for %s\n", archname);
			
 
				-		dump_per_arch_model_file(f, model, arch);
			
 
				-		fprintf(f, "\n##################\n");
			
 
				+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				+		{
			
 
				+			char archname[32];
			
 
				+			starpu_perfmodel_get_arch_name((enum starpu_perf_archtype) arch, archname, 32, nimpl);
			
 
				+			fprintf(f, "# Model for %s\n", archname);
			
 
				+			dump_per_arch_model_file(f, model, arch, nimpl);
			
 
				+			fprintf(f, "\n##################\n");
			
 
				+		}
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -270,8 +280,14 @@ static void initialize_per_arch_model(struct starpu_per_arch_perfmodel_t *per_ar
 
				 static void initialize_model(struct starpu_perfmodel_t *model)
			
 
				 {
			
 
				 	unsigned arch;
			
 
				+	unsigned nimpl;
			
 
				 	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
			
 
				-		initialize_per_arch_model(&model->per_arch[arch]);
			
 
				+	{
			
 
				+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				+		{
			
 
				+			initialize_per_arch_model(&model->per_arch[arch][nimpl]);
			
 
				+		}
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 static void get_model_debug_path(struct starpu_perfmodel_t *model, const char *arch, char *path, size_t maxlen)
			
@@ -312,12 +328,15 @@ void _starpu_register_model(struct starpu_perfmodel_t *model)
 
				 	_starpu_create_sampling_directory_if_needed();
			
 
				 
			
 
				 	unsigned arch;
			
 
				-	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
			
 
				-	{
			
 
				-		char debugpath[256];
			
 
				-		starpu_perfmodel_debugfilepath(model, arch, debugpath, 256);
			
 
				-		model->per_arch[arch].debug_file = fopen(debugpath, "a+");
			
 
				-		STARPU_ASSERT(model->per_arch[arch].debug_file);
			
 
				+	unsigned nimpl;
			
 
				+
			
 
				+	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++) {
			
 
				+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++) {
			
 
				+			char debugpath[256];
			
 
				+			starpu_perfmodel_debugfilepath(model, arch, debugpath, 256, nimpl);
			
 
				+			model->per_arch[arch][nimpl].debug_file = fopen(debugpath, "a+");
			
 
				+			STARPU_ASSERT(model->per_arch[arch][nimpl].debug_file);
			
 
				+		}
			
 
				 	}
			
 
				 #endif
			
 
				 
			
@@ -554,37 +573,37 @@ int starpu_load_history_debug(const char *symbol, struct starpu_perfmodel_t *mod
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-void starpu_perfmodel_get_arch_name(enum starpu_perf_archtype arch, char *archname, size_t maxlen)
			
 
				+void starpu_perfmodel_get_arch_name(enum starpu_perf_archtype arch, char *archname, size_t maxlen,unsigned nimpl)
			
 
				 {
			
 
				 	if (arch < STARPU_CUDA_DEFAULT)
			
 
				 	{
			
 
				 		if (arch == STARPU_CPU_DEFAULT)
			
 
				 		{
			
 
				 			/* NB: We could just use cpu_1 as well ... */
			
 
				-			snprintf(archname, maxlen, "cpu");
			
 
				+			snprintf(archname, maxlen, "cpu_impl_%u",nimpl);
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				 			/* For combined CPU workers */
			
 
				 			int cpu_count = arch - STARPU_CPU_DEFAULT + 1;
			
 
				-			snprintf(archname, maxlen, "cpu_%d", cpu_count);
			
 
				+			snprintf(archname, maxlen, "cpu_%d_impl_%u", cpu_count,nimpl);
			
 
				 		}
			
 
				 	}
			
 
				 	else if ((STARPU_CUDA_DEFAULT <= arch)
			
 
				 		&& (arch < STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS))
			
 
				 	{
			
 
				 		int devid = arch - STARPU_CUDA_DEFAULT;
			
 
				-		snprintf(archname, maxlen, "cuda_%d", devid);
			
 
				+		snprintf(archname, maxlen, "cuda_%d_impl_%u", devid,nimpl);
			
 
				 	}
			
 
				 	else if ((STARPU_OPENCL_DEFAULT <= arch)
			
 
				 		&& (arch < STARPU_OPENCL_DEFAULT + STARPU_MAXOPENCLDEVS))
			
 
				 	{
			
 
				 		int devid = arch - STARPU_OPENCL_DEFAULT;
			
 
				-		snprintf(archname, maxlen, "opencl_%d", devid);
			
 
				+		snprintf(archname, maxlen, "opencl_%d_impl_%u", devid,nimpl);
			
 
				 	}
			
 
				 	else if (arch == STARPU_GORDON_DEFAULT)
			
 
				 	{
			
 
				-		snprintf(archname, maxlen, "gordon");
			
 
				+		snprintf(archname, maxlen, "gordon_impl_%u",nimpl);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
@@ -593,23 +612,23 @@ void starpu_perfmodel_get_arch_name(enum starpu_perf_archtype arch, char *archna
 
				 }
			
 
				 
			
 
				 void starpu_perfmodel_debugfilepath(struct starpu_perfmodel_t *model,
			
 
				-		enum starpu_perf_archtype arch, char *path, size_t maxlen)
			
 
				+		enum starpu_perf_archtype arch, char *path, size_t maxlen, unsigned nimpl)
			
 
				 {
			
 
				 	char archname[32];
			
 
				-	starpu_perfmodel_get_arch_name(arch, archname, 32);
			
 
				+	starpu_perfmodel_get_arch_name(arch, archname, 32, nimpl);
			
 
				 
			
 
				 	STARPU_ASSERT(path);
			
 
				 
			
 
				 	get_model_debug_path(model, archname, path, maxlen);
			
 
				 }
			
 
				 
			
 
				-double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct starpu_job_s *j)
			
 
				+double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct starpu_job_s *j, unsigned nimpl)
			
 
				 {
			
 
				 	double exp = -1.0;
			
 
				 	size_t size = _starpu_job_get_data_size(j);
			
 
				 	struct starpu_regression_model_t *regmodel;
			
 
				 
			
 
				-	regmodel = &model->per_arch[arch].regression;
			
 
				+	regmodel = &model->per_arch[arch][nimpl].regression;
			
 
				 
			
 
				 	if (regmodel->valid)
			
 
				                 exp = regmodel->alpha*pow((double)size, regmodel->beta);
			
@@ -617,13 +636,13 @@ double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel_t *mod
 
				 	return exp;
			
 
				 }
			
 
				 
			
 
				-double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct starpu_job_s *j)
			
 
				+double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct starpu_job_s *j,unsigned nimpl)
			
 
				 {
			
 
				 	double exp = -1.0;
			
 
				 	size_t size = _starpu_job_get_data_size(j);
			
 
				 	struct starpu_regression_model_t *regmodel;
			
 
				 
			
 
				-	regmodel = &model->per_arch[arch].regression;
			
 
				+	regmodel = &model->per_arch[arch][nimpl].regression;
			
 
				 
			
 
				 	if (regmodel->nl_valid)
			
 
				 		exp = regmodel->a*pow((double)size, regmodel->b) + regmodel->c;
			
@@ -631,7 +650,7 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 
				 	return exp;
			
 
				 }
			
 
				 
			
 
				-double _starpu_history_based_job_expected_perf(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct starpu_job_s *j)
			
 
				+double _starpu_history_based_job_expected_perf(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct starpu_job_s *j,unsigned nimpl)
			
 
				 {
			
 
				 	double exp;
			
 
				 	struct starpu_per_arch_perfmodel_t *per_arch_model;
			
@@ -640,7 +659,7 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel_t *model,
 
				 
			
 
				 	uint32_t key = _starpu_compute_buffers_footprint(j);
			
 
				 
			
 
				-	per_arch_model = &model->per_arch[arch];
			
 
				+	per_arch_model = &model->per_arch[arch][nimpl];
			
 
				 
			
 
				 	history = per_arch_model->history;
			
 
				 	if (!history)
			
@@ -661,13 +680,13 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel_t *model,
 
				 	return exp;
			
 
				 }
			
 
				 
			
 
				-void _starpu_update_perfmodel_history(starpu_job_t j, struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, unsigned cpuid STARPU_ATTRIBUTE_UNUSED, double measured)
			
 
				+void _starpu_update_perfmodel_history(starpu_job_t j, struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, unsigned cpuid STARPU_ATTRIBUTE_UNUSED, double measured, unsigned nimpl)
			
 
				 {
			
 
				 	if (model)
			
 
				 	{
			
 
				 		PTHREAD_RWLOCK_WRLOCK(&model->model_rwlock);
			
 
				 
			
 
				-		struct starpu_per_arch_perfmodel_t *per_arch_model = &model->per_arch[arch];
			
 
				+		struct starpu_per_arch_perfmodel_t *per_arch_model = &model->per_arch[arch][nimpl];
			
 
				 
			
 
				 		if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
			
 
				 		{
			
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -256,7 +257,7 @@ int starpu_task_submit(struct starpu_task *task)
 
				 		/* In case we require that a task should be explicitely
			
 
				 		 * executed on a specific worker, we make sure that the worker
			
 
				 		 * is able to execute this task.  */
			
 
				-		if (task->execute_on_a_specific_worker && !starpu_combined_worker_may_execute_task(task->workerid, task)) {
			
 
				+		if (task->execute_on_a_specific_worker && !starpu_combined_worker_may_execute_task(task->workerid, task, 0)) {
			
 
				                         _STARPU_LOG_OUT_TAG("ENODEV");
			
 
				 			return -ENODEV;
			
 
				                 }
			
--- a/src/core/task_bundle.c
+++ b/src/core/task_bundle.c
@@ -1,6 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -177,7 +178,7 @@ void starpu_task_bundle_close(struct starpu_task_bundle *bundle)
 
				 }
			
 
				 
			
 
				 /* Return the expected duration of the entire task bundle in µs */
			
 
				-double starpu_task_bundle_expected_length(struct starpu_task_bundle *bundle,  enum starpu_perf_archtype arch)
			
 
				+double starpu_task_bundle_expected_length(struct starpu_task_bundle *bundle,  enum starpu_perf_archtype arch, unsigned nimpl)
			
 
				 {
			
 
				 	double expected_length = 0.0;
			
 
				 
			
@@ -188,7 +189,7 @@ double starpu_task_bundle_expected_length(struct starpu_task_bundle *bundle,  en
 
				 	entry = bundle->list;
			
 
				 
			
 
				 	while (entry) {
			
 
				-		double task_length = starpu_task_expected_length(entry->task, arch);
			
 
				+		double task_length = starpu_task_expected_length(entry->task, arch, nimpl);
			
 
				 
			
 
				 		/* In case the task is not calibrated, we consider the task
			
 
				 		 * ends immediately. */
			
@@ -204,7 +205,7 @@ double starpu_task_bundle_expected_length(struct starpu_task_bundle *bundle,  en
 
				 }
			
 
				 
			
 
				 /* Return the expected power consumption of the entire task bundle in J */
			
 
				-double starpu_task_bundle_expected_power(struct starpu_task_bundle *bundle,  enum starpu_perf_archtype arch)
			
 
				+double starpu_task_bundle_expected_power(struct starpu_task_bundle *bundle,  enum starpu_perf_archtype arch, unsigned nimpl)
			
 
				 {
			
 
				 	double expected_power = 0.0;
			
 
				 
			
@@ -215,7 +216,7 @@ double starpu_task_bundle_expected_power(struct starpu_task_bundle *bundle,  enu
 
				 	entry = bundle->list;
			
 
				 
			
 
				 	while (entry) {
			
 
				-		double task_power = starpu_task_expected_power(entry->task, arch);
			
 
				+		double task_power = starpu_task_expected_power(entry->task, arch, nimpl);
			
 
				 
			
 
				 		/* In case the task is not calibrated, we consider the task
			
 
				 		 * ends immediately. */
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -3,6 +3,7 @@
 
				  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2010, 2011  Institut National de Recherche en Informatique et Automatique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -68,17 +69,39 @@ uint32_t _starpu_may_submit_opencl_task(void)
 
				 	return (STARPU_OPENCL & config.worker_mask);
			
 
				 }
			
 
				 
			
 
				-int starpu_worker_may_execute_task(unsigned workerid, struct starpu_task *task)
			
 
				+static int _starpu_may_use_nth_implementation(enum starpu_archtype arch, struct starpu_codelet_t *cl, unsigned nimpl)
			
 
				+{
			
 
				+	switch(arch) {
			
 
				+	case STARPU_CPU_WORKER:
			
 
				+		return !(cl->cpu_func == STARPU_MULTIPLE_CPU_IMPLEMENTATIONS &&
			
 
				+			cl->cpu_funcs[nimpl] == NULL);
			
 
				+	case STARPU_CUDA_WORKER:
			
 
				+		return !(cl->cuda_func == STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS &&
			
 
				+			cl->cuda_funcs[nimpl] == NULL);
			
 
				+	case STARPU_OPENCL_WORKER:
			
 
				+		return !(cl->opencl_func == STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS &&
			
 
				+			cl->opencl_funcs[nimpl] == NULL);
			
 
				+	case STARPU_GORDON_WORKER:
			
 
				+		return !(cl->gordon_func == STARPU_MULTIPLE_GORDON_IMPLEMENTATIONS &&
			
 
				+			cl->gordon_funcs[nimpl] == NULL);
			
 
				+	default:
			
 
				+		return 0;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+int starpu_worker_may_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl)
			
 
				 {
			
 
				 	/* TODO: check that the task operand sizes will fit on that device */
			
 
				 	/* TODO: call application-provided function for various cases like
			
 
				 	 * double support, shared memory size limit, etc. */
			
 
				-	return !!(task->cl->where & config.workers[workerid].worker_mask);
			
 
				+	return !!((task->cl->where & config.workers[workerid].worker_mask) &&
			
 
				+		_starpu_may_use_nth_implementation(config.workers[workerid].arch, task->cl, nimpl));
			
 
				 }
			
 
				 
			
 
				 
			
 
				 
			
 
				-int starpu_combined_worker_may_execute_task(unsigned workerid, struct starpu_task *task)
			
 
				+int starpu_combined_worker_may_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl)
			
 
				 {
			
 
				 	/* TODO: check that the task operand sizes will fit on that device */
			
 
				 	/* TODO: call application-provided function for various cases like
			
@@ -90,7 +113,8 @@ int starpu_combined_worker_may_execute_task(unsigned workerid, struct starpu_tas
 
				 	/* Is this a parallel worker ? */
			
 
				 	if (workerid < nworkers)
			
 
				 	{
			
 
				-		return !!(task->cl->where & config.workers[workerid].worker_mask);
			
 
				+		return !!((task->cl->where & config.workers[workerid].worker_mask) &&
			
 
				+				_starpu_may_use_nth_implementation(config.workers[workerid].arch, task->cl, nimpl));
			
 
				 	}
			
 
				 	else {
			
 
				 		if ((cl->type == STARPU_SPMD) || (cl->type == STARPU_FORKJOIN))
			
@@ -99,7 +123,8 @@ int starpu_combined_worker_may_execute_task(unsigned workerid, struct starpu_tas
 
				 
			
 
				 			/* Is the worker larger than requested ? */
			
 
				 			int worker_size = (int)config.combined_workers[workerid - nworkers].worker_size;
			
 
				-			return !!(worker_size <= task->cl->max_parallelism);
			
 
				+			return !!((worker_size <= task->cl->max_parallelism) &&
			
 
				+				_starpu_may_use_nth_implementation(config.workers[workerid].arch, task->cl, nimpl));
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -3,6 +3,7 @@
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -55,9 +56,19 @@ static int execute_job_on_cpu(starpu_job_t j, struct starpu_worker_s *cpu_args,
 
				 	 * execute the kernel at all. */
			
 
				 	if ((rank == 0) || (cl->type != STARPU_FORKJOIN))
			
 
				 	{
			
 
				-		cl_func func = cl->cpu_func;
			
 
				-		STARPU_ASSERT(func);
			
 
				-		func(task->interfaces, task->cl_arg);
			
 
				+		if (cl->cpu_func != STARPU_MULTIPLE_CPU_IMPLEMENTATIONS) {
			
 
				+			cl_func func = cl->cpu_func;
			
 
				+			STARPU_ASSERT(func);
			
 
				+			func(task->interfaces, task->cl_arg);
			
 
				+		}
			
 
				+		else {
			
 
				+			if (cl->cpu_funcs[j->nimpl] != NULL) {
			
 
				+				/* _STARPU_DEBUG("CPU driver : running kernel (%d)\n", j->nimpl); */
			
 
				+				cl_func func = cl->cpu_funcs[j->nimpl];
			
 
				+				STARPU_ASSERT(func);
			
 
				+				func(task->interfaces, task->cl_arg);
			
 
				+			}
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	_starpu_driver_end_job(cpu_args, j, &codelet_end, rank);
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -3,6 +3,7 @@
 
				  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -198,9 +199,19 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 
				 	cures = cudaSetDevice(args->devid);
			
 
				 #endif
			
 
				 
			
 
				-	cl_func func = cl->cuda_func;
			
 
				-	STARPU_ASSERT(func);
			
 
				-	func(task->interfaces, task->cl_arg);
			
 
				+	if (cl->cuda_func != STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS) {
			
 
				+		cl_func func = cl->cuda_func;
			
 
				+		STARPU_ASSERT(func);
			
 
				+		func(task->interfaces, task->cl_arg);
			
 
				+	}
			
 
				+	else {
			
 
				+		if (cl->cuda_funcs[j->nimpl] != NULL) {
			
 
				+			/* _STARPU_DEBUG("Cuda driver : running kernel * (%d)\n", j->nimpl); */
			
 
				+			cl_func func = cl->cuda_funcs[j->nimpl];
			
 
				+			STARPU_ASSERT(func);
			
 
				+			func(task->interfaces, task->cl_arg);
			
 
				+		}
			
 
				+	}
			
 
				 
			
 
				 	_starpu_driver_end_job(args, j, &codelet_end, 0);
			
 
				 
			
--- a/src/drivers/driver_common/driver_common.c
+++ b/src/drivers/driver_common/driver_common.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -120,14 +121,17 @@ void _starpu_driver_update_job_feedback(starpu_job_t j, struct starpu_worker_s *
 
				 		}
			
 
				 
			
 
				 		if (calibrate_model)
			
 
				-			_starpu_update_perfmodel_history(j, j->task->cl->model,  perf_arch, worker_args->devid, measured);
			
 
				+
			
 
				+			_starpu_update_perfmodel_history(j, j->task->cl->model,  perf_arch, worker_args->devid, measured,j->nimpl);
			
 
				+
			
 
				+
			
 
				 	}
			
 
				 
			
 
				 	if (!updated)
			
 
				 		_starpu_worker_update_profiling_info_executing(workerid, NULL, 1, 0, 0, 0);
			
 
				 
			
 
				 	if (profiling_info && profiling_info->power_consumed && cl->power_model && cl->power_model->benchmarking) {
			
 
				-		_starpu_update_perfmodel_history(j, j->task->cl->power_model,  perf_arch, worker_args->devid, profiling_info->power_consumed);
			
 
				+		_starpu_update_perfmodel_history(j, j->task->cl->power_model,  perf_arch, worker_args->devid, profiling_info->power_consumed,j->nimpl);
			
 
				 		}
			
 
				 }
			
 
				 
			
--- a/src/drivers/gordon/driver_gordon.c
+++ b/src/drivers/gordon/driver_gordon.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -161,7 +162,10 @@ static struct gordon_task_wrapper_s *starpu_to_gordon_job(starpu_job_t j)
 
				 	task_wrapper->j = j;
			
 
				 	task_wrapper->terminated = 0;
			
 
				 
			
 
				-	gordon_job->index = j->task->cl->gordon_func;
			
 
				+	if (j->task->clgordon_func != STARPU_MULTIPLE_GORDON_IMPLEMENTATIONS)
			
 
				+		gordon_job->index = j->task->cl->gordon_func;
			
 
				+	else
			
 
				+		gordon_job->index = j->task->cl->gordon_funcs[j->nimpl];
			
 
				 
			
 
				 	/* we should not hardcore the memory node ... XXX */
			
 
				 	unsigned memory_node = 0;
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -3,6 +3,7 @@
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -529,9 +530,19 @@ static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *ar
 
				 
			
 
				 	_starpu_driver_start_job(args, j, &codelet_start, 0);
			
 
				 
			
 
				-	cl_func func = cl->opencl_func;
			
 
				-	STARPU_ASSERT(func);
			
 
				-	func(task->interfaces, task->cl_arg);
			
 
				+	if (cl->opencl_func != STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS) {
			
 
				+		cl_func func = cl->opencl_func;
			
 
				+		STARPU_ASSERT(func);
			
 
				+		func(task->interfaces, task->cl_arg);
			
 
				+	}
			
 
				+	else {
			
 
				+		if (cl->opencl_funcs[j->nimpl] != NULL) {
			
 
				+			/* _STARPU_DEBUG("OpenCL driver : running kernel (%d)\n", j->nimpl); */
			
 
				+			cl_func func = cl->opencl_funcs[j->nimpl];
			
 
				+			STARPU_ASSERT(func);
			
 
				+			func(task->interfaces, task->cl_arg);
			
 
				+		}
			
 
				+	}
			
 
				 
			
 
				 	_starpu_driver_end_job(args, j, &codelet_end, 0);
			
 
				 
			
--- a/src/profiling/bound.c
+++ b/src/profiling/bound.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -335,7 +336,7 @@ static void _starpu_get_tasks_times(int nw, int nt, double *times) {
 
				 				.footprint_is_computed = 1,
			
 
				 			};
			
 
				 			enum starpu_perf_archtype arch = starpu_worker_get_perf_archtype(w);
			
 
				-			double length = _starpu_history_based_job_expected_perf(tp->cl->model, arch, &j);
			
 
				+			double length = _starpu_history_based_job_expected_perf(tp->cl->model, arch, &j, j.nimpl);
			
 
				 			if (length == -1.0)
			
 
				 				times[w*nt+t] = -1.0;
			
 
				 			else
			
@@ -401,7 +402,7 @@ void starpu_bound_print_lp(FILE *output)
 
				 			for (w = 0; w < nw; w++) {
			
 
				 				enum starpu_perf_archtype arch = starpu_worker_get_perf_archtype(w);
			
 
				 				if (t1->duration[arch] == 0.) {
			
 
				-					double length = _starpu_history_based_job_expected_perf(t1->cl->model, arch, &j);
			
 
				+					double length = _starpu_history_based_job_expected_perf(t1->cl->model, arch, &j,j.nimpl);
			
 
				 					if (length == -1.0)
			
 
				 						/* Avoid problems with binary coding of doubles */
			
 
				 						t1->duration[arch] = -1.0;
			
--- a/src/sched_policies/deque_modeling_policy_data_aware.c
+++ b/src/sched_policies/deque_modeling_policy_data_aware.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -311,57 +312,65 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio)
 
				 	/* A priori, we know all estimations */
			
 
				 	int unknown = 0;
			
 
				 
			
 
				+	unsigned best_impl = 0;
			
 
				+	unsigned nimpl;
			
 
				 	for (worker = 0; worker < nworkers; worker++)
			
 
				 	{
			
 
				-		double exp_end;
			
 
				-		
			
 
				-		fifo = queue_array[worker];
			
 
				+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				+		{
			
 
				+			double exp_end;
			
 
				 
			
 
				-		/* Sometimes workers didn't take the tasks as early as we expected */
			
 
				-		fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
			
 
				-		fifo->exp_end = fifo->exp_start + fifo->exp_len;
			
 
				+			fifo = queue_array[worker];
			
 
				 
			
 
				-		if (!starpu_worker_may_execute_task(worker, task))
			
 
				-		{
			
 
				-			/* no one on that queue may execute this task */
			
 
				-			continue;
			
 
				-		}
			
 
				+			/* Sometimes workers didn't take the tasks as early as we expected */
			
 
				+			fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
			
 
				+			fifo->exp_end = fifo->exp_start + fifo->exp_len;
			
 
				 
			
 
				-		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				-		double local_length = starpu_task_expected_length(task, perf_arch);
			
 
				-		double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
			
 
				-
			
 
				-		if (ntasks_best == -1
			
 
				-				|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
			
 
				-				|| (!calibrating && local_length == -1.0) /* Not calibrating but this worker is being calibrated */
			
 
				-				|| (calibrating && local_length == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
			
 
				-				) {
			
 
				-			ntasks_best_end = ntasks_end;
			
 
				-			ntasks_best = worker;
			
 
				-		}
			
 
				+			if (!starpu_worker_may_execute_task(worker, task, nimpl))
			
 
				+			{
			
 
				+				/* no one on that queue may execute this task */
			
 
				+				continue;
			
 
				+			}
			
 
				 
			
 
				-		if (local_length == -1.0)
			
 
				-			/* we are calibrating, we want to speed-up calibration time
			
 
				-			 * so we privilege non-calibrated tasks (but still
			
 
				-			 * greedily distribute them to avoid dumb schedules) */
			
 
				-			calibrating = 1;
			
 
				+			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				+			double local_length = starpu_task_expected_length(task, perf_arch, nimpl);
			
 
				+			double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
			
 
				 
			
 
				-		if (local_length <= 0.0)
			
 
				-			/* there is no prediction available for that task
			
 
				-			 * with that arch yet, so switch to a greedy strategy */
			
 
				-			unknown = 1;
			
 
				+			//_STARPU_DEBUG("Scheduler dm: task length (%lf) worker (%u) kernel (%u) \n", local_length,worker,nimpl);
			
 
				 
			
 
				-		if (unknown)
			
 
				-			continue;
			
 
				+			if (ntasks_best == -1
			
 
				+					|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
			
 
				+					|| (!calibrating && local_length == -1.0) /* Not calibrating but this worker is being calibrated */
			
 
				+					|| (calibrating && local_length == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
			
 
				+					) {
			
 
				+				ntasks_best_end = ntasks_end;
			
 
				+				ntasks_best = worker;
			
 
				+			}
			
 
				 
			
 
				-		exp_end = fifo->exp_start + fifo->exp_len + local_length;
			
 
				+			if (local_length == -1.0)
			
 
				+				/* we are calibrating, we want to speed-up calibration time
			
 
				+				 * so we privilege non-calibrated tasks (but still
			
 
				+				 * greedily distribute them to avoid dumb schedules) */
			
 
				+				calibrating = 1;
			
 
				 
			
 
				-		if (best == -1 || exp_end < best_exp_end)
			
 
				-		{
			
 
				-			/* a better solution was found */
			
 
				-			best_exp_end = exp_end;
			
 
				-			best = worker;
			
 
				-			model_best = local_length;
			
 
				+			if (local_length <= 0.0)
			
 
				+				/* there is no prediction available for that task
			
 
				+				 * with that arch yet, so switch to a greedy strategy */
			
 
				+				unknown = 1;
			
 
				+
			
 
				+			if (unknown)
			
 
				+				continue;
			
 
				+
			
 
				+			exp_end = fifo->exp_start + fifo->exp_len + local_length;
			
 
				+
			
 
				+			if (best == -1 || exp_end < best_exp_end)
			
 
				+			{
			
 
				+				/* a better solution was found */
			
 
				+				best_exp_end = exp_end;
			
 
				+				best = worker;
			
 
				+				model_best = local_length;
			
 
				+				best_impl = nimpl;
			
 
				+			}
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -370,6 +379,10 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio)
 
				 		model_best = 0.0;
			
 
				 	}
			
 
				 	
			
 
				+	//_STARPU_DEBUG("Scheduler dm: kernel (%u)\n", best_impl);
			
 
				+
			
 
				+	 _starpu_get_job_associated_to_task(task)->nimpl = 0;//best_impl;
			
 
				+
			
 
				 	/* we should now have the best worker in variable "best" */
			
 
				 	return push_task_on_best_worker(task, best, model_best, prio);
			
 
				 }
			
@@ -404,64 +417,78 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio)
 
				 	/* A priori, we know all estimations */
			
 
				 	int unknown = 0;
			
 
				 
			
 
				+	unsigned best_impl = 0;
			
 
				+	unsigned nimpl=0;
			
 
				 	for (worker = 0; worker < nworkers; worker++)
			
 
				 	{
			
 
				-		fifo = queue_array[worker];
			
 
				+		for(nimpl  = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				+	 	{
			
 
				+			fifo = queue_array[worker];
			
 
				 
			
 
				-		/* Sometimes workers didn't take the tasks as early as we expected */
			
 
				-		fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
			
 
				-		fifo->exp_end = fifo->exp_start + fifo->exp_len;
			
 
				-		if (fifo->exp_end > max_exp_end)
			
 
				-			max_exp_end = fifo->exp_end;
			
 
				+			/* Sometimes workers didn't take the tasks as early as we expected */
			
 
				+			fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
			
 
				+			fifo->exp_end = fifo->exp_start + fifo->exp_len;
			
 
				+			if (fifo->exp_end > max_exp_end)
			
 
				+				max_exp_end = fifo->exp_end;
			
 
				 
			
 
				-		if (!starpu_worker_may_execute_task(worker, task))
			
 
				-		{
			
 
				-			/* no one on that queue may execute this task */
			
 
				-			continue;
			
 
				-		}
			
 
				+			if (!starpu_worker_may_execute_task(worker, task, nimpl))
			
 
				+			{
			
 
				+				/* no one on that queue may execute this task */
			
 
				+				continue;
			
 
				+			}
			
 
				 
			
 
				-		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				-		local_task_length[worker] = starpu_task_expected_length(task, perf_arch);
			
 
				+			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				+			local_task_length[worker] = starpu_task_expected_length(task, perf_arch, nimpl);
			
 
				 
			
 
				-		unsigned memory_node = starpu_worker_get_memory_node(worker);
			
 
				-		local_data_penalty[worker] = starpu_task_expected_data_transfer_time(memory_node, task);
			
 
				+			//_STARPU_DEBUG("Scheduler dmda: task length (%lf) worker (%u) kernel (%u) \n", local_task_length[worker],worker,nimpl);
			
 
				 
			
 
				-		double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
			
 
				+			unsigned memory_node = starpu_worker_get_memory_node(worker);
			
 
				+			local_data_penalty[worker] = starpu_task_expected_data_transfer_time(memory_node, task);
			
 
				 
			
 
				-		if (ntasks_best == -1
			
 
				-				|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
			
 
				-				|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
			
 
				-				|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
			
 
				-				) {
			
 
				-			ntasks_best_end = ntasks_end;
			
 
				-			ntasks_best = worker;
			
 
				-		}
			
 
				+			double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
			
 
				 
			
 
				-		if (local_task_length[worker] == -1.0)
			
 
				-			/* we are calibrating, we want to speed-up calibration time
			
 
				-			 * so we privilege non-calibrated tasks (but still
			
 
				-			 * greedily distribute them to avoid dumb schedules) */
			
 
				-			calibrating = 1;
			
 
				+			if (ntasks_best == -1
			
 
				+					|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
			
 
				+					|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
			
 
				+					|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
			
 
				+					) {
			
 
				+				ntasks_best_end = ntasks_end;
			
 
				+				ntasks_best = worker;
			
 
				 
			
 
				-		if (local_task_length[worker] <= 0.0)
			
 
				-			/* there is no prediction available for that task
			
 
				-			 * with that arch yet, so switch to a greedy strategy */
			
 
				-			unknown = 1;
			
 
				+			}
			
 
				 
			
 
				-		if (unknown)
			
 
				-			continue;
			
 
				+			if (local_task_length[worker] == -1.0)
			
 
				+				/* we are calibrating, we want to speed-up calibration time
			
 
				+				 * so we privilege non-calibrated tasks (but still
			
 
				+				 * greedily distribute them to avoid dumb schedules) */
			
 
				+				calibrating = 1;
			
 
				 
			
 
				-		exp_end[worker] = fifo->exp_start + fifo->exp_len + local_task_length[worker];
			
 
				+			if (local_task_length[worker] <= 0.0)
			
 
				+				/* there is no prediction available for that task
			
 
				+				 * with that arch yet, so switch to a greedy strategy */
			
 
				+				unknown = 1;
			
 
				 
			
 
				-		if (exp_end[worker] < best_exp_end)
			
 
				-		{
			
 
				-			/* a better solution was found */
			
 
				-			best_exp_end = exp_end[worker];
			
 
				-		}
			
 
				+			if (unknown)
			
 
				+					continue;
			
 
				 
			
 
				-		local_power[worker] = starpu_task_expected_power(task, perf_arch);
			
 
				-		if (local_power[worker] == -1.0)
			
 
				-			local_power[worker] = 0.;
			
 
				+			exp_end[worker] = fifo->exp_start + fifo->exp_len + local_task_length[worker];
			
 
				+
			
 
				+			if (exp_end[worker] < best_exp_end)
			
 
				+			{
			
 
				+				/* a better solution was found */
			
 
				+				best_exp_end = exp_end[worker];
			
 
				+				best_impl = nimpl;
			
 
				+
			
 
				+			}
			
 
				+
			
 
				+
			
 
				+
			
 
				+			local_power[worker] = starpu_task_expected_power(task, perf_arch, nimpl);
			
 
				+			if (local_power[worker] == -1.0)
			
 
				+				local_power[worker] = 0.;
			
 
				+
			
 
				+
			
 
				+		 }
			
 
				 	}
			
 
				 
			
 
				 	if (unknown)
			
@@ -475,7 +502,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio)
 
				 		{
			
 
				 			fifo = queue_array[worker];
			
 
				 	
			
 
				-			if (!starpu_worker_may_execute_task(worker, task))
			
 
				+			if (!starpu_worker_may_execute_task(worker, task, 0))
			
 
				 			{
			
 
				 				/* no one on that queue may execute this task */
			
 
				 				continue;
			
@@ -519,6 +546,10 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio)
 
				 		//penality_best = local_data_penalty[best];
			
 
				 	}
			
 
				 
			
 
				+
			
 
				+	//_STARPU_DEBUG("Scheduler dmda: kernel (%u)\n", best_impl);
			
 
				+	 _starpu_get_job_associated_to_task(task)->nimpl = best_impl;
			
 
				+
			
 
				 	/* we should now have the best worker in variable "best" */
			
 
				 	return push_task_on_best_worker(task, best, model_best, prio);
			
 
				 }
			
--- a/src/sched_policies/deque_queues.c
+++ b/src/sched_policies/deque_queues.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -109,7 +110,11 @@ struct starpu_job_list_s *_starpu_deque_pop_every_task(struct starpu_deque_jobq_
 
				 		{
			
 
				 			next_job = starpu_job_list_next(i);
			
 
				 
			
 
				-			if (starpu_worker_may_execute_task(workerid, i->task))
			
 
				+			/* In case there are multiples implementations of the
			
 
				+ 			 * codelet for a single device, We dont really care
			
 
				+			 * about the implementation used, so let's try the 
			
 
				+			 * first one. */
			
 
				+			if (starpu_worker_may_execute_task(workerid, i->task, 0))
			
 
				 			{
			
 
				 				/* this elements can be moved into the new list */
			
 
				 				new_list_size++;
			
--- a/src/sched_policies/fifo_queues.c
+++ b/src/sched_policies/fifo_queues.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -111,7 +112,7 @@ struct starpu_task *_starpu_fifo_pop_every_task(struct starpu_fifo_taskq_s *fifo
 
				 		{
			
 
				 			next_task = task->next;
			
 
				 
			
 
				-			if (starpu_worker_may_execute_task(workerid, task))
			
 
				+			if (starpu_worker_may_execute_task(workerid, task, 0))
			
 
				 			{
			
 
				 				/* this elements can be moved into the new list */
			
 
				 				new_list_size++;
			
--- a/src/sched_policies/heft.c
+++ b/src/sched_policies/heft.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -115,7 +116,9 @@ static void heft_push_task_notify(struct starpu_task *task, int workerid)
 
				 {
			
 
				 	/* Compute the expected penality */
			
 
				 	enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(workerid);
			
 
				-	double predicted = starpu_task_expected_length(task, perf_arch);
			
 
				+
			
 
				+	double predicted = starpu_task_expected_length(task, perf_arch,
			
 
				+			_starpu_get_job_associated_to_task(task)->nimpl);
			
 
				 
			
 
				 	/* Update the predictions */
			
 
				 	PTHREAD_MUTEX_LOCK(&sched_mutex[workerid]);
			
@@ -179,78 +182,92 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				 
			
 
				 	/* A priori, we know all estimations */
			
 
				 	int unknown = 0;
			
 
				-
			
 
				 	unsigned worker;
			
 
				-	for (worker = 0; worker < nworkers; worker++)
			
 
				-	{
			
 
				-		/* Sometimes workers didn't take the tasks as early as we expected */
			
 
				-		exp_start[worker] = STARPU_MAX(exp_start[worker], starpu_timing_now());
			
 
				-		exp_end[worker] = exp_start[worker] + exp_len[worker];
			
 
				-		if (exp_end[worker] > max_exp_end)
			
 
				-			max_exp_end = exp_end[worker];
			
 
				 
			
 
				-		if (!starpu_worker_may_execute_task(worker, task))
			
 
				-		{
			
 
				-			/* no one on that queue may execute this task */
			
 
				-			continue;
			
 
				-		}
			
 
				-
			
 
				-		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				-		unsigned memory_node = starpu_worker_get_memory_node(worker);
			
 
				-
			
 
				-		if (bundle)
			
 
				-		{
			
 
				-			local_task_length[worker] = starpu_task_bundle_expected_length(bundle, perf_arch);
			
 
				-			local_data_penalty[worker] = starpu_task_bundle_expected_data_transfer_time(bundle, memory_node);
			
 
				-			local_power[worker] = starpu_task_bundle_expected_power(bundle, perf_arch);
			
 
				-		}
			
 
				-		else {
			
 
				-			local_task_length[worker] = starpu_task_expected_length(task, perf_arch);
			
 
				-			local_data_penalty[worker] = starpu_task_expected_data_transfer_time(memory_node, task);
			
 
				-			local_power[worker] = starpu_task_expected_power(task, perf_arch);
			
 
				-		}
			
 
				-
			
 
				-		double ntasks_end = ntasks[worker] / starpu_worker_get_relative_speedup(perf_arch);
			
 
				-
			
 
				-		if (ntasks_best == -1
			
 
				+	unsigned nimpl;
			
 
				+	unsigned best_impl = 0;
			
 
				+
			
 
				+	for (worker = 0; worker < nworkers; worker++) {
			
 
				+		for (nimpl = 0; nimpl <STARPU_MAXIMPLEMENTATIONS; nimpl++) {
			
 
				+			/* Sometimes workers didn't take the tasks as early as we expected */
			
 
				+			exp_start[worker] = STARPU_MAX(exp_start[worker], starpu_timing_now());
			
 
				+			exp_end[worker] = exp_start[worker] + exp_len[worker];
			
 
				+			if (exp_end[worker] > max_exp_end)
			
 
				+				max_exp_end = exp_end[worker];
			
 
				+
			
 
				+			if (!starpu_worker_may_execute_task(worker, task, nimpl))
			
 
				+			{
			
 
				+				/* no one on that queue may execute this task */
			
 
				+				continue;
			
 
				+			}
			
 
				+
			
 
				+			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				+			unsigned memory_node = starpu_worker_get_memory_node(worker);
			
 
				+
			
 
				+			if (bundle)
			
 
				+			{
			
 
				+				local_task_length[worker] = starpu_task_bundle_expected_length(bundle, perf_arch, nimpl);
			
 
				+				local_data_penalty[worker] = starpu_task_bundle_expected_data_transfer_time(bundle, memory_node);
			
 
				+				local_power[worker] = starpu_task_bundle_expected_power(bundle, perf_arch,nimpl);
			
 
				+				//_STARPU_DEBUG("Scheduler heft bundle: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker],local_power[worker],worker,nimpl);
			
 
				+
			
 
				+			}
			
 
				+			else {
			
 
				+				local_task_length[worker] = starpu_task_expected_length(task, perf_arch, nimpl);
			
 
				+				local_data_penalty[worker] = starpu_task_expected_data_transfer_time(memory_node, task);
			
 
				+				local_power[worker] = starpu_task_expected_power(task, perf_arch,nimpl);
			
 
				+				//_STARPU_DEBUG("Scheduler heft: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker],local_power[worker],worker,nimpl);
			
 
				+
			
 
				+			}
			
 
				+
			
 
				+			double ntasks_end = ntasks[worker] / starpu_worker_get_relative_speedup(perf_arch);
			
 
				+
			
 
				+			if (ntasks_best == -1
			
 
				 				|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
			
 
				 				|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
			
 
				 				|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
			
 
				 				) {
			
 
				-			ntasks_best_end = ntasks_end;
			
 
				-			ntasks_best = worker;
			
 
				-		}
			
 
				+				ntasks_best_end = ntasks_end;
			
 
				+				ntasks_best = worker;
			
 
				+			}
			
 
				 
			
 
				-		if (local_task_length[worker] == -1.0)
			
 
				-			/* we are calibrating, we want to speed-up calibration time
			
 
				-			 * so we privilege non-calibrated tasks (but still
			
 
				-			 * greedily distribute them to avoid dumb schedules) */
			
 
				-			calibrating = 1;
			
 
				+			if (local_task_length[worker] == -1.0)
			
 
				+				/* we are calibrating, we want to speed-up calibration time
			
 
				+				 * so we privilege non-calibrated tasks (but still
			
 
				+				 * greedily distribute them to avoid dumb schedules) */
			
 
				+				calibrating = 1;
			
 
				 
			
 
				-		if (local_task_length[worker] <= 0.0)
			
 
				-			/* there is no prediction available for that task
			
 
				-			 * with that arch yet, so switch to a greedy strategy */
			
 
				-			unknown = 1;
			
 
				+			if (local_task_length[worker] <= 0.0)
			
 
				+				/* there is no prediction available for that task
			
 
				+				 * with that arch yet, so switch to a greedy strategy */
			
 
				+				unknown = 1;
			
 
				 
			
 
				-		if (unknown)
			
 
				-			continue;
			
 
				+			if (unknown)
			
 
				+				continue;
			
 
				 
			
 
				-		exp_end[worker] = exp_start[worker] + exp_len[worker] + local_task_length[worker];
			
 
				+			exp_end[worker] = exp_start[worker] + exp_len[worker] + local_task_length[worker];
			
 
				 
			
 
				-		if (exp_end[worker] < best_exp_end)
			
 
				-		{
			
 
				-			/* a better solution was found */
			
 
				-			best_exp_end = exp_end[worker];
			
 
				-		}
			
 
				+			if (exp_end[worker] < best_exp_end)
			
 
				+			{
			
 
				+				/* a better solution was found */
			
 
				+				best_exp_end = exp_end[worker];
			
 
				+				best_impl = nimpl;
			
 
				+			}
			
 
				+
			
 
				+			if (local_power[worker] == -1.0)
			
 
				+				local_power[worker] = 0.;
			
 
				 
			
 
				-		if (local_power[worker] == -1.0)
			
 
				-			local_power[worker] = 0.;
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	*forced_best = unknown?ntasks_best:-1;
			
 
				 
			
 
				 	*best_exp_endp = best_exp_end;
			
 
				 	*max_exp_endp = max_exp_end;
			
 
				+
			
 
				+	/* save the best implementation */
			
 
				+	//_STARPU_DEBUG("Scheduler heft: kernel (%u)\n", best_impl);
			
 
				+	_starpu_get_job_associated_to_task(task)->nimpl = best_impl;
			
 
				 }
			
 
				 
			
 
				 static int _heft_push_task(struct starpu_task *task, unsigned prio)
			
@@ -298,7 +315,7 @@ static int _heft_push_task(struct starpu_task *task, unsigned prio)
 
				 
			
 
				 	for (worker = 0; worker < nworkers; worker++)
			
 
				 	{
			
 
				-		if (!starpu_worker_may_execute_task(worker, task))
			
 
				+		if (!starpu_worker_may_execute_task(worker, task, 0))
			
 
				 		{
			
 
				 			/* no one on that queue may execute this task */
			
 
				 			continue;
			
@@ -314,12 +331,11 @@ static int _heft_push_task(struct starpu_task *task, unsigned prio)
 
				 			 * consumption of other cpus */
			
 
				 			fitness[worker] += _gamma * idle_power * (exp_end[worker] - max_exp_end) / 1000000.0;
			
 
				 
			
 
				-		if (best == -1 || fitness[worker] < best_fitness)
			
 
				-		{
			
 
				-			/* we found a better solution */
			
 
				-			best_fitness = fitness[worker];
			
 
				-			best = worker;
			
 
				-		}
			
 
				+			if (best == -1 || fitness[worker] < best_fitness)
			
 
				+			{
			
 
				+				/* we found a better solution */
			
 
				+				best_fitness = fitness[worker]; best = worker;
			
 
				+			}
			
 
				 	}
			
 
				 
			
 
				 	/* By now, we must have found a solution */
			
@@ -333,7 +349,8 @@ static int _heft_push_task(struct starpu_task *task, unsigned prio)
 
				 		/* If we have a task bundle, we have computed the expected
			
 
				 		 * length for the entire bundle, but not for the task alone. */
			
 
				 		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(best);
			
 
				-		model_best = starpu_task_expected_length(task, perf_arch);
			
 
				+		model_best = starpu_task_expected_length(task, perf_arch,
			
 
				+				_starpu_get_job_associated_to_task(task)->nimpl);
			
 
				 
			
 
				 		/* Remove the task from the bundle since we have made a
			
 
				 		 * decision for it, and that other tasks should not consider it
			
--- a/src/sched_policies/parallel_greedy.c
+++ b/src/sched_policies/parallel_greedy.c
@@ -1,6 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -167,7 +168,7 @@ static struct starpu_task *pop_task_pgreedy_policy(void)
 
				 			if (possible_combinations_size[workerid][i] > best_size)
			
 
				 			{
			
 
				 				int combined_worker = possible_combinations[workerid][i];
			
 
				-				if (starpu_combined_worker_may_execute_task(combined_worker, task))
			
 
				+				if (starpu_combined_worker_may_execute_task(combined_worker, task, 0))
			
 
				 				{
			
 
				 					best_size = possible_combinations_size[workerid][i];
			
 
				 					best_workerid = combined_worker;
			
--- a/src/sched_policies/parallel_heft.c
+++ b/src/sched_policies/parallel_heft.c
@@ -1,6 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -225,62 +226,73 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio)
 
				 			max_exp_end = worker_exp_end[worker];
			
 
				 	}
			
 
				 
			
 
				+	unsigned nimpl;
			
 
				+	unsigned best_impl = 0;
			
 
				 	for (worker = 0; worker < (nworkers+ncombinedworkers); worker++)
			
 
				 	{
			
 
				-		if (!starpu_combined_worker_may_execute_task(worker, task))
			
 
				+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				 		{
			
 
				-			/* no one on that queue may execute this task */
			
 
				-			skip_worker[worker] = 1;
			
 
				-			continue;
			
 
				-		}
			
 
				-		else {
			
 
				-			skip_worker[worker] = 0;
			
 
				-		}
			
 
				+			if (!starpu_combined_worker_may_execute_task(worker, task, nimpl))
			
 
				+			{
			
 
				+				/* no one on that queue may execute this task */
			
 
				+				skip_worker[worker] = 1;
			
 
				+				continue;
			
 
				+			}
			
 
				+			else {
			
 
				+				skip_worker[worker] = 0;
			
 
				+			}
			
 
				 
			
 
				-		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				-		local_task_length[worker] = starpu_task_expected_length(task, perf_arch);
			
 
				+			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				 
			
 
				-		unsigned memory_node = starpu_worker_get_memory_node(worker);
			
 
				-		local_data_penalty[worker] = starpu_task_expected_data_transfer_time(memory_node, task);
			
 
				+			local_task_length[worker] = starpu_task_expected_length(task, perf_arch,nimpl);
			
 
				 
			
 
				-		double ntasks_end = compute_ntasks_end(worker);
			
 
				+			unsigned memory_node = starpu_worker_get_memory_node(worker);
			
 
				+			local_data_penalty[worker] = starpu_task_expected_data_transfer_time(memory_node, task);
			
 
				 
			
 
				-		if (ntasks_best == -1
			
 
				-				|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
			
 
				-				|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
			
 
				-				|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
			
 
				-				) {
			
 
				-			ntasks_best_end = ntasks_end;
			
 
				-			ntasks_best = worker;
			
 
				-		}
			
 
				+			double ntasks_end = compute_ntasks_end(worker);
			
 
				 
			
 
				-		if (local_task_length[worker] == -1.0)
			
 
				-			/* we are calibrating, we want to speed-up calibration time
			
 
				-			 * so we privilege non-calibrated tasks (but still
			
 
				-			 * greedily distribute them to avoid dumb schedules) */
			
 
				-			calibrating = 1;
			
 
				+			if (ntasks_best == -1
			
 
				+					|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
			
 
				+					|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
			
 
				+					|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
			
 
				+					) {
			
 
				+				ntasks_best_end = ntasks_end;
			
 
				+				ntasks_best = worker;
			
 
				+			}
			
 
				 
			
 
				-		if (local_task_length[worker] <= 0.0)
			
 
				-			/* there is no prediction available for that task
			
 
				-			 * with that arch yet, so switch to a greedy strategy */
			
 
				-			unknown = 1;
			
 
				+			if (local_task_length[worker] == -1.0)
			
 
				+				/* we are calibrating, we want to speed-up calibration time
			
 
				+				 * so we privilege non-calibrated tasks (but still
			
 
				+				 * greedily distribute them to avoid dumb schedules) */
			
 
				+				calibrating = 1;
			
 
				 
			
 
				-		if (unknown)
			
 
				-			continue;
			
 
				+			if (local_task_length[worker] <= 0.0)
			
 
				+				/* there is no prediction available for that task
			
 
				+				 * with that arch yet, so switch to a greedy strategy */
			
 
				+				unknown = 1;
			
 
				 
			
 
				-		local_exp_end[worker] = compute_expected_end(worker, local_task_length[worker]);
			
 
				+			if (unknown)
			
 
				+				continue;
			
 
				 
			
 
				-		//fprintf(stderr, "WORKER %d -> length %e end %e\n", worker, local_task_length[worker], local_exp_end[worker]);
			
 
				+			local_exp_end[worker] = compute_expected_end(worker, local_task_length[worker]);
			
 
				 
			
 
				-		if (local_exp_end[worker] < best_exp_end)
			
 
				-		{
			
 
				-			/* a better solution was found */
			
 
				-			best_exp_end = local_exp_end[worker];
			
 
				-		}
			
 
				+			//fprintf(stderr, "WORKER %d -> length %e end %e\n", worker, local_task_length[worker], local_exp_end[worker]);
			
 
				+
			
 
				+			if (local_exp_end[worker] < best_exp_end)
			
 
				+			{
			
 
				+				/* a better solution was found */
			
 
				+				best_exp_end = local_exp_end[worker];
			
 
				+				best_impl = nimpl;
			
 
				+			}
			
 
				 
			
 
				-		local_power[worker] = starpu_task_expected_power(task, perf_arch);
			
 
				-		if (local_power[worker] == -1.0)
			
 
				-			local_power[worker] = 0.;
			
 
				+
			
 
				+			local_power[worker] = starpu_task_expected_power(task, perf_arch,nimpl);
			
 
				+			//_STARPU_DEBUG("Scheduler parallel heft: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker],local_power[worker],worker,nimpl);
			
 
				+
			
 
				+			if (local_power[worker] == -1.0)
			
 
				+				local_power[worker] = 0.;
			
 
				+
			
 
				+		} //end for
			
 
				 	}
			
 
				 
			
 
				 	if (unknown)
			
@@ -338,6 +350,9 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio)
 
				 		best_exp_end = local_exp_end[best];
			
 
				 	}
			
 
				 
			
 
				+
			
 
				+	//_STARPU_DEBUG("Scheduler parallel heft: kernel (%u)\n", best_impl);
			
 
				+	_starpu_get_job_associated_to_task(task)->nimpl = best_impl;
			
 
				 	/* we should now have the best worker in variable "best" */
			
 
				 	return push_task_on_best_worker(task, best, best_exp_end, prio);
			
 
				 }
			
--- a/tests/perfmodels/regression_based.c
+++ b/tests/perfmodels/regression_based.c
@@ -1,6 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -97,7 +98,11 @@ static void show_task_perfs(int size, struct starpu_task *task) {
 
				 		char name[16];
			
 
				 		starpu_worker_get_name(workerid, name, sizeof(name));
			
 
				 
			
 
				-		printf("Expected time for %d on %s:\t%f\n", size, name, starpu_task_expected_length(task, starpu_worker_get_perf_archtype(workerid)));
			
 
				+		unsigned nimpl;
			
 
				+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++) {
			
 
				+			printf("Expected time for %d on %s:\t%f\n",
			
 
				+				size, name, starpu_task_expected_length(task, starpu_worker_get_perf_archtype(workerid), nimpl));
			
 
				+		}
			
 
				 	}
			
 
				 }
			
 
				 
			
--- a/tools/starpu_perfmodel_display.c
+++ b/tools/starpu_perfmodel_display.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -143,14 +144,14 @@ static void display_history_based_perf_model(struct starpu_per_arch_perfmodel_t
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static void display_perf_model(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch)
			
 
				+static void display_perf_model(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, unsigned nimpl)
			
 
				 {
			
 
				-	struct starpu_per_arch_perfmodel_t *arch_model = &model->per_arch[arch];
			
 
				+	struct starpu_per_arch_perfmodel_t *arch_model = &model->per_arch[arch][nimpl];
			
 
				 	char archname[32];
			
 
				 
			
 
				 	if (arch_model->regression.nsample || arch_model->regression.valid || arch_model->regression.nl_valid || arch_model->list) {
			
 
				 
			
 
				-		starpu_perfmodel_get_arch_name(arch, archname, 32);
			
 
				+		starpu_perfmodel_get_arch_name(arch, archname, 32, nimpl);
			
 
				 		fprintf(stderr, "performance model for %s\n", archname);
			
 
				 	}
			
 
				 
			
@@ -187,7 +188,7 @@ static void display_perf_model(struct starpu_perfmodel_t *model, enum starpu_per
 
				 
			
 
				 #if 0
			
 
				 		char debugname[1024];
			
 
				-		starpu_perfmodel_debugfilepath(model, arch, debugname, 1024);
			
 
				+		starpu_perfmodel_debugfilepath(model, arch, debugname, 1024, nimpl);
			
 
				 		printf("\t debug file path : %s\n", debugname);
			
 
				 #endif
			
 
				 	}
			
@@ -220,7 +221,7 @@ static void display_perf_model(struct starpu_perfmodel_t *model, enum starpu_per
 
				 
			
 
				 		if (strcmp(parameter, "path-file-debug") == 0) {
			
 
				 			char debugname[256];
			
 
				-			starpu_perfmodel_debugfilepath(model, arch, debugname, 1024);
			
 
				+			starpu_perfmodel_debugfilepath(model, arch, debugname, 1024, nimpl);
			
 
				 			printf("%s\n", debugname);
			
 
				 			return;
			
 
				 		}
			
@@ -243,14 +244,18 @@ static void display_all_perf_models(struct starpu_perfmodel_t *model)
 
				 	{
			
 
				 		/* display all architectures */
			
 
				 		unsigned archid;
			
 
				-		for (archid = 0; archid < STARPU_NARCH_VARIATIONS; archid++)
			
 
				-		{
			
 
				-			display_perf_model(model, (enum starpu_perf_archtype) archid);
			
 
				+		unsigned implid;
			
 
				+		for (archid = 0; archid < STARPU_NARCH_VARIATIONS; archid++) {
			
 
				+			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++) { /* Display all codelets on each arch */
			
 
				+				display_perf_model(model, (enum starpu_perf_archtype) archid, implid);
			
 
				+			}
			
 
				 		}
			
 
				 	}
			
 
				 	else {
			
 
				 		if (strcmp(arch, "cpu") == 0) {
			
 
				-			display_perf_model(model, STARPU_CPU_DEFAULT);
			
 
				+			unsigned implid;
			
 
				+			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				+				display_perf_model(model, STARPU_CPU_DEFAULT,implid); /* Display all codelets on cpu */
			
 
				 			return;
			
 
				 		}
			
 
				 
			
@@ -264,18 +269,22 @@ static void display_all_perf_models(struct starpu_perfmodel_t *model)
 
				 				exit(-1);
			
 
				 			}
			
 
				 
			
 
				-			display_perf_model(model, (enum starpu_perf_archtype) (STARPU_CPU_DEFAULT + k - 1));
			
 
				+			unsigned implid;
			
 
				+			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				+				display_perf_model(model, (enum starpu_perf_archtype) STARPU_CPU_DEFAULT + k - 1, implid);
			
 
				 			return;
			
 
				 		}
			
 
				 
			
 
				 		if (strcmp(arch, "cuda") == 0) {
			
 
				 			unsigned archid;
			
 
				-			for (archid = STARPU_CUDA_DEFAULT; archid < STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS; archid++)
			
 
				-			{
			
 
				-				char archname[32];
			
 
				-				starpu_perfmodel_get_arch_name((enum starpu_perf_archtype) archid, archname, 32);
			
 
				-				fprintf(stderr, "performance model for %s\n", archname);
			
 
				-				display_perf_model(model, (enum starpu_perf_archtype) archid);
			
 
				+			unsigned implid;
			
 
				+			for (archid = STARPU_CUDA_DEFAULT; archid < STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS; archid++) {
			
 
				+				for (implid = 0; implid <STARPU_MAXIMPLEMENTATIONS; implid ++) {
			
 
				+					char archname[32];
			
 
				+					starpu_perfmodel_get_arch_name((enum starpu_perf_archtype) archid, archname, 32, implid);
			
 
				+					fprintf(stderr, "performance model for %s\n", archname);
			
 
				+					display_perf_model(model, (enum starpu_perf_archtype) archid, implid);
			
 
				+				}
			
 
				 			}
			
 
				 			return;
			
 
				 		}
			
@@ -287,13 +296,17 @@ static void display_all_perf_models(struct starpu_perfmodel_t *model)
 
				 		if (nmatched == 1)
			
 
				 		{
			
 
				 			unsigned archid = STARPU_CUDA_DEFAULT+ gpuid;
			
 
				-			display_perf_model(model, (enum starpu_perf_archtype) archid);
			
 
				+			unsigned implid;
			
 
				+			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				+				display_perf_model(model, (enum starpu_perf_archtype) archid, implid);
			
 
				 			return;
			
 
				 		}
			
 
				 
			
 
				 		if (strcmp(arch, "gordon") == 0) {
			
 
				 			fprintf(stderr, "performance model for gordon\n");
			
 
				-			display_perf_model(model, STARPU_GORDON_DEFAULT);
			
 
				+			unsigned implid;
			
 
				+			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				+				display_perf_model(model, STARPU_GORDON_DEFAULT, implid);
			
 
				 			return;
			
 
				 		}
			
 
				 
			
--- a/tools/starpu_perfmodel_plot.c
+++ b/tools/starpu_perfmodel_plot.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -141,10 +142,10 @@ static void print_comma(FILE *gnuplot_file, int *first)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static void display_perf_model(FILE *gnuplot_file, struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, int *first)
			
 
				+static void display_perf_model(FILE *gnuplot_file, struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, int *first, unsigned nimpl)
			
 
				 {
			
 
				 	char arch_name[256];
			
 
				-	starpu_perfmodel_get_arch_name(arch, arch_name, 256);
			
 
				+	starpu_perfmodel_get_arch_name(arch, arch_name, 256, nimpl);
			
 
				 
			
 
				 	fprintf(stderr,"Arch: %s\n", arch_name);
			
 
				 
			
@@ -156,7 +157,8 @@ static void display_perf_model(FILE *gnuplot_file, struct starpu_perfmodel_t *mo
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				-	struct starpu_per_arch_perfmodel_t *arch_model = &model->per_arch[arch];
			
 
				+	struct starpu_per_arch_perfmodel_t *arch_model =
			
 
				+		&model->per_arch[arch][nimpl];
			
 
				 
			
 
				 	/* Only display the regression model if we could actually build a model */
			
 
				 	if (arch_model->regression.valid)
			
@@ -203,16 +205,20 @@ static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_
 
				 	free(command);
			
 
				 
			
 
				 	col = 2;
			
 
				+	unsigned implid;
			
 
				 	for (arch = arch1; arch < arch2; arch++) {
			
 
				-		struct starpu_per_arch_perfmodel_t *arch_model = &model->per_arch[arch];
			
 
				-		starpu_perfmodel_get_arch_name((enum starpu_perf_archtype) arch, archname, 32);
			
 
				+		for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++) {
			
 
				+			struct starpu_per_arch_perfmodel_t *arch_model =
			
 
				+				&model->per_arch[arch][implid];
			
 
				+			starpu_perfmodel_get_arch_name((enum starpu_perf_archtype) arch, archname, 32, implid);
			
 
				 
			
 
				-		ptrs[arch-arch1] = ptr[arch-arch1] = arch_model->list;
			
 
				+			ptrs[arch-arch1] = ptr[arch-arch1] = arch_model->list;
			
 
				 
			
 
				-		if (ptr[arch-arch1]) {
			
 
				-			print_comma(gnuplot_file, first);
			
 
				-			fprintf(gnuplot_file, "\"%s\" using 1:%d:%d with errorlines title \"Measured %s\"", avg_file_name, col, col+1, archname);
			
 
				-			col += 2;
			
 
				+			if (ptr[arch-arch1]) {
			
 
				+				print_comma(gnuplot_file, first);
			
 
				+				fprintf(gnuplot_file, "\"%s\" using 1:%d:%d with errorlines title \"Measured %s\"", avg_file_name, col, col+1, archname);
			
 
				+				col += 2;
			
 
				+			}
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -260,8 +266,13 @@ static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_
 
				 static void display_perf_models(FILE *gnuplot_file, struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch1, enum starpu_perf_archtype arch2, int *first)
			
 
				 {
			
 
				 	unsigned arch;
			
 
				-	for (arch = arch1; arch < arch2; arch++)
			
 
				-		display_perf_model(gnuplot_file, model, (enum starpu_perf_archtype) arch, first);
			
 
				+	unsigned implid;
			
 
				+	for (arch = arch1; arch < arch2; arch++) {
			
 
				+		for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++) {
			
 
				+			display_perf_model(gnuplot_file, model, (enum starpu_perf_archtype) arch, first,
			
 
				+implid);
			
 
				+		}
			
 
				+	}
			
 
				 	display_history_based_perf_models(gnuplot_file, model, arch1, arch2, first);
			
 
				 }
			
 
				 
			
@@ -317,7 +328,12 @@ static void display_selected_models(FILE *gnuplot_file, struct starpu_perfmodel_
 
				 	}
			
 
				 	else {
			
 
				 		if (strcmp(arch, "cpu") == 0) {
			
 
				-			display_perf_model(gnuplot_file, model, STARPU_CPU_DEFAULT, &first);
			
 
				+			unsigned impl;
			
 
				+			for (impl = 0; impl < STARPU_MAXIMPLEMENTATIONS; impl++) {
			
 
				+				display_perf_model(gnuplot_file, model,
			
 
				+							STARPU_CPU_DEFAULT,
			
 
				+							&first, impl);
			
 
				+			}
			
 
				 			return;
			
 
				 		}