14 年之前 · a48b287faa
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -119,7 +119,9 @@ noinst_HEADERS = 				\
 
				 	common/blas_model.h			\
			
 
				 	common/blas.h				\
			
 
				 	mult/dw_mult.h				\
			
 
				-	mult/gordon/func_sgemm_ibm.h		\
			
 
				+	mult/simple.h				\
			
 
				+	mult/double.h				\
			
 
				+	mult/gordon/func_gemm_ibm.h		\
			
 
				 	gordon/null.h				\
			
 
				 	fortran/bindings/StarPU_fortran.h	\
			
 
				 	ppm_downscaler/ppm_downscaler.h		\
			
@@ -338,13 +340,11 @@ mult_dgemm_SOURCES = 				\
 
				 
			
 
				 mult_dw_mult_no_stride_SOURCES = 		\
			
 
				 	mult/dw_mult_no_stride.c		\
			
 
				-	mult/sgemm_kernels.c			\
			
 
				 	common/blas.c				\
			
 
				 	common/blas_model.c
			
 
				 
			
 
				 mult_dw_mult_no_stride_no_tag_SOURCES =		\
			
 
				 	mult/dw_mult_no_stride_no_tag.c		\
			
 
				-	mult/sgemm_kernels.c			\
			
 
				 	common/blas.c				\
			
 
				 	common/blas_model.c
			
 
				 
			
--- a/examples/common/blas_model.h
+++ b/examples/common/blas_model.h
@@ -21,8 +21,7 @@
 
				 
			
 
				 double gemm_cost(starpu_buffer_descr *descr);
			
 
				 
			
 
				-static struct starpu_perfmodel_t sgemm_model = {
			
 
				-	.cost_model = gemm_cost,
			
 
				+static struct starpu_perfmodel_t starpu_sgemm_model = {
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 #ifdef STARPU_ATLAS
			
 
				 	.symbol = "sgemm_atlas"
			
@@ -33,7 +32,23 @@ static struct starpu_perfmodel_t sgemm_model = {
 
				 #endif
			
 
				 };
			
 
				 
			
 
				-static struct starpu_perfmodel_t sgemm_model_common = {
			
 
				+static struct starpu_perfmodel_t starpu_sgemm_model_common = {
			
 
				+	.cost_model = gemm_cost,
			
 
				+	.type = STARPU_COMMON,
			
 
				+};
			
 
				+
			
 
				+static struct starpu_perfmodel_t starpu_dgemm_model = {
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+#ifdef STARPU_ATLAS
			
 
				+	.symbol = "dgemm_atlas"
			
 
				+#elif defined(STARPU_GOTO)
			
 
				+	.symbol = "dgemm_goto"
			
 
				+#else
			
 
				+	.symbol = "dgemm"
			
 
				+#endif
			
 
				+};
			
 
				+
			
 
				+static struct starpu_perfmodel_t starpu_dgemm_model_common = {
			
 
				 	.cost_model = gemm_cost,
			
 
				 	.type = STARPU_COMMON,
			
 
				 };
			
--- a/examples/mult/dgemm.c
+++ b/examples/mult/dgemm.c
@@ -14,13 +14,7 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				-#define TYPE	double
			
 
				-
			
 
				-#define CUBLAS_GEMM cublasDgemm
			
 
				-#define CPU_GEMM	DGEMM
			
 
				-#define CPU_ASUM	DASUM
			
 
				-#define CPU_IAMAX	IDAMAX
			
 
				-#define STARPU_GEMM(name)	starpu_dgemm_##name
			
 
				+#include "double.h"
			
 
				 
			
 
				 #include "xgemm_kernels.c"
			
 
				 #include "xgemm.c" 
			
--- a/examples/mult/dw_mult.c
+++ b/examples/mult/dw_mult.c
@@ -1,278 +0,0 @@
 
				-/*
			
 
				- * StarPU
			
 
				- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
			
 
				- *
			
 
				- * This program is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * This program is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include "dw_mult.h"
			
 
				-
			
 
				-#define TAG(taskx, tasky)	((((unsigned long long)(taskx))<<32) | (unsigned long long)(tasky))
			
 
				-
			
 
				-
			
 
				-
			
 
				-float *A, *B, *C;
			
 
				-starpu_data_handle A_handle, B_handle, C_handle;
			
 
				-
			
 
				-/*
			
 
				- * That program should compute C = A * B 
			
 
				- * 
			
 
				- *   A of size (z,y)
			
 
				- *   B of size (x,z)
			
 
				- *   C of size (x,y)
			
 
				-
			
 
				-              |---------------|
			
 
				-            z |       B       |
			
 
				-              |---------------|
			
 
				-       z              x
			
 
				-     |----|   |---------------|
			
 
				-     |    |   |               |
			
 
				-     |    |   |               |
			
 
				-     | A  | y |       C       |
			
 
				-     |    |   |               |
			
 
				-     |    |   |               |
			
 
				-     |----|   |---------------|
			
 
				-
			
 
				- */
			
 
				-
			
 
				-void terminate(void)
			
 
				-{
			
 
				-
			
 
				-	fprintf(stderr, "unpartition !!\n");
			
 
				-	starpu_data_unpartition(C_handle, 0);
			
 
				-
			
 
				-	starpu_data_unregister(C_handle);
			
 
				-
			
 
				-	gettimeofday(&end, NULL);
			
 
				-
			
 
				-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				-
			
 
				-	display_stats(timing);
			
 
				-
			
 
				-#ifdef CHECK_OUTPUT
			
 
				-	/* check results */
			
 
				-	/* compute C = C - AB */
			
 
				-
			
 
				-	SGEMM("N", "N", ydim, xdim, zdim, -1.0f, A, ydim, B, zdim, 1.0f, C, ydim);
			
 
				-		
			
 
				-	/* make sure C = 0 */
			
 
				-	float err;
			
 
				-	err = SASUM(xdim*ydim, C, 1);	
			
 
				-	
			
 
				-	if (err < xdim*ydim*0.001) {
			
 
				-		fprintf(stderr, "Results are OK\n");
			
 
				-	}
			
 
				-	else {
			
 
				-		fprintf(stderr, "There were errors ... err = %f\n", err);
			
 
				-	}
			
 
				-#endif // CHECK_OUTPUT
			
 
				-}
			
 
				-
			
 
				-void callback_func(void *arg)
			
 
				-{
			
 
				-	/* do some accounting */
			
 
				-	int id = starpu_worker_get_id();
			
 
				-	flop_per_worker[id] += BLAS3_FLOP(conf.m, conf.n, conf.k);
			
 
				-	ls_per_worker[id] += BLAS3_LS(conf.m, conf.n, conf.k);
			
 
				-}
			
 
				-
			
 
				-static void init_problem_data(void)
			
 
				-{
			
 
				-	unsigned i,j;
			
 
				-
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	if (pin) {
			
 
				-		starpu_data_malloc_pinned_if_possible((void **)&A, zdim*ydim*sizeof(float));
			
 
				-		starpu_data_malloc_pinned_if_possible((void **)&B, xdim*zdim*sizeof(float));
			
 
				-		starpu_data_malloc_pinned_if_possible((void **)&C, xdim*ydim*sizeof(float));
			
 
				-	} else
			
 
				-#endif
			
 
				-	{
			
 
				-#ifdef STARPU_HAVE_POSIX_MEMALIGN
			
 
				-		posix_memalign((void **)&A, 4096, zdim*ydim*sizeof(float));
			
 
				-		posix_memalign((void **)&B, 4096, xdim*zdim*sizeof(float));
			
 
				-		posix_memalign((void **)&C, 4096, xdim*ydim*sizeof(float));
			
 
				-#else
			
 
				-		A = malloc(zdim*ydim*sizeof(float));
			
 
				-		B = malloc(xdim*zdim*sizeof(float));
			
 
				-		C = malloc(xdim*ydim*sizeof(float));
			
 
				-#endif
			
 
				-	}
			
 
				-
			
 
				-	/* fill the A and B matrices */
			
 
				-	if (norandom) {
			
 
				-		for (j=0; j < ydim; j++) {
			
 
				-			for (i=0; i < zdim; i++) {
			
 
				-				A[j+i*ydim] = (float)(i);
			
 
				-			}
			
 
				-		}
			
 
				-	
			
 
				-		for (j=0; j < zdim; j++) {
			
 
				-			for (i=0; i < xdim; i++) {
			
 
				-				B[j+i*zdim] = (float)(j);
			
 
				-			}
			
 
				-		}
			
 
				-	} 
			
 
				-	else {
			
 
				-#ifdef NORANDOM
			
 
				-		srand(2008);
			
 
				-		STARPU_ABORT();
			
 
				-#endif
			
 
				-		for (j=0; j < ydim; j++) {
			
 
				-			for (i=0; i < zdim; i++) {
			
 
				-				A[j+i*ydim] = (float)(starpu_drand48());
			
 
				-			}
			
 
				-		}
			
 
				-	
			
 
				-		for (j=0; j < zdim; j++) {
			
 
				-			for (i=0; i < xdim; i++) {
			
 
				-				B[j+i*zdim] = (float)(starpu_drand48());
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	for (j=0; j < ydim; j++) {
			
 
				-		for (i=0; i < xdim; i++) {
			
 
				-			C[j+i*ydim] = (float)(0);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	display_memory_consumption();
			
 
				-}
			
 
				-
			
 
				-static void partition_mult_data(void)
			
 
				-{
			
 
				-	gettimeofday(&start, NULL);
			
 
				-
			
 
				-	starpu_matrix_data_register(&A_handle, 0, (uintptr_t)A, 
			
 
				-		ydim, ydim, zdim, sizeof(float));
			
 
				-	starpu_matrix_data_register(&B_handle, 0, (uintptr_t)B, 
			
 
				-		zdim, zdim, xdim, sizeof(float));
			
 
				-	starpu_matrix_data_register(&C_handle, 0, (uintptr_t)C, 
			
 
				-		ydim, ydim, xdim, sizeof(float));
			
 
				-
			
 
				-	starpu_data_set_wt_mask(C_handle, 1<<0);
			
 
				-
			
 
				-	conf.k = zdim;
			
 
				-	conf.m = ydim/nslicesy;
			
 
				-	conf.n = xdim/nslicesx;
			
 
				-
			
 
				-	struct starpu_data_filter f;
			
 
				-	f.filter_func = starpu_vertical_block_filter_func;
			
 
				-	f.nchildren = nslicesx;
			
 
				-	f.get_nchildren = NULL;
			
 
				-	f.get_child_ops = NULL;
			
 
				-		
			
 
				-	struct starpu_data_filter f2;
			
 
				-	f2.filter_func = starpu_block_filter_func;
			
 
				-	f2.nchildren = nslicesy;
			
 
				-	f2.get_nchildren = NULL;
			
 
				-	f2.get_child_ops = NULL;
			
 
				-		
			
 
				-	starpu_data_partition(B_handle, &f);
			
 
				-	starpu_data_partition(A_handle, &f2);
			
 
				-
			
 
				-	starpu_data_map_filters(C_handle, 2, &f, &f2);
			
 
				-}
			
 
				-
			
 
				-static starpu_codelet cl = {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
			
 
				-	.cpu_func = cpu_mult,
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = cublas_mult,
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_GORDON
			
 
				-#ifdef SPU_FUNC_SGEMM
			
 
				-	.gordon_func = SPU_FUNC_SGEMM,
			
 
				-#else
			
 
				-#warning SPU_FUNC_SGEMM is not available
			
 
				-#endif
			
 
				-#endif
			
 
				-	.nbuffers = 3
			
 
				-};
			
 
				-
			
 
				-static void launch_codelets(void)
			
 
				-{
			
 
				-#ifdef STARPU_USE_FXT
			
 
				-	_starpu_fxt_register_thread(0);
			
 
				-#endif
			
 
				-	/* partition the work into slices */
			
 
				-	unsigned taskx, tasky;
			
 
				-
			
 
				-	srand(time(NULL));
			
 
				-
			
 
				-	/* should we use a single performance model for all archs and use an
			
 
				- 	 * acceleration factor ? */
			
 
				-	if (use_common_model) {
			
 
				-		cl.model = &sgemm_model_common;
			
 
				-	}
			
 
				-	else {
			
 
				-		cl.model = &sgemm_model;
			
 
				-	}
			
 
				-
			
 
				-	for (taskx = 0; taskx < nslicesx; taskx++) 
			
 
				-	{
			
 
				-		for (tasky = 0; tasky < nslicesy; tasky++)
			
 
				-		{
			
 
				-			/* A B[task] = C[task] */
			
 
				-			struct starpu_task *task = starpu_task_create();
			
 
				-
			
 
				-			task->cl = &cl;
			
 
				-			task->cl_arg = &conf;
			
 
				-			task->cl_arg_size = sizeof(struct block_conf);
			
 
				-
			
 
				-			task->callback_func = callback_func;
			
 
				-			task->callback_arg = NULL;
			
 
				-
			
 
				-			starpu_tag_t tag = TAG(taskx, tasky); 
			
 
				-
			
 
				-			task->use_tag = 1;
			
 
				-			task->tag_id = tag;
			
 
				-
			
 
				-			task->buffers[0].handle = starpu_data_get_sub_data(A_handle, 1, tasky);
			
 
				-			task->buffers[0].mode = STARPU_R;
			
 
				-			task->buffers[1].handle = starpu_data_get_sub_data(B_handle, 1, taskx);
			
 
				-			task->buffers[1].mode = STARPU_R;
			
 
				-			task->buffers[2].handle = 
			
 
				-				starpu_data_get_sub_data(C_handle, 2, taskx, tasky);
			
 
				-			task->buffers[2].mode = STARPU_RW;
			
 
				-
			
 
				-			starpu_task_submit(task);
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-int main(__attribute__ ((unused)) int argc, 
			
 
				-	 __attribute__ ((unused)) char **argv)
			
 
				-{
			
 
				-
			
 
				-	parse_args(argc, argv);
			
 
				-
			
 
				-	/* start the runtime */
			
 
				-	starpu_init(NULL);
			
 
				-	starpu_helper_cublas_init();
			
 
				-
			
 
				-	init_problem_data();
			
 
				-
			
 
				-	partition_mult_data();
			
 
				-
			
 
				-	launch_codelets();
			
 
				-
			
 
				-	starpu_task_wait_for_all();
			
 
				-
			
 
				-	terminate();
			
 
				-	
			
 
				-	starpu_helper_cublas_shutdown();
			
 
				-	starpu_shutdown();
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
--- a/examples/mult/dw_mult.h
+++ b/examples/mult/dw_mult.h
@@ -182,15 +182,15 @@ static void parse_args(int argc, char **argv)
 
				 static void display_memory_consumption(void)
			
 
				 {
			
 
				 	fprintf(stderr, "Total memory : %ld MB\n",
			
 
				-		(MAXSLICESY*MAXSLICESZ*sizeof(float *) 
			
 
				-		+ MAXSLICESZ*MAXSLICESX*sizeof(float *)
			
 
				-		+ MAXSLICESY*MAXSLICESX*sizeof(float *)
			
 
				+		(MAXSLICESY*MAXSLICESZ*sizeof(TYPE *) 
			
 
				+		+ MAXSLICESZ*MAXSLICESX*sizeof(TYPE *)
			
 
				+		+ MAXSLICESY*MAXSLICESX*sizeof(TYPE *)
			
 
				 		+ MAXSLICESY*MAXSLICESZ*sizeof(starpu_data_handle)
			
 
				 		+ MAXSLICESZ*MAXSLICESX*sizeof(starpu_data_handle)
			
 
				 		+ MAXSLICESY*MAXSLICESX*sizeof(starpu_data_handle)
			
 
				-		+ ydim*zdim*sizeof(float)
			
 
				-		+ zdim*xdim*sizeof(float)
			
 
				-		+ ydim*xdim*sizeof(float))/(1024*1024) );
			
 
				+		+ ydim*zdim*sizeof(TYPE)
			
 
				+		+ zdim*xdim*sizeof(TYPE)
			
 
				+		+ ydim*xdim*sizeof(TYPE))/(1024*1024) );
			
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
--- a/examples/mult/dw_mult_no_stride.c
+++ b/examples/mult/dw_mult_no_stride.c
@@ -14,14 +14,16 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				+#include "simple.h"
			
 
				 #include "dw_mult.h"
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				 #include "gordon/func_sgemm_ibm.h"
			
 
				 #endif
			
 
				+#include "xgemm_kernels.c"
			
 
				 
			
 
				-float *A[MAXSLICESY][MAXSLICESZ];
			
 
				-float *B[MAXSLICESZ][MAXSLICESX];
			
 
				-float *C[MAXSLICESY][MAXSLICESX];
			
 
				+TYPE *A[MAXSLICESY][MAXSLICESZ];
			
 
				+TYPE *B[MAXSLICESZ][MAXSLICESX];
			
 
				+TYPE *C[MAXSLICESY][MAXSLICESX];
			
 
				 
			
 
				 starpu_data_handle A_state[MAXSLICESY][MAXSLICESZ];
			
 
				 starpu_data_handle B_state[MAXSLICESZ][MAXSLICESX];
			
@@ -33,7 +35,10 @@ starpu_data_handle C_state[MAXSLICESY][MAXSLICESX];
 
				 static void submit_new_iter(unsigned x, unsigned y, unsigned iter);
			
 
				 
			
 
				 /*
			
 
				- * That program should compute C = A * B 
			
 
				+ * This program computes C = A * B 
			
 
				+ *
			
 
				+ * The difference with xgemm.c is that matrices are here already split in
			
 
				+ * blocks, and thus no data partitioning is needed.
			
 
				  * 
			
 
				  *   A of size (z,y)
			
 
				  *   B of size (x,z)
			
@@ -59,6 +64,14 @@ static void init_problem_data(void)
 
				 {
			
 
				 	unsigned i,j;
			
 
				 
			
 
				+	/* debug ... */
			
 
				+	memset(A, 0, MAXSLICESY*MAXSLICESZ*sizeof(TYPE *));
			
 
				+	memset(B, 0, MAXSLICESZ*MAXSLICESZ*sizeof(TYPE *));
			
 
				+	memset(C, 0, MAXSLICESY*MAXSLICESX*sizeof(TYPE *));
			
 
				+	memset(&A_state, 0, MAXSLICESY*MAXSLICESZ*sizeof(starpu_data_handle));
			
 
				+	memset(&B_state, 0, MAXSLICESZ*MAXSLICESZ*sizeof(starpu_data_handle));
			
 
				+	memset(&C_state, 0, MAXSLICESY*MAXSLICESX*sizeof(starpu_data_handle));
			
 
				+
			
 
				 	/* Allocate grids of buffer */
			
 
				 	/* TODO pin ... */
			
 
				 	unsigned z, y, x;
			
@@ -68,9 +81,9 @@ static void init_problem_data(void)
 
				 		for (z = 0; z < nslicesz; z++)
			
 
				 		{
			
 
				 #ifdef STARPU_HAVE_POSIX_MEMALIGN
			
 
				-			posix_memalign((void **)&A[y][z], MEM_ALIGNMENT, BLOCKSIZEZ*BLOCKSIZEY*sizeof(float));
			
 
				+			posix_memalign((void **)&A[y][z], MEM_ALIGNMENT, BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
			
 
				 #else
			
 
				-			A[y][z] = malloc(BLOCKSIZEZ*BLOCKSIZEY*sizeof(float));
			
 
				+			A[y][z] = malloc(BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
			
 
				 #endif
			
 
				 			assert(A[y][z]);
			
 
				 		}
			
@@ -81,9 +94,9 @@ static void init_problem_data(void)
 
				 		for (x = 0; x < nslicesx; x++)
			
 
				 		{
			
 
				 #ifdef STARPU_HAVE_POSIX_MEMALIGN
			
 
				-			posix_memalign((void **)&B[z][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEZ*sizeof(float));
			
 
				+			posix_memalign((void **)&B[z][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
			
 
				 #else
			
 
				-			B[z][x] = malloc(BLOCKSIZEX*BLOCKSIZEZ*sizeof(float));
			
 
				+			B[z][x] = malloc(BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
			
 
				 #endif
			
 
				 			assert(B[z][x]);
			
 
				 		}
			
@@ -94,9 +107,9 @@ static void init_problem_data(void)
 
				 		for (x = 0; x < nslicesx; x++)
			
 
				 		{
			
 
				 #ifdef STARPU_HAVE_POSIX_MEMALIGN
			
 
				-			posix_memalign((void **)&C[y][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEY*sizeof(float));
			
 
				+			posix_memalign((void **)&C[y][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
			
 
				 #else
			
 
				-			C[y][x] = malloc(BLOCKSIZEX*BLOCKSIZEY*sizeof(float));
			
 
				+			C[y][x] = malloc(BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
			
 
				 #endif
			
 
				 			assert(C[y][x]);
			
 
				 		}
			
@@ -111,7 +124,7 @@ static void init_problem_data(void)
 
				 				for (j = 0; j < BLOCKSIZEY; j++)
			
 
				 					for (i = 0; i < BLOCKSIZEZ; i++)
			
 
				 					{
			
 
				-						A[blocky][blockz][i*BLOCKSIZEY + j] = (float)(1 + blockz + blocky*nslicesz);
			
 
				+						A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(1 + blockz + blocky*nslicesz);
			
 
				 					}
			
 
				 
			
 
				 		for (blockz = 0; blockz < nslicesz; blockz++)
			
@@ -119,7 +132,7 @@ static void init_problem_data(void)
 
				 				for (j = 0; j < BLOCKSIZEZ; j++)
			
 
				 					for (i = 0; i < BLOCKSIZEX; i++)
			
 
				 					{
			
 
				-						B[blockz][blockx][i*BLOCKSIZEZ + j] = (float)(1 + blockx + blockz*nslicesx);
			
 
				+						B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(1 + blockx + blockz*nslicesx);
			
 
				 					}
			
 
				 	} 
			
 
				 	else {
			
@@ -128,7 +141,7 @@ static void init_problem_data(void)
 
				 				for (j = 0; j < BLOCKSIZEY; j++)
			
 
				 					for (i = 0; i < BLOCKSIZEZ; i++)
			
 
				 					{
			
 
				-						A[blocky][blockz][i*BLOCKSIZEY + j] = (float)(starpu_drand48());
			
 
				+						A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(starpu_drand48());
			
 
				 					}
			
 
				 
			
 
				 		for (blockz = 0; blockz < nslicesz; blockz++)
			
@@ -136,7 +149,7 @@ static void init_problem_data(void)
 
				 				for (j = 0; j < BLOCKSIZEZ; j++)
			
 
				 					for (i = 0; i < BLOCKSIZEX; i++)
			
 
				 					{
			
 
				-						B[blockz][blockx][i*BLOCKSIZEZ + j] = (float)(starpu_drand48());
			
 
				+						B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(starpu_drand48());
			
 
				 					}
			
 
				 
			
 
				 	}
			
@@ -146,9 +159,11 @@ static void init_problem_data(void)
 
				 			for (j = 0; j < BLOCKSIZEY; j++)
			
 
				 				for (i = 0; i < BLOCKSIZEX; i++)
			
 
				 				{
			
 
				-					C[blocky][blockx][i*BLOCKSIZEY + j] = (float)(blockx + blocky*nslicesx + 1);
			
 
				+					C[blocky][blockx][i*BLOCKSIZEY + j] = (TYPE)(blockx + blocky*nslicesx + 1);
			
 
				 				}
			
 
				 
			
 
				+	/* TODO: aren't we supposed to set data consistency to relaxed, since
			
 
				+	 * tags are supposed to provide the correct dependencies? */
			
 
				 
			
 
				 	/* declare the StarPU data to monitor */
			
 
				 	for (y = 0; y < nslicesy; y++)
			
@@ -156,7 +171,7 @@ static void init_problem_data(void)
 
				 		for (z = 0; z < nslicesz; z++)
			
 
				 		{
			
 
				 			starpu_matrix_data_register(&A_state[y][z], 0, (uintptr_t)A[y][z], 
			
 
				-				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEZ, sizeof(float));
			
 
				+				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEZ, sizeof(TYPE));
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -165,7 +180,7 @@ static void init_problem_data(void)
 
				 		for (x = 0; x < nslicesx; x++)
			
 
				 		{
			
 
				 			starpu_matrix_data_register(&B_state[z][x], 0, (uintptr_t)B[z][x], 
			
 
				-				BLOCKSIZEZ, BLOCKSIZEZ, BLOCKSIZEX, sizeof(float));
			
 
				+				BLOCKSIZEZ, BLOCKSIZEZ, BLOCKSIZEX, sizeof(TYPE));
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -174,7 +189,7 @@ static void init_problem_data(void)
 
				 		for (x = 0; x < nslicesx; x++)
			
 
				 		{
			
 
				 			starpu_matrix_data_register(&C_state[y][x], 0, (uintptr_t)C[y][x], 
			
 
				-				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEX, sizeof(float));
			
 
				+				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEX, sizeof(TYPE));
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -184,6 +199,8 @@ static void init_problem_data(void)
 
				 	conf.n = BLOCKSIZEX;
			
 
				 #endif
			
 
				 
			
 
				+	fprintf(stderr, "block size : x %d y %d z %d\n", BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
			
 
				+
			
 
				 	display_memory_consumption();
			
 
				 }
			
 
				 
			
@@ -191,6 +208,41 @@ static void cleanup_problem(void)
 
				 {
			
 
				 	unsigned z, y, x;
			
 
				 
			
 
				+#ifdef CHECK_OUTPUT
			
 
				+	TYPE maxerr = 0.0;
			
 
				+	TYPE err;
			
 
				+	fprintf(stderr, "Checking results ....");
			
 
				+
			
 
				+	for (y = 0; y < nslicesy; y++)
			
 
				+	{
			
 
				+		for (x = 0; x < nslicesx; x++)
			
 
				+		{
			
 
				+			for (z = 0; z < nslicesz; z++)
			
 
				+			{
			
 
				+				SGEMM("N", "N", BLOCKSIZEY, BLOCKSIZEX, BLOCKSIZEZ, -(TYPE)(niter), A[y][z], BLOCKSIZEY, B[z][x], BLOCKSIZEZ, 1.0f, C[y][x], BLOCKSIZEY);
			
 
				+
			
 
				+			}
			
 
				+
			
 
				+			/* make sure C - niter AB = 0 */
			
 
				+			err = SASUM(BLOCKSIZEX*BLOCKSIZEY, C[y][x], 1);
			
 
				+
			
 
				+			if (err > BLOCKSIZEX*BLOCKSIZEY*niter*0.001) 
			
 
				+				fprintf(stderr, "\nerr = %f ( x = %d y = %d ) ... ", err/niter, x, y );
			
 
				+
			
 
				+			maxerr = STARPU_MAX(err, maxerr);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (maxerr > BLOCKSIZEX*BLOCKSIZEY*niter*0.001)
			
 
				+	{
			
 
				+		fprintf(stderr, " maxerr = %f\n", maxerr/niter);
			
 
				+	}
			
 
				+	else {
			
 
				+		fprintf(stderr, " OK\n");
			
 
				+	}
			
 
				+	fflush(stderr);
			
 
				+#endif
			
 
				+
			
 
				 	for (y = 0; y < nslicesy; y++)
			
 
				 	{
			
 
				 		for (z = 0; z < nslicesz; z++)
			
@@ -226,20 +278,24 @@ struct cb2_s {
 
				 	unsigned iter;
			
 
				 };
			
 
				 
			
 
				+
			
 
				 static starpu_codelet cl = {
			
 
				-	.cpu_func = cpu_mult,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA
			
 
				+#ifdef SPU_FUNC_SGEMM
			
 
				+		|STARPU_GORDON
			
 
				+#endif
			
 
				+		,
			
 
				+	.cpu_func = STARPU_GEMM(cpu_mult),
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = cublas_mult,
			
 
				+	.cuda_func = STARPU_GEMM(cublas_mult),
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				 	/* .gordon_func will be set by load_elf_sgemm */
			
 
				 #endif
			
 
				-
			
 
				-	.model = &sgemm_model,
			
 
				-	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
			
 
				 	.nbuffers = 3
			
 
				 };
			
 
				 
			
 
				+
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				 static const char *spu_func_sgemm_elf_file = "./gordon/func_sgemm_ibm.spuelf";
			
 
				 static unsigned spu_func_sgemm_elf_id;
			
@@ -252,26 +308,20 @@ static void load_elf_sgemm(void)
 
				 
			
 
				 	spu_func_sgemm_ibm_id = gordon_register_kernel(spu_func_sgemm_elf_id, "func_sgemm_ibm");
			
 
				 
			
 
				-	
			
 
				 	gordon_load_plugin_on_all_spu(spu_func_sgemm_elf_id);
			
 
				 	gordon_load_kernel_on_all_spu(spu_func_sgemm_ibm_id);
			
 
				 
			
 
				 	cl.gordon_func = spu_func_sgemm_ibm_id;
			
 
				 }
			
 
				-
			
 
				 #endif // STARPU_USE_GORDON
			
 
				 
			
 
				 static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, unsigned iter)
			
 
				 {
			
 
				+	/* A B[task] = C[task] */
			
 
				 	struct starpu_task *task = starpu_task_create();
			
 
				 
			
 
				 	task->cl = &cl;
			
 
				 
			
 
				-#ifdef STARPU_USE_GORDON
			
 
				-	task->cl_arg = &conf;
			
 
				-	task->cl_arg_size = sizeof(struct ibm_sgemm_block_conf);
			
 
				-#endif
			
 
				-
			
 
				 	task->use_tag = 1;
			
 
				 	task->tag_id = TAG(z, y, x, iter);
			
 
				 
			
@@ -282,6 +332,11 @@ static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, un
 
				 	task->buffers[2].handle = C_state[y][x];
			
 
				 	task->buffers[2].mode = STARPU_RW;
			
 
				 
			
 
				+#ifdef STARPU_USE_GORDON
			
 
				+	task->cl_arg = &conf;
			
 
				+	task->cl_arg_size = sizeof(struct ibm_sgemm_block_conf);
			
 
				+#endif
			
 
				+
			
 
				 	return task;
			
 
				 }
			
 
				 
			
@@ -355,12 +410,21 @@ static void launch_codelets(void)
 
				 
			
 
				 	srand(time(NULL));
			
 
				 
			
 
				-	gettimeofday(&start, NULL);
			
 
				+	/* should we use a single performance model for all archs and use an
			
 
				+ 	 * acceleration factor ? */
			
 
				+	if (use_common_model) {
			
 
				+		cl.model = &STARPU_GEMM(model_common);
			
 
				+	}
			
 
				+	else {
			
 
				+		cl.model = &STARPU_GEMM(model);
			
 
				+	}
			
 
				 
			
 
				 	for (taskx = 0; taskx < nslicesx; taskx++) 
			
 
				-	for (tasky = 0; tasky < nslicesy; tasky++)
			
 
				 	{
			
 
				-		submit_new_iter(taskx, tasky, 0);
			
 
				+		for (tasky = 0; tasky < nslicesy; tasky++)
			
 
				+		{
			
 
				+			submit_new_iter(taskx, tasky, 0);
			
 
				+		}
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -381,16 +445,16 @@ int main(__attribute__ ((unused)) int argc,
 
				 
			
 
				 	init_problem_data();
			
 
				 
			
 
				+	gettimeofday(&start, NULL);
			
 
				+
			
 
				 	launch_codelets();
			
 
				 
			
 
				 	starpu_task_wait_for_all();
			
 
				 
			
 
				 	gettimeofday(&end, NULL);
			
 
				-
			
 
				 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				 	display_stats(timing);
			
 
				 
			
 
				-
			
 
				 	cleanup_problem();
			
 
				 
			
 
				 	starpu_helper_cublas_shutdown();
			
--- a/examples/mult/dw_mult_no_stride_no_tag.c
+++ b/examples/mult/dw_mult_no_stride_no_tag.c
@@ -14,10 +14,12 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				+#include "simple.h"
			
 
				 #include "dw_mult.h"
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				 #include "gordon/func_sgemm_ibm.h"
			
 
				 #endif
			
 
				+#include "xgemm_kernels.c"
			
 
				 
			
 
				 
			
 
				 struct pos {
			
@@ -26,9 +28,9 @@ struct pos {
 
				 
			
 
				 struct pos currentpos [MAXSLICESY][MAXSLICESX];
			
 
				 
			
 
				-float *A[MAXSLICESY][MAXSLICESZ];
			
 
				-float *B[MAXSLICESZ][MAXSLICESX];
			
 
				-float *C[MAXSLICESY][MAXSLICESX];
			
 
				+TYPE *A[MAXSLICESY][MAXSLICESZ];
			
 
				+TYPE *B[MAXSLICESZ][MAXSLICESX];
			
 
				+TYPE *C[MAXSLICESY][MAXSLICESX];
			
 
				 
			
 
				 starpu_data_handle A_state[MAXSLICESY][MAXSLICESZ];
			
 
				 starpu_data_handle B_state[MAXSLICESZ][MAXSLICESX];
			
@@ -37,8 +39,10 @@ starpu_data_handle C_state[MAXSLICESY][MAXSLICESX];
 
				 
			
 
				 static void callback_func_3(void *arg);
			
 
				 /*
			
 
				- * That program should compute C = A * B 
			
 
				+ * This program computes C = A * B 
			
 
				  * 
			
 
				+ * The difference with dw_mult_no_stride.c is that here we do not use tags, and
			
 
				+ * just rely on sequential data consistency.
			
 
				  *   A of size (z,y)
			
 
				  *   B of size (x,z)
			
 
				  *   C of size (x,y)
			
@@ -64,9 +68,9 @@ static void init_problem_data(void)
 
				 	unsigned i,j;
			
 
				 
			
 
				 	/* debug ... */
			
 
				-	memset(A, 0, MAXSLICESY*MAXSLICESZ*sizeof(float *));
			
 
				-	memset(B, 0, MAXSLICESZ*MAXSLICESZ*sizeof(float *));
			
 
				-	memset(C, 0, MAXSLICESY*MAXSLICESX*sizeof(float *));
			
 
				+	memset(A, 0, MAXSLICESY*MAXSLICESZ*sizeof(TYPE *));
			
 
				+	memset(B, 0, MAXSLICESZ*MAXSLICESZ*sizeof(TYPE *));
			
 
				+	memset(C, 0, MAXSLICESY*MAXSLICESX*sizeof(TYPE *));
			
 
				 	memset(&A_state, 0, MAXSLICESY*MAXSLICESZ*sizeof(starpu_data_handle));
			
 
				 	memset(&B_state, 0, MAXSLICESZ*MAXSLICESZ*sizeof(starpu_data_handle));
			
 
				 	memset(&C_state, 0, MAXSLICESY*MAXSLICESX*sizeof(starpu_data_handle));
			
@@ -80,9 +84,9 @@ static void init_problem_data(void)
 
				 		for (z = 0; z < nslicesz; z++)
			
 
				 		{
			
 
				 #ifdef STARPU_HAVE_POSIX_MEMALIGN
			
 
				-			posix_memalign((void **)&A[y][z], MEM_ALIGNMENT, BLOCKSIZEZ*BLOCKSIZEY*sizeof(float));
			
 
				+			posix_memalign((void **)&A[y][z], MEM_ALIGNMENT, BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
			
 
				 #else
			
 
				-			A[y][z] = malloc(BLOCKSIZEZ*BLOCKSIZEY*sizeof(float));
			
 
				+			A[y][z] = malloc(BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
			
 
				 #endif
			
 
				 			assert(A[y][z]);
			
 
				 		}
			
@@ -93,9 +97,9 @@ static void init_problem_data(void)
 
				 		for (x = 0; x < nslicesx; x++)
			
 
				 		{
			
 
				 #ifdef STARPU_HAVE_POSIX_MEMALIGN
			
 
				-			posix_memalign((void **)&B[z][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEZ*sizeof(float));
			
 
				+			posix_memalign((void **)&B[z][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
			
 
				 #else
			
 
				-			B[z][x] = malloc(BLOCKSIZEX*BLOCKSIZEZ*sizeof(float));
			
 
				+			B[z][x] = malloc(BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
			
 
				 #endif
			
 
				 			assert(B[z][x]);
			
 
				 		}
			
@@ -106,9 +110,9 @@ static void init_problem_data(void)
 
				 		for (x = 0; x < nslicesx; x++)
			
 
				 		{
			
 
				 #ifdef STARPU_HAVE_POSIX_MEMALIGN
			
 
				-			posix_memalign((void **)&C[y][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEY*sizeof(float));
			
 
				+			posix_memalign((void **)&C[y][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
			
 
				 #else
			
 
				-			C[y][x] = malloc(BLOCKSIZEX*BLOCKSIZEY*sizeof(float));
			
 
				+			C[y][x] = malloc(BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
			
 
				 #endif
			
 
				 			currentpos[y][x].x = x;
			
 
				 			currentpos[y][x].y = y;
			
@@ -127,7 +131,7 @@ static void init_problem_data(void)
 
				 				for (j = 0; j < BLOCKSIZEY; j++)
			
 
				 					for (i = 0; i < BLOCKSIZEZ; i++)
			
 
				 					{
			
 
				-						A[blocky][blockz][i*BLOCKSIZEY + j] = (float)(1 + blockz + blocky*nslicesz);
			
 
				+						A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(1 + blockz + blocky*nslicesz);
			
 
				 					}
			
 
				 
			
 
				 		for (blockz = 0; blockz < nslicesz; blockz++)
			
@@ -135,7 +139,7 @@ static void init_problem_data(void)
 
				 				for (j = 0; j < BLOCKSIZEZ; j++)
			
 
				 					for (i = 0; i < BLOCKSIZEX; i++)
			
 
				 					{
			
 
				-						B[blockz][blockx][i*BLOCKSIZEZ + j] = (float)(1 + blockx + blockz*nslicesx);
			
 
				+						B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(1 + blockx + blockz*nslicesx);
			
 
				 					}
			
 
				 	} 
			
 
				 	else {
			
@@ -144,7 +148,7 @@ static void init_problem_data(void)
 
				 				for (j = 0; j < BLOCKSIZEY; j++)
			
 
				 					for (i = 0; i < BLOCKSIZEZ; i++)
			
 
				 					{
			
 
				-						A[blocky][blockz][i*BLOCKSIZEY + j] = (float)(starpu_drand48());
			
 
				+						A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(starpu_drand48());
			
 
				 					}
			
 
				 
			
 
				 		for (blockz = 0; blockz < nslicesz; blockz++)
			
@@ -152,7 +156,7 @@ static void init_problem_data(void)
 
				 				for (j = 0; j < BLOCKSIZEZ; j++)
			
 
				 					for (i = 0; i < BLOCKSIZEX; i++)
			
 
				 					{
			
 
				-						B[blockz][blockx][i*BLOCKSIZEZ + j] = (float)(starpu_drand48());
			
 
				+						B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(starpu_drand48());
			
 
				 					}
			
 
				 
			
 
				 	}
			
@@ -162,7 +166,7 @@ static void init_problem_data(void)
 
				 			for (j = 0; j < BLOCKSIZEY; j++)
			
 
				 				for (i = 0; i < BLOCKSIZEX; i++)
			
 
				 				{
			
 
				-					C[blocky][blockx][i*BLOCKSIZEY + j] = (float)0;
			
 
				+					C[blocky][blockx][i*BLOCKSIZEY + j] = (TYPE)0;
			
 
				 				}
			
 
				 
			
 
				 
			
@@ -172,7 +176,7 @@ static void init_problem_data(void)
 
				 		for (z = 0; z < nslicesz; z++)
			
 
				 		{
			
 
				 			starpu_matrix_data_register(&A_state[y][z], 0, (uintptr_t)A[y][z], 
			
 
				-				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEZ, sizeof(float));
			
 
				+				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEZ, sizeof(TYPE));
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -181,7 +185,7 @@ static void init_problem_data(void)
 
				 		for (x = 0; x < nslicesx; x++)
			
 
				 		{
			
 
				 			starpu_matrix_data_register(&B_state[z][x], 0, (uintptr_t)B[z][x], 
			
 
				-				BLOCKSIZEZ, BLOCKSIZEZ, BLOCKSIZEX, sizeof(float));
			
 
				+				BLOCKSIZEZ, BLOCKSIZEZ, BLOCKSIZEX, sizeof(TYPE));
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -190,7 +194,7 @@ static void init_problem_data(void)
 
				 		for (x = 0; x < nslicesx; x++)
			
 
				 		{
			
 
				 			starpu_matrix_data_register(&C_state[y][x], 0, (uintptr_t)C[y][x], 
			
 
				-				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEX, sizeof(float));
			
 
				+				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEX, sizeof(TYPE));
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -210,8 +214,8 @@ static void cleanup_problem(void)
 
				 	unsigned z, y, x;
			
 
				 
			
 
				 #ifdef CHECK_OUTPUT
			
 
				-	float maxerr = 0.0;
			
 
				-	float err;
			
 
				+	TYPE maxerr = 0.0;
			
 
				+	TYPE err;
			
 
				 	fprintf(stderr, "Checking results ....");
			
 
				 
			
 
				 	for (y = 0; y < nslicesy; y++)
			
@@ -220,7 +224,7 @@ static void cleanup_problem(void)
 
				 		{
			
 
				 			for (z = 0; z < nslicesz; z++)
			
 
				 			{
			
 
				-				SGEMM("N", "N", BLOCKSIZEY, BLOCKSIZEX, BLOCKSIZEZ, -(float)(niter), A[y][z], BLOCKSIZEY, B[z][x], BLOCKSIZEZ, 1.0f, C[y][x], BLOCKSIZEY);
			
 
				+				SGEMM("N", "N", BLOCKSIZEY, BLOCKSIZEX, BLOCKSIZEZ, -(TYPE)(niter), A[y][z], BLOCKSIZEY, B[z][x], BLOCKSIZEZ, 1.0f, C[y][x], BLOCKSIZEY);
			
 
				 
			
 
				 			}
			
 
				 
			
@@ -280,10 +284,14 @@ struct cb2_s {
 
				 
			
 
				 
			
 
				 static starpu_codelet cl = {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
			
 
				-	.cpu_func = cpu_mult,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA
			
 
				+#ifdef SPU_FUNC_SGEMM
			
 
				+		|STARPU_GORDON
			
 
				+#endif
			
 
				+		,
			
 
				+	.cpu_func = STARPU_GEMM(cpu_mult),
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = cublas_mult,
			
 
				+	.cuda_func = STARPU_GEMM(cublas_mult),
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				 	/* .gordon_func will be set by load_elf_sgemm */
			
@@ -309,13 +317,11 @@ static void load_elf_sgemm(void)
 
				 
			
 
				 	cl.gordon_func = spu_func_sgemm_ibm_id;
			
 
				 }
			
 
				-#endif
			
 
				+#endif // STARPU_USE_GORDON
			
 
				 
			
 
				-
			
 
				-static void construct_task(unsigned x, unsigned y, unsigned z, unsigned iter, struct pos *posp)
			
 
				+static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, unsigned iter, struct pos *posp)
			
 
				 {
			
 
				-	struct starpu_task *task;
			
 
				-	task = starpu_task_create();
			
 
				+	struct starpu_task *task = starpu_task_create();
			
 
				 
			
 
				 	task->cl = &cl;
			
 
				 
			
@@ -337,7 +343,7 @@ static void construct_task(unsigned x, unsigned y, unsigned z, unsigned iter, st
 
				 	posp->z = z;
			
 
				 	posp->iter = iter;
			
 
				 
			
 
				-	starpu_task_submit(task);
			
 
				+	return task;
			
 
				 }
			
 
				 
			
 
				 
			
@@ -359,13 +365,15 @@ static void callback_func_3(void *arg)
 
				 
			
 
				 	if (z < nslicesz - 1)
			
 
				 	{
			
 
				-		construct_task(x, y, z+1, iter, posp);
			
 
				+		struct starpu_task *task = construct_task(x, y, z+1, iter, posp);
			
 
				+		starpu_task_submit(task);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				 		if (iter < niter - 1)
			
 
				 		{
			
 
				-			construct_task(x, y, 0, iter+1, posp);
			
 
				+			struct starpu_task *task = construct_task(x, y, 0, iter+1, posp);
			
 
				+			starpu_task_submit(task);
			
 
				 		}
			
 
				 	}
			
 
				 }
			
@@ -383,12 +391,22 @@ static void launch_codelets(void)
 
				 
			
 
				 	srand(time(NULL));
			
 
				 
			
 
				-	gettimeofday(&start, NULL);
			
 
				+	/* should we use a single performance model for all archs and use an
			
 
				+ 	 * acceleration factor ? */
			
 
				+	if (use_common_model) {
			
 
				+		cl.model = &STARPU_GEMM(model_common);
			
 
				+	}
			
 
				+	else {
			
 
				+		cl.model = &STARPU_GEMM(model);
			
 
				+	}
			
 
				 
			
 
				 	for (taskx = 0; taskx < nslicesx; taskx++) 
			
 
				-	for (tasky = 0; tasky < nslicesy; tasky++)
			
 
				 	{
			
 
				-		construct_task(taskx, tasky, 0, 0, &currentpos[tasky][taskx]);
			
 
				+		for (tasky = 0; tasky < nslicesy; tasky++)
			
 
				+		{
			
 
				+			struct starpu_task *task = construct_task(taskx, tasky, 0, 0, &currentpos[tasky][taskx]);
			
 
				+			starpu_task_submit(task);
			
 
				+		}
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -409,14 +427,14 @@ int main(__attribute__ ((unused)) int argc,
 
				 
			
 
				 	init_problem_data();
			
 
				 
			
 
				+	gettimeofday(&start, NULL);
			
 
				+
			
 
				 	launch_codelets();
			
 
				 
			
 
				 	starpu_task_wait_for_all();
			
 
				 
			
 
				 	gettimeofday(&end, NULL);
			
 
				-
			
 
				 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				-
			
 
				 	display_stats(timing);
			
 
				 
			
 
				 	cleanup_problem();
			
--- a/examples/mult/gordon/func_dgemm_ibm.c
+++ b/examples/mult/gordon/func_dgemm_ibm.c
@@ -0,0 +1,42 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "func_gemm_ibm.h"
			
 
				+
			
 
				+#include <blas_s.h>
			
 
				+
			
 
				+void func_dgemm_ibm(__attribute__ ((unused)) void **alloc,
			
 
				+		__attribute__ ((unused)) void **in,
			
 
				+		__attribute__ ((unused)) void **inout,
			
 
				+		__attribute__ ((unused)) void **out)
			
 
				+{
			
 
				+	/* we assume data will be in A:R,B:R,C:RW mode
			
 
				+ 	 *  -> in[0] : describe problem
			
 
				+ 	 *  -> in[1] : A
			
 
				+ 	 *  -> in[2] : B
			
 
				+ 	 *  -> inout[0] : C
			
 
				+ 	 *
			
 
				+ 	 *   C = AB + C
			
 
				+ 	 *   but, being in fortran ordering, we compute
			
 
				+ 	 *   t(C) = t(B)t(A) + t(C) instead
			
 
				+ 	 */
			
 
				+	struct ibm_gemm_block_conf *conf = in[0];
			
 
				+	double *A = in[1];
			
 
				+	double *B = in[2];
			
 
				+	double *C = inout[0];
			
 
				+
			
 
				+	dgemm_spu(conf->m, conf->n, conf->k, B, A, C);
			
 
				+}
			
--- a/examples/mult/gordon/func_sgemm_ibm.h
+++ b/examples/mult/gordon/func_sgemm_ibm.h
--- a/examples/mult/gordon/func_sgemm_ibm.c
+++ b/examples/mult/gordon/func_sgemm_ibm.c
@@ -14,7 +14,7 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				-#include "func_sgemm_ibm.h"
			
 
				+#include "func_gemm_ibm.h"
			
 
				 
			
 
				 #include <blas_s.h>
			
 
				 
			
@@ -33,7 +33,7 @@ void func_sgemm_ibm(__attribute__ ((unused)) void **alloc,
 
				  	 *   but, being in fortran ordering, we compute
			
 
				  	 *   t(C) = t(B)t(A) + t(C) instead
			
 
				  	 */
			
 
				-	struct ibm_sgemm_block_conf *conf = in[0];
			
 
				+	struct ibm_gemm_block_conf *conf = in[0];
			
 
				 	float *A = in[1];
			
 
				 	float *B = in[2];
			
 
				 	float *C = inout[0];
			
--- a/examples/mult/sgemm.c
+++ b/examples/mult/sgemm.c
@@ -14,13 +14,7 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				-#define TYPE	float
			
 
				-
			
 
				-#define CUBLAS_GEMM cublasSgemm
			
 
				-#define CPU_GEMM	SGEMM
			
 
				-#define CPU_ASUM	SASUM
			
 
				-#define CPU_IAMAX	ISAMAX
			
 
				-#define STARPU_GEMM(name)	starpu_sgemm_##name
			
 
				+#include "simple.h"
			
 
				 
			
 
				 #include "xgemm_kernels.c"
			
 
				 #include "xgemm.c" 
			
--- a/examples/mult/sgemm_kernels.c
+++ b/examples/mult/sgemm_kernels.c
@@ -1,77 +0,0 @@
 
				-/*
			
 
				- * StarPU
			
 
				- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
			
 
				- *
			
 
				- * This program is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * This program is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include <starpu.h>
			
 
				-#include <starpu_cuda.h>
			
 
				-#include <common/blas.h>
			
 
				-
			
 
				-#define COMMON_CODE			\
			
 
				-	uint32_t nxC, nyC, nyA;		\
			
 
				-	uint32_t ldA, ldB, ldC;		\
			
 
				-					\
			
 
				-	float *subA;			\
			
 
				-	float *subB;			\
			
 
				-	float *subC;			\
			
 
				-					\
			
 
				-	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);	\
			
 
				-	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);	\
			
 
				-	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);	\
			
 
				-					\
			
 
				-	nxC = STARPU_MATRIX_GET_NX(descr[2]);		\
			
 
				-	nyC = STARPU_MATRIX_GET_NY(descr[2]);		\
			
 
				-	nyA = STARPU_MATRIX_GET_NY(descr[0]);		\
			
 
				-					\
			
 
				-	ldA = STARPU_MATRIX_GET_LD(descr[0]);		\
			
 
				-	ldB = STARPU_MATRIX_GET_LD(descr[1]);		\
			
 
				-	ldC = STARPU_MATRIX_GET_LD(descr[2]);
			
 
				-
			
 
				-
			
 
				-
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-
			
 
				-#ifdef STARPU_HAVE_MAGMA
			
 
				-#define GPU_SGEMM magmablas_sgemm
			
 
				-#else
			
 
				-#define GPU_SGEMM cublasSgemm
			
 
				-#endif
			
 
				-
			
 
				-void cublas_mult(void *descr[], __attribute__((unused)) void *arg)
			
 
				-{
			
 
				-	COMMON_CODE
			
 
				-
			
 
				-	starpu_trace_user_event(0x42);
			
 
				-
			
 
				-	GPU_SGEMM('n', 'n', nxC, nyC, nyA, 1.0f, subA, ldA, subB, ldB, 
			
 
				-					     0.0f, subC, ldC);
			
 
				-	cublasStatus st;
			
 
				-	st = cublasGetError();
			
 
				-	if (st != CUBLAS_STATUS_SUCCESS)
			
 
				-		STARPU_ABORT();
			
 
				-
			
 
				-	cudaThreadSynchronize();
			
 
				-
			
 
				-	starpu_trace_user_event(0x43);
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-void cpu_mult(void *descr[], __attribute__((unused))  void *arg)
			
 
				-{
			
 
				-	COMMON_CODE
			
 
				-
			
 
				-	starpu_trace_user_event(0x42);
			
 
				-	SGEMM("N", "N", nxC, nyC, nyA, 1.0f, subA, ldA, subB, ldB, 0.0f, subC, ldC);
			
 
				-	starpu_trace_user_event(0x43);
			
 
				-}
			
--- a/examples/mult/xgemm.c
+++ b/examples/mult/xgemm.c
@@ -16,15 +16,11 @@
 
				 
			
 
				 #include "dw_mult.h"
			
 
				 
			
 
				-#define str(s) #s
			
 
				-#define xstr(s)        str(s)
			
 
				-#define STARPU_GEMM_STR(name)  xstr(STARPU_GEMM(name))
			
 
				-
			
 
				 TYPE *A, *B, *C;
			
 
				 starpu_data_handle A_handle, B_handle, C_handle;
			
 
				 
			
 
				 /*
			
 
				- * That program should compute C = A * B 
			
 
				+ * This program computes C = A * B 
			
 
				  * 
			
 
				  *   A of size (z,y)
			
 
				  *   B of size (x,z)
			
@@ -49,17 +45,22 @@ static void check_output(void)
 
				 	/* check results */
			
 
				 	/* compute C = C - AB */
			
 
				 
			
 
				-	CPU_GEMM("N", "N", ydim, xdim, zdim, (TYPE)-1.0, A, ydim, B, zdim, (TYPE)1.0f, C, ydim);
			
 
				+	CPU_GEMM("N", "N", ydim, xdim, zdim, (TYPE)-1.0f, A, ydim, B, zdim, (TYPE)1.0f, C, ydim);
			
 
				 		
			
 
				 	/* make sure C = 0 */
			
 
				 	TYPE err;
			
 
				 	err = CPU_ASUM(xdim*ydim, C, 1);
			
 
				 
			
 
				-	int max;
			
 
				-	max = CPU_IAMAX(xdim*ydim, C, 1);
			
 
				+	if (err < xdim*ydim*0.001) {
			
 
				+		fprintf(stderr, "Results are OK\n");
			
 
				+	}
			
 
				+	else {
			
 
				+		int max;
			
 
				+		max = CPU_IAMAX(xdim*ydim, C, 1);
			
 
				 
			
 
				-	fprintf(stderr, "Avg error : %e\n", err/(xdim*ydim));
			
 
				-	fprintf(stderr, "Max error : %e\n", C[max]);
			
 
				+		fprintf(stderr, "There were errors ... err = %f\n", err);
			
 
				+		fprintf(stderr, "Max error : %e\n", C[max]);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 void callback_func(void *arg)
			
@@ -108,6 +109,10 @@ static void init_problem_data(void)
 
				 		}
			
 
				 	} 
			
 
				 	else {
			
 
				+#ifdef NORANDOM
			
 
				+		srand(2008);
			
 
				+		STARPU_ABORT();
			
 
				+#endif
			
 
				 		for (j=0; j < ydim; j++) {
			
 
				 			for (i=0; i < zdim; i++) {
			
 
				 				A[j+i*ydim] = (TYPE)(starpu_drand48());
			
@@ -127,12 +132,7 @@ static void init_problem_data(void)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	/* display memory consumption */
			
 
				-	fprintf(stderr, "Total memory : %ld MB\n",
			
 
				-		( ydim*zdim*sizeof(TYPE)
			
 
				-		+ zdim*xdim*sizeof(TYPE)
			
 
				-		+ ydim*xdim*sizeof(TYPE) )/(1024*1024));
			
 
				-
			
 
				+	display_memory_consumption();
			
 
				 }
			
 
				 
			
 
				 static void partition_mult_data(void)
			
@@ -170,60 +170,94 @@ static void partition_mult_data(void)
 
				 
			
 
				 static void unpartition_mult_data(void)
			
 
				 {
			
 
				+	fprintf(stderr, "unpartition !!\n");
			
 
				+
			
 
				 	starpu_data_unpartition(C_handle, 0);
			
 
				 
			
 
				 	starpu_data_unregister(C_handle);
			
 
				 }
			
 
				 
			
 
				-static struct starpu_perfmodel_t gemm_model = {
			
 
				-	.type = STARPU_HISTORY_BASED,
			
 
				-#ifdef STARPU_ATLAS
			
 
				-	.symbol = STARPU_GEMM_STR(gemm_atlas)
			
 
				-#elif defined(STARPU_GOTO)
			
 
				-	.symbol = STARPU_GEMM_STR(gemm_goto)
			
 
				-#else
			
 
				-	.symbol = STARPU_GEMM_STR(gemm)
			
 
				-#endif
			
 
				-};
			
 
				-
			
 
				 static starpu_codelet cl = {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA
			
 
				+#ifdef SPU_FUNC_SGEMM
			
 
				+		|STARPU_GORDON
			
 
				+#endif
			
 
				+		,
			
 
				 	.cpu_func = STARPU_GEMM(cpu_mult),
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_func = STARPU_GEMM(cublas_mult),
			
 
				 #endif
			
 
				-	.model = &gemm_model,
			
 
				+#ifdef STARPU_USE_GORDON
			
 
				+#ifdef SPU_FUNC_SGEMM
			
 
				+	.gordon_func = SPU_FUNC_SGEMM,
			
 
				+#else
			
 
				+#warning SPU_FUNC_SGEMM is not available
			
 
				+#endif
			
 
				+#endif
			
 
				 	.nbuffers = 3
			
 
				 };
			
 
				 
			
 
				+static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, unsigned iter)
			
 
				+{
			
 
				+	/* A B[task] = C[task] */
			
 
				+	struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+	task->cl = &cl;
			
 
				+
			
 
				+	/* we have a callback to do some accounting */
			
 
				+	task->callback_func = callback_func;
			
 
				+	task->callback_arg = NULL;
			
 
				+
			
 
				+	task->buffers[0].handle = starpu_data_get_sub_data(A_handle, 1, y);
			
 
				+	task->buffers[0].mode = STARPU_R;
			
 
				+	task->buffers[1].handle = starpu_data_get_sub_data(B_handle, 1, x);
			
 
				+	task->buffers[1].mode = STARPU_R;
			
 
				+	task->buffers[2].handle = starpu_data_get_sub_data(C_handle, 2, x, y);
			
 
				+	task->buffers[2].mode = STARPU_RW;
			
 
				+
			
 
				+	task->cl_arg = &conf;
			
 
				+	task->cl_arg_size = sizeof(struct block_conf);
			
 
				+	return task;
			
 
				+}
			
 
				+
			
 
				+static void submit_new_iter(unsigned x, unsigned y, unsigned iter)
			
 
				+{
			
 
				+	unsigned z;
			
 
				+
			
 
				+	z = 0;
			
 
				+
			
 
				+	{
			
 
				+		struct starpu_task *task;
			
 
				+		task = construct_task(x, y, z, iter);
			
 
				+
			
 
				+		starpu_task_submit(task);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 static void launch_codelets(void)
			
 
				 {
			
 
				+#ifdef STARPU_USE_FXT
			
 
				+	_starpu_fxt_register_thread(0);
			
 
				+#endif
			
 
				 	/* partition the work into slices */
			
 
				 	unsigned taskx, tasky;
			
 
				 
			
 
				+	srand(time(NULL));
			
 
				+
			
 
				+	/* should we use a single performance model for all archs and use an
			
 
				+ 	 * acceleration factor ? */
			
 
				+	if (use_common_model) {
			
 
				+		cl.model = &STARPU_GEMM(model_common);
			
 
				+	}
			
 
				+	else {
			
 
				+		cl.model = &STARPU_GEMM(model);
			
 
				+	}
			
 
				+
			
 
				 	for (taskx = 0; taskx < nslicesx; taskx++) 
			
 
				 	{
			
 
				 		for (tasky = 0; tasky < nslicesy; tasky++)
			
 
				 		{
			
 
				-			/* A B[task] = C[task] */
			
 
				-			struct starpu_task *task = starpu_task_create();
			
 
				-
			
 
				-			task->cl = &cl;
			
 
				-			task->cl_arg = &conf;
			
 
				-			task->cl_arg_size = sizeof(struct block_conf);
			
 
				-
			
 
				-			/* we have a callback to do some accounting */
			
 
				-			task->callback_func = callback_func;
			
 
				-			task->callback_arg = NULL;
			
 
				-
			
 
				-			task->buffers[0].handle = starpu_data_get_sub_data(A_handle, 1, tasky);
			
 
				-			task->buffers[0].mode = STARPU_R;
			
 
				-			task->buffers[1].handle = starpu_data_get_sub_data(B_handle, 1, taskx);
			
 
				-			task->buffers[1].mode = STARPU_R;
			
 
				-			task->buffers[2].handle = starpu_data_get_sub_data(C_handle, 2, taskx, tasky);
			
 
				-			task->buffers[2].mode = STARPU_RW;
			
 
				-
			
 
				-			starpu_task_submit(task);
			
 
				+			submit_new_iter(taskx, tasky, 0);
			
 
				 		}
			
 
				 	}
			
 
				 }
			
@@ -236,6 +270,7 @@ int main(__attribute__ ((unused)) int argc,
 
				 
			
 
				 	/* start the runtime */
			
 
				 	starpu_init(NULL);
			
 
				+
			
 
				 	starpu_helper_cublas_init();
			
 
				 
			
 
				 	init_problem_data();
			
@@ -245,11 +280,11 @@ int main(__attribute__ ((unused)) int argc,
 
				 	partition_mult_data();
			
 
				 
			
 
				 	launch_codelets();
			
 
				+
			
 
				 	starpu_task_wait_for_all();
			
 
				 
			
 
				 	gettimeofday(&end, NULL);
			
 
				-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 +
			
 
				-					(end.tv_usec - start.tv_usec));
			
 
				+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				 	display_stats(timing);
			
 
				 
			
 
				 	unpartition_mult_data();
			
--- a/examples/mult/xgemm_kernels.c
+++ b/examples/mult/xgemm_kernels.c
@@ -41,13 +41,20 @@
 
				 
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				+
			
 
				+#ifdef STARPU_HAVE_MAGMA
			
 
				+#define GPU_GEMM MAGMABLAS_GEMM
			
 
				+#else
			
 
				+#define GPU_GEMM CUBLAS_GEMM
			
 
				+#endif
			
 
				+
			
 
				 void STARPU_GEMM(cublas_mult)(void *descr[], __attribute__((unused)) void *arg)
			
 
				 {
			
 
				 	COMMON_CODE
			
 
				 
			
 
				 	starpu_trace_user_event(0x42);
			
 
				 
			
 
				-	CUBLAS_GEMM('n', 'n', nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
			
 
				+	GPU_GEMM('n', 'n', nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
			
 
				 					     (TYPE)0.0, subC, ldC);
			
 
				 	cublasStatus st;
			
 
				 	st = cublasGetError();
			
@@ -65,8 +72,6 @@ void STARPU_GEMM(cpu_mult)(void *descr[], __attribute__((unused))  void *arg)
 
				 	COMMON_CODE
			
 
				 
			
 
				 	starpu_trace_user_event(0x42);
			
 
				-	CPU_GEMM("N", "N", nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
			
 
				-					  (TYPE)0.0, subC, ldC);
			
 
				-
			
 
				+	CPU_GEMM("N", "N", nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB, (TYPE)0.0, subC, ldC);
			
 
				 	starpu_trace_user_event(0x43);
			
 
				 }