浏览代码

harmonize mult codes

Samuel Thibault 14 年之前
父节点
当前提交
a48b287faa

+ 3 - 3
examples/Makefile.am

@@ -119,7 +119,9 @@ noinst_HEADERS = 				\
 	common/blas_model.h			\
 	common/blas.h				\
 	mult/dw_mult.h				\
-	mult/gordon/func_sgemm_ibm.h		\
+	mult/simple.h				\
+	mult/double.h				\
+	mult/gordon/func_gemm_ibm.h		\
 	gordon/null.h				\
 	fortran/bindings/StarPU_fortran.h	\
 	ppm_downscaler/ppm_downscaler.h		\
@@ -338,13 +340,11 @@ mult_dgemm_SOURCES = 				\
 
 mult_dw_mult_no_stride_SOURCES = 		\
 	mult/dw_mult_no_stride.c		\
-	mult/sgemm_kernels.c			\
 	common/blas.c				\
 	common/blas_model.c
 
 mult_dw_mult_no_stride_no_tag_SOURCES =		\
 	mult/dw_mult_no_stride_no_tag.c		\
-	mult/sgemm_kernels.c			\
 	common/blas.c				\
 	common/blas_model.c
 

+ 18 - 3
examples/common/blas_model.h

@@ -21,8 +21,7 @@
 
 double gemm_cost(starpu_buffer_descr *descr);
 
-static struct starpu_perfmodel_t sgemm_model = {
-	.cost_model = gemm_cost,
+static struct starpu_perfmodel_t starpu_sgemm_model = {
 	.type = STARPU_HISTORY_BASED,
 #ifdef STARPU_ATLAS
 	.symbol = "sgemm_atlas"
@@ -33,7 +32,23 @@ static struct starpu_perfmodel_t sgemm_model = {
 #endif
 };
 
-static struct starpu_perfmodel_t sgemm_model_common = {
+static struct starpu_perfmodel_t starpu_sgemm_model_common = {
+	.cost_model = gemm_cost,
+	.type = STARPU_COMMON,
+};
+
+static struct starpu_perfmodel_t starpu_dgemm_model = {
+	.type = STARPU_HISTORY_BASED,
+#ifdef STARPU_ATLAS
+	.symbol = "dgemm_atlas"
+#elif defined(STARPU_GOTO)
+	.symbol = "dgemm_goto"
+#else
+	.symbol = "dgemm"
+#endif
+};
+
+static struct starpu_perfmodel_t starpu_dgemm_model_common = {
 	.cost_model = gemm_cost,
 	.type = STARPU_COMMON,
 };

+ 1 - 7
examples/mult/dgemm.c

@@ -14,13 +14,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#define TYPE	double
-
-#define CUBLAS_GEMM cublasDgemm
-#define CPU_GEMM	DGEMM
-#define CPU_ASUM	DASUM
-#define CPU_IAMAX	IDAMAX
-#define STARPU_GEMM(name)	starpu_dgemm_##name
+#include "double.h"
 
 #include "xgemm_kernels.c"
 #include "xgemm.c" 

+ 0 - 278
examples/mult/dw_mult.c

@@ -1,278 +0,0 @@
-/*
- * StarPU
- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "dw_mult.h"
-
-#define TAG(taskx, tasky)	((((unsigned long long)(taskx))<<32) | (unsigned long long)(tasky))
-
-
-
-float *A, *B, *C;
-starpu_data_handle A_handle, B_handle, C_handle;
-
-/*
- * That program should compute C = A * B 
- * 
- *   A of size (z,y)
- *   B of size (x,z)
- *   C of size (x,y)
-
-              |---------------|
-            z |       B       |
-              |---------------|
-       z              x
-     |----|   |---------------|
-     |    |   |               |
-     |    |   |               |
-     | A  | y |       C       |
-     |    |   |               |
-     |    |   |               |
-     |----|   |---------------|
-
- */
-
-void terminate(void)
-{
-
-	fprintf(stderr, "unpartition !!\n");
-	starpu_data_unpartition(C_handle, 0);
-
-	starpu_data_unregister(C_handle);
-
-	gettimeofday(&end, NULL);
-
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-
-	display_stats(timing);
-
-#ifdef CHECK_OUTPUT
-	/* check results */
-	/* compute C = C - AB */
-
-	SGEMM("N", "N", ydim, xdim, zdim, -1.0f, A, ydim, B, zdim, 1.0f, C, ydim);
-		
-	/* make sure C = 0 */
-	float err;
-	err = SASUM(xdim*ydim, C, 1);	
-	
-	if (err < xdim*ydim*0.001) {
-		fprintf(stderr, "Results are OK\n");
-	}
-	else {
-		fprintf(stderr, "There were errors ... err = %f\n", err);
-	}
-#endif // CHECK_OUTPUT
-}
-
-void callback_func(void *arg)
-{
-	/* do some accounting */
-	int id = starpu_worker_get_id();
-	flop_per_worker[id] += BLAS3_FLOP(conf.m, conf.n, conf.k);
-	ls_per_worker[id] += BLAS3_LS(conf.m, conf.n, conf.k);
-}
-
-static void init_problem_data(void)
-{
-	unsigned i,j;
-
-#ifdef STARPU_USE_CUDA
-	if (pin) {
-		starpu_data_malloc_pinned_if_possible((void **)&A, zdim*ydim*sizeof(float));
-		starpu_data_malloc_pinned_if_possible((void **)&B, xdim*zdim*sizeof(float));
-		starpu_data_malloc_pinned_if_possible((void **)&C, xdim*ydim*sizeof(float));
-	} else
-#endif
-	{
-#ifdef STARPU_HAVE_POSIX_MEMALIGN
-		posix_memalign((void **)&A, 4096, zdim*ydim*sizeof(float));
-		posix_memalign((void **)&B, 4096, xdim*zdim*sizeof(float));
-		posix_memalign((void **)&C, 4096, xdim*ydim*sizeof(float));
-#else
-		A = malloc(zdim*ydim*sizeof(float));
-		B = malloc(xdim*zdim*sizeof(float));
-		C = malloc(xdim*ydim*sizeof(float));
-#endif
-	}
-
-	/* fill the A and B matrices */
-	if (norandom) {
-		for (j=0; j < ydim; j++) {
-			for (i=0; i < zdim; i++) {
-				A[j+i*ydim] = (float)(i);
-			}
-		}
-	
-		for (j=0; j < zdim; j++) {
-			for (i=0; i < xdim; i++) {
-				B[j+i*zdim] = (float)(j);
-			}
-		}
-	} 
-	else {
-#ifdef NORANDOM
-		srand(2008);
-		STARPU_ABORT();
-#endif
-		for (j=0; j < ydim; j++) {
-			for (i=0; i < zdim; i++) {
-				A[j+i*ydim] = (float)(starpu_drand48());
-			}
-		}
-	
-		for (j=0; j < zdim; j++) {
-			for (i=0; i < xdim; i++) {
-				B[j+i*zdim] = (float)(starpu_drand48());
-			}
-		}
-	}
-
-	for (j=0; j < ydim; j++) {
-		for (i=0; i < xdim; i++) {
-			C[j+i*ydim] = (float)(0);
-		}
-	}
-
-	display_memory_consumption();
-}
-
-static void partition_mult_data(void)
-{
-	gettimeofday(&start, NULL);
-
-	starpu_matrix_data_register(&A_handle, 0, (uintptr_t)A, 
-		ydim, ydim, zdim, sizeof(float));
-	starpu_matrix_data_register(&B_handle, 0, (uintptr_t)B, 
-		zdim, zdim, xdim, sizeof(float));
-	starpu_matrix_data_register(&C_handle, 0, (uintptr_t)C, 
-		ydim, ydim, xdim, sizeof(float));
-
-	starpu_data_set_wt_mask(C_handle, 1<<0);
-
-	conf.k = zdim;
-	conf.m = ydim/nslicesy;
-	conf.n = xdim/nslicesx;
-
-	struct starpu_data_filter f;
-	f.filter_func = starpu_vertical_block_filter_func;
-	f.nchildren = nslicesx;
-	f.get_nchildren = NULL;
-	f.get_child_ops = NULL;
-		
-	struct starpu_data_filter f2;
-	f2.filter_func = starpu_block_filter_func;
-	f2.nchildren = nslicesy;
-	f2.get_nchildren = NULL;
-	f2.get_child_ops = NULL;
-		
-	starpu_data_partition(B_handle, &f);
-	starpu_data_partition(A_handle, &f2);
-
-	starpu_data_map_filters(C_handle, 2, &f, &f2);
-}
-
-static starpu_codelet cl = {
-	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
-	.cpu_func = cpu_mult,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = cublas_mult,
-#endif
-#ifdef STARPU_USE_GORDON
-#ifdef SPU_FUNC_SGEMM
-	.gordon_func = SPU_FUNC_SGEMM,
-#else
-#warning SPU_FUNC_SGEMM is not available
-#endif
-#endif
-	.nbuffers = 3
-};
-
-static void launch_codelets(void)
-{
-#ifdef STARPU_USE_FXT
-	_starpu_fxt_register_thread(0);
-#endif
-	/* partition the work into slices */
-	unsigned taskx, tasky;
-
-	srand(time(NULL));
-
-	/* should we use a single performance model for all archs and use an
- 	 * acceleration factor ? */
-	if (use_common_model) {
-		cl.model = &sgemm_model_common;
-	}
-	else {
-		cl.model = &sgemm_model;
-	}
-
-	for (taskx = 0; taskx < nslicesx; taskx++) 
-	{
-		for (tasky = 0; tasky < nslicesy; tasky++)
-		{
-			/* A B[task] = C[task] */
-			struct starpu_task *task = starpu_task_create();
-
-			task->cl = &cl;
-			task->cl_arg = &conf;
-			task->cl_arg_size = sizeof(struct block_conf);
-
-			task->callback_func = callback_func;
-			task->callback_arg = NULL;
-
-			starpu_tag_t tag = TAG(taskx, tasky); 
-
-			task->use_tag = 1;
-			task->tag_id = tag;
-
-			task->buffers[0].handle = starpu_data_get_sub_data(A_handle, 1, tasky);
-			task->buffers[0].mode = STARPU_R;
-			task->buffers[1].handle = starpu_data_get_sub_data(B_handle, 1, taskx);
-			task->buffers[1].mode = STARPU_R;
-			task->buffers[2].handle = 
-				starpu_data_get_sub_data(C_handle, 2, taskx, tasky);
-			task->buffers[2].mode = STARPU_RW;
-
-			starpu_task_submit(task);
-		}
-	}
-}
-
-int main(__attribute__ ((unused)) int argc, 
-	 __attribute__ ((unused)) char **argv)
-{
-
-	parse_args(argc, argv);
-
-	/* start the runtime */
-	starpu_init(NULL);
-	starpu_helper_cublas_init();
-
-	init_problem_data();
-
-	partition_mult_data();
-
-	launch_codelets();
-
-	starpu_task_wait_for_all();
-
-	terminate();
-	
-	starpu_helper_cublas_shutdown();
-	starpu_shutdown();
-
-	return 0;
-}

+ 6 - 6
examples/mult/dw_mult.h

@@ -182,15 +182,15 @@ static void parse_args(int argc, char **argv)
 static void display_memory_consumption(void)
 {
 	fprintf(stderr, "Total memory : %ld MB\n",
-		(MAXSLICESY*MAXSLICESZ*sizeof(float *) 
-		+ MAXSLICESZ*MAXSLICESX*sizeof(float *)
-		+ MAXSLICESY*MAXSLICESX*sizeof(float *)
+		(MAXSLICESY*MAXSLICESZ*sizeof(TYPE *) 
+		+ MAXSLICESZ*MAXSLICESX*sizeof(TYPE *)
+		+ MAXSLICESY*MAXSLICESX*sizeof(TYPE *)
 		+ MAXSLICESY*MAXSLICESZ*sizeof(starpu_data_handle)
 		+ MAXSLICESZ*MAXSLICESX*sizeof(starpu_data_handle)
 		+ MAXSLICESY*MAXSLICESX*sizeof(starpu_data_handle)
-		+ ydim*zdim*sizeof(float)
-		+ zdim*xdim*sizeof(float)
-		+ ydim*xdim*sizeof(float))/(1024*1024) );
+		+ ydim*zdim*sizeof(TYPE)
+		+ zdim*xdim*sizeof(TYPE)
+		+ ydim*xdim*sizeof(TYPE))/(1024*1024) );
 }
 
 #ifdef STARPU_USE_CUDA

+ 99 - 35
examples/mult/dw_mult_no_stride.c

@@ -14,14 +14,16 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#include "simple.h"
 #include "dw_mult.h"
 #ifdef STARPU_USE_GORDON
 #include "gordon/func_sgemm_ibm.h"
 #endif
+#include "xgemm_kernels.c"
 
-float *A[MAXSLICESY][MAXSLICESZ];
-float *B[MAXSLICESZ][MAXSLICESX];
-float *C[MAXSLICESY][MAXSLICESX];
+TYPE *A[MAXSLICESY][MAXSLICESZ];
+TYPE *B[MAXSLICESZ][MAXSLICESX];
+TYPE *C[MAXSLICESY][MAXSLICESX];
 
 starpu_data_handle A_state[MAXSLICESY][MAXSLICESZ];
 starpu_data_handle B_state[MAXSLICESZ][MAXSLICESX];
@@ -33,7 +35,10 @@ starpu_data_handle C_state[MAXSLICESY][MAXSLICESX];
 static void submit_new_iter(unsigned x, unsigned y, unsigned iter);
 
 /*
- * That program should compute C = A * B 
+ * This program computes C = A * B 
+ *
+ * The difference with xgemm.c is that matrices are here already split in
+ * blocks, and thus no data partitioning is needed.
  * 
  *   A of size (z,y)
  *   B of size (x,z)
@@ -59,6 +64,14 @@ static void init_problem_data(void)
 {
 	unsigned i,j;
 
+	/* debug ... */
+	memset(A, 0, MAXSLICESY*MAXSLICESZ*sizeof(TYPE *));
+	memset(B, 0, MAXSLICESZ*MAXSLICESZ*sizeof(TYPE *));
+	memset(C, 0, MAXSLICESY*MAXSLICESX*sizeof(TYPE *));
+	memset(&A_state, 0, MAXSLICESY*MAXSLICESZ*sizeof(starpu_data_handle));
+	memset(&B_state, 0, MAXSLICESZ*MAXSLICESZ*sizeof(starpu_data_handle));
+	memset(&C_state, 0, MAXSLICESY*MAXSLICESX*sizeof(starpu_data_handle));
+
 	/* Allocate grids of buffer */
 	/* TODO pin ... */
 	unsigned z, y, x;
@@ -68,9 +81,9 @@ static void init_problem_data(void)
 		for (z = 0; z < nslicesz; z++)
 		{
 #ifdef STARPU_HAVE_POSIX_MEMALIGN
-			posix_memalign((void **)&A[y][z], MEM_ALIGNMENT, BLOCKSIZEZ*BLOCKSIZEY*sizeof(float));
+			posix_memalign((void **)&A[y][z], MEM_ALIGNMENT, BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
 #else
-			A[y][z] = malloc(BLOCKSIZEZ*BLOCKSIZEY*sizeof(float));
+			A[y][z] = malloc(BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
 #endif
 			assert(A[y][z]);
 		}
@@ -81,9 +94,9 @@ static void init_problem_data(void)
 		for (x = 0; x < nslicesx; x++)
 		{
 #ifdef STARPU_HAVE_POSIX_MEMALIGN
-			posix_memalign((void **)&B[z][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEZ*sizeof(float));
+			posix_memalign((void **)&B[z][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
 #else
-			B[z][x] = malloc(BLOCKSIZEX*BLOCKSIZEZ*sizeof(float));
+			B[z][x] = malloc(BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
 #endif
 			assert(B[z][x]);
 		}
@@ -94,9 +107,9 @@ static void init_problem_data(void)
 		for (x = 0; x < nslicesx; x++)
 		{
 #ifdef STARPU_HAVE_POSIX_MEMALIGN
-			posix_memalign((void **)&C[y][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEY*sizeof(float));
+			posix_memalign((void **)&C[y][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
 #else
-			C[y][x] = malloc(BLOCKSIZEX*BLOCKSIZEY*sizeof(float));
+			C[y][x] = malloc(BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
 #endif
 			assert(C[y][x]);
 		}
@@ -111,7 +124,7 @@ static void init_problem_data(void)
 				for (j = 0; j < BLOCKSIZEY; j++)
 					for (i = 0; i < BLOCKSIZEZ; i++)
 					{
-						A[blocky][blockz][i*BLOCKSIZEY + j] = (float)(1 + blockz + blocky*nslicesz);
+						A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(1 + blockz + blocky*nslicesz);
 					}
 
 		for (blockz = 0; blockz < nslicesz; blockz++)
@@ -119,7 +132,7 @@ static void init_problem_data(void)
 				for (j = 0; j < BLOCKSIZEZ; j++)
 					for (i = 0; i < BLOCKSIZEX; i++)
 					{
-						B[blockz][blockx][i*BLOCKSIZEZ + j] = (float)(1 + blockx + blockz*nslicesx);
+						B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(1 + blockx + blockz*nslicesx);
 					}
 	} 
 	else {
@@ -128,7 +141,7 @@ static void init_problem_data(void)
 				for (j = 0; j < BLOCKSIZEY; j++)
 					for (i = 0; i < BLOCKSIZEZ; i++)
 					{
-						A[blocky][blockz][i*BLOCKSIZEY + j] = (float)(starpu_drand48());
+						A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(starpu_drand48());
 					}
 
 		for (blockz = 0; blockz < nslicesz; blockz++)
@@ -136,7 +149,7 @@ static void init_problem_data(void)
 				for (j = 0; j < BLOCKSIZEZ; j++)
 					for (i = 0; i < BLOCKSIZEX; i++)
 					{
-						B[blockz][blockx][i*BLOCKSIZEZ + j] = (float)(starpu_drand48());
+						B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(starpu_drand48());
 					}
 
 	}
@@ -146,9 +159,11 @@ static void init_problem_data(void)
 			for (j = 0; j < BLOCKSIZEY; j++)
 				for (i = 0; i < BLOCKSIZEX; i++)
 				{
-					C[blocky][blockx][i*BLOCKSIZEY + j] = (float)(blockx + blocky*nslicesx + 1);
+					C[blocky][blockx][i*BLOCKSIZEY + j] = (TYPE)(blockx + blocky*nslicesx + 1);
 				}
 
+	/* TODO: aren't we supposed to set data consistency to relaxed, since
+	 * tags are supposed to provide the correct dependencies? */
 
 	/* declare the StarPU data to monitor */
 	for (y = 0; y < nslicesy; y++)
@@ -156,7 +171,7 @@ static void init_problem_data(void)
 		for (z = 0; z < nslicesz; z++)
 		{
 			starpu_matrix_data_register(&A_state[y][z], 0, (uintptr_t)A[y][z], 
-				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEZ, sizeof(float));
+				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEZ, sizeof(TYPE));
 		}
 	}
 
@@ -165,7 +180,7 @@ static void init_problem_data(void)
 		for (x = 0; x < nslicesx; x++)
 		{
 			starpu_matrix_data_register(&B_state[z][x], 0, (uintptr_t)B[z][x], 
-				BLOCKSIZEZ, BLOCKSIZEZ, BLOCKSIZEX, sizeof(float));
+				BLOCKSIZEZ, BLOCKSIZEZ, BLOCKSIZEX, sizeof(TYPE));
 		}
 	}
 
@@ -174,7 +189,7 @@ static void init_problem_data(void)
 		for (x = 0; x < nslicesx; x++)
 		{
 			starpu_matrix_data_register(&C_state[y][x], 0, (uintptr_t)C[y][x], 
-				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEX, sizeof(float));
+				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEX, sizeof(TYPE));
 		}
 	}
 
@@ -184,6 +199,8 @@ static void init_problem_data(void)
 	conf.n = BLOCKSIZEX;
 #endif
 
+	fprintf(stderr, "block size : x %d y %d z %d\n", BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
+
 	display_memory_consumption();
 }
 
@@ -191,6 +208,41 @@ static void cleanup_problem(void)
 {
 	unsigned z, y, x;
 
+#ifdef CHECK_OUTPUT
+	TYPE maxerr = 0.0;
+	TYPE err;
+	fprintf(stderr, "Checking results ....");
+
+	for (y = 0; y < nslicesy; y++)
+	{
+		for (x = 0; x < nslicesx; x++)
+		{
+			for (z = 0; z < nslicesz; z++)
+			{
+				SGEMM("N", "N", BLOCKSIZEY, BLOCKSIZEX, BLOCKSIZEZ, -(TYPE)(niter), A[y][z], BLOCKSIZEY, B[z][x], BLOCKSIZEZ, 1.0f, C[y][x], BLOCKSIZEY);
+
+			}
+
+			/* make sure C - niter AB = 0 */
+			err = SASUM(BLOCKSIZEX*BLOCKSIZEY, C[y][x], 1);
+
+			if (err > BLOCKSIZEX*BLOCKSIZEY*niter*0.001) 
+				fprintf(stderr, "\nerr = %f ( x = %d y = %d ) ... ", err/niter, x, y );
+
+			maxerr = STARPU_MAX(err, maxerr);
+		}
+	}
+
+	if (maxerr > BLOCKSIZEX*BLOCKSIZEY*niter*0.001)
+	{
+		fprintf(stderr, " maxerr = %f\n", maxerr/niter);
+	}
+	else {
+		fprintf(stderr, " OK\n");
+	}
+	fflush(stderr);
+#endif
+
 	for (y = 0; y < nslicesy; y++)
 	{
 		for (z = 0; z < nslicesz; z++)
@@ -226,20 +278,24 @@ struct cb2_s {
 	unsigned iter;
 };
 
+
 static starpu_codelet cl = {
-	.cpu_func = cpu_mult,
+	.where = STARPU_CPU|STARPU_CUDA
+#ifdef SPU_FUNC_SGEMM
+		|STARPU_GORDON
+#endif
+		,
+	.cpu_func = STARPU_GEMM(cpu_mult),
 #ifdef STARPU_USE_CUDA
-	.cuda_func = cublas_mult,
+	.cuda_func = STARPU_GEMM(cublas_mult),
 #endif
 #ifdef STARPU_USE_GORDON
 	/* .gordon_func will be set by load_elf_sgemm */
 #endif
-
-	.model = &sgemm_model,
-	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
 	.nbuffers = 3
 };
 
+
 #ifdef STARPU_USE_GORDON
 static const char *spu_func_sgemm_elf_file = "./gordon/func_sgemm_ibm.spuelf";
 static unsigned spu_func_sgemm_elf_id;
@@ -252,26 +308,20 @@ static void load_elf_sgemm(void)
 
 	spu_func_sgemm_ibm_id = gordon_register_kernel(spu_func_sgemm_elf_id, "func_sgemm_ibm");
 
-	
 	gordon_load_plugin_on_all_spu(spu_func_sgemm_elf_id);
 	gordon_load_kernel_on_all_spu(spu_func_sgemm_ibm_id);
 
 	cl.gordon_func = spu_func_sgemm_ibm_id;
 }
-
 #endif // STARPU_USE_GORDON
 
 static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, unsigned iter)
 {
+	/* A B[task] = C[task] */
 	struct starpu_task *task = starpu_task_create();
 
 	task->cl = &cl;
 
-#ifdef STARPU_USE_GORDON
-	task->cl_arg = &conf;
-	task->cl_arg_size = sizeof(struct ibm_sgemm_block_conf);
-#endif
-
 	task->use_tag = 1;
 	task->tag_id = TAG(z, y, x, iter);
 
@@ -282,6 +332,11 @@ static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, un
 	task->buffers[2].handle = C_state[y][x];
 	task->buffers[2].mode = STARPU_RW;
 
+#ifdef STARPU_USE_GORDON
+	task->cl_arg = &conf;
+	task->cl_arg_size = sizeof(struct ibm_sgemm_block_conf);
+#endif
+
 	return task;
 }
 
@@ -355,12 +410,21 @@ static void launch_codelets(void)
 
 	srand(time(NULL));
 
-	gettimeofday(&start, NULL);
+	/* should we use a single performance model for all archs and use an
+ 	 * acceleration factor ? */
+	if (use_common_model) {
+		cl.model = &STARPU_GEMM(model_common);
+	}
+	else {
+		cl.model = &STARPU_GEMM(model);
+	}
 
 	for (taskx = 0; taskx < nslicesx; taskx++) 
-	for (tasky = 0; tasky < nslicesy; tasky++)
 	{
-		submit_new_iter(taskx, tasky, 0);
+		for (tasky = 0; tasky < nslicesy; tasky++)
+		{
+			submit_new_iter(taskx, tasky, 0);
+		}
 	}
 }
 
@@ -381,16 +445,16 @@ int main(__attribute__ ((unused)) int argc,
 
 	init_problem_data();
 
+	gettimeofday(&start, NULL);
+
 	launch_codelets();
 
 	starpu_task_wait_for_all();
 
 	gettimeofday(&end, NULL);
-
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 	display_stats(timing);
 
-
 	cleanup_problem();
 
 	starpu_helper_cublas_shutdown();

+ 58 - 40
examples/mult/dw_mult_no_stride_no_tag.c

@@ -14,10 +14,12 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#include "simple.h"
 #include "dw_mult.h"
 #ifdef STARPU_USE_GORDON
 #include "gordon/func_sgemm_ibm.h"
 #endif
+#include "xgemm_kernels.c"
 
 
 struct pos {
@@ -26,9 +28,9 @@ struct pos {
 
 struct pos currentpos [MAXSLICESY][MAXSLICESX];
 
-float *A[MAXSLICESY][MAXSLICESZ];
-float *B[MAXSLICESZ][MAXSLICESX];
-float *C[MAXSLICESY][MAXSLICESX];
+TYPE *A[MAXSLICESY][MAXSLICESZ];
+TYPE *B[MAXSLICESZ][MAXSLICESX];
+TYPE *C[MAXSLICESY][MAXSLICESX];
 
 starpu_data_handle A_state[MAXSLICESY][MAXSLICESZ];
 starpu_data_handle B_state[MAXSLICESZ][MAXSLICESX];
@@ -37,8 +39,10 @@ starpu_data_handle C_state[MAXSLICESY][MAXSLICESX];
 
 static void callback_func_3(void *arg);
 /*
- * That program should compute C = A * B 
+ * This program computes C = A * B 
  * 
+ * The difference with dw_mult_no_stride.c is that here we do not use tags, and
+ * just rely on sequential data consistency.
  *   A of size (z,y)
  *   B of size (x,z)
  *   C of size (x,y)
@@ -64,9 +68,9 @@ static void init_problem_data(void)
 	unsigned i,j;
 
 	/* debug ... */
-	memset(A, 0, MAXSLICESY*MAXSLICESZ*sizeof(float *));
-	memset(B, 0, MAXSLICESZ*MAXSLICESZ*sizeof(float *));
-	memset(C, 0, MAXSLICESY*MAXSLICESX*sizeof(float *));
+	memset(A, 0, MAXSLICESY*MAXSLICESZ*sizeof(TYPE *));
+	memset(B, 0, MAXSLICESZ*MAXSLICESZ*sizeof(TYPE *));
+	memset(C, 0, MAXSLICESY*MAXSLICESX*sizeof(TYPE *));
 	memset(&A_state, 0, MAXSLICESY*MAXSLICESZ*sizeof(starpu_data_handle));
 	memset(&B_state, 0, MAXSLICESZ*MAXSLICESZ*sizeof(starpu_data_handle));
 	memset(&C_state, 0, MAXSLICESY*MAXSLICESX*sizeof(starpu_data_handle));
@@ -80,9 +84,9 @@ static void init_problem_data(void)
 		for (z = 0; z < nslicesz; z++)
 		{
 #ifdef STARPU_HAVE_POSIX_MEMALIGN
-			posix_memalign((void **)&A[y][z], MEM_ALIGNMENT, BLOCKSIZEZ*BLOCKSIZEY*sizeof(float));
+			posix_memalign((void **)&A[y][z], MEM_ALIGNMENT, BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
 #else
-			A[y][z] = malloc(BLOCKSIZEZ*BLOCKSIZEY*sizeof(float));
+			A[y][z] = malloc(BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
 #endif
 			assert(A[y][z]);
 		}
@@ -93,9 +97,9 @@ static void init_problem_data(void)
 		for (x = 0; x < nslicesx; x++)
 		{
 #ifdef STARPU_HAVE_POSIX_MEMALIGN
-			posix_memalign((void **)&B[z][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEZ*sizeof(float));
+			posix_memalign((void **)&B[z][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
 #else
-			B[z][x] = malloc(BLOCKSIZEX*BLOCKSIZEZ*sizeof(float));
+			B[z][x] = malloc(BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
 #endif
 			assert(B[z][x]);
 		}
@@ -106,9 +110,9 @@ static void init_problem_data(void)
 		for (x = 0; x < nslicesx; x++)
 		{
 #ifdef STARPU_HAVE_POSIX_MEMALIGN
-			posix_memalign((void **)&C[y][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEY*sizeof(float));
+			posix_memalign((void **)&C[y][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
 #else
-			C[y][x] = malloc(BLOCKSIZEX*BLOCKSIZEY*sizeof(float));
+			C[y][x] = malloc(BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
 #endif
 			currentpos[y][x].x = x;
 			currentpos[y][x].y = y;
@@ -127,7 +131,7 @@ static void init_problem_data(void)
 				for (j = 0; j < BLOCKSIZEY; j++)
 					for (i = 0; i < BLOCKSIZEZ; i++)
 					{
-						A[blocky][blockz][i*BLOCKSIZEY + j] = (float)(1 + blockz + blocky*nslicesz);
+						A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(1 + blockz + blocky*nslicesz);
 					}
 
 		for (blockz = 0; blockz < nslicesz; blockz++)
@@ -135,7 +139,7 @@ static void init_problem_data(void)
 				for (j = 0; j < BLOCKSIZEZ; j++)
 					for (i = 0; i < BLOCKSIZEX; i++)
 					{
-						B[blockz][blockx][i*BLOCKSIZEZ + j] = (float)(1 + blockx + blockz*nslicesx);
+						B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(1 + blockx + blockz*nslicesx);
 					}
 	} 
 	else {
@@ -144,7 +148,7 @@ static void init_problem_data(void)
 				for (j = 0; j < BLOCKSIZEY; j++)
 					for (i = 0; i < BLOCKSIZEZ; i++)
 					{
-						A[blocky][blockz][i*BLOCKSIZEY + j] = (float)(starpu_drand48());
+						A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(starpu_drand48());
 					}
 
 		for (blockz = 0; blockz < nslicesz; blockz++)
@@ -152,7 +156,7 @@ static void init_problem_data(void)
 				for (j = 0; j < BLOCKSIZEZ; j++)
 					for (i = 0; i < BLOCKSIZEX; i++)
 					{
-						B[blockz][blockx][i*BLOCKSIZEZ + j] = (float)(starpu_drand48());
+						B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(starpu_drand48());
 					}
 
 	}
@@ -162,7 +166,7 @@ static void init_problem_data(void)
 			for (j = 0; j < BLOCKSIZEY; j++)
 				for (i = 0; i < BLOCKSIZEX; i++)
 				{
-					C[blocky][blockx][i*BLOCKSIZEY + j] = (float)0;
+					C[blocky][blockx][i*BLOCKSIZEY + j] = (TYPE)0;
 				}
 
 
@@ -172,7 +176,7 @@ static void init_problem_data(void)
 		for (z = 0; z < nslicesz; z++)
 		{
 			starpu_matrix_data_register(&A_state[y][z], 0, (uintptr_t)A[y][z], 
-				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEZ, sizeof(float));
+				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEZ, sizeof(TYPE));
 		}
 	}
 
@@ -181,7 +185,7 @@ static void init_problem_data(void)
 		for (x = 0; x < nslicesx; x++)
 		{
 			starpu_matrix_data_register(&B_state[z][x], 0, (uintptr_t)B[z][x], 
-				BLOCKSIZEZ, BLOCKSIZEZ, BLOCKSIZEX, sizeof(float));
+				BLOCKSIZEZ, BLOCKSIZEZ, BLOCKSIZEX, sizeof(TYPE));
 		}
 	}
 
@@ -190,7 +194,7 @@ static void init_problem_data(void)
 		for (x = 0; x < nslicesx; x++)
 		{
 			starpu_matrix_data_register(&C_state[y][x], 0, (uintptr_t)C[y][x], 
-				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEX, sizeof(float));
+				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEX, sizeof(TYPE));
 		}
 	}
 
@@ -210,8 +214,8 @@ static void cleanup_problem(void)
 	unsigned z, y, x;
 
 #ifdef CHECK_OUTPUT
-	float maxerr = 0.0;
-	float err;
+	TYPE maxerr = 0.0;
+	TYPE err;
 	fprintf(stderr, "Checking results ....");
 
 	for (y = 0; y < nslicesy; y++)
@@ -220,7 +224,7 @@ static void cleanup_problem(void)
 		{
 			for (z = 0; z < nslicesz; z++)
 			{
-				SGEMM("N", "N", BLOCKSIZEY, BLOCKSIZEX, BLOCKSIZEZ, -(float)(niter), A[y][z], BLOCKSIZEY, B[z][x], BLOCKSIZEZ, 1.0f, C[y][x], BLOCKSIZEY);
+				SGEMM("N", "N", BLOCKSIZEY, BLOCKSIZEX, BLOCKSIZEZ, -(TYPE)(niter), A[y][z], BLOCKSIZEY, B[z][x], BLOCKSIZEZ, 1.0f, C[y][x], BLOCKSIZEY);
 
 			}
 
@@ -280,10 +284,14 @@ struct cb2_s {
 
 
 static starpu_codelet cl = {
-	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
-	.cpu_func = cpu_mult,
+	.where = STARPU_CPU|STARPU_CUDA
+#ifdef SPU_FUNC_SGEMM
+		|STARPU_GORDON
+#endif
+		,
+	.cpu_func = STARPU_GEMM(cpu_mult),
 #ifdef STARPU_USE_CUDA
-	.cuda_func = cublas_mult,
+	.cuda_func = STARPU_GEMM(cublas_mult),
 #endif
 #ifdef STARPU_USE_GORDON
 	/* .gordon_func will be set by load_elf_sgemm */
@@ -309,13 +317,11 @@ static void load_elf_sgemm(void)
 
 	cl.gordon_func = spu_func_sgemm_ibm_id;
 }
-#endif
+#endif // STARPU_USE_GORDON
 
-
-static void construct_task(unsigned x, unsigned y, unsigned z, unsigned iter, struct pos *posp)
+static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, unsigned iter, struct pos *posp)
 {
-	struct starpu_task *task;
-	task = starpu_task_create();
+	struct starpu_task *task = starpu_task_create();
 
 	task->cl = &cl;
 
@@ -337,7 +343,7 @@ static void construct_task(unsigned x, unsigned y, unsigned z, unsigned iter, st
 	posp->z = z;
 	posp->iter = iter;
 
-	starpu_task_submit(task);
+	return task;
 }
 
 
@@ -359,13 +365,15 @@ static void callback_func_3(void *arg)
 
 	if (z < nslicesz - 1)
 	{
-		construct_task(x, y, z+1, iter, posp);
+		struct starpu_task *task = construct_task(x, y, z+1, iter, posp);
+		starpu_task_submit(task);
 	}
 	else
 	{
 		if (iter < niter - 1)
 		{
-			construct_task(x, y, 0, iter+1, posp);
+			struct starpu_task *task = construct_task(x, y, 0, iter+1, posp);
+			starpu_task_submit(task);
 		}
 	}
 }
@@ -383,12 +391,22 @@ static void launch_codelets(void)
 
 	srand(time(NULL));
 
-	gettimeofday(&start, NULL);
+	/* should we use a single performance model for all archs and use an
+ 	 * acceleration factor ? */
+	if (use_common_model) {
+		cl.model = &STARPU_GEMM(model_common);
+	}
+	else {
+		cl.model = &STARPU_GEMM(model);
+	}
 
 	for (taskx = 0; taskx < nslicesx; taskx++) 
-	for (tasky = 0; tasky < nslicesy; tasky++)
 	{
-		construct_task(taskx, tasky, 0, 0, &currentpos[tasky][taskx]);
+		for (tasky = 0; tasky < nslicesy; tasky++)
+		{
+			struct starpu_task *task = construct_task(taskx, tasky, 0, 0, &currentpos[tasky][taskx]);
+			starpu_task_submit(task);
+		}
 	}
 }
 
@@ -409,14 +427,14 @@ int main(__attribute__ ((unused)) int argc,
 
 	init_problem_data();
 
+	gettimeofday(&start, NULL);
+
 	launch_codelets();
 
 	starpu_task_wait_for_all();
 
 	gettimeofday(&end, NULL);
-
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-
 	display_stats(timing);
 
 	cleanup_problem();

+ 42 - 0
examples/mult/gordon/func_dgemm_ibm.c

@@ -0,0 +1,42 @@
+/*
+ * StarPU
+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "func_gemm_ibm.h"
+
+#include <blas_s.h>
+
+void func_dgemm_ibm(__attribute__ ((unused)) void **alloc,
+		__attribute__ ((unused)) void **in,
+		__attribute__ ((unused)) void **inout,
+		__attribute__ ((unused)) void **out)
+{
+	/* we assume data will be in A:R,B:R,C:RW mode
+ 	 *  -> in[0] : describe problem
+ 	 *  -> in[1] : A
+ 	 *  -> in[2] : B
+ 	 *  -> inout[0] : C
+ 	 *
+ 	 *   C = AB + C
+ 	 *   but, being in fortran ordering, we compute
+ 	 *   t(C) = t(B)t(A) + t(C) instead
+ 	 */
+	struct ibm_gemm_block_conf *conf = in[0];
+	double *A = in[1];
+	double *B = in[2];
+	double *C = inout[0];
+
+	dgemm_spu(conf->m, conf->n, conf->k, B, A, C);
+}

examples/mult/gordon/func_sgemm_ibm.h → examples/mult/gordon/func_gemm_ibm.h


+ 2 - 2
examples/mult/gordon/func_sgemm_ibm.c

@@ -14,7 +14,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "func_sgemm_ibm.h"
+#include "func_gemm_ibm.h"
 
 #include <blas_s.h>
 
@@ -33,7 +33,7 @@ void func_sgemm_ibm(__attribute__ ((unused)) void **alloc,
  	 *   but, being in fortran ordering, we compute
  	 *   t(C) = t(B)t(A) + t(C) instead
  	 */
-	struct ibm_sgemm_block_conf *conf = in[0];
+	struct ibm_gemm_block_conf *conf = in[0];
 	float *A = in[1];
 	float *B = in[2];
 	float *C = inout[0];

+ 1 - 7
examples/mult/sgemm.c

@@ -14,13 +14,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#define TYPE	float
-
-#define CUBLAS_GEMM cublasSgemm
-#define CPU_GEMM	SGEMM
-#define CPU_ASUM	SASUM
-#define CPU_IAMAX	ISAMAX
-#define STARPU_GEMM(name)	starpu_sgemm_##name
+#include "simple.h"
 
 #include "xgemm_kernels.c"
 #include "xgemm.c" 

+ 0 - 77
examples/mult/sgemm_kernels.c

@@ -1,77 +0,0 @@
-/*
- * StarPU
- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include <starpu.h>
-#include <starpu_cuda.h>
-#include <common/blas.h>
-
-#define COMMON_CODE			\
-	uint32_t nxC, nyC, nyA;		\
-	uint32_t ldA, ldB, ldC;		\
-					\
-	float *subA;			\
-	float *subB;			\
-	float *subC;			\
-					\
-	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);	\
-	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);	\
-	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);	\
-					\
-	nxC = STARPU_MATRIX_GET_NX(descr[2]);		\
-	nyC = STARPU_MATRIX_GET_NY(descr[2]);		\
-	nyA = STARPU_MATRIX_GET_NY(descr[0]);		\
-					\
-	ldA = STARPU_MATRIX_GET_LD(descr[0]);		\
-	ldB = STARPU_MATRIX_GET_LD(descr[1]);		\
-	ldC = STARPU_MATRIX_GET_LD(descr[2]);
-
-
-
-#ifdef STARPU_USE_CUDA
-
-#ifdef STARPU_HAVE_MAGMA
-#define GPU_SGEMM magmablas_sgemm
-#else
-#define GPU_SGEMM cublasSgemm
-#endif
-
-void cublas_mult(void *descr[], __attribute__((unused)) void *arg)
-{
-	COMMON_CODE
-
-	starpu_trace_user_event(0x42);
-
-	GPU_SGEMM('n', 'n', nxC, nyC, nyA, 1.0f, subA, ldA, subB, ldB, 
-					     0.0f, subC, ldC);
-	cublasStatus st;
-	st = cublasGetError();
-	if (st != CUBLAS_STATUS_SUCCESS)
-		STARPU_ABORT();
-
-	cudaThreadSynchronize();
-
-	starpu_trace_user_event(0x43);
-}
-#endif
-
-void cpu_mult(void *descr[], __attribute__((unused))  void *arg)
-{
-	COMMON_CODE
-
-	starpu_trace_user_event(0x42);
-	SGEMM("N", "N", nxC, nyC, nyA, 1.0f, subA, ldA, subB, ldB, 0.0f, subC, ldC);
-	starpu_trace_user_event(0x43);
-}

+ 85 - 50
examples/mult/xgemm.c

@@ -16,15 +16,11 @@
 
 #include "dw_mult.h"
 
-#define str(s) #s
-#define xstr(s)        str(s)
-#define STARPU_GEMM_STR(name)  xstr(STARPU_GEMM(name))
-
 TYPE *A, *B, *C;
 starpu_data_handle A_handle, B_handle, C_handle;
 
 /*
- * That program should compute C = A * B 
+ * This program computes C = A * B 
  * 
  *   A of size (z,y)
  *   B of size (x,z)
@@ -49,17 +45,22 @@ static void check_output(void)
 	/* check results */
 	/* compute C = C - AB */
 
-	CPU_GEMM("N", "N", ydim, xdim, zdim, (TYPE)-1.0, A, ydim, B, zdim, (TYPE)1.0f, C, ydim);
+	CPU_GEMM("N", "N", ydim, xdim, zdim, (TYPE)-1.0f, A, ydim, B, zdim, (TYPE)1.0f, C, ydim);
 		
 	/* make sure C = 0 */
 	TYPE err;
 	err = CPU_ASUM(xdim*ydim, C, 1);
 
-	int max;
-	max = CPU_IAMAX(xdim*ydim, C, 1);
+	if (err < xdim*ydim*0.001) {
+		fprintf(stderr, "Results are OK\n");
+	}
+	else {
+		int max;
+		max = CPU_IAMAX(xdim*ydim, C, 1);
 
-	fprintf(stderr, "Avg error : %e\n", err/(xdim*ydim));
-	fprintf(stderr, "Max error : %e\n", C[max]);
+		fprintf(stderr, "There were errors ... err = %f\n", err);
+		fprintf(stderr, "Max error : %e\n", C[max]);
+	}
 }
 
 void callback_func(void *arg)
@@ -108,6 +109,10 @@ static void init_problem_data(void)
 		}
 	} 
 	else {
+#ifdef NORANDOM
+		srand(2008);
+		STARPU_ABORT();
+#endif
 		for (j=0; j < ydim; j++) {
 			for (i=0; i < zdim; i++) {
 				A[j+i*ydim] = (TYPE)(starpu_drand48());
@@ -127,12 +132,7 @@ static void init_problem_data(void)
 		}
 	}
 
-	/* display memory consumption */
-	fprintf(stderr, "Total memory : %ld MB\n",
-		( ydim*zdim*sizeof(TYPE)
-		+ zdim*xdim*sizeof(TYPE)
-		+ ydim*xdim*sizeof(TYPE) )/(1024*1024));
-
+	display_memory_consumption();
 }
 
 static void partition_mult_data(void)
@@ -170,60 +170,94 @@ static void partition_mult_data(void)
 
 static void unpartition_mult_data(void)
 {
+	fprintf(stderr, "unpartition !!\n");
+
 	starpu_data_unpartition(C_handle, 0);
 
 	starpu_data_unregister(C_handle);
 }
 
-static struct starpu_perfmodel_t gemm_model = {
-	.type = STARPU_HISTORY_BASED,
-#ifdef STARPU_ATLAS
-	.symbol = STARPU_GEMM_STR(gemm_atlas)
-#elif defined(STARPU_GOTO)
-	.symbol = STARPU_GEMM_STR(gemm_goto)
-#else
-	.symbol = STARPU_GEMM_STR(gemm)
-#endif
-};
-
 static starpu_codelet cl = {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA
+#ifdef SPU_FUNC_SGEMM
+		|STARPU_GORDON
+#endif
+		,
 	.cpu_func = STARPU_GEMM(cpu_mult),
 #ifdef STARPU_USE_CUDA
 	.cuda_func = STARPU_GEMM(cublas_mult),
 #endif
-	.model = &gemm_model,
+#ifdef STARPU_USE_GORDON
+#ifdef SPU_FUNC_SGEMM
+	.gordon_func = SPU_FUNC_SGEMM,
+#else
+#warning SPU_FUNC_SGEMM is not available
+#endif
+#endif
 	.nbuffers = 3
 };
 
+static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, unsigned iter)
+{
+	/* A B[task] = C[task] */
+	struct starpu_task *task = starpu_task_create();
+
+	task->cl = &cl;
+
+	/* we have a callback to do some accounting */
+	task->callback_func = callback_func;
+	task->callback_arg = NULL;
+
+	task->buffers[0].handle = starpu_data_get_sub_data(A_handle, 1, y);
+	task->buffers[0].mode = STARPU_R;
+	task->buffers[1].handle = starpu_data_get_sub_data(B_handle, 1, x);
+	task->buffers[1].mode = STARPU_R;
+	task->buffers[2].handle = starpu_data_get_sub_data(C_handle, 2, x, y);
+	task->buffers[2].mode = STARPU_RW;
+
+	task->cl_arg = &conf;
+	task->cl_arg_size = sizeof(struct block_conf);
+	return task;
+}
+
+static void submit_new_iter(unsigned x, unsigned y, unsigned iter)
+{
+	unsigned z;
+
+	z = 0;
+
+	{
+		struct starpu_task *task;
+		task = construct_task(x, y, z, iter);
+
+		starpu_task_submit(task);
+	}
+}
+
 static void launch_codelets(void)
 {
+#ifdef STARPU_USE_FXT
+	_starpu_fxt_register_thread(0);
+#endif
 	/* partition the work into slices */
 	unsigned taskx, tasky;
 
+	srand(time(NULL));
+
+	/* should we use a single performance model for all archs and use an
+ 	 * acceleration factor ? */
+	if (use_common_model) {
+		cl.model = &STARPU_GEMM(model_common);
+	}
+	else {
+		cl.model = &STARPU_GEMM(model);
+	}
+
 	for (taskx = 0; taskx < nslicesx; taskx++) 
 	{
 		for (tasky = 0; tasky < nslicesy; tasky++)
 		{
-			/* A B[task] = C[task] */
-			struct starpu_task *task = starpu_task_create();
-
-			task->cl = &cl;
-			task->cl_arg = &conf;
-			task->cl_arg_size = sizeof(struct block_conf);
-
-			/* we have a callback to do some accounting */
-			task->callback_func = callback_func;
-			task->callback_arg = NULL;
-
-			task->buffers[0].handle = starpu_data_get_sub_data(A_handle, 1, tasky);
-			task->buffers[0].mode = STARPU_R;
-			task->buffers[1].handle = starpu_data_get_sub_data(B_handle, 1, taskx);
-			task->buffers[1].mode = STARPU_R;
-			task->buffers[2].handle = starpu_data_get_sub_data(C_handle, 2, taskx, tasky);
-			task->buffers[2].mode = STARPU_RW;
-
-			starpu_task_submit(task);
+			submit_new_iter(taskx, tasky, 0);
 		}
 	}
 }
@@ -236,6 +270,7 @@ int main(__attribute__ ((unused)) int argc,
 
 	/* start the runtime */
 	starpu_init(NULL);
+
 	starpu_helper_cublas_init();
 
 	init_problem_data();
@@ -245,11 +280,11 @@ int main(__attribute__ ((unused)) int argc,
 	partition_mult_data();
 
 	launch_codelets();
+
 	starpu_task_wait_for_all();
 
 	gettimeofday(&end, NULL);
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 +
-					(end.tv_usec - start.tv_usec));
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 	display_stats(timing);
 
 	unpartition_mult_data();

+ 9 - 4
examples/mult/xgemm_kernels.c

@@ -41,13 +41,20 @@
 
 
 #ifdef STARPU_USE_CUDA
+
+#ifdef STARPU_HAVE_MAGMA
+#define GPU_GEMM MAGMABLAS_GEMM
+#else
+#define GPU_GEMM CUBLAS_GEMM
+#endif
+
 void STARPU_GEMM(cublas_mult)(void *descr[], __attribute__((unused)) void *arg)
 {
 	COMMON_CODE
 
 	starpu_trace_user_event(0x42);
 
-	CUBLAS_GEMM('n', 'n', nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
+	GPU_GEMM('n', 'n', nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
 					     (TYPE)0.0, subC, ldC);
 	cublasStatus st;
 	st = cublasGetError();
@@ -65,8 +72,6 @@ void STARPU_GEMM(cpu_mult)(void *descr[], __attribute__((unused))  void *arg)
 	COMMON_CODE
 
 	starpu_trace_user_event(0x42);
-	CPU_GEMM("N", "N", nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
-					  (TYPE)0.0, subC, ldC);
-
+	CPU_GEMM("N", "N", nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB, (TYPE)0.0, subC, ldC);
 	starpu_trace_user_event(0x43);
 }