14 years ago · 53c60beeb5
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -41,8 +41,6 @@ EXTRA_DIST = 					\
 
				 	spmv/spmv_cuda.cu			\
			
 
				 	gordon/null_kernel_gordon.c		\
			
 
				 	mult/xgemm.c				\
			
 
				-	mult/xgemm_kernels.c			\
			
 
				-	mult/gordon/func_sgemm_ibm.c		\
			
 
				 	lu/xlu.c				\
			
 
				 	lu/xlu_pivot.c				\
			
 
				 	lu/xlu_implicit.c			\
			
@@ -56,8 +54,7 @@ EXTRA_DIST = 					\
 
				 	filters/fblock_opencl_kernel.cl
			
 
				 
			
 
				 CLEANFILES = 					\
			
 
				-	gordon/null_kernel_gordon.spuelf	\
			
 
				-	mult/gordon/func_sgemm_ibm.spuelf
			
 
				+	gordon/null_kernel_gordon.spuelf
			
 
				 
			
 
				 
			
 
				 CLEANFILES += *.gcno *.gcda *.linkinfo
			
@@ -88,8 +85,7 @@ SPULIBS = -lblas #-lc -lgloss -lc
 
				 	$(SPU_LD) $(SPULDFLAGS) $< -o $@ $(SPULIBS)
			
 
				 
			
 
				 BUILT_SOURCES +=				\
			
 
				-	gordon/null_kernel_gordon.spuelf	\
			
 
				-	mult/gordon/func_sgemm_ibm.spuelf
			
 
				+	gordon/null_kernel_gordon.spuelf
			
 
				 
			
 
				 endif
			
 
				 
			
@@ -116,10 +112,8 @@ noinst_HEADERS = 				\
 
				 	cholesky/dw_cholesky.h			\
			
 
				 	common/blas_model.h			\
			
 
				 	common/blas.h				\
			
 
				-	mult/dw_mult.h				\
			
 
				 	mult/simple.h				\
			
 
				 	mult/double.h				\
			
 
				-	mult/gordon/func_gemm_ibm.h		\
			
 
				 	gordon/null.h				\
			
 
				 	fortran/bindings/StarPU_fortran.h	\
			
 
				 	ppm_downscaler/ppm_downscaler.h		\
			
@@ -332,29 +326,15 @@ if !NO_BLAS_LIB
 
				 
			
 
				 examplebin_PROGRAMS += 				\
			
 
				 	mult/sgemm 				\
			
 
				-	mult/dgemm 				\
			
 
				-	mult/dw_mult_no_stride			\
			
 
				-	mult/dw_mult_no_stride_no_tag
			
 
				+	mult/dgemm
			
 
				 
			
 
				 mult_sgemm_SOURCES = 				\
			
 
				 	mult/sgemm.c				\
			
 
				-	common/blas.c				\
			
 
				-	common/blas_model.c
			
 
				+	common/blas.c
			
 
				 
			
 
				 mult_dgemm_SOURCES = 				\
			
 
				 	mult/dgemm.c				\
			
 
				-	common/blas.c				\
			
 
				-	common/blas_model.c
			
 
				-
			
 
				-mult_dw_mult_no_stride_SOURCES = 		\
			
 
				-	mult/dw_mult_no_stride.c		\
			
 
				-	common/blas.c				\
			
 
				-	common/blas_model.c
			
 
				-
			
 
				-mult_dw_mult_no_stride_no_tag_SOURCES =		\
			
 
				-	mult/dw_mult_no_stride_no_tag.c		\
			
 
				-	common/blas.c				\
			
 
				-	common/blas_model.c
			
 
				+	common/blas.c
			
 
				 
			
 
				 endif
			
 
				 
			
--- a/examples/mult/dgemm.c
+++ b/examples/mult/dgemm.c
@@ -16,6 +16,4 @@
 
				  */
			
 
				 
			
 
				 #include "double.h"
			
 
				-
			
 
				-#include "xgemm_kernels.c"
			
 
				 #include "xgemm.c" 
			
--- a/examples/mult/double.h
+++ b/examples/mult/double.h
@@ -17,7 +17,6 @@
 
				 #define TYPE	double
			
 
				 
			
 
				 #define CUBLAS_GEMM cublasDgemm
			
 
				-#define MAGMABLAS_GEMM magmablas_dgemm
			
 
				 #define CPU_GEMM	DGEMM
			
 
				 #define CPU_ASUM	DASUM
			
 
				 #define CPU_IAMAX	IDAMAX
			
--- a/examples/mult/dw_mult.h
+++ b/examples/mult/dw_mult.h
@@ -1,203 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#ifndef __MULT_H__
			
 
				-#define __MULT_H__
			
 
				-
			
 
				-#include <string.h>
			
 
				-#include <math.h>
			
 
				-#include <sys/types.h>
			
 
				-#include <sys/time.h>
			
 
				-#include <pthread.h>
			
 
				-#include <signal.h>
			
 
				-
			
 
				-#include <common/blas.h>
			
 
				-#include <common/blas_model.h>
			
 
				-
			
 
				-#include <starpu.h>
			
 
				-
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-#include <cuda.h>
			
 
				-#include <cublas.h>
			
 
				-#endif
			
 
				-
			
 
				-#define MAXSLICESX	64
			
 
				-#define MAXSLICESY	64
			
 
				-#define MAXSLICESZ	64
			
 
				-
			
 
				-#define BLAS3_FLOP(n1,n2,n3)	\
			
 
				-	(2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
			
 
				-
			
 
				-#define BLAS3_LS(n1,n2,n3)    \
			
 
				-	((2*(n1)*(n3) + (n1)*(n2) + (n2)*(n3))*sizeof(float))
			
 
				-
			
 
				-struct block_conf {
			
 
				-	uint32_t m;
			
 
				-	uint32_t n;
			
 
				-	uint32_t k;
			
 
				-	uint32_t pad;
			
 
				-};
			
 
				-
			
 
				-#define NITER	100
			
 
				-
			
 
				-unsigned niter = NITER;
			
 
				-unsigned nslicesx = 4;
			
 
				-unsigned nslicesy = 4;
			
 
				-unsigned nslicesz = 4;
			
 
				-unsigned xdim = 256;
			
 
				-unsigned ydim = 256;
			
 
				-unsigned zdim = 64;
			
 
				-unsigned norandom = 0;
			
 
				-unsigned pin = 0;
			
 
				-unsigned use_common_model = 0;
			
 
				-unsigned check = 0;
			
 
				-
			
 
				-/* to compute MFlop/s */
			
 
				-uint64_t flop_cublas = 0;
			
 
				-uint64_t flop_atlas = 0;
			
 
				-uint64_t flop_per_worker[STARPU_NMAXWORKERS] = {0};
			
 
				-
			
 
				-/* to compute MB/s (load/store) */
			
 
				-uint64_t ls_cublas = 0;
			
 
				-uint64_t ls_atlas = 0;
			
 
				-uint64_t ls_per_worker[STARPU_NMAXWORKERS] = {0};
			
 
				-
			
 
				-
			
 
				-struct timeval start;
			
 
				-struct timeval end;
			
 
				-
			
 
				-static int taskcounter __attribute__ ((unused));
			
 
				-static struct block_conf conf __attribute__ ((aligned (128)));
			
 
				-
			
 
				-#define BLOCKSIZEX	(xdim / nslicesx)
			
 
				-#define BLOCKSIZEY	(ydim / nslicesy)
			
 
				-#define BLOCKSIZEZ	(zdim / nslicesz)
			
 
				-
			
 
				-static void display_stats(double timing)
			
 
				-{
			
 
				-	unsigned worker;
			
 
				-	unsigned nworkers = starpu_worker_get_count();
			
 
				-
			
 
				-	fprintf(stderr, "Computation took (ms):\n");
			
 
				-	printf("%2.2f\n", timing/1000);
			
 
				-
			
 
				-	uint64_t flop_total = 0, ls_total = 0;
			
 
				-	
			
 
				-	for (worker = 0; worker < nworkers; worker++)
			
 
				-	{
			
 
				-		flop_total += flop_per_worker[worker];
			
 
				-		ls_total += ls_per_worker[worker];
			
 
				-
			
 
				-		char name[32];
			
 
				-		starpu_worker_get_name(worker, name, 32);
			
 
				-
			
 
				-		fprintf(stderr, "\t%s -> %2.2f GFlop\t%2.2f GFlop/s\n", name, (double)flop_per_worker[worker]/1000000000.0f, (double)flop_per_worker[worker]/(double)timing/1000);
			
 
				-	}
			
 
				-
			
 
				-	fprintf(stderr, "Total: %2.2f GFlops\t%2.2f GFlop/s\n", (double)flop_total/1000000000.0f, (double)flop_total/(double)timing/1000);
			
 
				-}
			
 
				-
			
 
				-static void parse_args(int argc, char **argv)
			
 
				-{
			
 
				-	int i;
			
 
				-	for (i = 1; i < argc; i++) {
			
 
				-		if (strcmp(argv[i], "-nblocks") == 0) {
			
 
				-			char *argptr;
			
 
				-			nslicesx = strtol(argv[++i], &argptr, 10);
			
 
				-			nslicesy = nslicesx;
			
 
				-			nslicesz = nslicesx;
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-nblocksx") == 0) {
			
 
				-			char *argptr;
			
 
				-			nslicesx = strtol(argv[++i], &argptr, 10);
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-nblocksy") == 0) {
			
 
				-			char *argptr;
			
 
				-			nslicesy = strtol(argv[++i], &argptr, 10);
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-nblocksz") == 0) {
			
 
				-			char *argptr;
			
 
				-			nslicesz = strtol(argv[++i], &argptr, 10);
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-x") == 0) {
			
 
				-			char *argptr;
			
 
				-			xdim = strtol(argv[++i], &argptr, 10);
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-y") == 0) {
			
 
				-			char *argptr;
			
 
				-			ydim = strtol(argv[++i], &argptr, 10);
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-z") == 0) {
			
 
				-			char *argptr;
			
 
				-			zdim = strtol(argv[++i], &argptr, 10);
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-iter") == 0) {
			
 
				-			char *argptr;
			
 
				-			niter = strtol(argv[++i], &argptr, 10);
			
 
				-		}
			
 
				-
			
 
				-
			
 
				-		if (strcmp(argv[i], "-no-random") == 0) {
			
 
				-			norandom = 1;
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-pin") == 0) {
			
 
				-			pin = 1;
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-check") == 0) {
			
 
				-			check = 1;
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-common-model") == 0) {
			
 
				-			use_common_model = 1;
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	assert(nslicesx <= MAXSLICESX); 
			
 
				-	assert(nslicesy <= MAXSLICESY); 
			
 
				-	assert(nslicesz <= MAXSLICESZ); 
			
 
				-}
			
 
				-
			
 
				-static void display_memory_consumption(void)
			
 
				-{
			
 
				-	fprintf(stderr, "Total memory : %ld MB\n",
			
 
				-		(MAXSLICESY*MAXSLICESZ*sizeof(TYPE *) 
			
 
				-		+ MAXSLICESZ*MAXSLICESX*sizeof(TYPE *)
			
 
				-		+ MAXSLICESY*MAXSLICESX*sizeof(TYPE *)
			
 
				-		+ MAXSLICESY*MAXSLICESZ*sizeof(starpu_data_handle)
			
 
				-		+ MAXSLICESZ*MAXSLICESX*sizeof(starpu_data_handle)
			
 
				-		+ MAXSLICESY*MAXSLICESX*sizeof(starpu_data_handle)
			
 
				-		+ ydim*zdim*sizeof(TYPE)
			
 
				-		+ zdim*xdim*sizeof(TYPE)
			
 
				-		+ ydim*xdim*sizeof(TYPE))/(1024*1024) );
			
 
				-}
			
 
				-
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-void cublas_mult(void *descr[], __attribute__((unused)) void *arg);
			
 
				-#endif
			
 
				-
			
 
				-void cpu_mult(void *descr[], __attribute__((unused))  void *arg);
			
 
				-
			
 
				-#endif // __MULT_H__
			
--- a/examples/mult/dw_mult_no_stride.c
+++ b/examples/mult/dw_mult_no_stride.c
@@ -1,465 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include "simple.h"
			
 
				-#include "dw_mult.h"
			
 
				-#ifdef STARPU_USE_GORDON
			
 
				-#include "gordon/func_sgemm_ibm.h"
			
 
				-#endif
			
 
				-#include "xgemm_kernels.c"
			
 
				-
			
 
				-TYPE *A[MAXSLICESY][MAXSLICESZ];
			
 
				-TYPE *B[MAXSLICESZ][MAXSLICESX];
			
 
				-TYPE *C[MAXSLICESY][MAXSLICESX];
			
 
				-
			
 
				-starpu_data_handle A_state[MAXSLICESY][MAXSLICESZ];
			
 
				-starpu_data_handle B_state[MAXSLICESZ][MAXSLICESX];
			
 
				-starpu_data_handle C_state[MAXSLICESY][MAXSLICESX];
			
 
				-
			
 
				-#define TAG(x,y,z,iter)	\
			
 
				-		((starpu_tag_t)((z) + (iter)*nslicesz + (x)*(nslicesz*niter) + (y)*(nslicesx*nslicesz*niter)))
			
 
				-
			
 
				-static void submit_new_iter(unsigned x, unsigned y, unsigned iter);
			
 
				-
			
 
				-/*
			
 
				- * This program computes C = A * B 
			
 
				- *
			
 
				- * The difference with xgemm.c is that matrices are here already split in
			
 
				- * blocks, and thus no data partitioning is needed.
			
 
				- * 
			
 
				- *   A of size (z,y)
			
 
				- *   B of size (x,z)
			
 
				- *   C of size (x,y)
			
 
				-
			
 
				-              |---------------|
			
 
				-            z |       B       |
			
 
				-              |---------------|
			
 
				-       z              x
			
 
				-     |----|   |---------------|
			
 
				-     |    |   |               |
			
 
				-     |    |   |               |
			
 
				-     | A  | y |       C       |
			
 
				-     |    |   |               |
			
 
				-     |    |   |               |
			
 
				-     |----|   |---------------|
			
 
				-
			
 
				- */
			
 
				-
			
 
				-#define MEM_ALIGNMENT	16
			
 
				-
			
 
				-static void init_problem_data(void)
			
 
				-{
			
 
				-	unsigned i,j;
			
 
				-
			
 
				-	/* debug ... */
			
 
				-	memset(A, 0, MAXSLICESY*MAXSLICESZ*sizeof(TYPE *));
			
 
				-	memset(B, 0, MAXSLICESZ*MAXSLICESZ*sizeof(TYPE *));
			
 
				-	memset(C, 0, MAXSLICESY*MAXSLICESX*sizeof(TYPE *));
			
 
				-	memset(&A_state, 0, MAXSLICESY*MAXSLICESZ*sizeof(starpu_data_handle));
			
 
				-	memset(&B_state, 0, MAXSLICESZ*MAXSLICESZ*sizeof(starpu_data_handle));
			
 
				-	memset(&C_state, 0, MAXSLICESY*MAXSLICESX*sizeof(starpu_data_handle));
			
 
				-
			
 
				-	/* Allocate grids of buffer */
			
 
				-	/* TODO pin ... */
			
 
				-	unsigned z, y, x;
			
 
				-
			
 
				-	for (y = 0; y < nslicesy; y++)
			
 
				-	{
			
 
				-		for (z = 0; z < nslicesz; z++)
			
 
				-		{
			
 
				-#ifdef STARPU_HAVE_POSIX_MEMALIGN
			
 
				-			posix_memalign((void **)&A[y][z], MEM_ALIGNMENT, BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
			
 
				-#else
			
 
				-			A[y][z] = malloc(BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
			
 
				-#endif
			
 
				-			assert(A[y][z]);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	for (z = 0; z < nslicesz; z++)
			
 
				-	{
			
 
				-		for (x = 0; x < nslicesx; x++)
			
 
				-		{
			
 
				-#ifdef STARPU_HAVE_POSIX_MEMALIGN
			
 
				-			posix_memalign((void **)&B[z][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
			
 
				-#else
			
 
				-			B[z][x] = malloc(BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
			
 
				-#endif
			
 
				-			assert(B[z][x]);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	for (y = 0; y < nslicesy; y++)
			
 
				-	{
			
 
				-		for (x = 0; x < nslicesx; x++)
			
 
				-		{
			
 
				-#ifdef STARPU_HAVE_POSIX_MEMALIGN
			
 
				-			posix_memalign((void **)&C[y][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
			
 
				-#else
			
 
				-			C[y][x] = malloc(BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
			
 
				-#endif
			
 
				-			assert(C[y][x]);
			
 
				-		}
			
 
				-	}
			
 
				-	
			
 
				-	/* fill the A and B matrices */
			
 
				-	unsigned blockx, blocky, blockz;
			
 
				-
			
 
				-	if (norandom) {
			
 
				-		for (blocky = 0; blocky < nslicesy; blocky++)
			
 
				-			for (blockz = 0; blockz < nslicesz; blockz++)
			
 
				-				for (j = 0; j < BLOCKSIZEY; j++)
			
 
				-					for (i = 0; i < BLOCKSIZEZ; i++)
			
 
				-					{
			
 
				-						A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(1 + blockz + blocky*nslicesz);
			
 
				-					}
			
 
				-
			
 
				-		for (blockz = 0; blockz < nslicesz; blockz++)
			
 
				-			for (blockx = 0; blockx < nslicesx; blockx++)
			
 
				-				for (j = 0; j < BLOCKSIZEZ; j++)
			
 
				-					for (i = 0; i < BLOCKSIZEX; i++)
			
 
				-					{
			
 
				-						B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(1 + blockx + blockz*nslicesx);
			
 
				-					}
			
 
				-	} 
			
 
				-	else {
			
 
				-		for (blocky = 0; blocky < nslicesy; blocky++)
			
 
				-			for (blockz = 0; blockz < nslicesz; blockz++)
			
 
				-				for (j = 0; j < BLOCKSIZEY; j++)
			
 
				-					for (i = 0; i < BLOCKSIZEZ; i++)
			
 
				-					{
			
 
				-						A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(starpu_drand48());
			
 
				-					}
			
 
				-
			
 
				-		for (blockz = 0; blockz < nslicesz; blockz++)
			
 
				-			for (blockx = 0; blockx < nslicesx; blockx++)
			
 
				-				for (j = 0; j < BLOCKSIZEZ; j++)
			
 
				-					for (i = 0; i < BLOCKSIZEX; i++)
			
 
				-					{
			
 
				-						B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(starpu_drand48());
			
 
				-					}
			
 
				-
			
 
				-	}
			
 
				-
			
 
				-	for (blocky = 0; blocky < nslicesy; blocky++)
			
 
				-		for (blockx = 0; blockx < nslicesx; blockx++)
			
 
				-			for (j = 0; j < BLOCKSIZEY; j++)
			
 
				-				for (i = 0; i < BLOCKSIZEX; i++)
			
 
				-				{
			
 
				-					C[blocky][blockx][i*BLOCKSIZEY + j] = (TYPE)(blockx + blocky*nslicesx + 1);
			
 
				-				}
			
 
				-
			
 
				-	/* TODO: aren't we supposed to set data consistency to relaxed, since
			
 
				-	 * tags are supposed to provide the correct dependencies? */
			
 
				-
			
 
				-	/* declare the StarPU data to monitor */
			
 
				-	for (y = 0; y < nslicesy; y++)
			
 
				-	{
			
 
				-		for (z = 0; z < nslicesz; z++)
			
 
				-		{
			
 
				-			starpu_matrix_data_register(&A_state[y][z], 0, (uintptr_t)A[y][z], 
			
 
				-				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEZ, sizeof(TYPE));
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	for (z = 0; z < nslicesz; z++)
			
 
				-	{
			
 
				-		for (x = 0; x < nslicesx; x++)
			
 
				-		{
			
 
				-			starpu_matrix_data_register(&B_state[z][x], 0, (uintptr_t)B[z][x], 
			
 
				-				BLOCKSIZEZ, BLOCKSIZEZ, BLOCKSIZEX, sizeof(TYPE));
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	for (y = 0; y < nslicesy; y++)
			
 
				-	{
			
 
				-		for (x = 0; x < nslicesx; x++)
			
 
				-		{
			
 
				-			starpu_matrix_data_register(&C_state[y][x], 0, (uintptr_t)C[y][x], 
			
 
				-				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEX, sizeof(TYPE));
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-#ifdef STARPU_USE_GORDON
			
 
				-	conf.k = BLOCKSIZEZ;
			
 
				-	conf.m = BLOCKSIZEY;
			
 
				-	conf.n = BLOCKSIZEX;
			
 
				-#endif
			
 
				-
			
 
				-	fprintf(stderr, "block size : x %d y %d z %d\n", BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
			
 
				-
			
 
				-	display_memory_consumption();
			
 
				-}
			
 
				-
			
 
				-static void cleanup_problem(void)
			
 
				-{
			
 
				-	unsigned z, y, x;
			
 
				-
			
 
				-#ifdef CHECK_OUTPUT
			
 
				-	TYPE maxerr = 0.0;
			
 
				-	TYPE err;
			
 
				-	fprintf(stderr, "Checking results ....");
			
 
				-
			
 
				-	for (y = 0; y < nslicesy; y++)
			
 
				-	{
			
 
				-		for (x = 0; x < nslicesx; x++)
			
 
				-		{
			
 
				-			for (z = 0; z < nslicesz; z++)
			
 
				-			{
			
 
				-				SGEMM("N", "N", BLOCKSIZEY, BLOCKSIZEX, BLOCKSIZEZ, -(TYPE)(niter), A[y][z], BLOCKSIZEY, B[z][x], BLOCKSIZEZ, 1.0f, C[y][x], BLOCKSIZEY);
			
 
				-
			
 
				-			}
			
 
				-
			
 
				-			/* make sure C - niter AB = 0 */
			
 
				-			err = SASUM(BLOCKSIZEX*BLOCKSIZEY, C[y][x], 1);
			
 
				-
			
 
				-			if (err > BLOCKSIZEX*BLOCKSIZEY*niter*0.001) 
			
 
				-				fprintf(stderr, "\nerr = %f ( x = %d y = %d ) ... ", err/niter, x, y );
			
 
				-
			
 
				-			maxerr = STARPU_MAX(err, maxerr);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	if (maxerr > BLOCKSIZEX*BLOCKSIZEY*niter*0.001)
			
 
				-	{
			
 
				-		fprintf(stderr, " maxerr = %f\n", maxerr/niter);
			
 
				-	}
			
 
				-	else {
			
 
				-		fprintf(stderr, " OK\n");
			
 
				-	}
			
 
				-	fflush(stderr);
			
 
				-#endif
			
 
				-
			
 
				-	for (y = 0; y < nslicesy; y++)
			
 
				-	{
			
 
				-		for (z = 0; z < nslicesz; z++)
			
 
				-		{
			
 
				-	//		free(A[y][z]);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	for (z = 0; z < nslicesz; z++)
			
 
				-	{
			
 
				-		for (x = 0; x < nslicesx; x++)
			
 
				-		{
			
 
				-	//		free(B[z][x]);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	for (y = 0; y < nslicesy; y++)
			
 
				-	{
			
 
				-		for (x = 0; x < nslicesx; x++)
			
 
				-		{
			
 
				-	//		free(C[y][x]);
			
 
				-			starpu_tag_remove(TAG(nslicesz - 1, y, x, niter - 1));
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	
			
 
				-	
			
 
				-}
			
 
				-
			
 
				-struct cb2_s {
			
 
				-	unsigned blockx;
			
 
				-	unsigned blocky;
			
 
				-	unsigned iter;
			
 
				-};
			
 
				-
			
 
				-
			
 
				-static starpu_codelet cl = {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA
			
 
				-#ifdef SPU_FUNC_SGEMM
			
 
				-		|STARPU_GORDON
			
 
				-#endif
			
 
				-		,
			
 
				-	.cpu_func = STARPU_GEMM(cpu_mult),
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = STARPU_GEMM(cublas_mult),
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_GORDON
			
 
				-	/* .gordon_func will be set by load_elf_sgemm */
			
 
				-#endif
			
 
				-	.nbuffers = 3
			
 
				-};
			
 
				-
			
 
				-
			
 
				-#ifdef STARPU_USE_GORDON
			
 
				-static const char *spu_func_sgemm_elf_file = "./gordon/func_sgemm_ibm.spuelf";
			
 
				-static unsigned spu_func_sgemm_elf_id;
			
 
				-static unsigned spu_func_sgemm_ibm_id;
			
 
				-
			
 
				-static void load_elf_sgemm(void)
			
 
				-{
			
 
				-	spu_func_sgemm_elf_id =
			
 
				-		gordon_register_elf_plugin(spu_func_sgemm_elf_file);
			
 
				-
			
 
				-	spu_func_sgemm_ibm_id = gordon_register_kernel(spu_func_sgemm_elf_id, "func_sgemm_ibm");
			
 
				-
			
 
				-	gordon_load_plugin_on_all_spu(spu_func_sgemm_elf_id);
			
 
				-	gordon_load_kernel_on_all_spu(spu_func_sgemm_ibm_id);
			
 
				-
			
 
				-	cl.gordon_func = spu_func_sgemm_ibm_id;
			
 
				-}
			
 
				-#endif // STARPU_USE_GORDON
			
 
				-
			
 
				-static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, unsigned iter)
			
 
				-{
			
 
				-	/* A B[task] = C[task] */
			
 
				-	struct starpu_task *task = starpu_task_create();
			
 
				-
			
 
				-	task->cl = &cl;
			
 
				-
			
 
				-	task->use_tag = 1;
			
 
				-	task->tag_id = TAG(z, y, x, iter);
			
 
				-
			
 
				-	task->buffers[0].handle = A_state[y][z];
			
 
				-	task->buffers[0].mode = STARPU_R;
			
 
				-	task->buffers[1].handle = B_state[z][x];
			
 
				-	task->buffers[1].mode = STARPU_R;
			
 
				-	task->buffers[2].handle = C_state[y][x];
			
 
				-	task->buffers[2].mode = STARPU_RW;
			
 
				-
			
 
				-#ifdef STARPU_USE_GORDON
			
 
				-	task->cl_arg = &conf;
			
 
				-	task->cl_arg_size = sizeof(struct ibm_sgemm_block_conf);
			
 
				-#endif
			
 
				-
			
 
				-	return task;
			
 
				-}
			
 
				-
			
 
				-static void callback_func_2(void *arg)
			
 
				-{
			
 
				-	/* the argument is a pointer to a counter of the remaining tasks */
			
 
				-	struct cb2_s *cb2 = arg;
			
 
				-	unsigned x,y,z,iter;
			
 
				-
			
 
				-	iter = cb2->iter;
			
 
				-	x = cb2->blockx;
			
 
				-	y = cb2->blocky;
			
 
				-
			
 
				-	free(cb2);
			
 
				-
			
 
				-	/* do some accounting */
			
 
				-	int id = starpu_worker_get_id();
			
 
				-	flop_per_worker[id] += BLAS3_FLOP(BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
			
 
				-	ls_per_worker[id] += BLAS3_LS(BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
			
 
				-
			
 
				-	/* TAG(nslicesz - 1, y, x, iter) remains ... */
			
 
				-	for (z = 0; z < nslicesz - 1; z++)
			
 
				-	{
			
 
				-		starpu_tag_remove(TAG(z, y, x, iter));
			
 
				-	}
			
 
				-
			
 
				-	if (iter > 0)
			
 
				-	{
			
 
				-		starpu_tag_remove(TAG(nslicesz - 1, y, x, iter-1));
			
 
				-	}
			
 
				-	
			
 
				-	if (iter != niter - 1) {
			
 
				-		submit_new_iter(x, y, iter+1);
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-
			
 
				-
			
 
				-static void submit_new_iter(unsigned x, unsigned y, unsigned iter)
			
 
				-{
			
 
				-	unsigned z;
			
 
				-	for (z = 0; z < nslicesz; z++) 
			
 
				-	{
			
 
				-		struct starpu_task *task;
			
 
				-		task = construct_task(x, y, z, iter);
			
 
				-		
			
 
				-		if (z != 0) {
			
 
				-			starpu_tag_declare_deps(TAG(z, y, x, iter), 1, TAG(z-1, y, x, iter));
			
 
				-		}
			
 
				-
			
 
				-		if (z == nslicesz - 1) {
			
 
				-			struct cb2_s *cb2 = malloc(sizeof(struct cb2_s));
			
 
				-				cb2->blockx = x;
			
 
				-				cb2->blocky = y;
			
 
				-				cb2->iter = iter;
			
 
				-			task->callback_func = callback_func_2;
			
 
				-			task->callback_arg = cb2;
			
 
				-		}
			
 
				-
			
 
				-		starpu_task_submit(task);
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-static void launch_codelets(void)
			
 
				-{
			
 
				-#ifdef STARPU_USE_FXT
			
 
				-	_starpu_fxt_register_thread(0);
			
 
				-#endif
			
 
				-	/* partition the work into slices */
			
 
				-	unsigned taskx, tasky;
			
 
				-
			
 
				-	srand(time(NULL));
			
 
				-
			
 
				-	/* should we use a single performance model for all archs and use an
			
 
				- 	 * acceleration factor ? */
			
 
				-	if (use_common_model) {
			
 
				-		cl.model = &STARPU_GEMM(model_common);
			
 
				-	}
			
 
				-	else {
			
 
				-		cl.model = &STARPU_GEMM(model);
			
 
				-	}
			
 
				-
			
 
				-	for (taskx = 0; taskx < nslicesx; taskx++) 
			
 
				-	{
			
 
				-		for (tasky = 0; tasky < nslicesy; tasky++)
			
 
				-		{
			
 
				-			submit_new_iter(taskx, tasky, 0);
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-int main(__attribute__ ((unused)) int argc, 
			
 
				-	 __attribute__ ((unused)) char **argv)
			
 
				-{
			
 
				-
			
 
				-	parse_args(argc, argv);
			
 
				-
			
 
				-	/* start the runtime */
			
 
				-	starpu_init(NULL);
			
 
				-
			
 
				-	starpu_helper_cublas_init();
			
 
				-
			
 
				-#ifdef STARPU_USE_GORDON
			
 
				-	load_elf_sgemm();
			
 
				-#endif
			
 
				-
			
 
				-	init_problem_data();
			
 
				-
			
 
				-	gettimeofday(&start, NULL);
			
 
				-
			
 
				-	launch_codelets();
			
 
				-
			
 
				-	starpu_task_wait_for_all();
			
 
				-
			
 
				-	gettimeofday(&end, NULL);
			
 
				-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				-	display_stats(timing);
			
 
				-
			
 
				-	cleanup_problem();
			
 
				-
			
 
				-	starpu_helper_cublas_shutdown();
			
 
				-	starpu_shutdown();
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
--- a/examples/mult/dw_mult_no_stride_no_tag.c
+++ b/examples/mult/dw_mult_no_stride_no_tag.c
@@ -1,447 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include "simple.h"
			
 
				-#include "dw_mult.h"
			
 
				-#ifdef STARPU_USE_GORDON
			
 
				-#include "gordon/func_sgemm_ibm.h"
			
 
				-#endif
			
 
				-#include "xgemm_kernels.c"
			
 
				-
			
 
				-
			
 
				-struct pos {
			
 
				-	unsigned x,y, z,iter;
			
 
				-};
			
 
				-
			
 
				-struct pos currentpos [MAXSLICESY][MAXSLICESX];
			
 
				-
			
 
				-TYPE *A[MAXSLICESY][MAXSLICESZ];
			
 
				-TYPE *B[MAXSLICESZ][MAXSLICESX];
			
 
				-TYPE *C[MAXSLICESY][MAXSLICESX];
			
 
				-
			
 
				-starpu_data_handle A_state[MAXSLICESY][MAXSLICESZ];
			
 
				-starpu_data_handle B_state[MAXSLICESZ][MAXSLICESX];
			
 
				-starpu_data_handle C_state[MAXSLICESY][MAXSLICESX];
			
 
				-
			
 
				-
			
 
				-static void callback_func_3(void *arg);
			
 
				-/*
			
 
				- * This program computes C = A * B 
			
 
				- * 
			
 
				- * The difference with dw_mult_no_stride.c is that here we do not use tags, and
			
 
				- * just rely on sequential data consistency.
			
 
				- *   A of size (z,y)
			
 
				- *   B of size (x,z)
			
 
				- *   C of size (x,y)
			
 
				-
			
 
				-              |---------------|
			
 
				-            z |       B       |
			
 
				-              |---------------|
			
 
				-       z              x
			
 
				-     |----|   |---------------|
			
 
				-     |    |   |               |
			
 
				-     |    |   |               |
			
 
				-     | A  | y |       C       |
			
 
				-     |    |   |               |
			
 
				-     |    |   |               |
			
 
				-     |----|   |---------------|
			
 
				-
			
 
				- */
			
 
				-
			
 
				-#define MEM_ALIGNMENT	16
			
 
				-
			
 
				-static void init_problem_data(void)
			
 
				-{
			
 
				-	unsigned i,j;
			
 
				-
			
 
				-	/* debug ... */
			
 
				-	memset(A, 0, MAXSLICESY*MAXSLICESZ*sizeof(TYPE *));
			
 
				-	memset(B, 0, MAXSLICESZ*MAXSLICESZ*sizeof(TYPE *));
			
 
				-	memset(C, 0, MAXSLICESY*MAXSLICESX*sizeof(TYPE *));
			
 
				-	memset(&A_state, 0, MAXSLICESY*MAXSLICESZ*sizeof(starpu_data_handle));
			
 
				-	memset(&B_state, 0, MAXSLICESZ*MAXSLICESZ*sizeof(starpu_data_handle));
			
 
				-	memset(&C_state, 0, MAXSLICESY*MAXSLICESX*sizeof(starpu_data_handle));
			
 
				-
			
 
				-	/* Allocate grids of buffer */
			
 
				-	/* TODO pin ... */
			
 
				-	unsigned z, y, x;
			
 
				-
			
 
				-	for (y = 0; y < nslicesy; y++)
			
 
				-	{
			
 
				-		for (z = 0; z < nslicesz; z++)
			
 
				-		{
			
 
				-#ifdef STARPU_HAVE_POSIX_MEMALIGN
			
 
				-			posix_memalign((void **)&A[y][z], MEM_ALIGNMENT, BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
			
 
				-#else
			
 
				-			A[y][z] = malloc(BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
			
 
				-#endif
			
 
				-			assert(A[y][z]);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	for (z = 0; z < nslicesz; z++)
			
 
				-	{
			
 
				-		for (x = 0; x < nslicesx; x++)
			
 
				-		{
			
 
				-#ifdef STARPU_HAVE_POSIX_MEMALIGN
			
 
				-			posix_memalign((void **)&B[z][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
			
 
				-#else
			
 
				-			B[z][x] = malloc(BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
			
 
				-#endif
			
 
				-			assert(B[z][x]);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	for (y = 0; y < nslicesy; y++)
			
 
				-	{
			
 
				-		for (x = 0; x < nslicesx; x++)
			
 
				-		{
			
 
				-#ifdef STARPU_HAVE_POSIX_MEMALIGN
			
 
				-			posix_memalign((void **)&C[y][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
			
 
				-#else
			
 
				-			C[y][x] = malloc(BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
			
 
				-#endif
			
 
				-			currentpos[y][x].x = x;
			
 
				-			currentpos[y][x].y = y;
			
 
				-			currentpos[y][x].z = 0;
			
 
				-			currentpos[y][x].iter = 0;
			
 
				-			assert(C[y][x]);
			
 
				-		}
			
 
				-	}
			
 
				-	
			
 
				-	/* fill the A and B matrices */
			
 
				-	unsigned blockx, blocky, blockz;
			
 
				-
			
 
				-	if (norandom) {
			
 
				-		for (blocky = 0; blocky < nslicesy; blocky++)
			
 
				-			for (blockz = 0; blockz < nslicesz; blockz++)
			
 
				-				for (j = 0; j < BLOCKSIZEY; j++)
			
 
				-					for (i = 0; i < BLOCKSIZEZ; i++)
			
 
				-					{
			
 
				-						A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(1 + blockz + blocky*nslicesz);
			
 
				-					}
			
 
				-
			
 
				-		for (blockz = 0; blockz < nslicesz; blockz++)
			
 
				-			for (blockx = 0; blockx < nslicesx; blockx++)
			
 
				-				for (j = 0; j < BLOCKSIZEZ; j++)
			
 
				-					for (i = 0; i < BLOCKSIZEX; i++)
			
 
				-					{
			
 
				-						B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(1 + blockx + blockz*nslicesx);
			
 
				-					}
			
 
				-	} 
			
 
				-	else {
			
 
				-		for (blocky = 0; blocky < nslicesy; blocky++)
			
 
				-			for (blockz = 0; blockz < nslicesz; blockz++)
			
 
				-				for (j = 0; j < BLOCKSIZEY; j++)
			
 
				-					for (i = 0; i < BLOCKSIZEZ; i++)
			
 
				-					{
			
 
				-						A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(starpu_drand48());
			
 
				-					}
			
 
				-
			
 
				-		for (blockz = 0; blockz < nslicesz; blockz++)
			
 
				-			for (blockx = 0; blockx < nslicesx; blockx++)
			
 
				-				for (j = 0; j < BLOCKSIZEZ; j++)
			
 
				-					for (i = 0; i < BLOCKSIZEX; i++)
			
 
				-					{
			
 
				-						B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(starpu_drand48());
			
 
				-					}
			
 
				-
			
 
				-	}
			
 
				-
			
 
				-	for (blocky = 0; blocky < nslicesy; blocky++)
			
 
				-		for (blockx = 0; blockx < nslicesx; blockx++)
			
 
				-			for (j = 0; j < BLOCKSIZEY; j++)
			
 
				-				for (i = 0; i < BLOCKSIZEX; i++)
			
 
				-				{
			
 
				-					C[blocky][blockx][i*BLOCKSIZEY + j] = (TYPE)0;
			
 
				-				}
			
 
				-
			
 
				-
			
 
				-	/* declare the StarPU data to monitor */
			
 
				-	for (y = 0; y < nslicesy; y++)
			
 
				-	{
			
 
				-		for (z = 0; z < nslicesz; z++)
			
 
				-		{
			
 
				-			starpu_matrix_data_register(&A_state[y][z], 0, (uintptr_t)A[y][z], 
			
 
				-				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEZ, sizeof(TYPE));
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	for (z = 0; z < nslicesz; z++)
			
 
				-	{
			
 
				-		for (x = 0; x < nslicesx; x++)
			
 
				-		{
			
 
				-			starpu_matrix_data_register(&B_state[z][x], 0, (uintptr_t)B[z][x], 
			
 
				-				BLOCKSIZEZ, BLOCKSIZEZ, BLOCKSIZEX, sizeof(TYPE));
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	for (y = 0; y < nslicesy; y++)
			
 
				-	{
			
 
				-		for (x = 0; x < nslicesx; x++)
			
 
				-		{
			
 
				-			starpu_matrix_data_register(&C_state[y][x], 0, (uintptr_t)C[y][x], 
			
 
				-				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEX, sizeof(TYPE));
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-#ifdef STARPU_USE_GORDON
			
 
				-	conf.k = BLOCKSIZEZ;
			
 
				-	conf.m = BLOCKSIZEY;
			
 
				-	conf.n = BLOCKSIZEX;
			
 
				-#endif
			
 
				-
			
 
				-	fprintf(stderr, "block size : x %d y %d z %d\n", BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
			
 
				-
			
 
				-	display_memory_consumption();
			
 
				-}
			
 
				-
			
 
				-static void cleanup_problem(void)
			
 
				-{
			
 
				-	unsigned z, y, x;
			
 
				-
			
 
				-#ifdef CHECK_OUTPUT
			
 
				-	TYPE maxerr = 0.0;
			
 
				-	TYPE err;
			
 
				-	fprintf(stderr, "Checking results ....");
			
 
				-
			
 
				-	for (y = 0; y < nslicesy; y++)
			
 
				-	{
			
 
				-		for (x = 0; x < nslicesx; x++)
			
 
				-		{
			
 
				-			for (z = 0; z < nslicesz; z++)
			
 
				-			{
			
 
				-				SGEMM("N", "N", BLOCKSIZEY, BLOCKSIZEX, BLOCKSIZEZ, -(TYPE)(niter), A[y][z], BLOCKSIZEY, B[z][x], BLOCKSIZEZ, 1.0f, C[y][x], BLOCKSIZEY);
			
 
				-
			
 
				-			}
			
 
				-
			
 
				-			/* make sure C - niter AB = 0 */
			
 
				-			err = SASUM(BLOCKSIZEX*BLOCKSIZEY, C[y][x], 1);
			
 
				-
			
 
				-			if (err > BLOCKSIZEX*BLOCKSIZEY*niter*0.001) 
			
 
				-				fprintf(stderr, "\nerr = %f ( x = %d y = %d ) ... ", err/niter, x, y );
			
 
				-
			
 
				-			maxerr = STARPU_MAX(err, maxerr);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	if (maxerr > BLOCKSIZEX*BLOCKSIZEY*niter*0.001)
			
 
				-	{
			
 
				-		fprintf(stderr, " maxerr = %f\n", maxerr/niter);
			
 
				-	}
			
 
				-	else {
			
 
				-		fprintf(stderr, " OK\n");
			
 
				-	}
			
 
				-	fflush(stderr);
			
 
				-#endif
			
 
				-
			
 
				-	for (y = 0; y < nslicesy; y++)
			
 
				-	{
			
 
				-		for (z = 0; z < nslicesz; z++)
			
 
				-		{
			
 
				-	//		free(A[y][z]);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	for (z = 0; z < nslicesz; z++)
			
 
				-	{
			
 
				-		for (x = 0; x < nslicesx; x++)
			
 
				-		{
			
 
				-	//		free(B[z][x]);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	for (y = 0; y < nslicesy; y++)
			
 
				-	{
			
 
				-		for (x = 0; x < nslicesx; x++)
			
 
				-		{
			
 
				-	//		free(C[y][x]);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	
			
 
				-	
			
 
				-}
			
 
				-
			
 
				-struct cb2_s {
			
 
				-	unsigned blockx;
			
 
				-	unsigned blocky;
			
 
				-	unsigned iter;
			
 
				-};
			
 
				-
			
 
				-
			
 
				-static starpu_codelet cl = {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA
			
 
				-#ifdef SPU_FUNC_SGEMM
			
 
				-		|STARPU_GORDON
			
 
				-#endif
			
 
				-		,
			
 
				-	.cpu_func = STARPU_GEMM(cpu_mult),
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = STARPU_GEMM(cublas_mult),
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_GORDON
			
 
				-	/* .gordon_func will be set by load_elf_sgemm */
			
 
				-#endif
			
 
				-	.nbuffers = 3
			
 
				-};
			
 
				-
			
 
				-
			
 
				-#ifdef STARPU_USE_GORDON
			
 
				-static const char *spu_func_sgemm_elf_file = "./gordon/func_sgemm_ibm.spuelf";
			
 
				-static unsigned spu_func_sgemm_elf_id;
			
 
				-static unsigned spu_func_sgemm_ibm_id;
			
 
				-
			
 
				-static void load_elf_sgemm(void)
			
 
				-{
			
 
				-	spu_func_sgemm_elf_id =
			
 
				-		gordon_register_elf_plugin(spu_func_sgemm_elf_file);
			
 
				-
			
 
				-	spu_func_sgemm_ibm_id = gordon_register_kernel(spu_func_sgemm_elf_id, "func_sgemm_ibm");
			
 
				-	
			
 
				-	gordon_load_plugin_on_all_spu(spu_func_sgemm_elf_id);
			
 
				-	gordon_load_kernel_on_all_spu(spu_func_sgemm_ibm_id);
			
 
				-
			
 
				-	cl.gordon_func = spu_func_sgemm_ibm_id;
			
 
				-}
			
 
				-#endif // STARPU_USE_GORDON
			
 
				-
			
 
				-static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, unsigned iter, struct pos *posp)
			
 
				-{
			
 
				-	struct starpu_task *task = starpu_task_create();
			
 
				-
			
 
				-	task->cl = &cl;
			
 
				-
			
 
				-	task->buffers[0].handle = A_state[y][z];
			
 
				-	task->buffers[0].mode = STARPU_R;
			
 
				-	task->buffers[1].handle = B_state[z][x];
			
 
				-	task->buffers[1].mode = STARPU_R;
			
 
				-	task->buffers[2].handle = C_state[y][x];
			
 
				-	task->buffers[2].mode = STARPU_RW;
			
 
				-
			
 
				-	task->callback_func = callback_func_3;
			
 
				-	task->callback_arg = posp;
			
 
				-
			
 
				-#ifdef STARPU_USE_GORDON
			
 
				-	task->cl_arg = &conf;
			
 
				-	task->cl_arg_size = sizeof(struct ibm_sgemm_block_conf);
			
 
				-#endif
			
 
				-
			
 
				-	posp->z = z;
			
 
				-	posp->iter = iter;
			
 
				-
			
 
				-	return task;
			
 
				-}
			
 
				-
			
 
				-
			
 
				-static void callback_func_3(void *arg)
			
 
				-{
			
 
				-	/* do some accounting */
			
 
				-	int id = starpu_worker_get_id();
			
 
				-	flop_per_worker[id] += BLAS3_FLOP(BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
			
 
				-	ls_per_worker[id] += BLAS3_LS(BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
			
 
				-
			
 
				-	/* the argument is a pointer to a counter of the remaining tasks */
			
 
				-	struct pos *posp = arg;
			
 
				-	unsigned x,y,z,iter;
			
 
				-
			
 
				-	iter = posp->iter;
			
 
				-	x = posp->x;
			
 
				-	y = posp->y;
			
 
				-	z = posp->z;
			
 
				-
			
 
				-	if (z < nslicesz - 1)
			
 
				-	{
			
 
				-		struct starpu_task *task = construct_task(x, y, z+1, iter, posp);
			
 
				-		starpu_task_submit(task);
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		if (iter < niter - 1)
			
 
				-		{
			
 
				-			struct starpu_task *task = construct_task(x, y, 0, iter+1, posp);
			
 
				-			starpu_task_submit(task);
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-static void launch_codelets(void)
			
 
				-{
			
 
				-#ifdef STARPU_USE_FXT
			
 
				-	_starpu_fxt_register_thread(0);
			
 
				-#endif
			
 
				-	/* partition the work into slices */
			
 
				-	unsigned taskx, tasky;
			
 
				-
			
 
				-	srand(time(NULL));
			
 
				-
			
 
				-	/* should we use a single performance model for all archs and use an
			
 
				- 	 * acceleration factor ? */
			
 
				-	if (use_common_model) {
			
 
				-		cl.model = &STARPU_GEMM(model_common);
			
 
				-	}
			
 
				-	else {
			
 
				-		cl.model = &STARPU_GEMM(model);
			
 
				-	}
			
 
				-
			
 
				-	for (taskx = 0; taskx < nslicesx; taskx++) 
			
 
				-	{
			
 
				-		for (tasky = 0; tasky < nslicesy; tasky++)
			
 
				-		{
			
 
				-			struct starpu_task *task = construct_task(taskx, tasky, 0, 0, &currentpos[tasky][taskx]);
			
 
				-			starpu_task_submit(task);
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-int main(__attribute__ ((unused)) int argc, 
			
 
				-	 __attribute__ ((unused)) char **argv)
			
 
				-{
			
 
				-
			
 
				-	parse_args(argc, argv);
			
 
				-
			
 
				-	/* start the runtime */
			
 
				-	starpu_init(NULL);
			
 
				-
			
 
				-	starpu_helper_cublas_init();
			
 
				-
			
 
				-#ifdef STARPU_USE_GORDON
			
 
				-	load_elf_sgemm();
			
 
				-#endif
			
 
				-
			
 
				-	init_problem_data();
			
 
				-
			
 
				-	gettimeofday(&start, NULL);
			
 
				-
			
 
				-	launch_codelets();
			
 
				-
			
 
				-	starpu_task_wait_for_all();
			
 
				-
			
 
				-	gettimeofday(&end, NULL);
			
 
				-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				-	display_stats(timing);
			
 
				-
			
 
				-	cleanup_problem();
			
 
				-
			
 
				-	starpu_helper_cublas_shutdown();
			
 
				-	starpu_shutdown();
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
--- a/examples/mult/gordon/func_dgemm_ibm.c
+++ b/examples/mult/gordon/func_dgemm_ibm.c
@@ -1,42 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include "func_gemm_ibm.h"
			
 
				-
			
 
				-#include <blas_s.h>
			
 
				-
			
 
				-void func_dgemm_ibm(__attribute__ ((unused)) void **alloc,
			
 
				-		__attribute__ ((unused)) void **in,
			
 
				-		__attribute__ ((unused)) void **inout,
			
 
				-		__attribute__ ((unused)) void **out)
			
 
				-{
			
 
				-	/* we assume data will be in A:R,B:R,C:RW mode
			
 
				- 	 *  -> in[0] : describe problem
			
 
				- 	 *  -> in[1] : A
			
 
				- 	 *  -> in[2] : B
			
 
				- 	 *  -> inout[0] : C
			
 
				- 	 *
			
 
				- 	 *   C = AB + C
			
 
				- 	 *   but, being in fortran ordering, we compute
			
 
				- 	 *   t(C) = t(B)t(A) + t(C) instead
			
 
				- 	 */
			
 
				-	struct ibm_gemm_block_conf *conf = in[0];
			
 
				-	double *A = in[1];
			
 
				-	double *B = in[2];
			
 
				-	double *C = inout[0];
			
 
				-
			
 
				-	dgemm_spu(conf->m, conf->n, conf->k, B, A, C);
			
 
				-}
			
--- a/examples/mult/gordon/func_gemm_ibm.h
+++ b/examples/mult/gordon/func_gemm_ibm.h
@@ -1,29 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#ifndef __FUNC_SGEMM_IBM_H__
			
 
				-#define __FUNC_SGEMM_IBM_H__
			
 
				-
			
 
				-#include <stdint.h>
			
 
				-
			
 
				-struct ibm_sgemm_block_conf {
			
 
				-	uint32_t m;
			
 
				-	uint32_t n;
			
 
				-	uint32_t k;
			
 
				-	uint32_t pad;
			
 
				-};
			
 
				-
			
 
				-#endif // __FUNC_SGEMM_IBM_H__
			
--- a/examples/mult/gordon/func_sgemm_ibm.c
+++ b/examples/mult/gordon/func_sgemm_ibm.c
@@ -1,43 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include "func_gemm_ibm.h"
			
 
				-
			
 
				-#include <blas_s.h>
			
 
				-
			
 
				-void func_sgemm_ibm(__attribute__ ((unused)) void **alloc,
			
 
				-		__attribute__ ((unused)) void **in,
			
 
				-		__attribute__ ((unused)) void **inout,
			
 
				-		__attribute__ ((unused)) void **out)
			
 
				-{
			
 
				-	/* we assume data will be in A:R,B:R,C:RW mode
			
 
				- 	 *  -> in[0] : describe problem
			
 
				- 	 *  -> in[1] : A
			
 
				- 	 *  -> in[2] : B
			
 
				- 	 *  -> inout[0] : C
			
 
				- 	 *
			
 
				- 	 *   C = AB + C
			
 
				- 	 *   but, being in fortran ordering, we compute
			
 
				- 	 *   t(C) = t(B)t(A) + t(C) instead
			
 
				- 	 */
			
 
				-	struct ibm_gemm_block_conf *conf = in[0];
			
 
				-	float *A = in[1];
			
 
				-	float *B = in[2];
			
 
				-	float *C = inout[0];
			
 
				-
			
 
				-	sgemm_spu(conf->m, conf->n, conf->k, B, A, C);
			
 
				-}
			
--- a/examples/mult/sgemm.c
+++ b/examples/mult/sgemm.c
@@ -16,6 +16,4 @@
 
				  */
			
 
				 
			
 
				 #include "simple.h"
			
 
				-
			
 
				-#include "xgemm_kernels.c"
			
 
				 #include "xgemm.c" 
			
--- a/examples/mult/simple.h
+++ b/examples/mult/simple.h
@@ -17,7 +17,6 @@
 
				 #define TYPE	float
			
 
				 
			
 
				 #define CUBLAS_GEMM cublasSgemm
			
 
				-#define MAGMABLAS_GEMM magmablas_sgemm
			
 
				 #define CPU_GEMM	SGEMM
			
 
				 #define CPU_ASUM	SASUM
			
 
				 #define CPU_IAMAX	ISAMAX
			
@@ -26,4 +25,3 @@
 
				 #define str(s) #s
			
 
				 #define xstr(s)        str(s)
			
 
				 #define STARPU_GEMM_STR(name)  xstr(STARPU_GEMM(name))
			
 
				-
			
--- a/examples/mult/xgemm.c
+++ b/examples/mult/xgemm.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  *
			
@@ -16,37 +16,79 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				-#include "dw_mult.h"
			
 
				-
			
 
				-TYPE *A, *B, *C;
			
 
				-starpu_data_handle A_handle, B_handle, C_handle;
			
 
				-
			
 
				-/*
			
 
				- * This program computes C = A * B 
			
 
				- * 
			
 
				- *   A of size (z,y)
			
 
				- *   B of size (x,z)
			
 
				- *   C of size (x,y)
			
 
				-
			
 
				-              |---------------|
			
 
				-            z |       B       |
			
 
				-              |---------------|
			
 
				-       z              x
			
 
				-     |----|   |---------------|
			
 
				-     |    |   |               |
			
 
				-     |    |   |               |
			
 
				-     | A  | y |       C       |
			
 
				-     |    |   |               |
			
 
				-     |    |   |               |
			
 
				-     |----|   |---------------|
			
 
				+#include <string.h>
			
 
				+#include <math.h>
			
 
				+#include <sys/types.h>
			
 
				+#include <sys/time.h>
			
 
				+#include <starpu.h>
			
 
				 
			
 
				- */
			
 
				+#include <common/blas.h>
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#include <cuda.h>
			
 
				+#include <cublas.h>
			
 
				+#endif
			
 
				+
			
 
				+static unsigned niter = 100;
			
 
				+static unsigned nslicesx = 4;
			
 
				+static unsigned nslicesy = 4;
			
 
				+static unsigned xdim = 256;
			
 
				+static unsigned ydim = 256;
			
 
				+static unsigned zdim = 64;
			
 
				+static unsigned check = 0;
			
 
				+
			
 
				+static TYPE *A, *B, *C;
			
 
				+static starpu_data_handle A_handle, B_handle, C_handle;
			
 
				+
			
 
				+static void parse_args(int argc, char **argv)
			
 
				+{
			
 
				+	int i;
			
 
				+	for (i = 1; i < argc; i++) {
			
 
				+		if (strcmp(argv[i], "-nblocks") == 0) {
			
 
				+			char *argptr;
			
 
				+			nslicesx = strtol(argv[++i], &argptr, 10);
			
 
				+			nslicesy = nslicesx;
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-nblocksx") == 0) {
			
 
				+			char *argptr;
			
 
				+			nslicesx = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-nblocksy") == 0) {
			
 
				+			char *argptr;
			
 
				+			nslicesy = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-x") == 0) {
			
 
				+			char *argptr;
			
 
				+			xdim = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-y") == 0) {
			
 
				+			char *argptr;
			
 
				+			ydim = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-z") == 0) {
			
 
				+			char *argptr;
			
 
				+			zdim = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-iter") == 0) {
			
 
				+			char *argptr;
			
 
				+			niter = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-check") == 0) {
			
 
				+			check = 1;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				 
			
 
				 static void check_output(void)
			
 
				 {
			
 
				-	/* check results */
			
 
				 	/* compute C = C - AB */
			
 
				-
			
 
				 	CPU_GEMM("N", "N", ydim, xdim, zdim, (TYPE)-1.0f, A, ydim, B, zdim, (TYPE)1.0f, C, ydim);
			
 
				 		
			
 
				 	/* make sure C = 0 */
			
@@ -65,66 +107,24 @@ static void check_output(void)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-void callback_func(void *arg)
			
 
				-{
			
 
				-	/* do some accounting */
			
 
				-	int id = starpu_worker_get_id();
			
 
				-	flop_per_worker[id] += BLAS3_FLOP(conf.m, conf.n, conf.k);
			
 
				-	ls_per_worker[id] += BLAS3_LS(conf.m, conf.n, conf.k);
			
 
				-}
			
 
				-
			
 
				 static void init_problem_data(void)
			
 
				 {
			
 
				 	unsigned i,j;
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	if (pin) {
			
 
				-		starpu_data_malloc_pinned_if_possible((void **)&A, zdim*ydim*sizeof(TYPE));
			
 
				-		starpu_data_malloc_pinned_if_possible((void **)&B, xdim*zdim*sizeof(TYPE));
			
 
				-		starpu_data_malloc_pinned_if_possible((void **)&C, xdim*ydim*sizeof(TYPE));
			
 
				-	} else
			
 
				-#endif
			
 
				-	{
			
 
				-#ifdef STARPU_HAVE_POSIX_MEMALIGN
			
 
				-		posix_memalign((void **)&A, 4096, zdim*ydim*sizeof(TYPE));
			
 
				-		posix_memalign((void **)&B, 4096, xdim*zdim*sizeof(TYPE));
			
 
				-		posix_memalign((void **)&C, 4096, xdim*ydim*sizeof(TYPE));
			
 
				-#else
			
 
				-		A = malloc(zdim*ydim*sizeof(TYPE));
			
 
				-		B = malloc(xdim*zdim*sizeof(TYPE));
			
 
				-		C = malloc(xdim*ydim*sizeof(TYPE));
			
 
				-#endif
			
 
				-	}
			
 
				+	starpu_data_malloc_pinned_if_possible((void **)&A, zdim*ydim*sizeof(TYPE));
			
 
				+	starpu_data_malloc_pinned_if_possible((void **)&B, xdim*zdim*sizeof(TYPE));
			
 
				+	starpu_data_malloc_pinned_if_possible((void **)&C, xdim*ydim*sizeof(TYPE));
			
 
				 
			
 
				 	/* fill the A and B matrices */
			
 
				-	if (norandom) {
			
 
				-		for (j=0; j < ydim; j++) {
			
 
				-			for (i=0; i < zdim; i++) {
			
 
				-				A[j+i*ydim] = (TYPE)(i);
			
 
				-			}
			
 
				-		}
			
 
				-	
			
 
				-		for (j=0; j < zdim; j++) {
			
 
				-			for (i=0; i < xdim; i++) {
			
 
				-				B[j+i*zdim] = (TYPE)(j);
			
 
				-			}
			
 
				-		}
			
 
				-	} 
			
 
				-	else {
			
 
				-#ifdef NORANDOM
			
 
				-		srand(2008);
			
 
				-		STARPU_ABORT();
			
 
				-#endif
			
 
				-		for (j=0; j < ydim; j++) {
			
 
				-			for (i=0; i < zdim; i++) {
			
 
				-				A[j+i*ydim] = (TYPE)(starpu_drand48());
			
 
				-			}
			
 
				+	for (j=0; j < ydim; j++) {
			
 
				+		for (i=0; i < zdim; i++) {
			
 
				+			A[j+i*ydim] = (TYPE)(starpu_drand48());
			
 
				 		}
			
 
				-	
			
 
				-		for (j=0; j < zdim; j++) {
			
 
				-			for (i=0; i < xdim; i++) {
			
 
				-				B[j+i*zdim] = (TYPE)(starpu_drand48());
			
 
				-			}
			
 
				+	}
			
 
				+
			
 
				+	for (j=0; j < zdim; j++) {
			
 
				+		for (i=0; i < xdim; i++) {
			
 
				+			B[j+i*zdim] = (TYPE)(starpu_drand48());
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -133,8 +133,6 @@ static void init_problem_data(void)
 
				 			C[j+i*ydim] = (TYPE)(0);
			
 
				 		}
			
 
				 	}
			
 
				-
			
 
				-	display_memory_consumption();
			
 
				 }
			
 
				 
			
 
				 static void partition_mult_data(void)
			
@@ -148,21 +146,15 @@ static void partition_mult_data(void)
 
				 
			
 
				 	starpu_data_set_wt_mask(C_handle, 1<<0);
			
 
				 
			
 
				-	conf.k = zdim;
			
 
				-	conf.m = ydim/nslicesy;
			
 
				-	conf.n = xdim/nslicesx;
			
 
				-
			
 
				 	struct starpu_data_filter f;
			
 
				+	memset(&f, 0, sizeof(f));
			
 
				 	f.filter_func = starpu_vertical_block_filter_func;
			
 
				 	f.nchildren = nslicesx;
			
 
				-	f.get_nchildren = NULL;
			
 
				-	f.get_child_ops = NULL;
			
 
				 		
			
 
				 	struct starpu_data_filter f2;
			
 
				+	memset(&f2, 0, sizeof(f2));
			
 
				 	f2.filter_func = starpu_block_filter_func;
			
 
				 	f2.nchildren = nslicesy;
			
 
				-	f2.get_nchildren = NULL;
			
 
				-	f2.get_child_ops = NULL;
			
 
				 		
			
 
				 	starpu_data_partition(B_handle, &f);
			
 
				 	starpu_data_partition(A_handle, &f2);
			
@@ -170,126 +162,110 @@ static void partition_mult_data(void)
 
				 	starpu_data_map_filters(C_handle, 2, &f, &f2);
			
 
				 }
			
 
				 
			
 
				-static void unpartition_mult_data(void)
			
 
				+static void mult_kernel_common(void *descr[], int type)
			
 
				 {
			
 
				-	fprintf(stderr, "unpartition !!\n");
			
 
				+	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	TYPE *subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+	TYPE *subC = (TYPE *)STARPU_MATRIX_GET_PTR(descr[2]);
			
 
				 
			
 
				-	starpu_data_unpartition(C_handle, 0);
			
 
				+	unsigned nxC = STARPU_MATRIX_GET_NX(descr[2]);
			
 
				+	unsigned nyC = STARPU_MATRIX_GET_NY(descr[2]);
			
 
				+	unsigned nyA = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				 
			
 
				-	starpu_data_unregister(C_handle);
			
 
				-}
			
 
				+	unsigned ldA = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+	unsigned ldB = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				+	unsigned ldC = STARPU_MATRIX_GET_LD(descr[2]);
			
 
				 
			
 
				-static starpu_codelet cl = {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA
			
 
				-#ifdef SPU_FUNC_SGEMM
			
 
				-		|STARPU_GORDON
			
 
				-#endif
			
 
				-		,
			
 
				-	.cpu_func = STARPU_GEMM(cpu_mult),
			
 
				+	if (type == STARPU_CPU) {
			
 
				+		CPU_GEMM("N", "N", nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB, (TYPE)0.0, subC, ldC);
			
 
				+	}
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = STARPU_GEMM(cublas_mult),
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_GORDON
			
 
				-#ifdef SPU_FUNC_SGEMM
			
 
				-	.gordon_func = SPU_FUNC_SGEMM,
			
 
				-#else
			
 
				-#warning SPU_FUNC_SGEMM is not available
			
 
				-#endif
			
 
				+	else {
			
 
				+		CUBLAS_GEMM('n', 'n', nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
			
 
				+					     (TYPE)0.0, subC, ldC);
			
 
				+		cudaThreadSynchronize();
			
 
				+	}
			
 
				 #endif
			
 
				-	.nbuffers = 3
			
 
				-};
			
 
				-
			
 
				-static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, unsigned iter)
			
 
				-{
			
 
				-	/* A B[task] = C[task] */
			
 
				-	struct starpu_task *task = starpu_task_create();
			
 
				-
			
 
				-	task->cl = &cl;
			
 
				-
			
 
				-	/* we have a callback to do some accounting */
			
 
				-	task->callback_func = callback_func;
			
 
				-	task->callback_arg = NULL;
			
 
				-
			
 
				-	task->buffers[0].handle = starpu_data_get_sub_data(A_handle, 1, y);
			
 
				-	task->buffers[0].mode = STARPU_R;
			
 
				-	task->buffers[1].handle = starpu_data_get_sub_data(B_handle, 1, x);
			
 
				-	task->buffers[1].mode = STARPU_R;
			
 
				-	task->buffers[2].handle = starpu_data_get_sub_data(C_handle, 2, x, y);
			
 
				-	task->buffers[2].mode = STARPU_RW;
			
 
				-
			
 
				-	task->cl_arg = &conf;
			
 
				-	task->cl_arg_size = sizeof(struct block_conf);
			
 
				-	return task;
			
 
				 }
			
 
				 
			
 
				-static void submit_new_iter(unsigned x, unsigned y, unsigned iter)
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+static void cublas_mult(void *descr[], __attribute__((unused)) void *arg)
			
 
				 {
			
 
				-	unsigned z;
			
 
				-
			
 
				-	z = 0;
			
 
				-
			
 
				-	{
			
 
				-		struct starpu_task *task;
			
 
				-		task = construct_task(x, y, z, iter);
			
 
				-
			
 
				-		starpu_task_submit(task);
			
 
				-	}
			
 
				+	mult_kernel_common(descr, STARPU_CUDA);
			
 
				 }
			
 
				-
			
 
				-static void launch_codelets(void)
			
 
				-{
			
 
				-#ifdef STARPU_USE_FXT
			
 
				-	_starpu_fxt_register_thread(0);
			
 
				 #endif
			
 
				-	/* partition the work into slices */
			
 
				-	unsigned taskx, tasky;
			
 
				 
			
 
				-	srand(time(NULL));
			
 
				+static void cpu_mult(void *descr[], __attribute__((unused))  void *arg)
			
 
				+{
			
 
				+	mult_kernel_common(descr, STARPU_CPU);
			
 
				+}
			
 
				 
			
 
				-	/* should we use a single performance model for all archs and use an
			
 
				- 	 * acceleration factor ? */
			
 
				-	if (use_common_model) {
			
 
				-		cl.model = &STARPU_GEMM(model_common);
			
 
				-	}
			
 
				-	else {
			
 
				-		cl.model = &STARPU_GEMM(model);
			
 
				-	}
			
 
				+static struct starpu_perfmodel_t starpu_gemm_model = {
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = STARPU_GEMM_STR(gemm)
			
 
				+};
			
 
				 
			
 
				-	for (taskx = 0; taskx < nslicesx; taskx++) 
			
 
				-	{
			
 
				-		for (tasky = 0; tasky < nslicesy; tasky++)
			
 
				-		{
			
 
				-			submit_new_iter(taskx, tasky, 0);
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				+static starpu_codelet cl = {
			
 
				+	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.cpu_func = cpu_mult,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_func = cublas_mult,
			
 
				+#endif
			
 
				+	.nbuffers = 3,
			
 
				+	.model = &starpu_gemm_model
			
 
				+};
			
 
				 
			
 
				-int main(__attribute__ ((unused)) int argc, 
			
 
				-	 __attribute__ ((unused)) char **argv)
			
 
				+int main(int argc, char **argv)
			
 
				 {
			
 
				+	struct timeval start;
			
 
				+	struct timeval end;
			
 
				 
			
 
				 	parse_args(argc, argv);
			
 
				 
			
 
				-	/* start the runtime */
			
 
				 	starpu_init(NULL);
			
 
				-
			
 
				 	starpu_helper_cublas_init();
			
 
				 
			
 
				 	init_problem_data();
			
 
				+	partition_mult_data();
			
 
				 
			
 
				 	gettimeofday(&start, NULL);
			
 
				 
			
 
				-	partition_mult_data();
			
 
				+	unsigned x, y, iter;
			
 
				+	for (iter = 0; iter < niter; iter++)
			
 
				+	{
			
 
				+		for (x = 0; x < nslicesx; x++) 
			
 
				+		for (y = 0; y < nslicesy; y++)
			
 
				+		{
			
 
				+			struct starpu_task *task = starpu_task_create();
			
 
				+	
			
 
				+			task->cl = &cl;
			
 
				+	
			
 
				+			task->buffers[0].handle = starpu_data_get_sub_data(A_handle, 1, y);
			
 
				+			task->buffers[0].mode = STARPU_R;
			
 
				+			task->buffers[1].handle = starpu_data_get_sub_data(B_handle, 1, x);
			
 
				+			task->buffers[1].mode = STARPU_R;
			
 
				+			task->buffers[2].handle = starpu_data_get_sub_data(C_handle, 2, x, y);
			
 
				+			task->buffers[2].mode = STARPU_RW;
			
 
				+	
			
 
				+			int ret = starpu_task_submit(task);
			
 
				+			STARPU_ASSERT(!ret);
			
 
				+		}
			
 
				 
			
 
				-	launch_codelets();
			
 
				+		starpu_task_wait_for_all();
			
 
				+	}
			
 
				 
			
 
				-	starpu_task_wait_for_all();
			
 
				 
			
 
				 	gettimeofday(&end, NULL);
			
 
				 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				-	display_stats(timing);
			
 
				 
			
 
				-	unpartition_mult_data();
			
 
				+	fprintf(stderr, "Time: %2.2f ms\n", timing/1000.0);
			
 
				+
			
 
				+	double flops = 2.0*((unsigned long)niter)*((unsigned long)xdim)
			
 
				+				*((unsigned long)ydim)*((unsigned long)zdim);
			
 
				+	fprintf(stderr, "GFlop/s: %.2f\n", flops/timing/1000.0);
			
 
				+
			
 
				+	starpu_data_unpartition(C_handle, 0);
			
 
				+	starpu_data_unregister(C_handle);
			
 
				 	
			
 
				 	if (check)
			
 
				 		check_output();
			
--- a/examples/mult/xgemm_kernels.c
+++ b/examples/mult/xgemm_kernels.c
@@ -1,78 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include <starpu.h>
			
 
				-#include <starpu_cuda.h>
			
 
				-#include <common/blas.h>
			
 
				-
			
 
				-#define COMMON_CODE			\
			
 
				-	uint32_t nxC, nyC, nyA;		\
			
 
				-	uint32_t ldA, ldB, ldC;		\
			
 
				-					\
			
 
				-	TYPE *subA;			\
			
 
				-	TYPE *subB;			\
			
 
				-	TYPE *subC;			\
			
 
				-					\
			
 
				-	subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);	\
			
 
				-	subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);	\
			
 
				-	subC = (TYPE *)STARPU_MATRIX_GET_PTR(descr[2]);	\
			
 
				-					\
			
 
				-	nxC = STARPU_MATRIX_GET_NX(descr[2]);		\
			
 
				-	nyC = STARPU_MATRIX_GET_NY(descr[2]);		\
			
 
				-	nyA = STARPU_MATRIX_GET_NY(descr[0]);		\
			
 
				-					\
			
 
				-	ldA = STARPU_MATRIX_GET_LD(descr[0]);		\
			
 
				-	ldB = STARPU_MATRIX_GET_LD(descr[1]);		\
			
 
				-	ldC = STARPU_MATRIX_GET_LD(descr[2]);
			
 
				-
			
 
				-
			
 
				-
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-
			
 
				-#ifdef STARPU_HAVE_MAGMA
			
 
				-#define GPU_GEMM MAGMABLAS_GEMM
			
 
				-#else
			
 
				-#define GPU_GEMM CUBLAS_GEMM
			
 
				-#endif
			
 
				-
			
 
				-void STARPU_GEMM(cublas_mult)(void *descr[], __attribute__((unused)) void *arg)
			
 
				-{
			
 
				-	COMMON_CODE
			
 
				-
			
 
				-	starpu_trace_user_event(0x42);
			
 
				-
			
 
				-	GPU_GEMM('n', 'n', nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
			
 
				-					     (TYPE)0.0, subC, ldC);
			
 
				-	cublasStatus st;
			
 
				-	st = cublasGetError();
			
 
				-	if (st != CUBLAS_STATUS_SUCCESS)
			
 
				-		STARPU_ABORT();
			
 
				-
			
 
				-	cudaThreadSynchronize();
			
 
				-
			
 
				-	starpu_trace_user_event(0x42);
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-void STARPU_GEMM(cpu_mult)(void *descr[], __attribute__((unused))  void *arg)
			
 
				-{
			
 
				-	COMMON_CODE
			
 
				-
			
 
				-	starpu_trace_user_event(0x42);
			
 
				-	CPU_GEMM("N", "N", nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB, (TYPE)0.0, subC, ldC);
			
 
				-	starpu_trace_user_event(0x43);
			
 
				-}