15 years ago · 53c60beeb5
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -41,8 +41,6 @@ EXTRA_DIST = 					\
 
																 	spmv/spmv_cuda.cu			\
															
 
																 	gordon/null_kernel_gordon.c		\
															
 
																 	mult/xgemm.c				\
															
 
																-	mult/xgemm_kernels.c			\
															
 
																-	mult/gordon/func_sgemm_ibm.c		\
															
 
																 	lu/xlu.c				\
															
 
																 	lu/xlu_pivot.c				\
															
 
																 	lu/xlu_implicit.c			\
															
@@ -56,8 +54,7 @@ EXTRA_DIST = 					\
 
																 	filters/fblock_opencl_kernel.cl
															
 
																 CLEANFILES = 					\
															
 
																-	gordon/null_kernel_gordon.spuelf	\
															
 
																-	mult/gordon/func_sgemm_ibm.spuelf
															
 
																+	gordon/null_kernel_gordon.spuelf
															
 
																 CLEANFILES += *.gcno *.gcda *.linkinfo
															
@@ -88,8 +85,7 @@ SPULIBS = -lblas #-lc -lgloss -lc
 
																 	$(SPU_LD) $(SPULDFLAGS) $< -o $@ $(SPULIBS)
															
 
																 BUILT_SOURCES +=				\
															
 
																-	gordon/null_kernel_gordon.spuelf	\
															
 
																-	mult/gordon/func_sgemm_ibm.spuelf
															
 
																+	gordon/null_kernel_gordon.spuelf
															
 
																 endif
															
@@ -116,10 +112,8 @@ noinst_HEADERS = 				\
 
																 	cholesky/dw_cholesky.h			\
															
 
																 	common/blas_model.h			\
															
 
																 	common/blas.h				\
															
 
																-	mult/dw_mult.h				\
															
 
																 	mult/simple.h				\
															
 
																 	mult/double.h				\
															
 
																-	mult/gordon/func_gemm_ibm.h		\
															
 
																 	gordon/null.h				\
															
 
																 	fortran/bindings/StarPU_fortran.h	\
															
 
																 	ppm_downscaler/ppm_downscaler.h		\
															
@@ -332,29 +326,15 @@ if !NO_BLAS_LIB
 
																 examplebin_PROGRAMS += 				\
															
 
																 	mult/sgemm 				\
															
 
																-	mult/dgemm 				\
															
 
																-	mult/dw_mult_no_stride			\
															
 
																-	mult/dw_mult_no_stride_no_tag
															
 
																+	mult/dgemm
															
 
																 mult_sgemm_SOURCES = 				\
															
 
																 	mult/sgemm.c				\
															
 
																-	common/blas.c				\
															
 
																-	common/blas_model.c
															
 
																+	common/blas.c
															
 
																 mult_dgemm_SOURCES = 				\
															
 
																 	mult/dgemm.c				\
															
 
																-	common/blas.c				\
															
 
																-	common/blas_model.c
															
 
																-
															
 
																-mult_dw_mult_no_stride_SOURCES = 		\
															
 
																-	mult/dw_mult_no_stride.c		\
															
 
																-	common/blas.c				\
															
 
																-	common/blas_model.c
															
 
																-
															
 
																-mult_dw_mult_no_stride_no_tag_SOURCES =		\
															
 
																-	mult/dw_mult_no_stride_no_tag.c		\
															
 
																-	common/blas.c				\
															
 
																-	common/blas_model.c
															
 
																+	common/blas.c
															
 
																 endif
															
--- a/examples/mult/dgemm.c
+++ b/examples/mult/dgemm.c
@@ -16,6 +16,4 @@
 
																  */
															
 
																 #include "double.h"
															
 
																-
															
 
																-#include "xgemm_kernels.c"
															
 
																 #include "xgemm.c" 
															
--- a/examples/mult/double.h
+++ b/examples/mult/double.h
@@ -17,7 +17,6 @@
 
																 #define TYPE	double
															
 
																 #define CUBLAS_GEMM cublasDgemm
															
 
																-#define MAGMABLAS_GEMM magmablas_dgemm
															
 
																 #define CPU_GEMM	DGEMM
															
 
																 #define CPU_ASUM	DASUM
															
 
																 #define CPU_IAMAX	IDAMAX
															
--- a/examples/mult/dw_mult.h
+++ b/examples/mult/dw_mult.h
@@ -1,203 +0,0 @@
 
																-/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																- *
															
 
																- * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																- *
															
 
																- * StarPU is free software; you can redistribute it and/or modify
															
 
																- * it under the terms of the GNU Lesser General Public License as published by
															
 
																- * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																- * your option) any later version.
															
 
																- *
															
 
																- * StarPU is distributed in the hope that it will be useful, but
															
 
																- * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																- *
															
 
																- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																- */
															
 
																-
															
 
																-#ifndef __MULT_H__
															
 
																-#define __MULT_H__
															
 
																-
															
 
																-#include <string.h>
															
 
																-#include <math.h>
															
 
																-#include <sys/types.h>
															
 
																-#include <sys/time.h>
															
 
																-#include <pthread.h>
															
 
																-#include <signal.h>
															
 
																-
															
 
																-#include <common/blas.h>
															
 
																-#include <common/blas_model.h>
															
 
																-
															
 
																-#include <starpu.h>
															
 
																-
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-#include <cuda.h>
															
 
																-#include <cublas.h>
															
 
																-#endif
															
 
																-
															
 
																-#define MAXSLICESX	64
															
 
																-#define MAXSLICESY	64
															
 
																-#define MAXSLICESZ	64
															
 
																-
															
 
																-#define BLAS3_FLOP(n1,n2,n3)	\
															
 
																-	(2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
															
 
																-
															
 
																-#define BLAS3_LS(n1,n2,n3)    \
															
 
																-	((2*(n1)*(n3) + (n1)*(n2) + (n2)*(n3))*sizeof(float))
															
 
																-
															
 
																-struct block_conf {
															
 
																-	uint32_t m;
															
 
																-	uint32_t n;
															
 
																-	uint32_t k;
															
 
																-	uint32_t pad;
															
 
																-};
															
 
																-
															
 
																-#define NITER	100
															
 
																-
															
 
																-unsigned niter = NITER;
															
 
																-unsigned nslicesx = 4;
															
 
																-unsigned nslicesy = 4;
															
 
																-unsigned nslicesz = 4;
															
 
																-unsigned xdim = 256;
															
 
																-unsigned ydim = 256;
															
 
																-unsigned zdim = 64;
															
 
																-unsigned norandom = 0;
															
 
																-unsigned pin = 0;
															
 
																-unsigned use_common_model = 0;
															
 
																-unsigned check = 0;
															
 
																-
															
 
																-/* to compute MFlop/s */
															
 
																-uint64_t flop_cublas = 0;
															
 
																-uint64_t flop_atlas = 0;
															
 
																-uint64_t flop_per_worker[STARPU_NMAXWORKERS] = {0};
															
 
																-
															
 
																-/* to compute MB/s (load/store) */
															
 
																-uint64_t ls_cublas = 0;
															
 
																-uint64_t ls_atlas = 0;
															
 
																-uint64_t ls_per_worker[STARPU_NMAXWORKERS] = {0};
															
 
																-
															
 
																-
															
 
																-struct timeval start;
															
 
																-struct timeval end;
															
 
																-
															
 
																-static int taskcounter __attribute__ ((unused));
															
 
																-static struct block_conf conf __attribute__ ((aligned (128)));
															
 
																-
															
 
																-#define BLOCKSIZEX	(xdim / nslicesx)
															
 
																-#define BLOCKSIZEY	(ydim / nslicesy)
															
 
																-#define BLOCKSIZEZ	(zdim / nslicesz)
															
 
																-
															
 
																-static void display_stats(double timing)
															
 
																-{
															
 
																-	unsigned worker;
															
 
																-	unsigned nworkers = starpu_worker_get_count();
															
 
																-
															
 
																-	fprintf(stderr, "Computation took (ms):\n");
															
 
																-	printf("%2.2f\n", timing/1000);
															
 
																-
															
 
																-	uint64_t flop_total = 0, ls_total = 0;
															
 
																-	
															
 
																-	for (worker = 0; worker < nworkers; worker++)
															
 
																-	{
															
 
																-		flop_total += flop_per_worker[worker];
															
 
																-		ls_total += ls_per_worker[worker];
															
 
																-
															
 
																-		char name[32];
															
 
																-		starpu_worker_get_name(worker, name, 32);
															
 
																-
															
 
																-		fprintf(stderr, "\t%s -> %2.2f GFlop\t%2.2f GFlop/s\n", name, (double)flop_per_worker[worker]/1000000000.0f, (double)flop_per_worker[worker]/(double)timing/1000);
															
 
																-	}
															
 
																-
															
 
																-	fprintf(stderr, "Total: %2.2f GFlops\t%2.2f GFlop/s\n", (double)flop_total/1000000000.0f, (double)flop_total/(double)timing/1000);
															
 
																-}
															
 
																-
															
 
																-static void parse_args(int argc, char **argv)
															
 
																-{
															
 
																-	int i;
															
 
																-	for (i = 1; i < argc; i++) {
															
 
																-		if (strcmp(argv[i], "-nblocks") == 0) {
															
 
																-			char *argptr;
															
 
																-			nslicesx = strtol(argv[++i], &argptr, 10);
															
 
																-			nslicesy = nslicesx;
															
 
																-			nslicesz = nslicesx;
															
 
																-		}
															
 
																-
															
 
																-		if (strcmp(argv[i], "-nblocksx") == 0) {
															
 
																-			char *argptr;
															
 
																-			nslicesx = strtol(argv[++i], &argptr, 10);
															
 
																-		}
															
 
																-
															
 
																-		if (strcmp(argv[i], "-nblocksy") == 0) {
															
 
																-			char *argptr;
															
 
																-			nslicesy = strtol(argv[++i], &argptr, 10);
															
 
																-		}
															
 
																-
															
 
																-		if (strcmp(argv[i], "-nblocksz") == 0) {
															
 
																-			char *argptr;
															
 
																-			nslicesz = strtol(argv[++i], &argptr, 10);
															
 
																-		}
															
 
																-
															
 
																-		if (strcmp(argv[i], "-x") == 0) {
															
 
																-			char *argptr;
															
 
																-			xdim = strtol(argv[++i], &argptr, 10);
															
 
																-		}
															
 
																-
															
 
																-		if (strcmp(argv[i], "-y") == 0) {
															
 
																-			char *argptr;
															
 
																-			ydim = strtol(argv[++i], &argptr, 10);
															
 
																-		}
															
 
																-
															
 
																-		if (strcmp(argv[i], "-z") == 0) {
															
 
																-			char *argptr;
															
 
																-			zdim = strtol(argv[++i], &argptr, 10);
															
 
																-		}
															
 
																-
															
 
																-		if (strcmp(argv[i], "-iter") == 0) {
															
 
																-			char *argptr;
															
 
																-			niter = strtol(argv[++i], &argptr, 10);
															
 
																-		}
															
 
																-
															
 
																-
															
 
																-		if (strcmp(argv[i], "-no-random") == 0) {
															
 
																-			norandom = 1;
															
 
																-		}
															
 
																-
															
 
																-		if (strcmp(argv[i], "-pin") == 0) {
															
 
																-			pin = 1;
															
 
																-		}
															
 
																-
															
 
																-		if (strcmp(argv[i], "-check") == 0) {
															
 
																-			check = 1;
															
 
																-		}
															
 
																-
															
 
																-		if (strcmp(argv[i], "-common-model") == 0) {
															
 
																-			use_common_model = 1;
															
 
																-		}
															
 
																-	}
															
 
																-
															
 
																-	assert(nslicesx <= MAXSLICESX); 
															
 
																-	assert(nslicesy <= MAXSLICESY); 
															
 
																-	assert(nslicesz <= MAXSLICESZ); 
															
 
																-}
															
 
																-
															
 
																-static void display_memory_consumption(void)
															
 
																-{
															
 
																-	fprintf(stderr, "Total memory : %ld MB\n",
															
 
																-		(MAXSLICESY*MAXSLICESZ*sizeof(TYPE *) 
															
 
																-		+ MAXSLICESZ*MAXSLICESX*sizeof(TYPE *)
															
 
																-		+ MAXSLICESY*MAXSLICESX*sizeof(TYPE *)
															
 
																-		+ MAXSLICESY*MAXSLICESZ*sizeof(starpu_data_handle)
															
 
																-		+ MAXSLICESZ*MAXSLICESX*sizeof(starpu_data_handle)
															
 
																-		+ MAXSLICESY*MAXSLICESX*sizeof(starpu_data_handle)
															
 
																-		+ ydim*zdim*sizeof(TYPE)
															
 
																-		+ zdim*xdim*sizeof(TYPE)
															
 
																-		+ ydim*xdim*sizeof(TYPE))/(1024*1024) );
															
 
																-}
															
 
																-
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-void cublas_mult(void *descr[], __attribute__((unused)) void *arg);
															
 
																-#endif
															
 
																-
															
 
																-void cpu_mult(void *descr[], __attribute__((unused))  void *arg);
															
 
																-
															
 
																-#endif // __MULT_H__
															
--- a/examples/mult/dw_mult_no_stride.c
+++ b/examples/mult/dw_mult_no_stride.c
@@ -1,465 +0,0 @@
 
																-/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																- *
															
 
																- * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																- *
															
 
																- * StarPU is free software; you can redistribute it and/or modify
															
 
																- * it under the terms of the GNU Lesser General Public License as published by
															
 
																- * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																- * your option) any later version.
															
 
																- *
															
 
																- * StarPU is distributed in the hope that it will be useful, but
															
 
																- * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																- *
															
 
																- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																- */
															
 
																-
															
 
																-#include "simple.h"
															
 
																-#include "dw_mult.h"
															
 
																-#ifdef STARPU_USE_GORDON
															
 
																-#include "gordon/func_sgemm_ibm.h"
															
 
																-#endif
															
 
																-#include "xgemm_kernels.c"
															
 
																-
															
 
																-TYPE *A[MAXSLICESY][MAXSLICESZ];
															
 
																-TYPE *B[MAXSLICESZ][MAXSLICESX];
															
 
																-TYPE *C[MAXSLICESY][MAXSLICESX];
															
 
																-
															
 
																-starpu_data_handle A_state[MAXSLICESY][MAXSLICESZ];
															
 
																-starpu_data_handle B_state[MAXSLICESZ][MAXSLICESX];
															
 
																-starpu_data_handle C_state[MAXSLICESY][MAXSLICESX];
															
 
																-
															
 
																-#define TAG(x,y,z,iter)	\
															
 
																-		((starpu_tag_t)((z) + (iter)*nslicesz + (x)*(nslicesz*niter) + (y)*(nslicesx*nslicesz*niter)))
															
 
																-
															
 
																-static void submit_new_iter(unsigned x, unsigned y, unsigned iter);
															
 
																-
															
 
																-/*
															
 
																- * This program computes C = A * B 
															
 
																- *
															
 
																- * The difference with xgemm.c is that matrices are here already split in
															
 
																- * blocks, and thus no data partitioning is needed.
															
 
																- * 
															
 
																- *   A of size (z,y)
															
 
																- *   B of size (x,z)
															
 
																- *   C of size (x,y)
															
 
																-
															
 
																-              |---------------|
															
 
																-            z |       B       |
															
 
																-              |---------------|
															
 
																-       z              x
															
 
																-     |----|   |---------------|
															
 
																-     |    |   |               |
															
 
																-     |    |   |               |
															
 
																-     | A  | y |       C       |
															
 
																-     |    |   |               |
															
 
																-     |    |   |               |
															
 
																-     |----|   |---------------|
															
 
																-
															
 
																- */
															
 
																-
															
 
																-#define MEM_ALIGNMENT	16
															
 
																-
															
 
																-static void init_problem_data(void)
															
 
																-{
															
 
																-	unsigned i,j;
															
 
																-
															
 
																-	/* debug ... */
															
 
																-	memset(A, 0, MAXSLICESY*MAXSLICESZ*sizeof(TYPE *));
															
 
																-	memset(B, 0, MAXSLICESZ*MAXSLICESZ*sizeof(TYPE *));
															
 
																-	memset(C, 0, MAXSLICESY*MAXSLICESX*sizeof(TYPE *));
															
 
																-	memset(&A_state, 0, MAXSLICESY*MAXSLICESZ*sizeof(starpu_data_handle));
															
 
																-	memset(&B_state, 0, MAXSLICESZ*MAXSLICESZ*sizeof(starpu_data_handle));
															
 
																-	memset(&C_state, 0, MAXSLICESY*MAXSLICESX*sizeof(starpu_data_handle));
															
 
																-
															
 
																-	/* Allocate grids of buffer */
															
 
																-	/* TODO pin ... */
															
 
																-	unsigned z, y, x;
															
 
																-
															
 
																-	for (y = 0; y < nslicesy; y++)
															
 
																-	{
															
 
																-		for (z = 0; z < nslicesz; z++)
															
 
																-		{
															
 
																-#ifdef STARPU_HAVE_POSIX_MEMALIGN
															
 
																-			posix_memalign((void **)&A[y][z], MEM_ALIGNMENT, BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
															
 
																-#else
															
 
																-			A[y][z] = malloc(BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
															
 
																-#endif
															
 
																-			assert(A[y][z]);
															
 
																-		}
															
 
																-	}
															
 
																-
															
 
																-	for (z = 0; z < nslicesz; z++)
															
 
																-	{
															
 
																-		for (x = 0; x < nslicesx; x++)
															
 
																-		{
															
 
																-#ifdef STARPU_HAVE_POSIX_MEMALIGN
															
 
																-			posix_memalign((void **)&B[z][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
															
 
																-#else
															
 
																-			B[z][x] = malloc(BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
															
 
																-#endif
															
 
																-			assert(B[z][x]);
															
 
																-		}
															
 
																-	}
															
 
																-
															
 
																-	for (y = 0; y < nslicesy; y++)
															
 
																-	{
															
 
																-		for (x = 0; x < nslicesx; x++)
															
 
																-		{
															
 
																-#ifdef STARPU_HAVE_POSIX_MEMALIGN
															
 
																-			posix_memalign((void **)&C[y][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
															
 
																-#else
															
 
																-			C[y][x] = malloc(BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
															
 
																-#endif
															
 
																-			assert(C[y][x]);
															
 
																-		}
															
 
																-	}
															
 
																-	
															
 
																-	/* fill the A and B matrices */
															
 
																-	unsigned blockx, blocky, blockz;
															
 
																-
															
 
																-	if (norandom) {
															
 
																-		for (blocky = 0; blocky < nslicesy; blocky++)
															
 
																-			for (blockz = 0; blockz < nslicesz; blockz++)
															
 
																-				for (j = 0; j < BLOCKSIZEY; j++)
															
 
																-					for (i = 0; i < BLOCKSIZEZ; i++)
															
 
																-					{
															
 
																-						A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(1 + blockz + blocky*nslicesz);
															
 
																-					}
															
 
																-
															
 
																-		for (blockz = 0; blockz < nslicesz; blockz++)
															
 
																-			for (blockx = 0; blockx < nslicesx; blockx++)
															
 
																-				for (j = 0; j < BLOCKSIZEZ; j++)
															
 
																-					for (i = 0; i < BLOCKSIZEX; i++)
															
 
																-					{
															
 
																-						B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(1 + blockx + blockz*nslicesx);
															
 
																-					}
															
 
																-	} 
															
 
																-	else {
															
 
																-		for (blocky = 0; blocky < nslicesy; blocky++)
															
 
																-			for (blockz = 0; blockz < nslicesz; blockz++)
															
 
																-				for (j = 0; j < BLOCKSIZEY; j++)
															
 
																-					for (i = 0; i < BLOCKSIZEZ; i++)
															
 
																-					{
															
 
																-						A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(starpu_drand48());
															
 
																-					}
															
 
																-
															
 
																-		for (blockz = 0; blockz < nslicesz; blockz++)
															
 
																-			for (blockx = 0; blockx < nslicesx; blockx++)
															
 
																-				for (j = 0; j < BLOCKSIZEZ; j++)
															
 
																-					for (i = 0; i < BLOCKSIZEX; i++)
															
 
																-					{
															
 
																-						B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(starpu_drand48());
															
 
																-					}
															
 
																-
															
 
																-	}
															
 
																-
															
 
																-	for (blocky = 0; blocky < nslicesy; blocky++)
															
 
																-		for (blockx = 0; blockx < nslicesx; blockx++)
															
 
																-			for (j = 0; j < BLOCKSIZEY; j++)
															
 
																-				for (i = 0; i < BLOCKSIZEX; i++)
															
 
																-				{
															
 
																-					C[blocky][blockx][i*BLOCKSIZEY + j] = (TYPE)(blockx + blocky*nslicesx + 1);
															
 
																-				}
															
 
																-
															
 
																-	/* TODO: aren't we supposed to set data consistency to relaxed, since
															
 
																-	 * tags are supposed to provide the correct dependencies? */
															
 
																-
															
 
																-	/* declare the StarPU data to monitor */
															
 
																-	for (y = 0; y < nslicesy; y++)
															
 
																-	{
															
 
																-		for (z = 0; z < nslicesz; z++)
															
 
																-		{
															
 
																-			starpu_matrix_data_register(&A_state[y][z], 0, (uintptr_t)A[y][z], 
															
 
																-				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEZ, sizeof(TYPE));
															
 
																-		}
															
 
																-	}
															
 
																-
															
 
																-	for (z = 0; z < nslicesz; z++)
															
 
																-	{
															
 
																-		for (x = 0; x < nslicesx; x++)
															
 
																-		{
															
 
																-			starpu_matrix_data_register(&B_state[z][x], 0, (uintptr_t)B[z][x], 
															
 
																-				BLOCKSIZEZ, BLOCKSIZEZ, BLOCKSIZEX, sizeof(TYPE));
															
 
																-		}
															
 
																-	}
															
 
																-
															
 
																-	for (y = 0; y < nslicesy; y++)
															
 
																-	{
															
 
																-		for (x = 0; x < nslicesx; x++)
															
 
																-		{
															
 
																-			starpu_matrix_data_register(&C_state[y][x], 0, (uintptr_t)C[y][x], 
															
 
																-				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEX, sizeof(TYPE));
															
 
																-		}
															
 
																-	}
															
 
																-
															
 
																-#ifdef STARPU_USE_GORDON
															
 
																-	conf.k = BLOCKSIZEZ;
															
 
																-	conf.m = BLOCKSIZEY;
															
 
																-	conf.n = BLOCKSIZEX;
															
 
																-#endif
															
 
																-
															
 
																-	fprintf(stderr, "block size : x %d y %d z %d\n", BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
															
 
																-
															
 
																-	display_memory_consumption();
															
 
																-}
															
 
																-
															
 
																-static void cleanup_problem(void)
															
 
																-{
															
 
																-	unsigned z, y, x;
															
 
																-
															
 
																-#ifdef CHECK_OUTPUT
															
 
																-	TYPE maxerr = 0.0;
															
 
																-	TYPE err;
															
 
																-	fprintf(stderr, "Checking results ....");
															
 
																-
															
 
																-	for (y = 0; y < nslicesy; y++)
															
 
																-	{
															
 
																-		for (x = 0; x < nslicesx; x++)
															
 
																-		{
															
 
																-			for (z = 0; z < nslicesz; z++)
															
 
																-			{
															
 
																-				SGEMM("N", "N", BLOCKSIZEY, BLOCKSIZEX, BLOCKSIZEZ, -(TYPE)(niter), A[y][z], BLOCKSIZEY, B[z][x], BLOCKSIZEZ, 1.0f, C[y][x], BLOCKSIZEY);
															
 
																-
															
 
																-			}
															
 
																-
															
 
																-			/* make sure C - niter AB = 0 */
															
 
																-			err = SASUM(BLOCKSIZEX*BLOCKSIZEY, C[y][x], 1);
															
 
																-
															
 
																-			if (err > BLOCKSIZEX*BLOCKSIZEY*niter*0.001) 
															
 
																-				fprintf(stderr, "\nerr = %f ( x = %d y = %d ) ... ", err/niter, x, y );
															
 
																-
															
 
																-			maxerr = STARPU_MAX(err, maxerr);
															
 
																-		}
															
 
																-	}
															
 
																-
															
 
																-	if (maxerr > BLOCKSIZEX*BLOCKSIZEY*niter*0.001)
															
 
																-	{
															
 
																-		fprintf(stderr, " maxerr = %f\n", maxerr/niter);
															
 
																-	}
															
 
																-	else {
															
 
																-		fprintf(stderr, " OK\n");
															
 
																-	}
															
 
																-	fflush(stderr);
															
 
																-#endif
															
 
																-
															
 
																-	for (y = 0; y < nslicesy; y++)
															
 
																-	{
															
 
																-		for (z = 0; z < nslicesz; z++)
															
 
																-		{
															
 
																-	//		free(A[y][z]);
															
 
																-		}
															
 
																-	}
															
 
																-
															
 
																-	for (z = 0; z < nslicesz; z++)
															
 
																-	{
															
 
																-		for (x = 0; x < nslicesx; x++)
															
 
																-		{
															
 
																-	//		free(B[z][x]);
															
 
																-		}
															
 
																-	}
															
 
																-
															
 
																-	for (y = 0; y < nslicesy; y++)
															
 
																-	{
															
 
																-		for (x = 0; x < nslicesx; x++)
															
 
																-		{
															
 
																-	//		free(C[y][x]);
															
 
																-			starpu_tag_remove(TAG(nslicesz - 1, y, x, niter - 1));
															
 
																-		}
															
 
																-	}
															
 
																-
															
 
																-	
															
 
																-	
															
 
																-}
															
 
																-
															
 
																-struct cb2_s {
															
 
																-	unsigned blockx;
															
 
																-	unsigned blocky;
															
 
																-	unsigned iter;
															
 
																-};
															
 
																-
															
 
																-
															
 
																-static starpu_codelet cl = {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA
															
 
																-#ifdef SPU_FUNC_SGEMM
															
 
																-		|STARPU_GORDON
															
 
																-#endif
															
 
																-		,
															
 
																-	.cpu_func = STARPU_GEMM(cpu_mult),
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-	.cuda_func = STARPU_GEMM(cublas_mult),
															
 
																-#endif
															
 
																-#ifdef STARPU_USE_GORDON
															
 
																-	/* .gordon_func will be set by load_elf_sgemm */
															
 
																-#endif
															
 
																-	.nbuffers = 3
															
 
																-};
															
 
																-
															
 
																-
															
 
																-#ifdef STARPU_USE_GORDON
															
 
																-static const char *spu_func_sgemm_elf_file = "./gordon/func_sgemm_ibm.spuelf";
															
 
																-static unsigned spu_func_sgemm_elf_id;
															
 
																-static unsigned spu_func_sgemm_ibm_id;
															
 
																-
															
 
																-static void load_elf_sgemm(void)
															
 
																-{
															
 
																-	spu_func_sgemm_elf_id =
															
 
																-		gordon_register_elf_plugin(spu_func_sgemm_elf_file);
															
 
																-
															
 
																-	spu_func_sgemm_ibm_id = gordon_register_kernel(spu_func_sgemm_elf_id, "func_sgemm_ibm");
															
 
																-
															
 
																-	gordon_load_plugin_on_all_spu(spu_func_sgemm_elf_id);
															
 
																-	gordon_load_kernel_on_all_spu(spu_func_sgemm_ibm_id);
															
 
																-
															
 
																-	cl.gordon_func = spu_func_sgemm_ibm_id;
															
 
																-}
															
 
																-#endif // STARPU_USE_GORDON
															
 
																-
															
 
																-static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, unsigned iter)
															
 
																-{
															
 
																-	/* A B[task] = C[task] */
															
 
																-	struct starpu_task *task = starpu_task_create();
															
 
																-
															
 
																-	task->cl = &cl;
															
 
																-
															
 
																-	task->use_tag = 1;
															
 
																-	task->tag_id = TAG(z, y, x, iter);
															
 
																-
															
 
																-	task->buffers[0].handle = A_state[y][z];
															
 
																-	task->buffers[0].mode = STARPU_R;
															
 
																-	task->buffers[1].handle = B_state[z][x];
															
 
																-	task->buffers[1].mode = STARPU_R;
															
 
																-	task->buffers[2].handle = C_state[y][x];
															
 
																-	task->buffers[2].mode = STARPU_RW;
															
 
																-
															
 
																-#ifdef STARPU_USE_GORDON
															
 
																-	task->cl_arg = &conf;
															
 
																-	task->cl_arg_size = sizeof(struct ibm_sgemm_block_conf);
															
 
																-#endif
															
 
																-
															
 
																-	return task;
															
 
																-}
															
 
																-
															
 
																-static void callback_func_2(void *arg)
															
 
																-{
															
 
																-	/* the argument is a pointer to a counter of the remaining tasks */
															
 
																-	struct cb2_s *cb2 = arg;
															
 
																-	unsigned x,y,z,iter;
															
 
																-
															
 
																-	iter = cb2->iter;
															
 
																-	x = cb2->blockx;
															
 
																-	y = cb2->blocky;
															
 
																-
															
 
																-	free(cb2);
															
 
																-
															
 
																-	/* do some accounting */
															
 
																-	int id = starpu_worker_get_id();
															
 
																-	flop_per_worker[id] += BLAS3_FLOP(BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
															
 
																-	ls_per_worker[id] += BLAS3_LS(BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
															
 
																-
															
 
																-	/* TAG(nslicesz - 1, y, x, iter) remains ... */
															
 
																-	for (z = 0; z < nslicesz - 1; z++)
															
 
																-	{
															
 
																-		starpu_tag_remove(TAG(z, y, x, iter));
															
 
																-	}
															
 
																-
															
 
																-	if (iter > 0)
															
 
																-	{
															
 
																-		starpu_tag_remove(TAG(nslicesz - 1, y, x, iter-1));
															
 
																-	}
															
 
																-	
															
 
																-	if (iter != niter - 1) {
															
 
																-		submit_new_iter(x, y, iter+1);
															
 
																-	}
															
 
																-}
															
 
																-
															
 
																-
															
 
																-
															
 
																-static void submit_new_iter(unsigned x, unsigned y, unsigned iter)
															
 
																-{
															
 
																-	unsigned z;
															
 
																-	for (z = 0; z < nslicesz; z++) 
															
 
																-	{
															
 
																-		struct starpu_task *task;
															
 
																-		task = construct_task(x, y, z, iter);
															
 
																-		
															
 
																-		if (z != 0) {
															
 
																-			starpu_tag_declare_deps(TAG(z, y, x, iter), 1, TAG(z-1, y, x, iter));
															
 
																-		}
															
 
																-
															
 
																-		if (z == nslicesz - 1) {
															
 
																-			struct cb2_s *cb2 = malloc(sizeof(struct cb2_s));
															
 
																-				cb2->blockx = x;
															
 
																-				cb2->blocky = y;
															
 
																-				cb2->iter = iter;
															
 
																-			task->callback_func = callback_func_2;
															
 
																-			task->callback_arg = cb2;
															
 
																-		}
															
 
																-
															
 
																-		starpu_task_submit(task);
															
 
																-	}
															
 
																-}
															
 
																-
															
 
																-static void launch_codelets(void)
															
 
																-{
															
 
																-#ifdef STARPU_USE_FXT
															
 
																-	_starpu_fxt_register_thread(0);
															
 
																-#endif
															
 
																-	/* partition the work into slices */
															
 
																-	unsigned taskx, tasky;
															
 
																-
															
 
																-	srand(time(NULL));
															
 
																-
															
 
																-	/* should we use a single performance model for all archs and use an
															
 
																- 	 * acceleration factor ? */
															
 
																-	if (use_common_model) {
															
 
																-		cl.model = &STARPU_GEMM(model_common);
															
 
																-	}
															
 
																-	else {
															
 
																-		cl.model = &STARPU_GEMM(model);
															
 
																-	}
															
 
																-
															
 
																-	for (taskx = 0; taskx < nslicesx; taskx++) 
															
 
																-	{
															
 
																-		for (tasky = 0; tasky < nslicesy; tasky++)
															
 
																-		{
															
 
																-			submit_new_iter(taskx, tasky, 0);
															
 
																-		}
															
 
																-	}
															
 
																-}
															
 
																-
															
 
																-int main(__attribute__ ((unused)) int argc, 
															
 
																-	 __attribute__ ((unused)) char **argv)
															
 
																-{
															
 
																-
															
 
																-	parse_args(argc, argv);
															
 
																-
															
 
																-	/* start the runtime */
															
 
																-	starpu_init(NULL);
															
 
																-
															
 
																-	starpu_helper_cublas_init();
															
 
																-
															
 
																-#ifdef STARPU_USE_GORDON
															
 
																-	load_elf_sgemm();
															
 
																-#endif
															
 
																-
															
 
																-	init_problem_data();
															
 
																-
															
 
																-	gettimeofday(&start, NULL);
															
 
																-
															
 
																-	launch_codelets();
															
 
																-
															
 
																-	starpu_task_wait_for_all();
															
 
																-
															
 
																-	gettimeofday(&end, NULL);
															
 
																-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
															
 
																-	display_stats(timing);
															
 
																-
															
 
																-	cleanup_problem();
															
 
																-
															
 
																-	starpu_helper_cublas_shutdown();
															
 
																-	starpu_shutdown();
															
 
																-
															
 
																-	return 0;
															
 
																-}
															
--- a/examples/mult/dw_mult_no_stride_no_tag.c
+++ b/examples/mult/dw_mult_no_stride_no_tag.c
@@ -1,447 +0,0 @@
 
																-/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																- *
															
 
																- * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																- *
															
 
																- * StarPU is free software; you can redistribute it and/or modify
															
 
																- * it under the terms of the GNU Lesser General Public License as published by
															
 
																- * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																- * your option) any later version.
															
 
																- *
															
 
																- * StarPU is distributed in the hope that it will be useful, but
															
 
																- * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																- *
															
 
																- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																- */
															
 
																-
															
 
																-#include "simple.h"
															
 
																-#include "dw_mult.h"
															
 
																-#ifdef STARPU_USE_GORDON
															
 
																-#include "gordon/func_sgemm_ibm.h"
															
 
																-#endif
															
 
																-#include "xgemm_kernels.c"
															
 
																-
															
 
																-
															
 
																-struct pos {
															
 
																-	unsigned x,y, z,iter;
															
 
																-};
															
 
																-
															
 
																-struct pos currentpos [MAXSLICESY][MAXSLICESX];
															
 
																-
															
 
																-TYPE *A[MAXSLICESY][MAXSLICESZ];
															
 
																-TYPE *B[MAXSLICESZ][MAXSLICESX];
															
 
																-TYPE *C[MAXSLICESY][MAXSLICESX];
															
 
																-
															
 
																-starpu_data_handle A_state[MAXSLICESY][MAXSLICESZ];
															
 
																-starpu_data_handle B_state[MAXSLICESZ][MAXSLICESX];
															
 
																-starpu_data_handle C_state[MAXSLICESY][MAXSLICESX];
															
 
																-
															
 
																-
															
 
																-static void callback_func_3(void *arg);
															
 
																-/*
															
 
																- * This program computes C = A * B 
															
 
																- * 
															
 
																- * The difference with dw_mult_no_stride.c is that here we do not use tags, and
															
 
																- * just rely on sequential data consistency.
															
 
																- *   A of size (z,y)
															
 
																- *   B of size (x,z)
															
 
																- *   C of size (x,y)
															
 
																-
															
 
																-              |---------------|
															
 
																-            z |       B       |
															
 
																-              |---------------|
															
 
																-       z              x
															
 
																-     |----|   |---------------|
															
 
																-     |    |   |               |
															
 
																-     |    |   |               |
															
 
																-     | A  | y |       C       |
															
 
																-     |    |   |               |
															
 
																-     |    |   |               |
															
 
																-     |----|   |---------------|
															
 
																-
															
 
																- */
															
 
																-
															
 
																-#define MEM_ALIGNMENT	16
															
 
																-
															
 
																-static void init_problem_data(void)
															
 
																-{
															
 
																-	unsigned i,j;
															
 
																-
															
 
																-	/* debug ... */
															
 
																-	memset(A, 0, MAXSLICESY*MAXSLICESZ*sizeof(TYPE *));
															
 
																-	memset(B, 0, MAXSLICESZ*MAXSLICESZ*sizeof(TYPE *));
															
 
																-	memset(C, 0, MAXSLICESY*MAXSLICESX*sizeof(TYPE *));
															
 
																-	memset(&A_state, 0, MAXSLICESY*MAXSLICESZ*sizeof(starpu_data_handle));
															
 
																-	memset(&B_state, 0, MAXSLICESZ*MAXSLICESZ*sizeof(starpu_data_handle));
															
 
																-	memset(&C_state, 0, MAXSLICESY*MAXSLICESX*sizeof(starpu_data_handle));
															
 
																-
															
 
																-	/* Allocate grids of buffer */
															
 
																-	/* TODO pin ... */
															
 
																-	unsigned z, y, x;
															
 
																-
															
 
																-	for (y = 0; y < nslicesy; y++)
															
 
																-	{
															
 
																-		for (z = 0; z < nslicesz; z++)
															
 
																-		{
															
 
																-#ifdef STARPU_HAVE_POSIX_MEMALIGN
															
 
																-			posix_memalign((void **)&A[y][z], MEM_ALIGNMENT, BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
															
 
																-#else
															
 
																-			A[y][z] = malloc(BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
															
 
																-#endif
															
 
																-			assert(A[y][z]);
															
 
																-		}
															
 
																-	}
															
 
																-
															
 
																-	for (z = 0; z < nslicesz; z++)
															
 
																-	{
															
 
																-		for (x = 0; x < nslicesx; x++)
															
 
																-		{
															
 
																-#ifdef STARPU_HAVE_POSIX_MEMALIGN
															
 
																-			posix_memalign((void **)&B[z][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
															
 
																-#else
															
 
																-			B[z][x] = malloc(BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
															
 
																-#endif
															
 
																-			assert(B[z][x]);
															
 
																-		}
															
 
																-	}
															
 
																-
															
 
																-	for (y = 0; y < nslicesy; y++)
															
 
																-	{
															
 
																-		for (x = 0; x < nslicesx; x++)
															
 
																-		{
															
 
																-#ifdef STARPU_HAVE_POSIX_MEMALIGN
															
 
																-			posix_memalign((void **)&C[y][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
															
 
																-#else
															
 
																-			C[y][x] = malloc(BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
															
 
																-#endif
															
 
																-			currentpos[y][x].x = x;
															
 
																-			currentpos[y][x].y = y;
															
 
																-			currentpos[y][x].z = 0;
															
 
																-			currentpos[y][x].iter = 0;
															
 
																-			assert(C[y][x]);
															
 
																-		}
															
 
																-	}
															
 
																-	
															
 
																-	/* fill the A and B matrices */
															
 
																-	unsigned blockx, blocky, blockz;
															
 
																-
															
 
																-	if (norandom) {
															
 
																-		for (blocky = 0; blocky < nslicesy; blocky++)
															
 
																-			for (blockz = 0; blockz < nslicesz; blockz++)
															
 
																-				for (j = 0; j < BLOCKSIZEY; j++)
															
 
																-					for (i = 0; i < BLOCKSIZEZ; i++)
															
 
																-					{
															
 
																-						A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(1 + blockz + blocky*nslicesz);
															
 
																-					}
															
 
																-
															
 
																-		for (blockz = 0; blockz < nslicesz; blockz++)
															
 
																-			for (blockx = 0; blockx < nslicesx; blockx++)
															
 
																-				for (j = 0; j < BLOCKSIZEZ; j++)
															
 
																-					for (i = 0; i < BLOCKSIZEX; i++)
															
 
																-					{
															
 
																-						B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(1 + blockx + blockz*nslicesx);
															
 
																-					}
															
 
																-	} 
															
 
																-	else {
															
 
																-		for (blocky = 0; blocky < nslicesy; blocky++)
															
 
																-			for (blockz = 0; blockz < nslicesz; blockz++)
															
 
																-				for (j = 0; j < BLOCKSIZEY; j++)
															
 
																-					for (i = 0; i < BLOCKSIZEZ; i++)
															
 
																-					{
															
 
																-						A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(starpu_drand48());
															
 
																-					}
															
 
																-
															
 
																-		for (blockz = 0; blockz < nslicesz; blockz++)
															
 
																-			for (blockx = 0; blockx < nslicesx; blockx++)
															
 
																-				for (j = 0; j < BLOCKSIZEZ; j++)
															
 
																-					for (i = 0; i < BLOCKSIZEX; i++)
															
 
																-					{
															
 
																-						B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(starpu_drand48());
															
 
																-					}
															
 
																-
															
 
																-	}
															
 
																-
															
 
																-	for (blocky = 0; blocky < nslicesy; blocky++)
															
 
																-		for (blockx = 0; blockx < nslicesx; blockx++)
															
 
																-			for (j = 0; j < BLOCKSIZEY; j++)
															
 
																-				for (i = 0; i < BLOCKSIZEX; i++)
															
 
																-				{
															
 
																-					C[blocky][blockx][i*BLOCKSIZEY + j] = (TYPE)0;
															
 
																-				}
															
 
																-
															
 
																-
															
 
																-	/* declare the StarPU data to monitor */
															
 
																-	for (y = 0; y < nslicesy; y++)
															
 
																-	{
															
 
																-		for (z = 0; z < nslicesz; z++)
															
 
																-		{
															
 
																-			starpu_matrix_data_register(&A_state[y][z], 0, (uintptr_t)A[y][z], 
															
 
																-				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEZ, sizeof(TYPE));
															
 
																-		}
															
 
																-	}
															
 
																-
															
 
																-	for (z = 0; z < nslicesz; z++)
															
 
																-	{
															
 
																-		for (x = 0; x < nslicesx; x++)
															
 
																-		{
															
 
																-			starpu_matrix_data_register(&B_state[z][x], 0, (uintptr_t)B[z][x], 
															
 
																-				BLOCKSIZEZ, BLOCKSIZEZ, BLOCKSIZEX, sizeof(TYPE));
															
 
																-		}
															
 
																-	}
															
 
																-
															
 
																-	for (y = 0; y < nslicesy; y++)
															
 
																-	{
															
 
																-		for (x = 0; x < nslicesx; x++)
															
 
																-		{
															
 
																-			starpu_matrix_data_register(&C_state[y][x], 0, (uintptr_t)C[y][x], 
															
 
																-				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEX, sizeof(TYPE));
															
 
																-		}
															
 
																-	}
															
 
																-
															
 
																-#ifdef STARPU_USE_GORDON
															
 
																-	conf.k = BLOCKSIZEZ;
															
 
																-	conf.m = BLOCKSIZEY;
															
 
																-	conf.n = BLOCKSIZEX;
															
 
																-#endif
															
 
																-
															
 
																-	fprintf(stderr, "block size : x %d y %d z %d\n", BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
															
 
																-
															
 
																-	display_memory_consumption();
															
 
																-}
															
 
																-
															
 
																-static void cleanup_problem(void)
															
 
																-{
															
 
																-	unsigned z, y, x;
															
 
																-
															
 
																-#ifdef CHECK_OUTPUT
															
 
																-	TYPE maxerr = 0.0;
															
 
																-	TYPE err;
															
 
																-	fprintf(stderr, "Checking results ....");
															
 
																-
															
 
																-	for (y = 0; y < nslicesy; y++)
															
 
																-	{
															
 
																-		for (x = 0; x < nslicesx; x++)
															
 
																-		{
															
 
																-			for (z = 0; z < nslicesz; z++)
															
 
																-			{
															
 
																-				SGEMM("N", "N", BLOCKSIZEY, BLOCKSIZEX, BLOCKSIZEZ, -(TYPE)(niter), A[y][z], BLOCKSIZEY, B[z][x], BLOCKSIZEZ, 1.0f, C[y][x], BLOCKSIZEY);
															
 
																-
															
 
																-			}
															
 
																-
															
 
																-			/* make sure C - niter AB = 0 */
															
 
																-			err = SASUM(BLOCKSIZEX*BLOCKSIZEY, C[y][x], 1);
															
 
																-
															
 
																-			if (err > BLOCKSIZEX*BLOCKSIZEY*niter*0.001) 
															
 
																-				fprintf(stderr, "\nerr = %f ( x = %d y = %d ) ... ", err/niter, x, y );
															
 
																-
															
 
																-			maxerr = STARPU_MAX(err, maxerr);
															
 
																-		}
															
 
																-	}
															
 
																-
															
 
																-	if (maxerr > BLOCKSIZEX*BLOCKSIZEY*niter*0.001)
															
 
																-	{
															
 
																-		fprintf(stderr, " maxerr = %f\n", maxerr/niter);
															
 
																-	}
															
 
																-	else {
															
 
																-		fprintf(stderr, " OK\n");
															
 
																-	}
															
 
																-	fflush(stderr);
															
 
																-#endif
															
 
																-
															
 
																-	for (y = 0; y < nslicesy; y++)
															
 
																-	{
															
 
																-		for (z = 0; z < nslicesz; z++)
															
 
																-		{
															
 
																-	//		free(A[y][z]);
															
 
																-		}
															
 
																-	}
															
 
																-
															
 
																-	for (z = 0; z < nslicesz; z++)
															
 
																-	{
															
 
																-		for (x = 0; x < nslicesx; x++)
															
 
																-		{
															
 
																-	//		free(B[z][x]);
															
 
																-		}
															
 
																-	}
															
 
																-
															
 
																-	for (y = 0; y < nslicesy; y++)
															
 
																-	{
															
 
																-		for (x = 0; x < nslicesx; x++)
															
 
																-		{
															
 
																-	//		free(C[y][x]);
															
 
																-		}
															
 
																-	}
															
 
																-
															
 
																-	
															
 
																-	
															
 
																-}
															
 
																-
															
 
																-struct cb2_s {
															
 
																-	unsigned blockx;
															
 
																-	unsigned blocky;
															
 
																-	unsigned iter;
															
 
																-};
															
 
																-
															
 
																-
															
 
																-static starpu_codelet cl = {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA
															
 
																-#ifdef SPU_FUNC_SGEMM
															
 
																-		|STARPU_GORDON
															
 
																-#endif
															
 
																-		,
															
 
																-	.cpu_func = STARPU_GEMM(cpu_mult),
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-	.cuda_func = STARPU_GEMM(cublas_mult),
															
 
																-#endif
															
 
																-#ifdef STARPU_USE_GORDON
															
 
																-	/* .gordon_func will be set by load_elf_sgemm */
															
 
																-#endif
															
 
																-	.nbuffers = 3
															
 
																-};
															
 
																-
															
 
																-
															
 
																-#ifdef STARPU_USE_GORDON
															
 
																-static const char *spu_func_sgemm_elf_file = "./gordon/func_sgemm_ibm.spuelf";
															
 
																-static unsigned spu_func_sgemm_elf_id;
															
 
																-static unsigned spu_func_sgemm_ibm_id;
															
 
																-
															
 
																-static void load_elf_sgemm(void)
															
 
																-{
															
 
																-	spu_func_sgemm_elf_id =
															
 
																-		gordon_register_elf_plugin(spu_func_sgemm_elf_file);
															
 
																-
															
 
																-	spu_func_sgemm_ibm_id = gordon_register_kernel(spu_func_sgemm_elf_id, "func_sgemm_ibm");
															
 
																-	
															
 
																-	gordon_load_plugin_on_all_spu(spu_func_sgemm_elf_id);
															
 
																-	gordon_load_kernel_on_all_spu(spu_func_sgemm_ibm_id);
															
 
																-
															
 
																-	cl.gordon_func = spu_func_sgemm_ibm_id;
															
 
																-}
															
 
																-#endif // STARPU_USE_GORDON
															
 
																-
															
 
																-static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, unsigned iter, struct pos *posp)
															
 
																-{
															
 
																-	struct starpu_task *task = starpu_task_create();
															
 
																-
															
 
																-	task->cl = &cl;
															
 
																-
															
 
																-	task->buffers[0].handle = A_state[y][z];
															
 
																-	task->buffers[0].mode = STARPU_R;
															
 
																-	task->buffers[1].handle = B_state[z][x];
															
 
																-	task->buffers[1].mode = STARPU_R;
															
 
																-	task->buffers[2].handle = C_state[y][x];
															
 
																-	task->buffers[2].mode = STARPU_RW;
															
 
																-
															
 
																-	task->callback_func = callback_func_3;
															
 
																-	task->callback_arg = posp;
															
 
																-
															
 
																-#ifdef STARPU_USE_GORDON
															
 
																-	task->cl_arg = &conf;
															
 
																-	task->cl_arg_size = sizeof(struct ibm_sgemm_block_conf);
															
 
																-#endif
															
 
																-
															
 
																-	posp->z = z;
															
 
																-	posp->iter = iter;
															
 
																-
															
 
																-	return task;
															
 
																-}
															
 
																-
															
 
																-
															
 
																-static void callback_func_3(void *arg)
															
 
																-{
															
 
																-	/* do some accounting */
															
 
																-	int id = starpu_worker_get_id();
															
 
																-	flop_per_worker[id] += BLAS3_FLOP(BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
															
 
																-	ls_per_worker[id] += BLAS3_LS(BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
															
 
																-
															
 
																-	/* the argument is a pointer to a counter of the remaining tasks */
															
 
																-	struct pos *posp = arg;
															
 
																-	unsigned x,y,z,iter;
															
 
																-
															
 
																-	iter = posp->iter;
															
 
																-	x = posp->x;
															
 
																-	y = posp->y;
															
 
																-	z = posp->z;
															
 
																-
															
 
																-	if (z < nslicesz - 1)
															
 
																-	{
															
 
																-		struct starpu_task *task = construct_task(x, y, z+1, iter, posp);
															
 
																-		starpu_task_submit(task);
															
 
																-	}
															
 
																-	else
															
 
																-	{
															
 
																-		if (iter < niter - 1)
															
 
																-		{
															
 
																-			struct starpu_task *task = construct_task(x, y, 0, iter+1, posp);
															
 
																-			starpu_task_submit(task);
															
 
																-		}
															
 
																-	}
															
 
																-}
															
 
																-
															
 
																-
															
 
																-
															
 
																-
															
 
																-static void launch_codelets(void)
															
 
																-{
															
 
																-#ifdef STARPU_USE_FXT
															
 
																-	_starpu_fxt_register_thread(0);
															
 
																-#endif
															
 
																-	/* partition the work into slices */
															
 
																-	unsigned taskx, tasky;
															
 
																-
															
 
																-	srand(time(NULL));
															
 
																-
															
 
																-	/* should we use a single performance model for all archs and use an
															
 
																- 	 * acceleration factor ? */
															
 
																-	if (use_common_model) {
															
 
																-		cl.model = &STARPU_GEMM(model_common);
															
 
																-	}
															
 
																-	else {
															
 
																-		cl.model = &STARPU_GEMM(model);
															
 
																-	}
															
 
																-
															
 
																-	for (taskx = 0; taskx < nslicesx; taskx++) 
															
 
																-	{
															
 
																-		for (tasky = 0; tasky < nslicesy; tasky++)
															
 
																-		{
															
 
																-			struct starpu_task *task = construct_task(taskx, tasky, 0, 0, &currentpos[tasky][taskx]);
															
 
																-			starpu_task_submit(task);
															
 
																-		}
															
 
																-	}
															
 
																-}
															
 
																-
															
 
																-int main(__attribute__ ((unused)) int argc, 
															
 
																-	 __attribute__ ((unused)) char **argv)
															
 
																-{
															
 
																-
															
 
																-	parse_args(argc, argv);
															
 
																-
															
 
																-	/* start the runtime */
															
 
																-	starpu_init(NULL);
															
 
																-
															
 
																-	starpu_helper_cublas_init();
															
 
																-
															
 
																-#ifdef STARPU_USE_GORDON
															
 
																-	load_elf_sgemm();
															
 
																-#endif
															
 
																-
															
 
																-	init_problem_data();
															
 
																-
															
 
																-	gettimeofday(&start, NULL);
															
 
																-
															
 
																-	launch_codelets();
															
 
																-
															
 
																-	starpu_task_wait_for_all();
															
 
																-
															
 
																-	gettimeofday(&end, NULL);
															
 
																-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
															
 
																-	display_stats(timing);
															
 
																-
															
 
																-	cleanup_problem();
															
 
																-
															
 
																-	starpu_helper_cublas_shutdown();
															
 
																-	starpu_shutdown();
															
 
																-
															
 
																-	return 0;
															
 
																-}
															
--- a/examples/mult/gordon/func_dgemm_ibm.c
+++ b/examples/mult/gordon/func_dgemm_ibm.c
@@ -1,42 +0,0 @@
 
																-/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																- *
															
 
																- * Copyright (C) 2010  Université de Bordeaux 1
															
 
																- *
															
 
																- * StarPU is free software; you can redistribute it and/or modify
															
 
																- * it under the terms of the GNU Lesser General Public License as published by
															
 
																- * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																- * your option) any later version.
															
 
																- *
															
 
																- * StarPU is distributed in the hope that it will be useful, but
															
 
																- * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																- *
															
 
																- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																- */
															
 
																-
															
 
																-#include "func_gemm_ibm.h"
															
 
																-
															
 
																-#include <blas_s.h>
															
 
																-
															
 
																-void func_dgemm_ibm(__attribute__ ((unused)) void **alloc,
															
 
																-		__attribute__ ((unused)) void **in,
															
 
																-		__attribute__ ((unused)) void **inout,
															
 
																-		__attribute__ ((unused)) void **out)
															
 
																-{
															
 
																-	/* we assume data will be in A:R,B:R,C:RW mode
															
 
																- 	 *  -> in[0] : describe problem
															
 
																- 	 *  -> in[1] : A
															
 
																- 	 *  -> in[2] : B
															
 
																- 	 *  -> inout[0] : C
															
 
																- 	 *
															
 
																- 	 *   C = AB + C
															
 
																- 	 *   but, being in fortran ordering, we compute
															
 
																- 	 *   t(C) = t(B)t(A) + t(C) instead
															
 
																- 	 */
															
 
																-	struct ibm_gemm_block_conf *conf = in[0];
															
 
																-	double *A = in[1];
															
 
																-	double *B = in[2];
															
 
																-	double *C = inout[0];
															
 
																-
															
 
																-	dgemm_spu(conf->m, conf->n, conf->k, B, A, C);
															
 
																-}
															
--- a/examples/mult/gordon/func_gemm_ibm.h
+++ b/examples/mult/gordon/func_gemm_ibm.h
@@ -1,29 +0,0 @@
 
																-/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																- *
															
 
																- * Copyright (C) 2010  Université de Bordeaux 1
															
 
																- *
															
 
																- * StarPU is free software; you can redistribute it and/or modify
															
 
																- * it under the terms of the GNU Lesser General Public License as published by
															
 
																- * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																- * your option) any later version.
															
 
																- *
															
 
																- * StarPU is distributed in the hope that it will be useful, but
															
 
																- * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																- *
															
 
																- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																- */
															
 
																-
															
 
																-#ifndef __FUNC_SGEMM_IBM_H__
															
 
																-#define __FUNC_SGEMM_IBM_H__
															
 
																-
															
 
																-#include <stdint.h>
															
 
																-
															
 
																-struct ibm_sgemm_block_conf {
															
 
																-	uint32_t m;
															
 
																-	uint32_t n;
															
 
																-	uint32_t k;
															
 
																-	uint32_t pad;
															
 
																-};
															
 
																-
															
 
																-#endif // __FUNC_SGEMM_IBM_H__
															
--- a/examples/mult/gordon/func_sgemm_ibm.c
+++ b/examples/mult/gordon/func_sgemm_ibm.c
@@ -1,43 +0,0 @@
 
																-/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																- *
															
 
																- * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																- *
															
 
																- * StarPU is free software; you can redistribute it and/or modify
															
 
																- * it under the terms of the GNU Lesser General Public License as published by
															
 
																- * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																- * your option) any later version.
															
 
																- *
															
 
																- * StarPU is distributed in the hope that it will be useful, but
															
 
																- * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																- *
															
 
																- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																- */
															
 
																-
															
 
																-#include "func_gemm_ibm.h"
															
 
																-
															
 
																-#include <blas_s.h>
															
 
																-
															
 
																-void func_sgemm_ibm(__attribute__ ((unused)) void **alloc,
															
 
																-		__attribute__ ((unused)) void **in,
															
 
																-		__attribute__ ((unused)) void **inout,
															
 
																-		__attribute__ ((unused)) void **out)
															
 
																-{
															
 
																-	/* we assume data will be in A:R,B:R,C:RW mode
															
 
																- 	 *  -> in[0] : describe problem
															
 
																- 	 *  -> in[1] : A
															
 
																- 	 *  -> in[2] : B
															
 
																- 	 *  -> inout[0] : C
															
 
																- 	 *
															
 
																- 	 *   C = AB + C
															
 
																- 	 *   but, being in fortran ordering, we compute
															
 
																- 	 *   t(C) = t(B)t(A) + t(C) instead
															
 
																- 	 */
															
 
																-	struct ibm_gemm_block_conf *conf = in[0];
															
 
																-	float *A = in[1];
															
 
																-	float *B = in[2];
															
 
																-	float *C = inout[0];
															
 
																-
															
 
																-	sgemm_spu(conf->m, conf->n, conf->k, B, A, C);
															
 
																-}
															
--- a/examples/mult/sgemm.c
+++ b/examples/mult/sgemm.c
@@ -16,6 +16,4 @@
 
																  */
															
 
																 #include "simple.h"
															
 
																-
															
 
																-#include "xgemm_kernels.c"
															
 
																 #include "xgemm.c" 
															
--- a/examples/mult/simple.h
+++ b/examples/mult/simple.h
@@ -17,7 +17,6 @@
 
																 #define TYPE	float
															
 
																 #define CUBLAS_GEMM cublasSgemm
															
 
																-#define MAGMABLAS_GEMM magmablas_sgemm
															
 
																 #define CPU_GEMM	SGEMM
															
 
																 #define CPU_ASUM	SASUM
															
 
																 #define CPU_IAMAX	ISAMAX
															
@@ -26,4 +25,3 @@
 
																 #define str(s) #s
															
 
																 #define xstr(s)        str(s)
															
 
																 #define STARPU_GEMM_STR(name)  xstr(STARPU_GEMM(name))
															
 
																-
															
--- a/examples/mult/xgemm.c
+++ b/examples/mult/xgemm.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
															
 
																  * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																  *
															
@@ -16,37 +16,79 @@
 
																  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																  */
															
 
																-#include "dw_mult.h"
															
 
																-
															
 
																-TYPE *A, *B, *C;
															
 
																-starpu_data_handle A_handle, B_handle, C_handle;
															
 
																-
															
 
																-/*
															
 
																- * This program computes C = A * B 
															
 
																- * 
															
 
																- *   A of size (z,y)
															
 
																- *   B of size (x,z)
															
 
																- *   C of size (x,y)
															
 
																-
															
 
																-              |---------------|
															
 
																-            z |       B       |
															
 
																-              |---------------|
															
 
																-       z              x
															
 
																-     |----|   |---------------|
															
 
																-     |    |   |               |
															
 
																-     |    |   |               |
															
 
																-     | A  | y |       C       |
															
 
																-     |    |   |               |
															
 
																-     |    |   |               |
															
 
																-     |----|   |---------------|
															
 
																+#include <string.h>
															
 
																+#include <math.h>
															
 
																+#include <sys/types.h>
															
 
																+#include <sys/time.h>
															
 
																+#include <starpu.h>
															
 
																- */
															
 
																+#include <common/blas.h>
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+#include <cuda.h>
															
 
																+#include <cublas.h>
															
 
																+#endif
															
 
																+
															
 
																+static unsigned niter = 100;
															
 
																+static unsigned nslicesx = 4;
															
 
																+static unsigned nslicesy = 4;
															
 
																+static unsigned xdim = 256;
															
 
																+static unsigned ydim = 256;
															
 
																+static unsigned zdim = 64;
															
 
																+static unsigned check = 0;
															
 
																+
															
 
																+static TYPE *A, *B, *C;
															
 
																+static starpu_data_handle A_handle, B_handle, C_handle;
															
 
																+
															
 
																+static void parse_args(int argc, char **argv)
															
 
																+{
															
 
																+	int i;
															
 
																+	for (i = 1; i < argc; i++) {
															
 
																+		if (strcmp(argv[i], "-nblocks") == 0) {
															
 
																+			char *argptr;
															
 
																+			nslicesx = strtol(argv[++i], &argptr, 10);
															
 
																+			nslicesy = nslicesx;
															
 
																+		}
															
 
																+
															
 
																+		if (strcmp(argv[i], "-nblocksx") == 0) {
															
 
																+			char *argptr;
															
 
																+			nslicesx = strtol(argv[++i], &argptr, 10);
															
 
																+		}
															
 
																+
															
 
																+		if (strcmp(argv[i], "-nblocksy") == 0) {
															
 
																+			char *argptr;
															
 
																+			nslicesy = strtol(argv[++i], &argptr, 10);
															
 
																+		}
															
 
																+
															
 
																+		if (strcmp(argv[i], "-x") == 0) {
															
 
																+			char *argptr;
															
 
																+			xdim = strtol(argv[++i], &argptr, 10);
															
 
																+		}
															
 
																+
															
 
																+		if (strcmp(argv[i], "-y") == 0) {
															
 
																+			char *argptr;
															
 
																+			ydim = strtol(argv[++i], &argptr, 10);
															
 
																+		}
															
 
																+
															
 
																+		if (strcmp(argv[i], "-z") == 0) {
															
 
																+			char *argptr;
															
 
																+			zdim = strtol(argv[++i], &argptr, 10);
															
 
																+		}
															
 
																+
															
 
																+		if (strcmp(argv[i], "-iter") == 0) {
															
 
																+			char *argptr;
															
 
																+			niter = strtol(argv[++i], &argptr, 10);
															
 
																+		}
															
 
																+
															
 
																+		if (strcmp(argv[i], "-check") == 0) {
															
 
																+			check = 1;
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																 static void check_output(void)
															
 
																 {
															
 
																-	/* check results */
															
 
																 	/* compute C = C - AB */
															
 
																-
															
 
																 	CPU_GEMM("N", "N", ydim, xdim, zdim, (TYPE)-1.0f, A, ydim, B, zdim, (TYPE)1.0f, C, ydim);
															
 
																 	/* make sure C = 0 */
															
@@ -65,66 +107,24 @@ static void check_output(void)
 
																 	}
															
 
																 }
															
 
																-void callback_func(void *arg)
															
 
																-{
															
 
																-	/* do some accounting */
															
 
																-	int id = starpu_worker_get_id();
															
 
																-	flop_per_worker[id] += BLAS3_FLOP(conf.m, conf.n, conf.k);
															
 
																-	ls_per_worker[id] += BLAS3_LS(conf.m, conf.n, conf.k);
															
 
																-}
															
 
																-
															
 
																 static void init_problem_data(void)
															
 
																 {
															
 
																 	unsigned i,j;
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-	if (pin) {
															
 
																-		starpu_data_malloc_pinned_if_possible((void **)&A, zdim*ydim*sizeof(TYPE));
															
 
																-		starpu_data_malloc_pinned_if_possible((void **)&B, xdim*zdim*sizeof(TYPE));
															
 
																-		starpu_data_malloc_pinned_if_possible((void **)&C, xdim*ydim*sizeof(TYPE));
															
 
																-	} else
															
 
																-#endif
															
 
																-	{
															
 
																-#ifdef STARPU_HAVE_POSIX_MEMALIGN
															
 
																-		posix_memalign((void **)&A, 4096, zdim*ydim*sizeof(TYPE));
															
 
																-		posix_memalign((void **)&B, 4096, xdim*zdim*sizeof(TYPE));
															
 
																-		posix_memalign((void **)&C, 4096, xdim*ydim*sizeof(TYPE));
															
 
																-#else
															
 
																-		A = malloc(zdim*ydim*sizeof(TYPE));
															
 
																-		B = malloc(xdim*zdim*sizeof(TYPE));
															
 
																-		C = malloc(xdim*ydim*sizeof(TYPE));
															
 
																-#endif
															
 
																-	}
															
 
																+	starpu_data_malloc_pinned_if_possible((void **)&A, zdim*ydim*sizeof(TYPE));
															
 
																+	starpu_data_malloc_pinned_if_possible((void **)&B, xdim*zdim*sizeof(TYPE));
															
 
																+	starpu_data_malloc_pinned_if_possible((void **)&C, xdim*ydim*sizeof(TYPE));
															
 
																 	/* fill the A and B matrices */
															
 
																-	if (norandom) {
															
 
																-		for (j=0; j < ydim; j++) {
															
 
																-			for (i=0; i < zdim; i++) {
															
 
																-				A[j+i*ydim] = (TYPE)(i);
															
 
																-			}
															
 
																-		}
															
 
																-	
															
 
																-		for (j=0; j < zdim; j++) {
															
 
																-			for (i=0; i < xdim; i++) {
															
 
																-				B[j+i*zdim] = (TYPE)(j);
															
 
																-			}
															
 
																-		}
															
 
																-	} 
															
 
																-	else {
															
 
																-#ifdef NORANDOM
															
 
																-		srand(2008);
															
 
																-		STARPU_ABORT();
															
 
																-#endif
															
 
																-		for (j=0; j < ydim; j++) {
															
 
																-			for (i=0; i < zdim; i++) {
															
 
																-				A[j+i*ydim] = (TYPE)(starpu_drand48());
															
 
																-			}
															
 
																+	for (j=0; j < ydim; j++) {
															
 
																+		for (i=0; i < zdim; i++) {
															
 
																+			A[j+i*ydim] = (TYPE)(starpu_drand48());
															
 
																 		}
															
 
																-	
															
 
																-		for (j=0; j < zdim; j++) {
															
 
																-			for (i=0; i < xdim; i++) {
															
 
																-				B[j+i*zdim] = (TYPE)(starpu_drand48());
															
 
																-			}
															
 
																+	}
															
 
																+
															
 
																+	for (j=0; j < zdim; j++) {
															
 
																+		for (i=0; i < xdim; i++) {
															
 
																+			B[j+i*zdim] = (TYPE)(starpu_drand48());
															
 
																 		}
															
 
																 	}
															
@@ -133,8 +133,6 @@ static void init_problem_data(void)
 
																 			C[j+i*ydim] = (TYPE)(0);
															
 
																 		}
															
 
																 	}
															
 
																-
															
 
																-	display_memory_consumption();
															
 
																 }
															
 
																 static void partition_mult_data(void)
															
@@ -148,21 +146,15 @@ static void partition_mult_data(void)
 
																 	starpu_data_set_wt_mask(C_handle, 1<<0);
															
 
																-	conf.k = zdim;
															
 
																-	conf.m = ydim/nslicesy;
															
 
																-	conf.n = xdim/nslicesx;
															
 
																-
															
 
																 	struct starpu_data_filter f;
															
 
																+	memset(&f, 0, sizeof(f));
															
 
																 	f.filter_func = starpu_vertical_block_filter_func;
															
 
																 	f.nchildren = nslicesx;
															
 
																-	f.get_nchildren = NULL;
															
 
																-	f.get_child_ops = NULL;
															
 
																 	struct starpu_data_filter f2;
															
 
																+	memset(&f2, 0, sizeof(f2));
															
 
																 	f2.filter_func = starpu_block_filter_func;
															
 
																 	f2.nchildren = nslicesy;
															
 
																-	f2.get_nchildren = NULL;
															
 
																-	f2.get_child_ops = NULL;
															
 
																 	starpu_data_partition(B_handle, &f);
															
 
																 	starpu_data_partition(A_handle, &f2);
															
@@ -170,126 +162,110 @@ static void partition_mult_data(void)
 
																 	starpu_data_map_filters(C_handle, 2, &f, &f2);
															
 
																 }
															
 
																-static void unpartition_mult_data(void)
															
 
																+static void mult_kernel_common(void *descr[], int type)
															
 
																 {
															
 
																-	fprintf(stderr, "unpartition !!\n");
															
 
																+	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
															
 
																+	TYPE *subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
															
 
																+	TYPE *subC = (TYPE *)STARPU_MATRIX_GET_PTR(descr[2]);
															
 
																-	starpu_data_unpartition(C_handle, 0);
															
 
																+	unsigned nxC = STARPU_MATRIX_GET_NX(descr[2]);
															
 
																+	unsigned nyC = STARPU_MATRIX_GET_NY(descr[2]);
															
 
																+	unsigned nyA = STARPU_MATRIX_GET_NY(descr[0]);
															
 
																-	starpu_data_unregister(C_handle);
															
 
																-}
															
 
																+	unsigned ldA = STARPU_MATRIX_GET_LD(descr[0]);
															
 
																+	unsigned ldB = STARPU_MATRIX_GET_LD(descr[1]);
															
 
																+	unsigned ldC = STARPU_MATRIX_GET_LD(descr[2]);
															
 
																-static starpu_codelet cl = {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA
															
 
																-#ifdef SPU_FUNC_SGEMM
															
 
																-		|STARPU_GORDON
															
 
																-#endif
															
 
																-		,
															
 
																-	.cpu_func = STARPU_GEMM(cpu_mult),
															
 
																+	if (type == STARPU_CPU) {
															
 
																+		CPU_GEMM("N", "N", nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB, (TYPE)0.0, subC, ldC);
															
 
																+	}
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-	.cuda_func = STARPU_GEMM(cublas_mult),
															
 
																-#endif
															
 
																-#ifdef STARPU_USE_GORDON
															
 
																-#ifdef SPU_FUNC_SGEMM
															
 
																-	.gordon_func = SPU_FUNC_SGEMM,
															
 
																-#else
															
 
																-#warning SPU_FUNC_SGEMM is not available
															
 
																-#endif
															
 
																+	else {
															
 
																+		CUBLAS_GEMM('n', 'n', nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
															
 
																+					     (TYPE)0.0, subC, ldC);
															
 
																+		cudaThreadSynchronize();
															
 
																+	}
															
 
																 #endif
															
 
																-	.nbuffers = 3
															
 
																-};
															
 
																-
															
 
																-static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, unsigned iter)
															
 
																-{
															
 
																-	/* A B[task] = C[task] */
															
 
																-	struct starpu_task *task = starpu_task_create();
															
 
																-
															
 
																-	task->cl = &cl;
															
 
																-
															
 
																-	/* we have a callback to do some accounting */
															
 
																-	task->callback_func = callback_func;
															
 
																-	task->callback_arg = NULL;
															
 
																-
															
 
																-	task->buffers[0].handle = starpu_data_get_sub_data(A_handle, 1, y);
															
 
																-	task->buffers[0].mode = STARPU_R;
															
 
																-	task->buffers[1].handle = starpu_data_get_sub_data(B_handle, 1, x);
															
 
																-	task->buffers[1].mode = STARPU_R;
															
 
																-	task->buffers[2].handle = starpu_data_get_sub_data(C_handle, 2, x, y);
															
 
																-	task->buffers[2].mode = STARPU_RW;
															
 
																-
															
 
																-	task->cl_arg = &conf;
															
 
																-	task->cl_arg_size = sizeof(struct block_conf);
															
 
																-	return task;
															
 
																 }
															
 
																-static void submit_new_iter(unsigned x, unsigned y, unsigned iter)
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+static void cublas_mult(void *descr[], __attribute__((unused)) void *arg)
															
 
																 {
															
 
																-	unsigned z;
															
 
																-
															
 
																-	z = 0;
															
 
																-
															
 
																-	{
															
 
																-		struct starpu_task *task;
															
 
																-		task = construct_task(x, y, z, iter);
															
 
																-
															
 
																-		starpu_task_submit(task);
															
 
																-	}
															
 
																+	mult_kernel_common(descr, STARPU_CUDA);
															
 
																 }
															
 
																-
															
 
																-static void launch_codelets(void)
															
 
																-{
															
 
																-#ifdef STARPU_USE_FXT
															
 
																-	_starpu_fxt_register_thread(0);
															
 
																 #endif
															
 
																-	/* partition the work into slices */
															
 
																-	unsigned taskx, tasky;
															
 
																-	srand(time(NULL));
															
 
																+static void cpu_mult(void *descr[], __attribute__((unused))  void *arg)
															
 
																+{
															
 
																+	mult_kernel_common(descr, STARPU_CPU);
															
 
																+}
															
 
																-	/* should we use a single performance model for all archs and use an
															
 
																- 	 * acceleration factor ? */
															
 
																-	if (use_common_model) {
															
 
																-		cl.model = &STARPU_GEMM(model_common);
															
 
																-	}
															
 
																-	else {
															
 
																-		cl.model = &STARPU_GEMM(model);
															
 
																-	}
															
 
																+static struct starpu_perfmodel_t starpu_gemm_model = {
															
 
																+	.type = STARPU_HISTORY_BASED,
															
 
																+	.symbol = STARPU_GEMM_STR(gemm)
															
 
																+};
															
 
																-	for (taskx = 0; taskx < nslicesx; taskx++) 
															
 
																-	{
															
 
																-		for (tasky = 0; tasky < nslicesy; tasky++)
															
 
																-		{
															
 
																-			submit_new_iter(taskx, tasky, 0);
															
 
																-		}
															
 
																-	}
															
 
																-}
															
 
																+static starpu_codelet cl = {
															
 
																+	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.cpu_func = cpu_mult,
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	.cuda_func = cublas_mult,
															
 
																+#endif
															
 
																+	.nbuffers = 3,
															
 
																+	.model = &starpu_gemm_model
															
 
																+};
															
 
																-int main(__attribute__ ((unused)) int argc, 
															
 
																-	 __attribute__ ((unused)) char **argv)
															
 
																+int main(int argc, char **argv)
															
 
																 {
															
 
																+	struct timeval start;
															
 
																+	struct timeval end;
															
 
																 	parse_args(argc, argv);
															
 
																-	/* start the runtime */
															
 
																 	starpu_init(NULL);
															
 
																-
															
 
																 	starpu_helper_cublas_init();
															
 
																 	init_problem_data();
															
 
																+	partition_mult_data();
															
 
																 	gettimeofday(&start, NULL);
															
 
																-	partition_mult_data();
															
 
																+	unsigned x, y, iter;
															
 
																+	for (iter = 0; iter < niter; iter++)
															
 
																+	{
															
 
																+		for (x = 0; x < nslicesx; x++) 
															
 
																+		for (y = 0; y < nslicesy; y++)
															
 
																+		{
															
 
																+			struct starpu_task *task = starpu_task_create();
															
 
																+	
															
 
																+			task->cl = &cl;
															
 
																+	
															
 
																+			task->buffers[0].handle = starpu_data_get_sub_data(A_handle, 1, y);
															
 
																+			task->buffers[0].mode = STARPU_R;
															
 
																+			task->buffers[1].handle = starpu_data_get_sub_data(B_handle, 1, x);
															
 
																+			task->buffers[1].mode = STARPU_R;
															
 
																+			task->buffers[2].handle = starpu_data_get_sub_data(C_handle, 2, x, y);
															
 
																+			task->buffers[2].mode = STARPU_RW;
															
 
																+	
															
 
																+			int ret = starpu_task_submit(task);
															
 
																+			STARPU_ASSERT(!ret);
															
 
																+		}
															
 
																-	launch_codelets();
															
 
																+		starpu_task_wait_for_all();
															
 
																+	}
															
 
																-	starpu_task_wait_for_all();
															
 
																 	gettimeofday(&end, NULL);
															
 
																 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
															
 
																-	display_stats(timing);
															
 
																-	unpartition_mult_data();
															
 
																+	fprintf(stderr, "Time: %2.2f ms\n", timing/1000.0);
															
 
																+
															
 
																+	double flops = 2.0*((unsigned long)niter)*((unsigned long)xdim)
															
 
																+				*((unsigned long)ydim)*((unsigned long)zdim);
															
 
																+	fprintf(stderr, "GFlop/s: %.2f\n", flops/timing/1000.0);
															
 
																+
															
 
																+	starpu_data_unpartition(C_handle, 0);
															
 
																+	starpu_data_unregister(C_handle);
															
 
																 	if (check)
															
 
																 		check_output();
															
--- a/examples/mult/xgemm_kernels.c
+++ b/examples/mult/xgemm_kernels.c
@@ -1,78 +0,0 @@
 
																-/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																- *
															
 
																- * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																- *
															
 
																- * StarPU is free software; you can redistribute it and/or modify
															
 
																- * it under the terms of the GNU Lesser General Public License as published by
															
 
																- * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																- * your option) any later version.
															
 
																- *
															
 
																- * StarPU is distributed in the hope that it will be useful, but
															
 
																- * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																- *
															
 
																- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																- */
															
 
																-
															
 
																-#include <starpu.h>
															
 
																-#include <starpu_cuda.h>
															
 
																-#include <common/blas.h>
															
 
																-
															
 
																-#define COMMON_CODE			\
															
 
																-	uint32_t nxC, nyC, nyA;		\
															
 
																-	uint32_t ldA, ldB, ldC;		\
															
 
																-					\
															
 
																-	TYPE *subA;			\
															
 
																-	TYPE *subB;			\
															
 
																-	TYPE *subC;			\
															
 
																-					\
															
 
																-	subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);	\
															
 
																-	subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);	\
															
 
																-	subC = (TYPE *)STARPU_MATRIX_GET_PTR(descr[2]);	\
															
 
																-					\
															
 
																-	nxC = STARPU_MATRIX_GET_NX(descr[2]);		\
															
 
																-	nyC = STARPU_MATRIX_GET_NY(descr[2]);		\
															
 
																-	nyA = STARPU_MATRIX_GET_NY(descr[0]);		\
															
 
																-					\
															
 
																-	ldA = STARPU_MATRIX_GET_LD(descr[0]);		\
															
 
																-	ldB = STARPU_MATRIX_GET_LD(descr[1]);		\
															
 
																-	ldC = STARPU_MATRIX_GET_LD(descr[2]);
															
 
																-
															
 
																-
															
 
																-
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-
															
 
																-#ifdef STARPU_HAVE_MAGMA
															
 
																-#define GPU_GEMM MAGMABLAS_GEMM
															
 
																-#else
															
 
																-#define GPU_GEMM CUBLAS_GEMM
															
 
																-#endif
															
 
																-
															
 
																-void STARPU_GEMM(cublas_mult)(void *descr[], __attribute__((unused)) void *arg)
															
 
																-{
															
 
																-	COMMON_CODE
															
 
																-
															
 
																-	starpu_trace_user_event(0x42);
															
 
																-
															
 
																-	GPU_GEMM('n', 'n', nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
															
 
																-					     (TYPE)0.0, subC, ldC);
															
 
																-	cublasStatus st;
															
 
																-	st = cublasGetError();
															
 
																-	if (st != CUBLAS_STATUS_SUCCESS)
															
 
																-		STARPU_ABORT();
															
 
																-
															
 
																-	cudaThreadSynchronize();
															
 
																-
															
 
																-	starpu_trace_user_event(0x42);
															
 
																-}
															
 
																-#endif
															
 
																-
															
 
																-void STARPU_GEMM(cpu_mult)(void *descr[], __attribute__((unused))  void *arg)
															
 
																-{
															
 
																-	COMMON_CODE
															
 
																-
															
 
																-	starpu_trace_user_event(0x42);
															
 
																-	CPU_GEMM("N", "N", nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB, (TYPE)0.0, subC, ldC);
															
 
																-	starpu_trace_user_event(0x43);
															
 
																-}