Bladeren bron

Cleanup the examples/mult/ directory to only keep sgemm and dgemm (the rest of
the code is unreadable anyway, so that it should not be used as an example).

Cédric Augonnet 14 jaren geleden
bovenliggende
commit
53c60beeb5

+ 5 - 25
examples/Makefile.am

@@ -41,8 +41,6 @@ EXTRA_DIST = 					\
 	spmv/spmv_cuda.cu			\
 	gordon/null_kernel_gordon.c		\
 	mult/xgemm.c				\
-	mult/xgemm_kernels.c			\
-	mult/gordon/func_sgemm_ibm.c		\
 	lu/xlu.c				\
 	lu/xlu_pivot.c				\
 	lu/xlu_implicit.c			\
@@ -56,8 +54,7 @@ EXTRA_DIST = 					\
 	filters/fblock_opencl_kernel.cl
 
 CLEANFILES = 					\
-	gordon/null_kernel_gordon.spuelf	\
-	mult/gordon/func_sgemm_ibm.spuelf
+	gordon/null_kernel_gordon.spuelf
 
 
 CLEANFILES += *.gcno *.gcda *.linkinfo
@@ -88,8 +85,7 @@ SPULIBS = -lblas #-lc -lgloss -lc
 	$(SPU_LD) $(SPULDFLAGS) $< -o $@ $(SPULIBS)
 
 BUILT_SOURCES +=				\
-	gordon/null_kernel_gordon.spuelf	\
-	mult/gordon/func_sgemm_ibm.spuelf
+	gordon/null_kernel_gordon.spuelf
 
 endif
 
@@ -116,10 +112,8 @@ noinst_HEADERS = 				\
 	cholesky/dw_cholesky.h			\
 	common/blas_model.h			\
 	common/blas.h				\
-	mult/dw_mult.h				\
 	mult/simple.h				\
 	mult/double.h				\
-	mult/gordon/func_gemm_ibm.h		\
 	gordon/null.h				\
 	fortran/bindings/StarPU_fortran.h	\
 	ppm_downscaler/ppm_downscaler.h		\
@@ -332,29 +326,15 @@ if !NO_BLAS_LIB
 
 examplebin_PROGRAMS += 				\
 	mult/sgemm 				\
-	mult/dgemm 				\
-	mult/dw_mult_no_stride			\
-	mult/dw_mult_no_stride_no_tag
+	mult/dgemm
 
 mult_sgemm_SOURCES = 				\
 	mult/sgemm.c				\
-	common/blas.c				\
-	common/blas_model.c
+	common/blas.c
 
 mult_dgemm_SOURCES = 				\
 	mult/dgemm.c				\
-	common/blas.c				\
-	common/blas_model.c
-
-mult_dw_mult_no_stride_SOURCES = 		\
-	mult/dw_mult_no_stride.c		\
-	common/blas.c				\
-	common/blas_model.c
-
-mult_dw_mult_no_stride_no_tag_SOURCES =		\
-	mult/dw_mult_no_stride_no_tag.c		\
-	common/blas.c				\
-	common/blas_model.c
+	common/blas.c
 
 endif
 

+ 0 - 2
examples/mult/dgemm.c

@@ -16,6 +16,4 @@
  */
 
 #include "double.h"
-
-#include "xgemm_kernels.c"
 #include "xgemm.c" 

+ 0 - 1
examples/mult/double.h

@@ -17,7 +17,6 @@
 #define TYPE	double
 
 #define CUBLAS_GEMM cublasDgemm
-#define MAGMABLAS_GEMM magmablas_dgemm
 #define CPU_GEMM	DGEMM
 #define CPU_ASUM	DASUM
 #define CPU_IAMAX	IDAMAX

+ 0 - 203
examples/mult/dw_mult.h

@@ -1,203 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#ifndef __MULT_H__
-#define __MULT_H__
-
-#include <string.h>
-#include <math.h>
-#include <sys/types.h>
-#include <sys/time.h>
-#include <pthread.h>
-#include <signal.h>
-
-#include <common/blas.h>
-#include <common/blas_model.h>
-
-#include <starpu.h>
-
-#ifdef STARPU_USE_CUDA
-#include <cuda.h>
-#include <cublas.h>
-#endif
-
-#define MAXSLICESX	64
-#define MAXSLICESY	64
-#define MAXSLICESZ	64
-
-#define BLAS3_FLOP(n1,n2,n3)	\
-	(2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
-
-#define BLAS3_LS(n1,n2,n3)    \
-	((2*(n1)*(n3) + (n1)*(n2) + (n2)*(n3))*sizeof(float))
-
-struct block_conf {
-	uint32_t m;
-	uint32_t n;
-	uint32_t k;
-	uint32_t pad;
-};
-
-#define NITER	100
-
-unsigned niter = NITER;
-unsigned nslicesx = 4;
-unsigned nslicesy = 4;
-unsigned nslicesz = 4;
-unsigned xdim = 256;
-unsigned ydim = 256;
-unsigned zdim = 64;
-unsigned norandom = 0;
-unsigned pin = 0;
-unsigned use_common_model = 0;
-unsigned check = 0;
-
-/* to compute MFlop/s */
-uint64_t flop_cublas = 0;
-uint64_t flop_atlas = 0;
-uint64_t flop_per_worker[STARPU_NMAXWORKERS] = {0};
-
-/* to compute MB/s (load/store) */
-uint64_t ls_cublas = 0;
-uint64_t ls_atlas = 0;
-uint64_t ls_per_worker[STARPU_NMAXWORKERS] = {0};
-
-
-struct timeval start;
-struct timeval end;
-
-static int taskcounter __attribute__ ((unused));
-static struct block_conf conf __attribute__ ((aligned (128)));
-
-#define BLOCKSIZEX	(xdim / nslicesx)
-#define BLOCKSIZEY	(ydim / nslicesy)
-#define BLOCKSIZEZ	(zdim / nslicesz)
-
-static void display_stats(double timing)
-{
-	unsigned worker;
-	unsigned nworkers = starpu_worker_get_count();
-
-	fprintf(stderr, "Computation took (ms):\n");
-	printf("%2.2f\n", timing/1000);
-
-	uint64_t flop_total = 0, ls_total = 0;
-	
-	for (worker = 0; worker < nworkers; worker++)
-	{
-		flop_total += flop_per_worker[worker];
-		ls_total += ls_per_worker[worker];
-
-		char name[32];
-		starpu_worker_get_name(worker, name, 32);
-
-		fprintf(stderr, "\t%s -> %2.2f GFlop\t%2.2f GFlop/s\n", name, (double)flop_per_worker[worker]/1000000000.0f, (double)flop_per_worker[worker]/(double)timing/1000);
-	}
-
-	fprintf(stderr, "Total: %2.2f GFlops\t%2.2f GFlop/s\n", (double)flop_total/1000000000.0f, (double)flop_total/(double)timing/1000);
-}
-
-static void parse_args(int argc, char **argv)
-{
-	int i;
-	for (i = 1; i < argc; i++) {
-		if (strcmp(argv[i], "-nblocks") == 0) {
-			char *argptr;
-			nslicesx = strtol(argv[++i], &argptr, 10);
-			nslicesy = nslicesx;
-			nslicesz = nslicesx;
-		}
-
-		if (strcmp(argv[i], "-nblocksx") == 0) {
-			char *argptr;
-			nslicesx = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-nblocksy") == 0) {
-			char *argptr;
-			nslicesy = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-nblocksz") == 0) {
-			char *argptr;
-			nslicesz = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-x") == 0) {
-			char *argptr;
-			xdim = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-y") == 0) {
-			char *argptr;
-			ydim = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-z") == 0) {
-			char *argptr;
-			zdim = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-iter") == 0) {
-			char *argptr;
-			niter = strtol(argv[++i], &argptr, 10);
-		}
-
-
-		if (strcmp(argv[i], "-no-random") == 0) {
-			norandom = 1;
-		}
-
-		if (strcmp(argv[i], "-pin") == 0) {
-			pin = 1;
-		}
-
-		if (strcmp(argv[i], "-check") == 0) {
-			check = 1;
-		}
-
-		if (strcmp(argv[i], "-common-model") == 0) {
-			use_common_model = 1;
-		}
-	}
-
-	assert(nslicesx <= MAXSLICESX); 
-	assert(nslicesy <= MAXSLICESY); 
-	assert(nslicesz <= MAXSLICESZ); 
-}
-
-static void display_memory_consumption(void)
-{
-	fprintf(stderr, "Total memory : %ld MB\n",
-		(MAXSLICESY*MAXSLICESZ*sizeof(TYPE *) 
-		+ MAXSLICESZ*MAXSLICESX*sizeof(TYPE *)
-		+ MAXSLICESY*MAXSLICESX*sizeof(TYPE *)
-		+ MAXSLICESY*MAXSLICESZ*sizeof(starpu_data_handle)
-		+ MAXSLICESZ*MAXSLICESX*sizeof(starpu_data_handle)
-		+ MAXSLICESY*MAXSLICESX*sizeof(starpu_data_handle)
-		+ ydim*zdim*sizeof(TYPE)
-		+ zdim*xdim*sizeof(TYPE)
-		+ ydim*xdim*sizeof(TYPE))/(1024*1024) );
-}
-
-#ifdef STARPU_USE_CUDA
-void cublas_mult(void *descr[], __attribute__((unused)) void *arg);
-#endif
-
-void cpu_mult(void *descr[], __attribute__((unused))  void *arg);
-
-#endif // __MULT_H__

+ 0 - 465
examples/mult/dw_mult_no_stride.c

@@ -1,465 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "simple.h"
-#include "dw_mult.h"
-#ifdef STARPU_USE_GORDON
-#include "gordon/func_sgemm_ibm.h"
-#endif
-#include "xgemm_kernels.c"
-
-TYPE *A[MAXSLICESY][MAXSLICESZ];
-TYPE *B[MAXSLICESZ][MAXSLICESX];
-TYPE *C[MAXSLICESY][MAXSLICESX];
-
-starpu_data_handle A_state[MAXSLICESY][MAXSLICESZ];
-starpu_data_handle B_state[MAXSLICESZ][MAXSLICESX];
-starpu_data_handle C_state[MAXSLICESY][MAXSLICESX];
-
-#define TAG(x,y,z,iter)	\
-		((starpu_tag_t)((z) + (iter)*nslicesz + (x)*(nslicesz*niter) + (y)*(nslicesx*nslicesz*niter)))
-
-static void submit_new_iter(unsigned x, unsigned y, unsigned iter);
-
-/*
- * This program computes C = A * B 
- *
- * The difference with xgemm.c is that matrices are here already split in
- * blocks, and thus no data partitioning is needed.
- * 
- *   A of size (z,y)
- *   B of size (x,z)
- *   C of size (x,y)
-
-              |---------------|
-            z |       B       |
-              |---------------|
-       z              x
-     |----|   |---------------|
-     |    |   |               |
-     |    |   |               |
-     | A  | y |       C       |
-     |    |   |               |
-     |    |   |               |
-     |----|   |---------------|
-
- */
-
-#define MEM_ALIGNMENT	16
-
-static void init_problem_data(void)
-{
-	unsigned i,j;
-
-	/* debug ... */
-	memset(A, 0, MAXSLICESY*MAXSLICESZ*sizeof(TYPE *));
-	memset(B, 0, MAXSLICESZ*MAXSLICESZ*sizeof(TYPE *));
-	memset(C, 0, MAXSLICESY*MAXSLICESX*sizeof(TYPE *));
-	memset(&A_state, 0, MAXSLICESY*MAXSLICESZ*sizeof(starpu_data_handle));
-	memset(&B_state, 0, MAXSLICESZ*MAXSLICESZ*sizeof(starpu_data_handle));
-	memset(&C_state, 0, MAXSLICESY*MAXSLICESX*sizeof(starpu_data_handle));
-
-	/* Allocate grids of buffer */
-	/* TODO pin ... */
-	unsigned z, y, x;
-
-	for (y = 0; y < nslicesy; y++)
-	{
-		for (z = 0; z < nslicesz; z++)
-		{
-#ifdef STARPU_HAVE_POSIX_MEMALIGN
-			posix_memalign((void **)&A[y][z], MEM_ALIGNMENT, BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
-#else
-			A[y][z] = malloc(BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
-#endif
-			assert(A[y][z]);
-		}
-	}
-
-	for (z = 0; z < nslicesz; z++)
-	{
-		for (x = 0; x < nslicesx; x++)
-		{
-#ifdef STARPU_HAVE_POSIX_MEMALIGN
-			posix_memalign((void **)&B[z][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
-#else
-			B[z][x] = malloc(BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
-#endif
-			assert(B[z][x]);
-		}
-	}
-
-	for (y = 0; y < nslicesy; y++)
-	{
-		for (x = 0; x < nslicesx; x++)
-		{
-#ifdef STARPU_HAVE_POSIX_MEMALIGN
-			posix_memalign((void **)&C[y][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
-#else
-			C[y][x] = malloc(BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
-#endif
-			assert(C[y][x]);
-		}
-	}
-	
-	/* fill the A and B matrices */
-	unsigned blockx, blocky, blockz;
-
-	if (norandom) {
-		for (blocky = 0; blocky < nslicesy; blocky++)
-			for (blockz = 0; blockz < nslicesz; blockz++)
-				for (j = 0; j < BLOCKSIZEY; j++)
-					for (i = 0; i < BLOCKSIZEZ; i++)
-					{
-						A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(1 + blockz + blocky*nslicesz);
-					}
-
-		for (blockz = 0; blockz < nslicesz; blockz++)
-			for (blockx = 0; blockx < nslicesx; blockx++)
-				for (j = 0; j < BLOCKSIZEZ; j++)
-					for (i = 0; i < BLOCKSIZEX; i++)
-					{
-						B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(1 + blockx + blockz*nslicesx);
-					}
-	} 
-	else {
-		for (blocky = 0; blocky < nslicesy; blocky++)
-			for (blockz = 0; blockz < nslicesz; blockz++)
-				for (j = 0; j < BLOCKSIZEY; j++)
-					for (i = 0; i < BLOCKSIZEZ; i++)
-					{
-						A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(starpu_drand48());
-					}
-
-		for (blockz = 0; blockz < nslicesz; blockz++)
-			for (blockx = 0; blockx < nslicesx; blockx++)
-				for (j = 0; j < BLOCKSIZEZ; j++)
-					for (i = 0; i < BLOCKSIZEX; i++)
-					{
-						B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(starpu_drand48());
-					}
-
-	}
-
-	for (blocky = 0; blocky < nslicesy; blocky++)
-		for (blockx = 0; blockx < nslicesx; blockx++)
-			for (j = 0; j < BLOCKSIZEY; j++)
-				for (i = 0; i < BLOCKSIZEX; i++)
-				{
-					C[blocky][blockx][i*BLOCKSIZEY + j] = (TYPE)(blockx + blocky*nslicesx + 1);
-				}
-
-	/* TODO: aren't we supposed to set data consistency to relaxed, since
-	 * tags are supposed to provide the correct dependencies? */
-
-	/* declare the StarPU data to monitor */
-	for (y = 0; y < nslicesy; y++)
-	{
-		for (z = 0; z < nslicesz; z++)
-		{
-			starpu_matrix_data_register(&A_state[y][z], 0, (uintptr_t)A[y][z], 
-				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEZ, sizeof(TYPE));
-		}
-	}
-
-	for (z = 0; z < nslicesz; z++)
-	{
-		for (x = 0; x < nslicesx; x++)
-		{
-			starpu_matrix_data_register(&B_state[z][x], 0, (uintptr_t)B[z][x], 
-				BLOCKSIZEZ, BLOCKSIZEZ, BLOCKSIZEX, sizeof(TYPE));
-		}
-	}
-
-	for (y = 0; y < nslicesy; y++)
-	{
-		for (x = 0; x < nslicesx; x++)
-		{
-			starpu_matrix_data_register(&C_state[y][x], 0, (uintptr_t)C[y][x], 
-				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEX, sizeof(TYPE));
-		}
-	}
-
-#ifdef STARPU_USE_GORDON
-	conf.k = BLOCKSIZEZ;
-	conf.m = BLOCKSIZEY;
-	conf.n = BLOCKSIZEX;
-#endif
-
-	fprintf(stderr, "block size : x %d y %d z %d\n", BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
-
-	display_memory_consumption();
-}
-
-static void cleanup_problem(void)
-{
-	unsigned z, y, x;
-
-#ifdef CHECK_OUTPUT
-	TYPE maxerr = 0.0;
-	TYPE err;
-	fprintf(stderr, "Checking results ....");
-
-	for (y = 0; y < nslicesy; y++)
-	{
-		for (x = 0; x < nslicesx; x++)
-		{
-			for (z = 0; z < nslicesz; z++)
-			{
-				SGEMM("N", "N", BLOCKSIZEY, BLOCKSIZEX, BLOCKSIZEZ, -(TYPE)(niter), A[y][z], BLOCKSIZEY, B[z][x], BLOCKSIZEZ, 1.0f, C[y][x], BLOCKSIZEY);
-
-			}
-
-			/* make sure C - niter AB = 0 */
-			err = SASUM(BLOCKSIZEX*BLOCKSIZEY, C[y][x], 1);
-
-			if (err > BLOCKSIZEX*BLOCKSIZEY*niter*0.001) 
-				fprintf(stderr, "\nerr = %f ( x = %d y = %d ) ... ", err/niter, x, y );
-
-			maxerr = STARPU_MAX(err, maxerr);
-		}
-	}
-
-	if (maxerr > BLOCKSIZEX*BLOCKSIZEY*niter*0.001)
-	{
-		fprintf(stderr, " maxerr = %f\n", maxerr/niter);
-	}
-	else {
-		fprintf(stderr, " OK\n");
-	}
-	fflush(stderr);
-#endif
-
-	for (y = 0; y < nslicesy; y++)
-	{
-		for (z = 0; z < nslicesz; z++)
-		{
-	//		free(A[y][z]);
-		}
-	}
-
-	for (z = 0; z < nslicesz; z++)
-	{
-		for (x = 0; x < nslicesx; x++)
-		{
-	//		free(B[z][x]);
-		}
-	}
-
-	for (y = 0; y < nslicesy; y++)
-	{
-		for (x = 0; x < nslicesx; x++)
-		{
-	//		free(C[y][x]);
-			starpu_tag_remove(TAG(nslicesz - 1, y, x, niter - 1));
-		}
-	}
-
-	
-	
-}
-
-struct cb2_s {
-	unsigned blockx;
-	unsigned blocky;
-	unsigned iter;
-};
-
-
-static starpu_codelet cl = {
-	.where = STARPU_CPU|STARPU_CUDA
-#ifdef SPU_FUNC_SGEMM
-		|STARPU_GORDON
-#endif
-		,
-	.cpu_func = STARPU_GEMM(cpu_mult),
-#ifdef STARPU_USE_CUDA
-	.cuda_func = STARPU_GEMM(cublas_mult),
-#endif
-#ifdef STARPU_USE_GORDON
-	/* .gordon_func will be set by load_elf_sgemm */
-#endif
-	.nbuffers = 3
-};
-
-
-#ifdef STARPU_USE_GORDON
-static const char *spu_func_sgemm_elf_file = "./gordon/func_sgemm_ibm.spuelf";
-static unsigned spu_func_sgemm_elf_id;
-static unsigned spu_func_sgemm_ibm_id;
-
-static void load_elf_sgemm(void)
-{
-	spu_func_sgemm_elf_id =
-		gordon_register_elf_plugin(spu_func_sgemm_elf_file);
-
-	spu_func_sgemm_ibm_id = gordon_register_kernel(spu_func_sgemm_elf_id, "func_sgemm_ibm");
-
-	gordon_load_plugin_on_all_spu(spu_func_sgemm_elf_id);
-	gordon_load_kernel_on_all_spu(spu_func_sgemm_ibm_id);
-
-	cl.gordon_func = spu_func_sgemm_ibm_id;
-}
-#endif // STARPU_USE_GORDON
-
-static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, unsigned iter)
-{
-	/* A B[task] = C[task] */
-	struct starpu_task *task = starpu_task_create();
-
-	task->cl = &cl;
-
-	task->use_tag = 1;
-	task->tag_id = TAG(z, y, x, iter);
-
-	task->buffers[0].handle = A_state[y][z];
-	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = B_state[z][x];
-	task->buffers[1].mode = STARPU_R;
-	task->buffers[2].handle = C_state[y][x];
-	task->buffers[2].mode = STARPU_RW;
-
-#ifdef STARPU_USE_GORDON
-	task->cl_arg = &conf;
-	task->cl_arg_size = sizeof(struct ibm_sgemm_block_conf);
-#endif
-
-	return task;
-}
-
-static void callback_func_2(void *arg)
-{
-	/* the argument is a pointer to a counter of the remaining tasks */
-	struct cb2_s *cb2 = arg;
-	unsigned x,y,z,iter;
-
-	iter = cb2->iter;
-	x = cb2->blockx;
-	y = cb2->blocky;
-
-	free(cb2);
-
-	/* do some accounting */
-	int id = starpu_worker_get_id();
-	flop_per_worker[id] += BLAS3_FLOP(BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
-	ls_per_worker[id] += BLAS3_LS(BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
-
-	/* TAG(nslicesz - 1, y, x, iter) remains ... */
-	for (z = 0; z < nslicesz - 1; z++)
-	{
-		starpu_tag_remove(TAG(z, y, x, iter));
-	}
-
-	if (iter > 0)
-	{
-		starpu_tag_remove(TAG(nslicesz - 1, y, x, iter-1));
-	}
-	
-	if (iter != niter - 1) {
-		submit_new_iter(x, y, iter+1);
-	}
-}
-
-
-
-static void submit_new_iter(unsigned x, unsigned y, unsigned iter)
-{
-	unsigned z;
-	for (z = 0; z < nslicesz; z++) 
-	{
-		struct starpu_task *task;
-		task = construct_task(x, y, z, iter);
-		
-		if (z != 0) {
-			starpu_tag_declare_deps(TAG(z, y, x, iter), 1, TAG(z-1, y, x, iter));
-		}
-
-		if (z == nslicesz - 1) {
-			struct cb2_s *cb2 = malloc(sizeof(struct cb2_s));
-				cb2->blockx = x;
-				cb2->blocky = y;
-				cb2->iter = iter;
-			task->callback_func = callback_func_2;
-			task->callback_arg = cb2;
-		}
-
-		starpu_task_submit(task);
-	}
-}
-
-static void launch_codelets(void)
-{
-#ifdef STARPU_USE_FXT
-	_starpu_fxt_register_thread(0);
-#endif
-	/* partition the work into slices */
-	unsigned taskx, tasky;
-
-	srand(time(NULL));
-
-	/* should we use a single performance model for all archs and use an
- 	 * acceleration factor ? */
-	if (use_common_model) {
-		cl.model = &STARPU_GEMM(model_common);
-	}
-	else {
-		cl.model = &STARPU_GEMM(model);
-	}
-
-	for (taskx = 0; taskx < nslicesx; taskx++) 
-	{
-		for (tasky = 0; tasky < nslicesy; tasky++)
-		{
-			submit_new_iter(taskx, tasky, 0);
-		}
-	}
-}
-
-int main(__attribute__ ((unused)) int argc, 
-	 __attribute__ ((unused)) char **argv)
-{
-
-	parse_args(argc, argv);
-
-	/* start the runtime */
-	starpu_init(NULL);
-
-	starpu_helper_cublas_init();
-
-#ifdef STARPU_USE_GORDON
-	load_elf_sgemm();
-#endif
-
-	init_problem_data();
-
-	gettimeofday(&start, NULL);
-
-	launch_codelets();
-
-	starpu_task_wait_for_all();
-
-	gettimeofday(&end, NULL);
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	display_stats(timing);
-
-	cleanup_problem();
-
-	starpu_helper_cublas_shutdown();
-	starpu_shutdown();
-
-	return 0;
-}

+ 0 - 447
examples/mult/dw_mult_no_stride_no_tag.c

@@ -1,447 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "simple.h"
-#include "dw_mult.h"
-#ifdef STARPU_USE_GORDON
-#include "gordon/func_sgemm_ibm.h"
-#endif
-#include "xgemm_kernels.c"
-
-
-struct pos {
-	unsigned x,y, z,iter;
-};
-
-struct pos currentpos [MAXSLICESY][MAXSLICESX];
-
-TYPE *A[MAXSLICESY][MAXSLICESZ];
-TYPE *B[MAXSLICESZ][MAXSLICESX];
-TYPE *C[MAXSLICESY][MAXSLICESX];
-
-starpu_data_handle A_state[MAXSLICESY][MAXSLICESZ];
-starpu_data_handle B_state[MAXSLICESZ][MAXSLICESX];
-starpu_data_handle C_state[MAXSLICESY][MAXSLICESX];
-
-
-static void callback_func_3(void *arg);
-/*
- * This program computes C = A * B 
- * 
- * The difference with dw_mult_no_stride.c is that here we do not use tags, and
- * just rely on sequential data consistency.
- *   A of size (z,y)
- *   B of size (x,z)
- *   C of size (x,y)
-
-              |---------------|
-            z |       B       |
-              |---------------|
-       z              x
-     |----|   |---------------|
-     |    |   |               |
-     |    |   |               |
-     | A  | y |       C       |
-     |    |   |               |
-     |    |   |               |
-     |----|   |---------------|
-
- */
-
-#define MEM_ALIGNMENT	16
-
-static void init_problem_data(void)
-{
-	unsigned i,j;
-
-	/* debug ... */
-	memset(A, 0, MAXSLICESY*MAXSLICESZ*sizeof(TYPE *));
-	memset(B, 0, MAXSLICESZ*MAXSLICESZ*sizeof(TYPE *));
-	memset(C, 0, MAXSLICESY*MAXSLICESX*sizeof(TYPE *));
-	memset(&A_state, 0, MAXSLICESY*MAXSLICESZ*sizeof(starpu_data_handle));
-	memset(&B_state, 0, MAXSLICESZ*MAXSLICESZ*sizeof(starpu_data_handle));
-	memset(&C_state, 0, MAXSLICESY*MAXSLICESX*sizeof(starpu_data_handle));
-
-	/* Allocate grids of buffer */
-	/* TODO pin ... */
-	unsigned z, y, x;
-
-	for (y = 0; y < nslicesy; y++)
-	{
-		for (z = 0; z < nslicesz; z++)
-		{
-#ifdef STARPU_HAVE_POSIX_MEMALIGN
-			posix_memalign((void **)&A[y][z], MEM_ALIGNMENT, BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
-#else
-			A[y][z] = malloc(BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
-#endif
-			assert(A[y][z]);
-		}
-	}
-
-	for (z = 0; z < nslicesz; z++)
-	{
-		for (x = 0; x < nslicesx; x++)
-		{
-#ifdef STARPU_HAVE_POSIX_MEMALIGN
-			posix_memalign((void **)&B[z][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
-#else
-			B[z][x] = malloc(BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
-#endif
-			assert(B[z][x]);
-		}
-	}
-
-	for (y = 0; y < nslicesy; y++)
-	{
-		for (x = 0; x < nslicesx; x++)
-		{
-#ifdef STARPU_HAVE_POSIX_MEMALIGN
-			posix_memalign((void **)&C[y][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
-#else
-			C[y][x] = malloc(BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
-#endif
-			currentpos[y][x].x = x;
-			currentpos[y][x].y = y;
-			currentpos[y][x].z = 0;
-			currentpos[y][x].iter = 0;
-			assert(C[y][x]);
-		}
-	}
-	
-	/* fill the A and B matrices */
-	unsigned blockx, blocky, blockz;
-
-	if (norandom) {
-		for (blocky = 0; blocky < nslicesy; blocky++)
-			for (blockz = 0; blockz < nslicesz; blockz++)
-				for (j = 0; j < BLOCKSIZEY; j++)
-					for (i = 0; i < BLOCKSIZEZ; i++)
-					{
-						A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(1 + blockz + blocky*nslicesz);
-					}
-
-		for (blockz = 0; blockz < nslicesz; blockz++)
-			for (blockx = 0; blockx < nslicesx; blockx++)
-				for (j = 0; j < BLOCKSIZEZ; j++)
-					for (i = 0; i < BLOCKSIZEX; i++)
-					{
-						B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(1 + blockx + blockz*nslicesx);
-					}
-	} 
-	else {
-		for (blocky = 0; blocky < nslicesy; blocky++)
-			for (blockz = 0; blockz < nslicesz; blockz++)
-				for (j = 0; j < BLOCKSIZEY; j++)
-					for (i = 0; i < BLOCKSIZEZ; i++)
-					{
-						A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(starpu_drand48());
-					}
-
-		for (blockz = 0; blockz < nslicesz; blockz++)
-			for (blockx = 0; blockx < nslicesx; blockx++)
-				for (j = 0; j < BLOCKSIZEZ; j++)
-					for (i = 0; i < BLOCKSIZEX; i++)
-					{
-						B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(starpu_drand48());
-					}
-
-	}
-
-	for (blocky = 0; blocky < nslicesy; blocky++)
-		for (blockx = 0; blockx < nslicesx; blockx++)
-			for (j = 0; j < BLOCKSIZEY; j++)
-				for (i = 0; i < BLOCKSIZEX; i++)
-				{
-					C[blocky][blockx][i*BLOCKSIZEY + j] = (TYPE)0;
-				}
-
-
-	/* declare the StarPU data to monitor */
-	for (y = 0; y < nslicesy; y++)
-	{
-		for (z = 0; z < nslicesz; z++)
-		{
-			starpu_matrix_data_register(&A_state[y][z], 0, (uintptr_t)A[y][z], 
-				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEZ, sizeof(TYPE));
-		}
-	}
-
-	for (z = 0; z < nslicesz; z++)
-	{
-		for (x = 0; x < nslicesx; x++)
-		{
-			starpu_matrix_data_register(&B_state[z][x], 0, (uintptr_t)B[z][x], 
-				BLOCKSIZEZ, BLOCKSIZEZ, BLOCKSIZEX, sizeof(TYPE));
-		}
-	}
-
-	for (y = 0; y < nslicesy; y++)
-	{
-		for (x = 0; x < nslicesx; x++)
-		{
-			starpu_matrix_data_register(&C_state[y][x], 0, (uintptr_t)C[y][x], 
-				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEX, sizeof(TYPE));
-		}
-	}
-
-#ifdef STARPU_USE_GORDON
-	conf.k = BLOCKSIZEZ;
-	conf.m = BLOCKSIZEY;
-	conf.n = BLOCKSIZEX;
-#endif
-
-	fprintf(stderr, "block size : x %d y %d z %d\n", BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
-
-	display_memory_consumption();
-}
-
-static void cleanup_problem(void)
-{
-	unsigned z, y, x;
-
-#ifdef CHECK_OUTPUT
-	TYPE maxerr = 0.0;
-	TYPE err;
-	fprintf(stderr, "Checking results ....");
-
-	for (y = 0; y < nslicesy; y++)
-	{
-		for (x = 0; x < nslicesx; x++)
-		{
-			for (z = 0; z < nslicesz; z++)
-			{
-				SGEMM("N", "N", BLOCKSIZEY, BLOCKSIZEX, BLOCKSIZEZ, -(TYPE)(niter), A[y][z], BLOCKSIZEY, B[z][x], BLOCKSIZEZ, 1.0f, C[y][x], BLOCKSIZEY);
-
-			}
-
-			/* make sure C - niter AB = 0 */
-			err = SASUM(BLOCKSIZEX*BLOCKSIZEY, C[y][x], 1);
-
-			if (err > BLOCKSIZEX*BLOCKSIZEY*niter*0.001) 
-				fprintf(stderr, "\nerr = %f ( x = %d y = %d ) ... ", err/niter, x, y );
-
-			maxerr = STARPU_MAX(err, maxerr);
-		}
-	}
-
-	if (maxerr > BLOCKSIZEX*BLOCKSIZEY*niter*0.001)
-	{
-		fprintf(stderr, " maxerr = %f\n", maxerr/niter);
-	}
-	else {
-		fprintf(stderr, " OK\n");
-	}
-	fflush(stderr);
-#endif
-
-	for (y = 0; y < nslicesy; y++)
-	{
-		for (z = 0; z < nslicesz; z++)
-		{
-	//		free(A[y][z]);
-		}
-	}
-
-	for (z = 0; z < nslicesz; z++)
-	{
-		for (x = 0; x < nslicesx; x++)
-		{
-	//		free(B[z][x]);
-		}
-	}
-
-	for (y = 0; y < nslicesy; y++)
-	{
-		for (x = 0; x < nslicesx; x++)
-		{
-	//		free(C[y][x]);
-		}
-	}
-
-	
-	
-}
-
-struct cb2_s {
-	unsigned blockx;
-	unsigned blocky;
-	unsigned iter;
-};
-
-
-static starpu_codelet cl = {
-	.where = STARPU_CPU|STARPU_CUDA
-#ifdef SPU_FUNC_SGEMM
-		|STARPU_GORDON
-#endif
-		,
-	.cpu_func = STARPU_GEMM(cpu_mult),
-#ifdef STARPU_USE_CUDA
-	.cuda_func = STARPU_GEMM(cublas_mult),
-#endif
-#ifdef STARPU_USE_GORDON
-	/* .gordon_func will be set by load_elf_sgemm */
-#endif
-	.nbuffers = 3
-};
-
-
-#ifdef STARPU_USE_GORDON
-static const char *spu_func_sgemm_elf_file = "./gordon/func_sgemm_ibm.spuelf";
-static unsigned spu_func_sgemm_elf_id;
-static unsigned spu_func_sgemm_ibm_id;
-
-static void load_elf_sgemm(void)
-{
-	spu_func_sgemm_elf_id =
-		gordon_register_elf_plugin(spu_func_sgemm_elf_file);
-
-	spu_func_sgemm_ibm_id = gordon_register_kernel(spu_func_sgemm_elf_id, "func_sgemm_ibm");
-	
-	gordon_load_plugin_on_all_spu(spu_func_sgemm_elf_id);
-	gordon_load_kernel_on_all_spu(spu_func_sgemm_ibm_id);
-
-	cl.gordon_func = spu_func_sgemm_ibm_id;
-}
-#endif // STARPU_USE_GORDON
-
-static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, unsigned iter, struct pos *posp)
-{
-	struct starpu_task *task = starpu_task_create();
-
-	task->cl = &cl;
-
-	task->buffers[0].handle = A_state[y][z];
-	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = B_state[z][x];
-	task->buffers[1].mode = STARPU_R;
-	task->buffers[2].handle = C_state[y][x];
-	task->buffers[2].mode = STARPU_RW;
-
-	task->callback_func = callback_func_3;
-	task->callback_arg = posp;
-
-#ifdef STARPU_USE_GORDON
-	task->cl_arg = &conf;
-	task->cl_arg_size = sizeof(struct ibm_sgemm_block_conf);
-#endif
-
-	posp->z = z;
-	posp->iter = iter;
-
-	return task;
-}
-
-
-static void callback_func_3(void *arg)
-{
-	/* do some accounting */
-	int id = starpu_worker_get_id();
-	flop_per_worker[id] += BLAS3_FLOP(BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
-	ls_per_worker[id] += BLAS3_LS(BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
-
-	/* the argument is a pointer to a counter of the remaining tasks */
-	struct pos *posp = arg;
-	unsigned x,y,z,iter;
-
-	iter = posp->iter;
-	x = posp->x;
-	y = posp->y;
-	z = posp->z;
-
-	if (z < nslicesz - 1)
-	{
-		struct starpu_task *task = construct_task(x, y, z+1, iter, posp);
-		starpu_task_submit(task);
-	}
-	else
-	{
-		if (iter < niter - 1)
-		{
-			struct starpu_task *task = construct_task(x, y, 0, iter+1, posp);
-			starpu_task_submit(task);
-		}
-	}
-}
-
-
-
-
-static void launch_codelets(void)
-{
-#ifdef STARPU_USE_FXT
-	_starpu_fxt_register_thread(0);
-#endif
-	/* partition the work into slices */
-	unsigned taskx, tasky;
-
-	srand(time(NULL));
-
-	/* should we use a single performance model for all archs and use an
- 	 * acceleration factor ? */
-	if (use_common_model) {
-		cl.model = &STARPU_GEMM(model_common);
-	}
-	else {
-		cl.model = &STARPU_GEMM(model);
-	}
-
-	for (taskx = 0; taskx < nslicesx; taskx++) 
-	{
-		for (tasky = 0; tasky < nslicesy; tasky++)
-		{
-			struct starpu_task *task = construct_task(taskx, tasky, 0, 0, &currentpos[tasky][taskx]);
-			starpu_task_submit(task);
-		}
-	}
-}
-
-int main(__attribute__ ((unused)) int argc, 
-	 __attribute__ ((unused)) char **argv)
-{
-
-	parse_args(argc, argv);
-
-	/* start the runtime */
-	starpu_init(NULL);
-
-	starpu_helper_cublas_init();
-
-#ifdef STARPU_USE_GORDON
-	load_elf_sgemm();
-#endif
-
-	init_problem_data();
-
-	gettimeofday(&start, NULL);
-
-	launch_codelets();
-
-	starpu_task_wait_for_all();
-
-	gettimeofday(&end, NULL);
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	display_stats(timing);
-
-	cleanup_problem();
-
-	starpu_helper_cublas_shutdown();
-	starpu_shutdown();
-
-	return 0;
-}

+ 0 - 42
examples/mult/gordon/func_dgemm_ibm.c

@@ -1,42 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010  Université de Bordeaux 1
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "func_gemm_ibm.h"
-
-#include <blas_s.h>
-
-void func_dgemm_ibm(__attribute__ ((unused)) void **alloc,
-		__attribute__ ((unused)) void **in,
-		__attribute__ ((unused)) void **inout,
-		__attribute__ ((unused)) void **out)
-{
-	/* we assume data will be in A:R,B:R,C:RW mode
- 	 *  -> in[0] : describe problem
- 	 *  -> in[1] : A
- 	 *  -> in[2] : B
- 	 *  -> inout[0] : C
- 	 *
- 	 *   C = AB + C
- 	 *   but, being in fortran ordering, we compute
- 	 *   t(C) = t(B)t(A) + t(C) instead
- 	 */
-	struct ibm_gemm_block_conf *conf = in[0];
-	double *A = in[1];
-	double *B = in[2];
-	double *C = inout[0];
-
-	dgemm_spu(conf->m, conf->n, conf->k, B, A, C);
-}

+ 0 - 29
examples/mult/gordon/func_gemm_ibm.h

@@ -1,29 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010  Université de Bordeaux 1
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#ifndef __FUNC_SGEMM_IBM_H__
-#define __FUNC_SGEMM_IBM_H__
-
-#include <stdint.h>
-
-struct ibm_sgemm_block_conf {
-	uint32_t m;
-	uint32_t n;
-	uint32_t k;
-	uint32_t pad;
-};
-
-#endif // __FUNC_SGEMM_IBM_H__

+ 0 - 43
examples/mult/gordon/func_sgemm_ibm.c

@@ -1,43 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "func_gemm_ibm.h"
-
-#include <blas_s.h>
-
-void func_sgemm_ibm(__attribute__ ((unused)) void **alloc,
-		__attribute__ ((unused)) void **in,
-		__attribute__ ((unused)) void **inout,
-		__attribute__ ((unused)) void **out)
-{
-	/* we assume data will be in A:R,B:R,C:RW mode
- 	 *  -> in[0] : describe problem
- 	 *  -> in[1] : A
- 	 *  -> in[2] : B
- 	 *  -> inout[0] : C
- 	 *
- 	 *   C = AB + C
- 	 *   but, being in fortran ordering, we compute
- 	 *   t(C) = t(B)t(A) + t(C) instead
- 	 */
-	struct ibm_gemm_block_conf *conf = in[0];
-	float *A = in[1];
-	float *B = in[2];
-	float *C = inout[0];
-
-	sgemm_spu(conf->m, conf->n, conf->k, B, A, C);
-}

+ 0 - 2
examples/mult/sgemm.c

@@ -16,6 +16,4 @@
  */
 
 #include "simple.h"
-
-#include "xgemm_kernels.c"
 #include "xgemm.c" 

+ 0 - 2
examples/mult/simple.h

@@ -17,7 +17,6 @@
 #define TYPE	float
 
 #define CUBLAS_GEMM cublasSgemm
-#define MAGMABLAS_GEMM magmablas_sgemm
 #define CPU_GEMM	SGEMM
 #define CPU_ASUM	SASUM
 #define CPU_IAMAX	ISAMAX
@@ -26,4 +25,3 @@
 #define str(s) #s
 #define xstr(s)        str(s)
 #define STARPU_GEMM_STR(name)  xstr(STARPU_GEMM(name))
-

+ 154 - 178
examples/mult/xgemm.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
@@ -16,37 +16,79 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "dw_mult.h"
-
-TYPE *A, *B, *C;
-starpu_data_handle A_handle, B_handle, C_handle;
-
-/*
- * This program computes C = A * B 
- * 
- *   A of size (z,y)
- *   B of size (x,z)
- *   C of size (x,y)
-
-              |---------------|
-            z |       B       |
-              |---------------|
-       z              x
-     |----|   |---------------|
-     |    |   |               |
-     |    |   |               |
-     | A  | y |       C       |
-     |    |   |               |
-     |    |   |               |
-     |----|   |---------------|
+#include <string.h>
+#include <math.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <starpu.h>
 
- */
+#include <common/blas.h>
+
+#ifdef STARPU_USE_CUDA
+#include <cuda.h>
+#include <cublas.h>
+#endif
+
+static unsigned niter = 100;
+static unsigned nslicesx = 4;
+static unsigned nslicesy = 4;
+static unsigned xdim = 256;
+static unsigned ydim = 256;
+static unsigned zdim = 64;
+static unsigned check = 0;
+
+static TYPE *A, *B, *C;
+static starpu_data_handle A_handle, B_handle, C_handle;
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-nblocks") == 0) {
+			char *argptr;
+			nslicesx = strtol(argv[++i], &argptr, 10);
+			nslicesy = nslicesx;
+		}
+
+		if (strcmp(argv[i], "-nblocksx") == 0) {
+			char *argptr;
+			nslicesx = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nblocksy") == 0) {
+			char *argptr;
+			nslicesy = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-x") == 0) {
+			char *argptr;
+			xdim = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-y") == 0) {
+			char *argptr;
+			ydim = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-z") == 0) {
+			char *argptr;
+			zdim = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-iter") == 0) {
+			char *argptr;
+			niter = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-check") == 0) {
+			check = 1;
+		}
+	}
+}
 
 static void check_output(void)
 {
-	/* check results */
 	/* compute C = C - AB */
-
 	CPU_GEMM("N", "N", ydim, xdim, zdim, (TYPE)-1.0f, A, ydim, B, zdim, (TYPE)1.0f, C, ydim);
 		
 	/* make sure C = 0 */
@@ -65,66 +107,24 @@ static void check_output(void)
 	}
 }
 
-void callback_func(void *arg)
-{
-	/* do some accounting */
-	int id = starpu_worker_get_id();
-	flop_per_worker[id] += BLAS3_FLOP(conf.m, conf.n, conf.k);
-	ls_per_worker[id] += BLAS3_LS(conf.m, conf.n, conf.k);
-}
-
 static void init_problem_data(void)
 {
 	unsigned i,j;
 
-#ifdef STARPU_USE_CUDA
-	if (pin) {
-		starpu_data_malloc_pinned_if_possible((void **)&A, zdim*ydim*sizeof(TYPE));
-		starpu_data_malloc_pinned_if_possible((void **)&B, xdim*zdim*sizeof(TYPE));
-		starpu_data_malloc_pinned_if_possible((void **)&C, xdim*ydim*sizeof(TYPE));
-	} else
-#endif
-	{
-#ifdef STARPU_HAVE_POSIX_MEMALIGN
-		posix_memalign((void **)&A, 4096, zdim*ydim*sizeof(TYPE));
-		posix_memalign((void **)&B, 4096, xdim*zdim*sizeof(TYPE));
-		posix_memalign((void **)&C, 4096, xdim*ydim*sizeof(TYPE));
-#else
-		A = malloc(zdim*ydim*sizeof(TYPE));
-		B = malloc(xdim*zdim*sizeof(TYPE));
-		C = malloc(xdim*ydim*sizeof(TYPE));
-#endif
-	}
+	starpu_data_malloc_pinned_if_possible((void **)&A, zdim*ydim*sizeof(TYPE));
+	starpu_data_malloc_pinned_if_possible((void **)&B, xdim*zdim*sizeof(TYPE));
+	starpu_data_malloc_pinned_if_possible((void **)&C, xdim*ydim*sizeof(TYPE));
 
 	/* fill the A and B matrices */
-	if (norandom) {
-		for (j=0; j < ydim; j++) {
-			for (i=0; i < zdim; i++) {
-				A[j+i*ydim] = (TYPE)(i);
-			}
-		}
-	
-		for (j=0; j < zdim; j++) {
-			for (i=0; i < xdim; i++) {
-				B[j+i*zdim] = (TYPE)(j);
-			}
-		}
-	} 
-	else {
-#ifdef NORANDOM
-		srand(2008);
-		STARPU_ABORT();
-#endif
-		for (j=0; j < ydim; j++) {
-			for (i=0; i < zdim; i++) {
-				A[j+i*ydim] = (TYPE)(starpu_drand48());
-			}
+	for (j=0; j < ydim; j++) {
+		for (i=0; i < zdim; i++) {
+			A[j+i*ydim] = (TYPE)(starpu_drand48());
 		}
-	
-		for (j=0; j < zdim; j++) {
-			for (i=0; i < xdim; i++) {
-				B[j+i*zdim] = (TYPE)(starpu_drand48());
-			}
+	}
+
+	for (j=0; j < zdim; j++) {
+		for (i=0; i < xdim; i++) {
+			B[j+i*zdim] = (TYPE)(starpu_drand48());
 		}
 	}
 
@@ -133,8 +133,6 @@ static void init_problem_data(void)
 			C[j+i*ydim] = (TYPE)(0);
 		}
 	}
-
-	display_memory_consumption();
 }
 
 static void partition_mult_data(void)
@@ -148,21 +146,15 @@ static void partition_mult_data(void)
 
 	starpu_data_set_wt_mask(C_handle, 1<<0);
 
-	conf.k = zdim;
-	conf.m = ydim/nslicesy;
-	conf.n = xdim/nslicesx;
-
 	struct starpu_data_filter f;
+	memset(&f, 0, sizeof(f));
 	f.filter_func = starpu_vertical_block_filter_func;
 	f.nchildren = nslicesx;
-	f.get_nchildren = NULL;
-	f.get_child_ops = NULL;
 		
 	struct starpu_data_filter f2;
+	memset(&f2, 0, sizeof(f2));
 	f2.filter_func = starpu_block_filter_func;
 	f2.nchildren = nslicesy;
-	f2.get_nchildren = NULL;
-	f2.get_child_ops = NULL;
 		
 	starpu_data_partition(B_handle, &f);
 	starpu_data_partition(A_handle, &f2);
@@ -170,126 +162,110 @@ static void partition_mult_data(void)
 	starpu_data_map_filters(C_handle, 2, &f, &f2);
 }
 
-static void unpartition_mult_data(void)
+static void mult_kernel_common(void *descr[], int type)
 {
-	fprintf(stderr, "unpartition !!\n");
+	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
+	TYPE *subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
+	TYPE *subC = (TYPE *)STARPU_MATRIX_GET_PTR(descr[2]);
 
-	starpu_data_unpartition(C_handle, 0);
+	unsigned nxC = STARPU_MATRIX_GET_NX(descr[2]);
+	unsigned nyC = STARPU_MATRIX_GET_NY(descr[2]);
+	unsigned nyA = STARPU_MATRIX_GET_NY(descr[0]);
 
-	starpu_data_unregister(C_handle);
-}
+	unsigned ldA = STARPU_MATRIX_GET_LD(descr[0]);
+	unsigned ldB = STARPU_MATRIX_GET_LD(descr[1]);
+	unsigned ldC = STARPU_MATRIX_GET_LD(descr[2]);
 
-static starpu_codelet cl = {
-	.where = STARPU_CPU|STARPU_CUDA
-#ifdef SPU_FUNC_SGEMM
-		|STARPU_GORDON
-#endif
-		,
-	.cpu_func = STARPU_GEMM(cpu_mult),
+	if (type == STARPU_CPU) {
+		CPU_GEMM("N", "N", nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB, (TYPE)0.0, subC, ldC);
+	}
 #ifdef STARPU_USE_CUDA
-	.cuda_func = STARPU_GEMM(cublas_mult),
-#endif
-#ifdef STARPU_USE_GORDON
-#ifdef SPU_FUNC_SGEMM
-	.gordon_func = SPU_FUNC_SGEMM,
-#else
-#warning SPU_FUNC_SGEMM is not available
-#endif
+	else {
+		CUBLAS_GEMM('n', 'n', nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
+					     (TYPE)0.0, subC, ldC);
+		cudaThreadSynchronize();
+	}
 #endif
-	.nbuffers = 3
-};
-
-static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, unsigned iter)
-{
-	/* A B[task] = C[task] */
-	struct starpu_task *task = starpu_task_create();
-
-	task->cl = &cl;
-
-	/* we have a callback to do some accounting */
-	task->callback_func = callback_func;
-	task->callback_arg = NULL;
-
-	task->buffers[0].handle = starpu_data_get_sub_data(A_handle, 1, y);
-	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = starpu_data_get_sub_data(B_handle, 1, x);
-	task->buffers[1].mode = STARPU_R;
-	task->buffers[2].handle = starpu_data_get_sub_data(C_handle, 2, x, y);
-	task->buffers[2].mode = STARPU_RW;
-
-	task->cl_arg = &conf;
-	task->cl_arg_size = sizeof(struct block_conf);
-	return task;
 }
 
-static void submit_new_iter(unsigned x, unsigned y, unsigned iter)
+#ifdef STARPU_USE_CUDA
+static void cublas_mult(void *descr[], __attribute__((unused)) void *arg)
 {
-	unsigned z;
-
-	z = 0;
-
-	{
-		struct starpu_task *task;
-		task = construct_task(x, y, z, iter);
-
-		starpu_task_submit(task);
-	}
+	mult_kernel_common(descr, STARPU_CUDA);
 }
-
-static void launch_codelets(void)
-{
-#ifdef STARPU_USE_FXT
-	_starpu_fxt_register_thread(0);
 #endif
-	/* partition the work into slices */
-	unsigned taskx, tasky;
 
-	srand(time(NULL));
+static void cpu_mult(void *descr[], __attribute__((unused))  void *arg)
+{
+	mult_kernel_common(descr, STARPU_CPU);
+}
 
-	/* should we use a single performance model for all archs and use an
- 	 * acceleration factor ? */
-	if (use_common_model) {
-		cl.model = &STARPU_GEMM(model_common);
-	}
-	else {
-		cl.model = &STARPU_GEMM(model);
-	}
+static struct starpu_perfmodel_t starpu_gemm_model = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = STARPU_GEMM_STR(gemm)
+};
 
-	for (taskx = 0; taskx < nslicesx; taskx++) 
-	{
-		for (tasky = 0; tasky < nslicesy; tasky++)
-		{
-			submit_new_iter(taskx, tasky, 0);
-		}
-	}
-}
+static starpu_codelet cl = {
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_func = cpu_mult,
+#ifdef STARPU_USE_CUDA
+	.cuda_func = cublas_mult,
+#endif
+	.nbuffers = 3,
+	.model = &starpu_gemm_model
+};
 
-int main(__attribute__ ((unused)) int argc, 
-	 __attribute__ ((unused)) char **argv)
+int main(int argc, char **argv)
 {
+	struct timeval start;
+	struct timeval end;
 
 	parse_args(argc, argv);
 
-	/* start the runtime */
 	starpu_init(NULL);
-
 	starpu_helper_cublas_init();
 
 	init_problem_data();
+	partition_mult_data();
 
 	gettimeofday(&start, NULL);
 
-	partition_mult_data();
+	unsigned x, y, iter;
+	for (iter = 0; iter < niter; iter++)
+	{
+		for (x = 0; x < nslicesx; x++) 
+		for (y = 0; y < nslicesy; y++)
+		{
+			struct starpu_task *task = starpu_task_create();
+	
+			task->cl = &cl;
+	
+			task->buffers[0].handle = starpu_data_get_sub_data(A_handle, 1, y);
+			task->buffers[0].mode = STARPU_R;
+			task->buffers[1].handle = starpu_data_get_sub_data(B_handle, 1, x);
+			task->buffers[1].mode = STARPU_R;
+			task->buffers[2].handle = starpu_data_get_sub_data(C_handle, 2, x, y);
+			task->buffers[2].mode = STARPU_RW;
+	
+			int ret = starpu_task_submit(task);
+			STARPU_ASSERT(!ret);
+		}
 
-	launch_codelets();
+		starpu_task_wait_for_all();
+	}
 
-	starpu_task_wait_for_all();
 
 	gettimeofday(&end, NULL);
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	display_stats(timing);
 
-	unpartition_mult_data();
+	fprintf(stderr, "Time: %2.2f ms\n", timing/1000.0);
+
+	double flops = 2.0*((unsigned long)niter)*((unsigned long)xdim)
+				*((unsigned long)ydim)*((unsigned long)zdim);
+	fprintf(stderr, "GFlop/s: %.2f\n", flops/timing/1000.0);
+
+	starpu_data_unpartition(C_handle, 0);
+	starpu_data_unregister(C_handle);
 	
 	if (check)
 		check_output();

+ 0 - 78
examples/mult/xgemm_kernels.c

@@ -1,78 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include <starpu.h>
-#include <starpu_cuda.h>
-#include <common/blas.h>
-
-#define COMMON_CODE			\
-	uint32_t nxC, nyC, nyA;		\
-	uint32_t ldA, ldB, ldC;		\
-					\
-	TYPE *subA;			\
-	TYPE *subB;			\
-	TYPE *subC;			\
-					\
-	subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);	\
-	subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);	\
-	subC = (TYPE *)STARPU_MATRIX_GET_PTR(descr[2]);	\
-					\
-	nxC = STARPU_MATRIX_GET_NX(descr[2]);		\
-	nyC = STARPU_MATRIX_GET_NY(descr[2]);		\
-	nyA = STARPU_MATRIX_GET_NY(descr[0]);		\
-					\
-	ldA = STARPU_MATRIX_GET_LD(descr[0]);		\
-	ldB = STARPU_MATRIX_GET_LD(descr[1]);		\
-	ldC = STARPU_MATRIX_GET_LD(descr[2]);
-
-
-
-#ifdef STARPU_USE_CUDA
-
-#ifdef STARPU_HAVE_MAGMA
-#define GPU_GEMM MAGMABLAS_GEMM
-#else
-#define GPU_GEMM CUBLAS_GEMM
-#endif
-
-void STARPU_GEMM(cublas_mult)(void *descr[], __attribute__((unused)) void *arg)
-{
-	COMMON_CODE
-
-	starpu_trace_user_event(0x42);
-
-	GPU_GEMM('n', 'n', nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
-					     (TYPE)0.0, subC, ldC);
-	cublasStatus st;
-	st = cublasGetError();
-	if (st != CUBLAS_STATUS_SUCCESS)
-		STARPU_ABORT();
-
-	cudaThreadSynchronize();
-
-	starpu_trace_user_event(0x42);
-}
-#endif
-
-void STARPU_GEMM(cpu_mult)(void *descr[], __attribute__((unused))  void *arg)
-{
-	COMMON_CODE
-
-	starpu_trace_user_event(0x42);
-	CPU_GEMM("N", "N", nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB, (TYPE)0.0, subC, ldC);
-	starpu_trace_user_event(0x43);
-}