ソースを参照

Cleanup the examples/mult/ directory to only keep sgemm and dgemm (the rest of
the code is unreadable anyway, so that it should not be used as an example).

Cédric Augonnet 14 年 前
コミット
53c60beeb5

+ 5 - 25
examples/Makefile.am

@@ -41,8 +41,6 @@ EXTRA_DIST = 					\
 	spmv/spmv_cuda.cu			\
 	spmv/spmv_cuda.cu			\
 	gordon/null_kernel_gordon.c		\
 	gordon/null_kernel_gordon.c		\
 	mult/xgemm.c				\
 	mult/xgemm.c				\
-	mult/xgemm_kernels.c			\
-	mult/gordon/func_sgemm_ibm.c		\
 	lu/xlu.c				\
 	lu/xlu.c				\
 	lu/xlu_pivot.c				\
 	lu/xlu_pivot.c				\
 	lu/xlu_implicit.c			\
 	lu/xlu_implicit.c			\
@@ -56,8 +54,7 @@ EXTRA_DIST = 					\
 	filters/fblock_opencl_kernel.cl
 	filters/fblock_opencl_kernel.cl
 
 
 CLEANFILES = 					\
 CLEANFILES = 					\
-	gordon/null_kernel_gordon.spuelf	\
-	mult/gordon/func_sgemm_ibm.spuelf
+	gordon/null_kernel_gordon.spuelf
 
 
 
 
 CLEANFILES += *.gcno *.gcda *.linkinfo
 CLEANFILES += *.gcno *.gcda *.linkinfo
@@ -88,8 +85,7 @@ SPULIBS = -lblas #-lc -lgloss -lc
 	$(SPU_LD) $(SPULDFLAGS) $< -o $@ $(SPULIBS)
 	$(SPU_LD) $(SPULDFLAGS) $< -o $@ $(SPULIBS)
 
 
 BUILT_SOURCES +=				\
 BUILT_SOURCES +=				\
-	gordon/null_kernel_gordon.spuelf	\
-	mult/gordon/func_sgemm_ibm.spuelf
+	gordon/null_kernel_gordon.spuelf
 
 
 endif
 endif
 
 
@@ -116,10 +112,8 @@ noinst_HEADERS = 				\
 	cholesky/dw_cholesky.h			\
 	cholesky/dw_cholesky.h			\
 	common/blas_model.h			\
 	common/blas_model.h			\
 	common/blas.h				\
 	common/blas.h				\
-	mult/dw_mult.h				\
 	mult/simple.h				\
 	mult/simple.h				\
 	mult/double.h				\
 	mult/double.h				\
-	mult/gordon/func_gemm_ibm.h		\
 	gordon/null.h				\
 	gordon/null.h				\
 	fortran/bindings/StarPU_fortran.h	\
 	fortran/bindings/StarPU_fortran.h	\
 	ppm_downscaler/ppm_downscaler.h		\
 	ppm_downscaler/ppm_downscaler.h		\
@@ -332,29 +326,15 @@ if !NO_BLAS_LIB
 
 
 examplebin_PROGRAMS += 				\
 examplebin_PROGRAMS += 				\
 	mult/sgemm 				\
 	mult/sgemm 				\
-	mult/dgemm 				\
-	mult/dw_mult_no_stride			\
-	mult/dw_mult_no_stride_no_tag
+	mult/dgemm
 
 
 mult_sgemm_SOURCES = 				\
 mult_sgemm_SOURCES = 				\
 	mult/sgemm.c				\
 	mult/sgemm.c				\
-	common/blas.c				\
-	common/blas_model.c
+	common/blas.c
 
 
 mult_dgemm_SOURCES = 				\
 mult_dgemm_SOURCES = 				\
 	mult/dgemm.c				\
 	mult/dgemm.c				\
-	common/blas.c				\
-	common/blas_model.c
-
-mult_dw_mult_no_stride_SOURCES = 		\
-	mult/dw_mult_no_stride.c		\
-	common/blas.c				\
-	common/blas_model.c
-
-mult_dw_mult_no_stride_no_tag_SOURCES =		\
-	mult/dw_mult_no_stride_no_tag.c		\
-	common/blas.c				\
-	common/blas_model.c
+	common/blas.c
 
 
 endif
 endif
 
 

+ 0 - 2
examples/mult/dgemm.c

@@ -16,6 +16,4 @@
  */
  */
 
 
 #include "double.h"
 #include "double.h"
-
-#include "xgemm_kernels.c"
 #include "xgemm.c" 
 #include "xgemm.c" 

+ 0 - 1
examples/mult/double.h

@@ -17,7 +17,6 @@
 #define TYPE	double
 #define TYPE	double
 
 
 #define CUBLAS_GEMM cublasDgemm
 #define CUBLAS_GEMM cublasDgemm
-#define MAGMABLAS_GEMM magmablas_dgemm
 #define CPU_GEMM	DGEMM
 #define CPU_GEMM	DGEMM
 #define CPU_ASUM	DASUM
 #define CPU_ASUM	DASUM
 #define CPU_IAMAX	IDAMAX
 #define CPU_IAMAX	IDAMAX

+ 0 - 203
examples/mult/dw_mult.h

@@ -1,203 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#ifndef __MULT_H__
-#define __MULT_H__
-
-#include <string.h>
-#include <math.h>
-#include <sys/types.h>
-#include <sys/time.h>
-#include <pthread.h>
-#include <signal.h>
-
-#include <common/blas.h>
-#include <common/blas_model.h>
-
-#include <starpu.h>
-
-#ifdef STARPU_USE_CUDA
-#include <cuda.h>
-#include <cublas.h>
-#endif
-
-#define MAXSLICESX	64
-#define MAXSLICESY	64
-#define MAXSLICESZ	64
-
-#define BLAS3_FLOP(n1,n2,n3)	\
-	(2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
-
-#define BLAS3_LS(n1,n2,n3)    \
-	((2*(n1)*(n3) + (n1)*(n2) + (n2)*(n3))*sizeof(float))
-
-struct block_conf {
-	uint32_t m;
-	uint32_t n;
-	uint32_t k;
-	uint32_t pad;
-};
-
-#define NITER	100
-
-unsigned niter = NITER;
-unsigned nslicesx = 4;
-unsigned nslicesy = 4;
-unsigned nslicesz = 4;
-unsigned xdim = 256;
-unsigned ydim = 256;
-unsigned zdim = 64;
-unsigned norandom = 0;
-unsigned pin = 0;
-unsigned use_common_model = 0;
-unsigned check = 0;
-
-/* to compute MFlop/s */
-uint64_t flop_cublas = 0;
-uint64_t flop_atlas = 0;
-uint64_t flop_per_worker[STARPU_NMAXWORKERS] = {0};
-
-/* to compute MB/s (load/store) */
-uint64_t ls_cublas = 0;
-uint64_t ls_atlas = 0;
-uint64_t ls_per_worker[STARPU_NMAXWORKERS] = {0};
-
-
-struct timeval start;
-struct timeval end;
-
-static int taskcounter __attribute__ ((unused));
-static struct block_conf conf __attribute__ ((aligned (128)));
-
-#define BLOCKSIZEX	(xdim / nslicesx)
-#define BLOCKSIZEY	(ydim / nslicesy)
-#define BLOCKSIZEZ	(zdim / nslicesz)
-
-static void display_stats(double timing)
-{
-	unsigned worker;
-	unsigned nworkers = starpu_worker_get_count();
-
-	fprintf(stderr, "Computation took (ms):\n");
-	printf("%2.2f\n", timing/1000);
-
-	uint64_t flop_total = 0, ls_total = 0;
-	
-	for (worker = 0; worker < nworkers; worker++)
-	{
-		flop_total += flop_per_worker[worker];
-		ls_total += ls_per_worker[worker];
-
-		char name[32];
-		starpu_worker_get_name(worker, name, 32);
-
-		fprintf(stderr, "\t%s -> %2.2f GFlop\t%2.2f GFlop/s\n", name, (double)flop_per_worker[worker]/1000000000.0f, (double)flop_per_worker[worker]/(double)timing/1000);
-	}
-
-	fprintf(stderr, "Total: %2.2f GFlops\t%2.2f GFlop/s\n", (double)flop_total/1000000000.0f, (double)flop_total/(double)timing/1000);
-}
-
-static void parse_args(int argc, char **argv)
-{
-	int i;
-	for (i = 1; i < argc; i++) {
-		if (strcmp(argv[i], "-nblocks") == 0) {
-			char *argptr;
-			nslicesx = strtol(argv[++i], &argptr, 10);
-			nslicesy = nslicesx;
-			nslicesz = nslicesx;
-		}
-
-		if (strcmp(argv[i], "-nblocksx") == 0) {
-			char *argptr;
-			nslicesx = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-nblocksy") == 0) {
-			char *argptr;
-			nslicesy = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-nblocksz") == 0) {
-			char *argptr;
-			nslicesz = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-x") == 0) {
-			char *argptr;
-			xdim = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-y") == 0) {
-			char *argptr;
-			ydim = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-z") == 0) {
-			char *argptr;
-			zdim = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-iter") == 0) {
-			char *argptr;
-			niter = strtol(argv[++i], &argptr, 10);
-		}
-
-
-		if (strcmp(argv[i], "-no-random") == 0) {
-			norandom = 1;
-		}
-
-		if (strcmp(argv[i], "-pin") == 0) {
-			pin = 1;
-		}
-
-		if (strcmp(argv[i], "-check") == 0) {
-			check = 1;
-		}
-
-		if (strcmp(argv[i], "-common-model") == 0) {
-			use_common_model = 1;
-		}
-	}
-
-	assert(nslicesx <= MAXSLICESX); 
-	assert(nslicesy <= MAXSLICESY); 
-	assert(nslicesz <= MAXSLICESZ); 
-}
-
-static void display_memory_consumption(void)
-{
-	fprintf(stderr, "Total memory : %ld MB\n",
-		(MAXSLICESY*MAXSLICESZ*sizeof(TYPE *) 
-		+ MAXSLICESZ*MAXSLICESX*sizeof(TYPE *)
-		+ MAXSLICESY*MAXSLICESX*sizeof(TYPE *)
-		+ MAXSLICESY*MAXSLICESZ*sizeof(starpu_data_handle)
-		+ MAXSLICESZ*MAXSLICESX*sizeof(starpu_data_handle)
-		+ MAXSLICESY*MAXSLICESX*sizeof(starpu_data_handle)
-		+ ydim*zdim*sizeof(TYPE)
-		+ zdim*xdim*sizeof(TYPE)
-		+ ydim*xdim*sizeof(TYPE))/(1024*1024) );
-}
-
-#ifdef STARPU_USE_CUDA
-void cublas_mult(void *descr[], __attribute__((unused)) void *arg);
-#endif
-
-void cpu_mult(void *descr[], __attribute__((unused))  void *arg);
-
-#endif // __MULT_H__

+ 0 - 465
examples/mult/dw_mult_no_stride.c

@@ -1,465 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "simple.h"
-#include "dw_mult.h"
-#ifdef STARPU_USE_GORDON
-#include "gordon/func_sgemm_ibm.h"
-#endif
-#include "xgemm_kernels.c"
-
-TYPE *A[MAXSLICESY][MAXSLICESZ];
-TYPE *B[MAXSLICESZ][MAXSLICESX];
-TYPE *C[MAXSLICESY][MAXSLICESX];
-
-starpu_data_handle A_state[MAXSLICESY][MAXSLICESZ];
-starpu_data_handle B_state[MAXSLICESZ][MAXSLICESX];
-starpu_data_handle C_state[MAXSLICESY][MAXSLICESX];
-
-#define TAG(x,y,z,iter)	\
-		((starpu_tag_t)((z) + (iter)*nslicesz + (x)*(nslicesz*niter) + (y)*(nslicesx*nslicesz*niter)))
-
-static void submit_new_iter(unsigned x, unsigned y, unsigned iter);
-
-/*
- * This program computes C = A * B 
- *
- * The difference with xgemm.c is that matrices are here already split in
- * blocks, and thus no data partitioning is needed.
- * 
- *   A of size (z,y)
- *   B of size (x,z)
- *   C of size (x,y)
-
-              |---------------|
-            z |       B       |
-              |---------------|
-       z              x
-     |----|   |---------------|
-     |    |   |               |
-     |    |   |               |
-     | A  | y |       C       |
-     |    |   |               |
-     |    |   |               |
-     |----|   |---------------|
-
- */
-
-#define MEM_ALIGNMENT	16
-
-static void init_problem_data(void)
-{
-	unsigned i,j;
-
-	/* debug ... */
-	memset(A, 0, MAXSLICESY*MAXSLICESZ*sizeof(TYPE *));
-	memset(B, 0, MAXSLICESZ*MAXSLICESZ*sizeof(TYPE *));
-	memset(C, 0, MAXSLICESY*MAXSLICESX*sizeof(TYPE *));
-	memset(&A_state, 0, MAXSLICESY*MAXSLICESZ*sizeof(starpu_data_handle));
-	memset(&B_state, 0, MAXSLICESZ*MAXSLICESZ*sizeof(starpu_data_handle));
-	memset(&C_state, 0, MAXSLICESY*MAXSLICESX*sizeof(starpu_data_handle));
-
-	/* Allocate grids of buffer */
-	/* TODO pin ... */
-	unsigned z, y, x;
-
-	for (y = 0; y < nslicesy; y++)
-	{
-		for (z = 0; z < nslicesz; z++)
-		{
-#ifdef STARPU_HAVE_POSIX_MEMALIGN
-			posix_memalign((void **)&A[y][z], MEM_ALIGNMENT, BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
-#else
-			A[y][z] = malloc(BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
-#endif
-			assert(A[y][z]);
-		}
-	}
-
-	for (z = 0; z < nslicesz; z++)
-	{
-		for (x = 0; x < nslicesx; x++)
-		{
-#ifdef STARPU_HAVE_POSIX_MEMALIGN
-			posix_memalign((void **)&B[z][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
-#else
-			B[z][x] = malloc(BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
-#endif
-			assert(B[z][x]);
-		}
-	}
-
-	for (y = 0; y < nslicesy; y++)
-	{
-		for (x = 0; x < nslicesx; x++)
-		{
-#ifdef STARPU_HAVE_POSIX_MEMALIGN
-			posix_memalign((void **)&C[y][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
-#else
-			C[y][x] = malloc(BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
-#endif
-			assert(C[y][x]);
-		}
-	}
-	
-	/* fill the A and B matrices */
-	unsigned blockx, blocky, blockz;
-
-	if (norandom) {
-		for (blocky = 0; blocky < nslicesy; blocky++)
-			for (blockz = 0; blockz < nslicesz; blockz++)
-				for (j = 0; j < BLOCKSIZEY; j++)
-					for (i = 0; i < BLOCKSIZEZ; i++)
-					{
-						A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(1 + blockz + blocky*nslicesz);
-					}
-
-		for (blockz = 0; blockz < nslicesz; blockz++)
-			for (blockx = 0; blockx < nslicesx; blockx++)
-				for (j = 0; j < BLOCKSIZEZ; j++)
-					for (i = 0; i < BLOCKSIZEX; i++)
-					{
-						B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(1 + blockx + blockz*nslicesx);
-					}
-	} 
-	else {
-		for (blocky = 0; blocky < nslicesy; blocky++)
-			for (blockz = 0; blockz < nslicesz; blockz++)
-				for (j = 0; j < BLOCKSIZEY; j++)
-					for (i = 0; i < BLOCKSIZEZ; i++)
-					{
-						A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(starpu_drand48());
-					}
-
-		for (blockz = 0; blockz < nslicesz; blockz++)
-			for (blockx = 0; blockx < nslicesx; blockx++)
-				for (j = 0; j < BLOCKSIZEZ; j++)
-					for (i = 0; i < BLOCKSIZEX; i++)
-					{
-						B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(starpu_drand48());
-					}
-
-	}
-
-	for (blocky = 0; blocky < nslicesy; blocky++)
-		for (blockx = 0; blockx < nslicesx; blockx++)
-			for (j = 0; j < BLOCKSIZEY; j++)
-				for (i = 0; i < BLOCKSIZEX; i++)
-				{
-					C[blocky][blockx][i*BLOCKSIZEY + j] = (TYPE)(blockx + blocky*nslicesx + 1);
-				}
-
-	/* TODO: aren't we supposed to set data consistency to relaxed, since
-	 * tags are supposed to provide the correct dependencies? */
-
-	/* declare the StarPU data to monitor */
-	for (y = 0; y < nslicesy; y++)
-	{
-		for (z = 0; z < nslicesz; z++)
-		{
-			starpu_matrix_data_register(&A_state[y][z], 0, (uintptr_t)A[y][z], 
-				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEZ, sizeof(TYPE));
-		}
-	}
-
-	for (z = 0; z < nslicesz; z++)
-	{
-		for (x = 0; x < nslicesx; x++)
-		{
-			starpu_matrix_data_register(&B_state[z][x], 0, (uintptr_t)B[z][x], 
-				BLOCKSIZEZ, BLOCKSIZEZ, BLOCKSIZEX, sizeof(TYPE));
-		}
-	}
-
-	for (y = 0; y < nslicesy; y++)
-	{
-		for (x = 0; x < nslicesx; x++)
-		{
-			starpu_matrix_data_register(&C_state[y][x], 0, (uintptr_t)C[y][x], 
-				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEX, sizeof(TYPE));
-		}
-	}
-
-#ifdef STARPU_USE_GORDON
-	conf.k = BLOCKSIZEZ;
-	conf.m = BLOCKSIZEY;
-	conf.n = BLOCKSIZEX;
-#endif
-
-	fprintf(stderr, "block size : x %d y %d z %d\n", BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
-
-	display_memory_consumption();
-}
-
-static void cleanup_problem(void)
-{
-	unsigned z, y, x;
-
-#ifdef CHECK_OUTPUT
-	TYPE maxerr = 0.0;
-	TYPE err;
-	fprintf(stderr, "Checking results ....");
-
-	for (y = 0; y < nslicesy; y++)
-	{
-		for (x = 0; x < nslicesx; x++)
-		{
-			for (z = 0; z < nslicesz; z++)
-			{
-				SGEMM("N", "N", BLOCKSIZEY, BLOCKSIZEX, BLOCKSIZEZ, -(TYPE)(niter), A[y][z], BLOCKSIZEY, B[z][x], BLOCKSIZEZ, 1.0f, C[y][x], BLOCKSIZEY);
-
-			}
-
-			/* make sure C - niter AB = 0 */
-			err = SASUM(BLOCKSIZEX*BLOCKSIZEY, C[y][x], 1);
-
-			if (err > BLOCKSIZEX*BLOCKSIZEY*niter*0.001) 
-				fprintf(stderr, "\nerr = %f ( x = %d y = %d ) ... ", err/niter, x, y );
-
-			maxerr = STARPU_MAX(err, maxerr);
-		}
-	}
-
-	if (maxerr > BLOCKSIZEX*BLOCKSIZEY*niter*0.001)
-	{
-		fprintf(stderr, " maxerr = %f\n", maxerr/niter);
-	}
-	else {
-		fprintf(stderr, " OK\n");
-	}
-	fflush(stderr);
-#endif
-
-	for (y = 0; y < nslicesy; y++)
-	{
-		for (z = 0; z < nslicesz; z++)
-		{
-	//		free(A[y][z]);
-		}
-	}
-
-	for (z = 0; z < nslicesz; z++)
-	{
-		for (x = 0; x < nslicesx; x++)
-		{
-	//		free(B[z][x]);
-		}
-	}
-
-	for (y = 0; y < nslicesy; y++)
-	{
-		for (x = 0; x < nslicesx; x++)
-		{
-	//		free(C[y][x]);
-			starpu_tag_remove(TAG(nslicesz - 1, y, x, niter - 1));
-		}
-	}
-
-	
-	
-}
-
-struct cb2_s {
-	unsigned blockx;
-	unsigned blocky;
-	unsigned iter;
-};
-
-
-static starpu_codelet cl = {
-	.where = STARPU_CPU|STARPU_CUDA
-#ifdef SPU_FUNC_SGEMM
-		|STARPU_GORDON
-#endif
-		,
-	.cpu_func = STARPU_GEMM(cpu_mult),
-#ifdef STARPU_USE_CUDA
-	.cuda_func = STARPU_GEMM(cublas_mult),
-#endif
-#ifdef STARPU_USE_GORDON
-	/* .gordon_func will be set by load_elf_sgemm */
-#endif
-	.nbuffers = 3
-};
-
-
-#ifdef STARPU_USE_GORDON
-static const char *spu_func_sgemm_elf_file = "./gordon/func_sgemm_ibm.spuelf";
-static unsigned spu_func_sgemm_elf_id;
-static unsigned spu_func_sgemm_ibm_id;
-
-static void load_elf_sgemm(void)
-{
-	spu_func_sgemm_elf_id =
-		gordon_register_elf_plugin(spu_func_sgemm_elf_file);
-
-	spu_func_sgemm_ibm_id = gordon_register_kernel(spu_func_sgemm_elf_id, "func_sgemm_ibm");
-
-	gordon_load_plugin_on_all_spu(spu_func_sgemm_elf_id);
-	gordon_load_kernel_on_all_spu(spu_func_sgemm_ibm_id);
-
-	cl.gordon_func = spu_func_sgemm_ibm_id;
-}
-#endif // STARPU_USE_GORDON
-
-static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, unsigned iter)
-{
-	/* A B[task] = C[task] */
-	struct starpu_task *task = starpu_task_create();
-
-	task->cl = &cl;
-
-	task->use_tag = 1;
-	task->tag_id = TAG(z, y, x, iter);
-
-	task->buffers[0].handle = A_state[y][z];
-	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = B_state[z][x];
-	task->buffers[1].mode = STARPU_R;
-	task->buffers[2].handle = C_state[y][x];
-	task->buffers[2].mode = STARPU_RW;
-
-#ifdef STARPU_USE_GORDON
-	task->cl_arg = &conf;
-	task->cl_arg_size = sizeof(struct ibm_sgemm_block_conf);
-#endif
-
-	return task;
-}
-
-static void callback_func_2(void *arg)
-{
-	/* the argument is a pointer to a counter of the remaining tasks */
-	struct cb2_s *cb2 = arg;
-	unsigned x,y,z,iter;
-
-	iter = cb2->iter;
-	x = cb2->blockx;
-	y = cb2->blocky;
-
-	free(cb2);
-
-	/* do some accounting */
-	int id = starpu_worker_get_id();
-	flop_per_worker[id] += BLAS3_FLOP(BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
-	ls_per_worker[id] += BLAS3_LS(BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
-
-	/* TAG(nslicesz - 1, y, x, iter) remains ... */
-	for (z = 0; z < nslicesz - 1; z++)
-	{
-		starpu_tag_remove(TAG(z, y, x, iter));
-	}
-
-	if (iter > 0)
-	{
-		starpu_tag_remove(TAG(nslicesz - 1, y, x, iter-1));
-	}
-	
-	if (iter != niter - 1) {
-		submit_new_iter(x, y, iter+1);
-	}
-}
-
-
-
-static void submit_new_iter(unsigned x, unsigned y, unsigned iter)
-{
-	unsigned z;
-	for (z = 0; z < nslicesz; z++) 
-	{
-		struct starpu_task *task;
-		task = construct_task(x, y, z, iter);
-		
-		if (z != 0) {
-			starpu_tag_declare_deps(TAG(z, y, x, iter), 1, TAG(z-1, y, x, iter));
-		}
-
-		if (z == nslicesz - 1) {
-			struct cb2_s *cb2 = malloc(sizeof(struct cb2_s));
-				cb2->blockx = x;
-				cb2->blocky = y;
-				cb2->iter = iter;
-			task->callback_func = callback_func_2;
-			task->callback_arg = cb2;
-		}
-
-		starpu_task_submit(task);
-	}
-}
-
-static void launch_codelets(void)
-{
-#ifdef STARPU_USE_FXT
-	_starpu_fxt_register_thread(0);
-#endif
-	/* partition the work into slices */
-	unsigned taskx, tasky;
-
-	srand(time(NULL));
-
-	/* should we use a single performance model for all archs and use an
- 	 * acceleration factor ? */
-	if (use_common_model) {
-		cl.model = &STARPU_GEMM(model_common);
-	}
-	else {
-		cl.model = &STARPU_GEMM(model);
-	}
-
-	for (taskx = 0; taskx < nslicesx; taskx++) 
-	{
-		for (tasky = 0; tasky < nslicesy; tasky++)
-		{
-			submit_new_iter(taskx, tasky, 0);
-		}
-	}
-}
-
-int main(__attribute__ ((unused)) int argc, 
-	 __attribute__ ((unused)) char **argv)
-{
-
-	parse_args(argc, argv);
-
-	/* start the runtime */
-	starpu_init(NULL);
-
-	starpu_helper_cublas_init();
-
-#ifdef STARPU_USE_GORDON
-	load_elf_sgemm();
-#endif
-
-	init_problem_data();
-
-	gettimeofday(&start, NULL);
-
-	launch_codelets();
-
-	starpu_task_wait_for_all();
-
-	gettimeofday(&end, NULL);
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	display_stats(timing);
-
-	cleanup_problem();
-
-	starpu_helper_cublas_shutdown();
-	starpu_shutdown();
-
-	return 0;
-}

+ 0 - 447
examples/mult/dw_mult_no_stride_no_tag.c

@@ -1,447 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "simple.h"
-#include "dw_mult.h"
-#ifdef STARPU_USE_GORDON
-#include "gordon/func_sgemm_ibm.h"
-#endif
-#include "xgemm_kernels.c"
-
-
-struct pos {
-	unsigned x,y, z,iter;
-};
-
-struct pos currentpos [MAXSLICESY][MAXSLICESX];
-
-TYPE *A[MAXSLICESY][MAXSLICESZ];
-TYPE *B[MAXSLICESZ][MAXSLICESX];
-TYPE *C[MAXSLICESY][MAXSLICESX];
-
-starpu_data_handle A_state[MAXSLICESY][MAXSLICESZ];
-starpu_data_handle B_state[MAXSLICESZ][MAXSLICESX];
-starpu_data_handle C_state[MAXSLICESY][MAXSLICESX];
-
-
-static void callback_func_3(void *arg);
-/*
- * This program computes C = A * B 
- * 
- * The difference with dw_mult_no_stride.c is that here we do not use tags, and
- * just rely on sequential data consistency.
- *   A of size (z,y)
- *   B of size (x,z)
- *   C of size (x,y)
-
-              |---------------|
-            z |       B       |
-              |---------------|
-       z              x
-     |----|   |---------------|
-     |    |   |               |
-     |    |   |               |
-     | A  | y |       C       |
-     |    |   |               |
-     |    |   |               |
-     |----|   |---------------|
-
- */
-
-#define MEM_ALIGNMENT	16
-
-static void init_problem_data(void)
-{
-	unsigned i,j;
-
-	/* debug ... */
-	memset(A, 0, MAXSLICESY*MAXSLICESZ*sizeof(TYPE *));
-	memset(B, 0, MAXSLICESZ*MAXSLICESZ*sizeof(TYPE *));
-	memset(C, 0, MAXSLICESY*MAXSLICESX*sizeof(TYPE *));
-	memset(&A_state, 0, MAXSLICESY*MAXSLICESZ*sizeof(starpu_data_handle));
-	memset(&B_state, 0, MAXSLICESZ*MAXSLICESZ*sizeof(starpu_data_handle));
-	memset(&C_state, 0, MAXSLICESY*MAXSLICESX*sizeof(starpu_data_handle));
-
-	/* Allocate grids of buffer */
-	/* TODO pin ... */
-	unsigned z, y, x;
-
-	for (y = 0; y < nslicesy; y++)
-	{
-		for (z = 0; z < nslicesz; z++)
-		{
-#ifdef STARPU_HAVE_POSIX_MEMALIGN
-			posix_memalign((void **)&A[y][z], MEM_ALIGNMENT, BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
-#else
-			A[y][z] = malloc(BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
-#endif
-			assert(A[y][z]);
-		}
-	}
-
-	for (z = 0; z < nslicesz; z++)
-	{
-		for (x = 0; x < nslicesx; x++)
-		{
-#ifdef STARPU_HAVE_POSIX_MEMALIGN
-			posix_memalign((void **)&B[z][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
-#else
-			B[z][x] = malloc(BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
-#endif
-			assert(B[z][x]);
-		}
-	}
-
-	for (y = 0; y < nslicesy; y++)
-	{
-		for (x = 0; x < nslicesx; x++)
-		{
-#ifdef STARPU_HAVE_POSIX_MEMALIGN
-			posix_memalign((void **)&C[y][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
-#else
-			C[y][x] = malloc(BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
-#endif
-			currentpos[y][x].x = x;
-			currentpos[y][x].y = y;
-			currentpos[y][x].z = 0;
-			currentpos[y][x].iter = 0;
-			assert(C[y][x]);
-		}
-	}
-	
-	/* fill the A and B matrices */
-	unsigned blockx, blocky, blockz;
-
-	if (norandom) {
-		for (blocky = 0; blocky < nslicesy; blocky++)
-			for (blockz = 0; blockz < nslicesz; blockz++)
-				for (j = 0; j < BLOCKSIZEY; j++)
-					for (i = 0; i < BLOCKSIZEZ; i++)
-					{
-						A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(1 + blockz + blocky*nslicesz);
-					}
-
-		for (blockz = 0; blockz < nslicesz; blockz++)
-			for (blockx = 0; blockx < nslicesx; blockx++)
-				for (j = 0; j < BLOCKSIZEZ; j++)
-					for (i = 0; i < BLOCKSIZEX; i++)
-					{
-						B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(1 + blockx + blockz*nslicesx);
-					}
-	} 
-	else {
-		for (blocky = 0; blocky < nslicesy; blocky++)
-			for (blockz = 0; blockz < nslicesz; blockz++)
-				for (j = 0; j < BLOCKSIZEY; j++)
-					for (i = 0; i < BLOCKSIZEZ; i++)
-					{
-						A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(starpu_drand48());
-					}
-
-		for (blockz = 0; blockz < nslicesz; blockz++)
-			for (blockx = 0; blockx < nslicesx; blockx++)
-				for (j = 0; j < BLOCKSIZEZ; j++)
-					for (i = 0; i < BLOCKSIZEX; i++)
-					{
-						B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(starpu_drand48());
-					}
-
-	}
-
-	for (blocky = 0; blocky < nslicesy; blocky++)
-		for (blockx = 0; blockx < nslicesx; blockx++)
-			for (j = 0; j < BLOCKSIZEY; j++)
-				for (i = 0; i < BLOCKSIZEX; i++)
-				{
-					C[blocky][blockx][i*BLOCKSIZEY + j] = (TYPE)0;
-				}
-
-
-	/* declare the StarPU data to monitor */
-	for (y = 0; y < nslicesy; y++)
-	{
-		for (z = 0; z < nslicesz; z++)
-		{
-			starpu_matrix_data_register(&A_state[y][z], 0, (uintptr_t)A[y][z], 
-				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEZ, sizeof(TYPE));
-		}
-	}
-
-	for (z = 0; z < nslicesz; z++)
-	{
-		for (x = 0; x < nslicesx; x++)
-		{
-			starpu_matrix_data_register(&B_state[z][x], 0, (uintptr_t)B[z][x], 
-				BLOCKSIZEZ, BLOCKSIZEZ, BLOCKSIZEX, sizeof(TYPE));
-		}
-	}
-
-	for (y = 0; y < nslicesy; y++)
-	{
-		for (x = 0; x < nslicesx; x++)
-		{
-			starpu_matrix_data_register(&C_state[y][x], 0, (uintptr_t)C[y][x], 
-				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEX, sizeof(TYPE));
-		}
-	}
-
-#ifdef STARPU_USE_GORDON
-	conf.k = BLOCKSIZEZ;
-	conf.m = BLOCKSIZEY;
-	conf.n = BLOCKSIZEX;
-#endif
-
-	fprintf(stderr, "block size : x %d y %d z %d\n", BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
-
-	display_memory_consumption();
-}
-
-static void cleanup_problem(void)
-{
-	unsigned z, y, x;
-
-#ifdef CHECK_OUTPUT
-	TYPE maxerr = 0.0;
-	TYPE err;
-	fprintf(stderr, "Checking results ....");
-
-	for (y = 0; y < nslicesy; y++)
-	{
-		for (x = 0; x < nslicesx; x++)
-		{
-			for (z = 0; z < nslicesz; z++)
-			{
-				SGEMM("N", "N", BLOCKSIZEY, BLOCKSIZEX, BLOCKSIZEZ, -(TYPE)(niter), A[y][z], BLOCKSIZEY, B[z][x], BLOCKSIZEZ, 1.0f, C[y][x], BLOCKSIZEY);
-
-			}
-
-			/* make sure C - niter AB = 0 */
-			err = SASUM(BLOCKSIZEX*BLOCKSIZEY, C[y][x], 1);
-
-			if (err > BLOCKSIZEX*BLOCKSIZEY*niter*0.001) 
-				fprintf(stderr, "\nerr = %f ( x = %d y = %d ) ... ", err/niter, x, y );
-
-			maxerr = STARPU_MAX(err, maxerr);
-		}
-	}
-
-	if (maxerr > BLOCKSIZEX*BLOCKSIZEY*niter*0.001)
-	{
-		fprintf(stderr, " maxerr = %f\n", maxerr/niter);
-	}
-	else {
-		fprintf(stderr, " OK\n");
-	}
-	fflush(stderr);
-#endif
-
-	for (y = 0; y < nslicesy; y++)
-	{
-		for (z = 0; z < nslicesz; z++)
-		{
-	//		free(A[y][z]);
-		}
-	}
-
-	for (z = 0; z < nslicesz; z++)
-	{
-		for (x = 0; x < nslicesx; x++)
-		{
-	//		free(B[z][x]);
-		}
-	}
-
-	for (y = 0; y < nslicesy; y++)
-	{
-		for (x = 0; x < nslicesx; x++)
-		{
-	//		free(C[y][x]);
-		}
-	}
-
-	
-	
-}
-
-struct cb2_s {
-	unsigned blockx;
-	unsigned blocky;
-	unsigned iter;
-};
-
-
-static starpu_codelet cl = {
-	.where = STARPU_CPU|STARPU_CUDA
-#ifdef SPU_FUNC_SGEMM
-		|STARPU_GORDON
-#endif
-		,
-	.cpu_func = STARPU_GEMM(cpu_mult),
-#ifdef STARPU_USE_CUDA
-	.cuda_func = STARPU_GEMM(cublas_mult),
-#endif
-#ifdef STARPU_USE_GORDON
-	/* .gordon_func will be set by load_elf_sgemm */
-#endif
-	.nbuffers = 3
-};
-
-
-#ifdef STARPU_USE_GORDON
-static const char *spu_func_sgemm_elf_file = "./gordon/func_sgemm_ibm.spuelf";
-static unsigned spu_func_sgemm_elf_id;
-static unsigned spu_func_sgemm_ibm_id;
-
-static void load_elf_sgemm(void)
-{
-	spu_func_sgemm_elf_id =
-		gordon_register_elf_plugin(spu_func_sgemm_elf_file);
-
-	spu_func_sgemm_ibm_id = gordon_register_kernel(spu_func_sgemm_elf_id, "func_sgemm_ibm");
-	
-	gordon_load_plugin_on_all_spu(spu_func_sgemm_elf_id);
-	gordon_load_kernel_on_all_spu(spu_func_sgemm_ibm_id);
-
-	cl.gordon_func = spu_func_sgemm_ibm_id;
-}
-#endif // STARPU_USE_GORDON
-
-static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, unsigned iter, struct pos *posp)
-{
-	struct starpu_task *task = starpu_task_create();
-
-	task->cl = &cl;
-
-	task->buffers[0].handle = A_state[y][z];
-	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = B_state[z][x];
-	task->buffers[1].mode = STARPU_R;
-	task->buffers[2].handle = C_state[y][x];
-	task->buffers[2].mode = STARPU_RW;
-
-	task->callback_func = callback_func_3;
-	task->callback_arg = posp;
-
-#ifdef STARPU_USE_GORDON
-	task->cl_arg = &conf;
-	task->cl_arg_size = sizeof(struct ibm_sgemm_block_conf);
-#endif
-
-	posp->z = z;
-	posp->iter = iter;
-
-	return task;
-}
-
-
-static void callback_func_3(void *arg)
-{
-	/* do some accounting */
-	int id = starpu_worker_get_id();
-	flop_per_worker[id] += BLAS3_FLOP(BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
-	ls_per_worker[id] += BLAS3_LS(BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
-
-	/* the argument is a pointer to a counter of the remaining tasks */
-	struct pos *posp = arg;
-	unsigned x,y,z,iter;
-
-	iter = posp->iter;
-	x = posp->x;
-	y = posp->y;
-	z = posp->z;
-
-	if (z < nslicesz - 1)
-	{
-		struct starpu_task *task = construct_task(x, y, z+1, iter, posp);
-		starpu_task_submit(task);
-	}
-	else
-	{
-		if (iter < niter - 1)
-		{
-			struct starpu_task *task = construct_task(x, y, 0, iter+1, posp);
-			starpu_task_submit(task);
-		}
-	}
-}
-
-
-
-
-static void launch_codelets(void)
-{
-#ifdef STARPU_USE_FXT
-	_starpu_fxt_register_thread(0);
-#endif
-	/* partition the work into slices */
-	unsigned taskx, tasky;
-
-	srand(time(NULL));
-
-	/* should we use a single performance model for all archs and use an
- 	 * acceleration factor ? */
-	if (use_common_model) {
-		cl.model = &STARPU_GEMM(model_common);
-	}
-	else {
-		cl.model = &STARPU_GEMM(model);
-	}
-
-	for (taskx = 0; taskx < nslicesx; taskx++) 
-	{
-		for (tasky = 0; tasky < nslicesy; tasky++)
-		{
-			struct starpu_task *task = construct_task(taskx, tasky, 0, 0, &currentpos[tasky][taskx]);
-			starpu_task_submit(task);
-		}
-	}
-}
-
-int main(__attribute__ ((unused)) int argc, 
-	 __attribute__ ((unused)) char **argv)
-{
-
-	parse_args(argc, argv);
-
-	/* start the runtime */
-	starpu_init(NULL);
-
-	starpu_helper_cublas_init();
-
-#ifdef STARPU_USE_GORDON
-	load_elf_sgemm();
-#endif
-
-	init_problem_data();
-
-	gettimeofday(&start, NULL);
-
-	launch_codelets();
-
-	starpu_task_wait_for_all();
-
-	gettimeofday(&end, NULL);
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	display_stats(timing);
-
-	cleanup_problem();
-
-	starpu_helper_cublas_shutdown();
-	starpu_shutdown();
-
-	return 0;
-}

+ 0 - 42
examples/mult/gordon/func_dgemm_ibm.c

@@ -1,42 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010  Université de Bordeaux 1
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "func_gemm_ibm.h"
-
-#include <blas_s.h>
-
-void func_dgemm_ibm(__attribute__ ((unused)) void **alloc,
-		__attribute__ ((unused)) void **in,
-		__attribute__ ((unused)) void **inout,
-		__attribute__ ((unused)) void **out)
-{
-	/* we assume data will be in A:R,B:R,C:RW mode
- 	 *  -> in[0] : describe problem
- 	 *  -> in[1] : A
- 	 *  -> in[2] : B
- 	 *  -> inout[0] : C
- 	 *
- 	 *   C = AB + C
- 	 *   but, being in fortran ordering, we compute
- 	 *   t(C) = t(B)t(A) + t(C) instead
- 	 */
-	struct ibm_gemm_block_conf *conf = in[0];
-	double *A = in[1];
-	double *B = in[2];
-	double *C = inout[0];
-
-	dgemm_spu(conf->m, conf->n, conf->k, B, A, C);
-}

+ 0 - 29
examples/mult/gordon/func_gemm_ibm.h

@@ -1,29 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010  Université de Bordeaux 1
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#ifndef __FUNC_SGEMM_IBM_H__
-#define __FUNC_SGEMM_IBM_H__
-
-#include <stdint.h>
-
-struct ibm_sgemm_block_conf {
-	uint32_t m;
-	uint32_t n;
-	uint32_t k;
-	uint32_t pad;
-};
-
-#endif // __FUNC_SGEMM_IBM_H__

+ 0 - 43
examples/mult/gordon/func_sgemm_ibm.c

@@ -1,43 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "func_gemm_ibm.h"
-
-#include <blas_s.h>
-
-void func_sgemm_ibm(__attribute__ ((unused)) void **alloc,
-		__attribute__ ((unused)) void **in,
-		__attribute__ ((unused)) void **inout,
-		__attribute__ ((unused)) void **out)
-{
-	/* we assume data will be in A:R,B:R,C:RW mode
- 	 *  -> in[0] : describe problem
- 	 *  -> in[1] : A
- 	 *  -> in[2] : B
- 	 *  -> inout[0] : C
- 	 *
- 	 *   C = AB + C
- 	 *   but, being in fortran ordering, we compute
- 	 *   t(C) = t(B)t(A) + t(C) instead
- 	 */
-	struct ibm_gemm_block_conf *conf = in[0];
-	float *A = in[1];
-	float *B = in[2];
-	float *C = inout[0];
-
-	sgemm_spu(conf->m, conf->n, conf->k, B, A, C);
-}

+ 0 - 2
examples/mult/sgemm.c

@@ -16,6 +16,4 @@
  */
  */
 
 
 #include "simple.h"
 #include "simple.h"
-
-#include "xgemm_kernels.c"
 #include "xgemm.c" 
 #include "xgemm.c" 

+ 0 - 2
examples/mult/simple.h

@@ -17,7 +17,6 @@
 #define TYPE	float
 #define TYPE	float
 
 
 #define CUBLAS_GEMM cublasSgemm
 #define CUBLAS_GEMM cublasSgemm
-#define MAGMABLAS_GEMM magmablas_sgemm
 #define CPU_GEMM	SGEMM
 #define CPU_GEMM	SGEMM
 #define CPU_ASUM	SASUM
 #define CPU_ASUM	SASUM
 #define CPU_IAMAX	ISAMAX
 #define CPU_IAMAX	ISAMAX
@@ -26,4 +25,3 @@
 #define str(s) #s
 #define str(s) #s
 #define xstr(s)        str(s)
 #define xstr(s)        str(s)
 #define STARPU_GEMM_STR(name)  xstr(STARPU_GEMM(name))
 #define STARPU_GEMM_STR(name)  xstr(STARPU_GEMM(name))
-

+ 154 - 178
examples/mult/xgemm.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  *
@@ -16,37 +16,79 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
  */
 
 
-#include "dw_mult.h"
-
-TYPE *A, *B, *C;
-starpu_data_handle A_handle, B_handle, C_handle;
-
-/*
- * This program computes C = A * B 
- * 
- *   A of size (z,y)
- *   B of size (x,z)
- *   C of size (x,y)
-
-              |---------------|
-            z |       B       |
-              |---------------|
-       z              x
-     |----|   |---------------|
-     |    |   |               |
-     |    |   |               |
-     | A  | y |       C       |
-     |    |   |               |
-     |    |   |               |
-     |----|   |---------------|
+#include <string.h>
+#include <math.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <starpu.h>
 
 
- */
+#include <common/blas.h>
+
+#ifdef STARPU_USE_CUDA
+#include <cuda.h>
+#include <cublas.h>
+#endif
+
+static unsigned niter = 100;
+static unsigned nslicesx = 4;
+static unsigned nslicesy = 4;
+static unsigned xdim = 256;
+static unsigned ydim = 256;
+static unsigned zdim = 64;
+static unsigned check = 0;
+
+static TYPE *A, *B, *C;
+static starpu_data_handle A_handle, B_handle, C_handle;
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-nblocks") == 0) {
+			char *argptr;
+			nslicesx = strtol(argv[++i], &argptr, 10);
+			nslicesy = nslicesx;
+		}
+
+		if (strcmp(argv[i], "-nblocksx") == 0) {
+			char *argptr;
+			nslicesx = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nblocksy") == 0) {
+			char *argptr;
+			nslicesy = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-x") == 0) {
+			char *argptr;
+			xdim = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-y") == 0) {
+			char *argptr;
+			ydim = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-z") == 0) {
+			char *argptr;
+			zdim = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-iter") == 0) {
+			char *argptr;
+			niter = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-check") == 0) {
+			check = 1;
+		}
+	}
+}
 
 
 static void check_output(void)
 static void check_output(void)
 {
 {
-	/* check results */
 	/* compute C = C - AB */
 	/* compute C = C - AB */
-
 	CPU_GEMM("N", "N", ydim, xdim, zdim, (TYPE)-1.0f, A, ydim, B, zdim, (TYPE)1.0f, C, ydim);
 	CPU_GEMM("N", "N", ydim, xdim, zdim, (TYPE)-1.0f, A, ydim, B, zdim, (TYPE)1.0f, C, ydim);
 		
 		
 	/* make sure C = 0 */
 	/* make sure C = 0 */
@@ -65,66 +107,24 @@ static void check_output(void)
 	}
 	}
 }
 }
 
 
-void callback_func(void *arg)
-{
-	/* do some accounting */
-	int id = starpu_worker_get_id();
-	flop_per_worker[id] += BLAS3_FLOP(conf.m, conf.n, conf.k);
-	ls_per_worker[id] += BLAS3_LS(conf.m, conf.n, conf.k);
-}
-
 static void init_problem_data(void)
 static void init_problem_data(void)
 {
 {
 	unsigned i,j;
 	unsigned i,j;
 
 
-#ifdef STARPU_USE_CUDA
-	if (pin) {
-		starpu_data_malloc_pinned_if_possible((void **)&A, zdim*ydim*sizeof(TYPE));
-		starpu_data_malloc_pinned_if_possible((void **)&B, xdim*zdim*sizeof(TYPE));
-		starpu_data_malloc_pinned_if_possible((void **)&C, xdim*ydim*sizeof(TYPE));
-	} else
-#endif
-	{
-#ifdef STARPU_HAVE_POSIX_MEMALIGN
-		posix_memalign((void **)&A, 4096, zdim*ydim*sizeof(TYPE));
-		posix_memalign((void **)&B, 4096, xdim*zdim*sizeof(TYPE));
-		posix_memalign((void **)&C, 4096, xdim*ydim*sizeof(TYPE));
-#else
-		A = malloc(zdim*ydim*sizeof(TYPE));
-		B = malloc(xdim*zdim*sizeof(TYPE));
-		C = malloc(xdim*ydim*sizeof(TYPE));
-#endif
-	}
+	starpu_data_malloc_pinned_if_possible((void **)&A, zdim*ydim*sizeof(TYPE));
+	starpu_data_malloc_pinned_if_possible((void **)&B, xdim*zdim*sizeof(TYPE));
+	starpu_data_malloc_pinned_if_possible((void **)&C, xdim*ydim*sizeof(TYPE));
 
 
 	/* fill the A and B matrices */
 	/* fill the A and B matrices */
-	if (norandom) {
-		for (j=0; j < ydim; j++) {
-			for (i=0; i < zdim; i++) {
-				A[j+i*ydim] = (TYPE)(i);
-			}
-		}
-	
-		for (j=0; j < zdim; j++) {
-			for (i=0; i < xdim; i++) {
-				B[j+i*zdim] = (TYPE)(j);
-			}
-		}
-	} 
-	else {
-#ifdef NORANDOM
-		srand(2008);
-		STARPU_ABORT();
-#endif
-		for (j=0; j < ydim; j++) {
-			for (i=0; i < zdim; i++) {
-				A[j+i*ydim] = (TYPE)(starpu_drand48());
-			}
+	for (j=0; j < ydim; j++) {
+		for (i=0; i < zdim; i++) {
+			A[j+i*ydim] = (TYPE)(starpu_drand48());
 		}
 		}
-	
-		for (j=0; j < zdim; j++) {
-			for (i=0; i < xdim; i++) {
-				B[j+i*zdim] = (TYPE)(starpu_drand48());
-			}
+	}
+
+	for (j=0; j < zdim; j++) {
+		for (i=0; i < xdim; i++) {
+			B[j+i*zdim] = (TYPE)(starpu_drand48());
 		}
 		}
 	}
 	}
 
 
@@ -133,8 +133,6 @@ static void init_problem_data(void)
 			C[j+i*ydim] = (TYPE)(0);
 			C[j+i*ydim] = (TYPE)(0);
 		}
 		}
 	}
 	}
-
-	display_memory_consumption();
 }
 }
 
 
 static void partition_mult_data(void)
 static void partition_mult_data(void)
@@ -148,21 +146,15 @@ static void partition_mult_data(void)
 
 
 	starpu_data_set_wt_mask(C_handle, 1<<0);
 	starpu_data_set_wt_mask(C_handle, 1<<0);
 
 
-	conf.k = zdim;
-	conf.m = ydim/nslicesy;
-	conf.n = xdim/nslicesx;
-
 	struct starpu_data_filter f;
 	struct starpu_data_filter f;
+	memset(&f, 0, sizeof(f));
 	f.filter_func = starpu_vertical_block_filter_func;
 	f.filter_func = starpu_vertical_block_filter_func;
 	f.nchildren = nslicesx;
 	f.nchildren = nslicesx;
-	f.get_nchildren = NULL;
-	f.get_child_ops = NULL;
 		
 		
 	struct starpu_data_filter f2;
 	struct starpu_data_filter f2;
+	memset(&f2, 0, sizeof(f2));
 	f2.filter_func = starpu_block_filter_func;
 	f2.filter_func = starpu_block_filter_func;
 	f2.nchildren = nslicesy;
 	f2.nchildren = nslicesy;
-	f2.get_nchildren = NULL;
-	f2.get_child_ops = NULL;
 		
 		
 	starpu_data_partition(B_handle, &f);
 	starpu_data_partition(B_handle, &f);
 	starpu_data_partition(A_handle, &f2);
 	starpu_data_partition(A_handle, &f2);
@@ -170,126 +162,110 @@ static void partition_mult_data(void)
 	starpu_data_map_filters(C_handle, 2, &f, &f2);
 	starpu_data_map_filters(C_handle, 2, &f, &f2);
 }
 }
 
 
-static void unpartition_mult_data(void)
+static void mult_kernel_common(void *descr[], int type)
 {
 {
-	fprintf(stderr, "unpartition !!\n");
+	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
+	TYPE *subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
+	TYPE *subC = (TYPE *)STARPU_MATRIX_GET_PTR(descr[2]);
 
 
-	starpu_data_unpartition(C_handle, 0);
+	unsigned nxC = STARPU_MATRIX_GET_NX(descr[2]);
+	unsigned nyC = STARPU_MATRIX_GET_NY(descr[2]);
+	unsigned nyA = STARPU_MATRIX_GET_NY(descr[0]);
 
 
-	starpu_data_unregister(C_handle);
-}
+	unsigned ldA = STARPU_MATRIX_GET_LD(descr[0]);
+	unsigned ldB = STARPU_MATRIX_GET_LD(descr[1]);
+	unsigned ldC = STARPU_MATRIX_GET_LD(descr[2]);
 
 
-static starpu_codelet cl = {
-	.where = STARPU_CPU|STARPU_CUDA
-#ifdef SPU_FUNC_SGEMM
-		|STARPU_GORDON
-#endif
-		,
-	.cpu_func = STARPU_GEMM(cpu_mult),
+	if (type == STARPU_CPU) {
+		CPU_GEMM("N", "N", nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB, (TYPE)0.0, subC, ldC);
+	}
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	.cuda_func = STARPU_GEMM(cublas_mult),
-#endif
-#ifdef STARPU_USE_GORDON
-#ifdef SPU_FUNC_SGEMM
-	.gordon_func = SPU_FUNC_SGEMM,
-#else
-#warning SPU_FUNC_SGEMM is not available
-#endif
+	else {
+		CUBLAS_GEMM('n', 'n', nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
+					     (TYPE)0.0, subC, ldC);
+		cudaThreadSynchronize();
+	}
 #endif
 #endif
-	.nbuffers = 3
-};
-
-static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, unsigned iter)
-{
-	/* A B[task] = C[task] */
-	struct starpu_task *task = starpu_task_create();
-
-	task->cl = &cl;
-
-	/* we have a callback to do some accounting */
-	task->callback_func = callback_func;
-	task->callback_arg = NULL;
-
-	task->buffers[0].handle = starpu_data_get_sub_data(A_handle, 1, y);
-	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = starpu_data_get_sub_data(B_handle, 1, x);
-	task->buffers[1].mode = STARPU_R;
-	task->buffers[2].handle = starpu_data_get_sub_data(C_handle, 2, x, y);
-	task->buffers[2].mode = STARPU_RW;
-
-	task->cl_arg = &conf;
-	task->cl_arg_size = sizeof(struct block_conf);
-	return task;
 }
 }
 
 
-static void submit_new_iter(unsigned x, unsigned y, unsigned iter)
+#ifdef STARPU_USE_CUDA
+static void cublas_mult(void *descr[], __attribute__((unused)) void *arg)
 {
 {
-	unsigned z;
-
-	z = 0;
-
-	{
-		struct starpu_task *task;
-		task = construct_task(x, y, z, iter);
-
-		starpu_task_submit(task);
-	}
+	mult_kernel_common(descr, STARPU_CUDA);
 }
 }
-
-static void launch_codelets(void)
-{
-#ifdef STARPU_USE_FXT
-	_starpu_fxt_register_thread(0);
 #endif
 #endif
-	/* partition the work into slices */
-	unsigned taskx, tasky;
 
 
-	srand(time(NULL));
+static void cpu_mult(void *descr[], __attribute__((unused))  void *arg)
+{
+	mult_kernel_common(descr, STARPU_CPU);
+}
 
 
-	/* should we use a single performance model for all archs and use an
- 	 * acceleration factor ? */
-	if (use_common_model) {
-		cl.model = &STARPU_GEMM(model_common);
-	}
-	else {
-		cl.model = &STARPU_GEMM(model);
-	}
+static struct starpu_perfmodel_t starpu_gemm_model = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = STARPU_GEMM_STR(gemm)
+};
 
 
-	for (taskx = 0; taskx < nslicesx; taskx++) 
-	{
-		for (tasky = 0; tasky < nslicesy; tasky++)
-		{
-			submit_new_iter(taskx, tasky, 0);
-		}
-	}
-}
+static starpu_codelet cl = {
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_func = cpu_mult,
+#ifdef STARPU_USE_CUDA
+	.cuda_func = cublas_mult,
+#endif
+	.nbuffers = 3,
+	.model = &starpu_gemm_model
+};
 
 
-int main(__attribute__ ((unused)) int argc, 
-	 __attribute__ ((unused)) char **argv)
+int main(int argc, char **argv)
 {
 {
+	struct timeval start;
+	struct timeval end;
 
 
 	parse_args(argc, argv);
 	parse_args(argc, argv);
 
 
-	/* start the runtime */
 	starpu_init(NULL);
 	starpu_init(NULL);
-
 	starpu_helper_cublas_init();
 	starpu_helper_cublas_init();
 
 
 	init_problem_data();
 	init_problem_data();
+	partition_mult_data();
 
 
 	gettimeofday(&start, NULL);
 	gettimeofday(&start, NULL);
 
 
-	partition_mult_data();
+	unsigned x, y, iter;
+	for (iter = 0; iter < niter; iter++)
+	{
+		for (x = 0; x < nslicesx; x++) 
+		for (y = 0; y < nslicesy; y++)
+		{
+			struct starpu_task *task = starpu_task_create();
+	
+			task->cl = &cl;
+	
+			task->buffers[0].handle = starpu_data_get_sub_data(A_handle, 1, y);
+			task->buffers[0].mode = STARPU_R;
+			task->buffers[1].handle = starpu_data_get_sub_data(B_handle, 1, x);
+			task->buffers[1].mode = STARPU_R;
+			task->buffers[2].handle = starpu_data_get_sub_data(C_handle, 2, x, y);
+			task->buffers[2].mode = STARPU_RW;
+	
+			int ret = starpu_task_submit(task);
+			STARPU_ASSERT(!ret);
+		}
 
 
-	launch_codelets();
+		starpu_task_wait_for_all();
+	}
 
 
-	starpu_task_wait_for_all();
 
 
 	gettimeofday(&end, NULL);
 	gettimeofday(&end, NULL);
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	display_stats(timing);
 
 
-	unpartition_mult_data();
+	fprintf(stderr, "Time: %2.2f ms\n", timing/1000.0);
+
+	double flops = 2.0*((unsigned long)niter)*((unsigned long)xdim)
+				*((unsigned long)ydim)*((unsigned long)zdim);
+	fprintf(stderr, "GFlop/s: %.2f\n", flops/timing/1000.0);
+
+	starpu_data_unpartition(C_handle, 0);
+	starpu_data_unregister(C_handle);
 	
 	
 	if (check)
 	if (check)
 		check_output();
 		check_output();

+ 0 - 78
examples/mult/xgemm_kernels.c

@@ -1,78 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include <starpu.h>
-#include <starpu_cuda.h>
-#include <common/blas.h>
-
-#define COMMON_CODE			\
-	uint32_t nxC, nyC, nyA;		\
-	uint32_t ldA, ldB, ldC;		\
-					\
-	TYPE *subA;			\
-	TYPE *subB;			\
-	TYPE *subC;			\
-					\
-	subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);	\
-	subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);	\
-	subC = (TYPE *)STARPU_MATRIX_GET_PTR(descr[2]);	\
-					\
-	nxC = STARPU_MATRIX_GET_NX(descr[2]);		\
-	nyC = STARPU_MATRIX_GET_NY(descr[2]);		\
-	nyA = STARPU_MATRIX_GET_NY(descr[0]);		\
-					\
-	ldA = STARPU_MATRIX_GET_LD(descr[0]);		\
-	ldB = STARPU_MATRIX_GET_LD(descr[1]);		\
-	ldC = STARPU_MATRIX_GET_LD(descr[2]);
-
-
-
-#ifdef STARPU_USE_CUDA
-
-#ifdef STARPU_HAVE_MAGMA
-#define GPU_GEMM MAGMABLAS_GEMM
-#else
-#define GPU_GEMM CUBLAS_GEMM
-#endif
-
-void STARPU_GEMM(cublas_mult)(void *descr[], __attribute__((unused)) void *arg)
-{
-	COMMON_CODE
-
-	starpu_trace_user_event(0x42);
-
-	GPU_GEMM('n', 'n', nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
-					     (TYPE)0.0, subC, ldC);
-	cublasStatus st;
-	st = cublasGetError();
-	if (st != CUBLAS_STATUS_SUCCESS)
-		STARPU_ABORT();
-
-	cudaThreadSynchronize();
-
-	starpu_trace_user_event(0x42);
-}
-#endif
-
-void STARPU_GEMM(cpu_mult)(void *descr[], __attribute__((unused))  void *arg)
-{
-	COMMON_CODE
-
-	starpu_trace_user_event(0x42);
-	CPU_GEMM("N", "N", nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB, (TYPE)0.0, subC, ldC);
-	starpu_trace_user_event(0x43);
-}