Browse Source

gcc-plugin: new cholesky example

Nathalie Furmento 13 years ago
parent
commit
c703771abd

+ 11 - 1
gcc-plugin/examples/Makefile.am

@@ -14,12 +14,22 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
 noinst_PROGRAMS =				\
-  matrix-mult stencil5
+  matrix-mult stencil5 cholesky
 
 AM_LDFLAGS = $(top_builddir)/src/libstarpu.la
 
 AM_CPPFLAGS =						\
   -I$(top_srcdir)/include				\
+  -I$(top_srcdir)/examples				\
   $(STARPU_OPENCL_CPPFLAGS) $(STARPU_CUDA_CPPFLAGS)
 
 AM_CFLAGS = -fplugin="$(builddir)/../src/.libs/starpu.so" -Wall
+
+cholesky_SOURCES	=		\
+	cholesky/cholesky.c		\
+	cholesky/cholesky_models.c	\
+	cholesky/cholesky_kernels.c	\
+	$(top_srcdir)/examples/common/blas.c
+
+cholesky_LDADD	=	\
+	$(STARPU_BLAS_LDFLAGS)

+ 267 - 0
gcc-plugin/examples/cholesky/cholesky.c

@@ -0,0 +1,267 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "cholesky.h"
+#include "cholesky_kernels.h"
+
+/*
+ *	code to bootstrap the factorization
+ *	and construct the DAG
+ */
+static void dw_cholesky(float ***matA, unsigned size, unsigned ld, unsigned nblocks)
+{
+	struct timeval start;
+	struct timeval end;
+        int x, y;
+
+	/* create all the DAG nodes */
+	unsigned i,j,k;
+
+        for(x = 0; x < nblocks ;  x++) {
+                for (y = 0; y < nblocks; y++) {
+#pragma starpu register matA[x][y] size/nblocks*size/nblocks
+		}
+        }
+
+	gettimeofday(&start, NULL);
+
+	for (k = 0; k < nblocks; k++)
+        {
+#warning deal with prio and models
+//                int prio = STARPU_DEFAULT_PRIO;
+//                if (!noprio) prio = STARPU_MAX_PRIO;
+
+		chol_codelet_update_u11(matA[k][k], size/nblocks, ld);
+
+		for (j = k+1; j<nblocks; j++)
+		{
+//                        prio = STARPU_DEFAULT_PRIO;
+//                        if (!noprio&& (j == k+1)) prio = STARPU_MAX_PRIO;
+
+			chol_codelet_update_u21(matA[k][k], matA[k][j], ld, ld, size/nblocks, size/nblocks);
+
+			for (i = k+1; i<nblocks; i++)
+			{
+				if (i <= j)
+                                {
+//                                        prio = STARPU_DEFAULT_PRIO;
+//                                        if (!noprio && (i == k + 1) && (j == k +1) ) prio = STARPU_MAX_PRIO;
+
+					chol_codelet_update_u22(matA[k][i],
+								matA[k][j],
+								matA[i][j],
+								size/nblocks, size/nblocks, size/nblocks, ld, ld, ld);
+                                }
+			}
+		}
+        }
+
+#pragma starpu wait
+
+        for(x = 0; x < nblocks ;  x++) {
+                for (y = 0; y < nblocks; y++) {
+#pragma starpu unregister matA[x][y]
+                }
+        }
+
+#pragma starpu wait
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	fprintf(stderr, "Computation took (in ms)\n");
+	fprintf(stdout, "%2.2f\n", timing/1000);
+	double flop = (1.0f*size*size*size)/3.0f;
+	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+}
+
+int main(int argc, char **argv)
+{
+	/* create a simple definite positive symetric matrix example
+	 *
+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
+	 * */
+
+	float ***bmat;
+
+	parse_args(argc, argv);
+
+#warning todo
+//	struct starpu_conf conf;
+//	starpu_conf_init(&conf);
+//	conf.sched_policy_name = "heft";
+//	conf.calibrate = 1;
+
+#pragma starpu initialize
+
+	unsigned i,j,x,y;
+        bmat = malloc(nblocks * sizeof(float *));
+        for(x=0 ; x<nblocks ; x++)
+	{
+                bmat[x] = malloc(nblocks * sizeof(float *));
+                for(y=0 ; y<nblocks ; y++)
+		{
+                        starpu_malloc((void **)&bmat[x][y], BLOCKSIZE*BLOCKSIZE*sizeof(float));
+			for (i = 0; i < BLOCKSIZE; i++)
+			{
+				for (j = 0; j < BLOCKSIZE; j++)
+				{
+                                        bmat[x][y][j +i*BLOCKSIZE] = (1.0f/(1.0f+(i+(x*BLOCKSIZE)+j+(y*BLOCKSIZE)))) + ((i+(x*BLOCKSIZE) == j+(y*BLOCKSIZE))?1.0f*size:0.0f);
+					//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
+				}
+			}
+		}
+	}
+
+
+        if (display) {
+		for(y=0 ; y<nblocks ; y++)
+		{
+			for(x=0 ; x<nblocks ; x++)
+			{
+                                printf("Block %d,%d :\n", x, y);
+				for (j = 0; j < BLOCKSIZE; j++)
+				{
+					for (i = 0; i < BLOCKSIZE; i++)
+					{
+						if (i <= j) {
+							printf("%2.2f\t", bmat[y][x][j +i*BLOCKSIZE]);
+						}
+						else {
+							printf(".\t");
+						}
+					}
+					printf("\n");
+				}
+			}
+		}
+	}
+
+	dw_cholesky(bmat, size, size/nblocks, nblocks);
+
+#pragma starpu shutdown
+
+        if (display) {
+                printf("Results:\n");
+		for(y=0 ; y<nblocks ; y++)
+		{
+			for(x=0 ; x<nblocks ; x++)
+			{
+                                printf("Block %d,%d :\n", x, y);
+				for (j = 0; j < BLOCKSIZE; j++)
+				{
+					for (i = 0; i < BLOCKSIZE; i++)
+					{
+						if (i <= j) {
+							printf("%2.2f\t", bmat[y][x][j +i*BLOCKSIZE]);
+						}
+						else {
+							printf(".\t");
+						}
+					}
+					printf("\n");
+				}
+			}
+		}
+	}
+
+	float *rmat = malloc(size*size*sizeof(float));
+        for(x=0 ; x<nblocks ; x++) {
+                for(y=0 ; y<nblocks ; y++) {
+                        for (i = 0; i < BLOCKSIZE; i++) {
+                                for (j = 0; j < BLOCKSIZE; j++) {
+                                        rmat[j+(y*BLOCKSIZE)+(i+(x*BLOCKSIZE))*size] = bmat[x][y][j +i*BLOCKSIZE];
+                                }
+                        }
+                }
+        }
+
+	fprintf(stderr, "compute explicit LLt ...\n");
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+			if (i > j) {
+				rmat[j+i*size] = 0.0f; // debug
+			}
+		}
+	}
+	float *test_mat = malloc(size*size*sizeof(float));
+	assert(test_mat);
+
+	SSYRK("L", "N", size, size, 1.0f,
+	      rmat, size, 0.0f, test_mat, size);
+
+	fprintf(stderr, "comparing results ...\n");
+        if (display) {
+                for (j = 0; j < size; j++)
+		{
+                        for (i = 0; i < size; i++)
+			{
+                                if (i <= j) {
+                                        printf("%2.2f\t", test_mat[j +i*size]);
+                                }
+                                else {
+                                        printf(".\t");
+                                }
+                        }
+                        printf("\n");
+                }
+        }
+
+	int correctness = 1;
+        for(x = 0; x < nblocks ;  x++)
+	{
+                for (y = 0; y < nblocks; y++)
+		{
+			for (i = (size/nblocks)*x ; i < (size/nblocks)*x+(size/nblocks); i++)
+                                {
+                                        for (j = (size/nblocks)*y ; j < (size/nblocks)*y+(size/nblocks); j++)
+						{
+							if (i <= j)
+								{
+									float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
+									float err = abs(test_mat[j +i*size] - orig);
+									if (err > 0.00001) {
+										fprintf(stderr, "Error[%d, %d] --> %2.2f != %2.2f (err %2.2f)\n", i, j, test_mat[j +i*size], orig, err);
+										correctness = 0;
+										break;
+									}
+								}
+						}
+                                }
+		}
+        }
+
+        for(x=0 ; x<nblocks ; x++)
+	{
+                for(y=0 ; y<nblocks ; y++)
+		{
+                        starpu_free((void *)bmat[x][y]);
+		}
+		free(bmat[x]);
+	}
+	free(bmat);
+	free(rmat);
+	free(test_mat);
+
+        starpu_helper_cublas_shutdown();
+#pragma starpu shutdown
+
+	assert(correctness);
+	return 0;
+}

+ 78 - 0
gcc-plugin/examples/cholesky/cholesky.h

@@ -0,0 +1,78 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DW_CHOLESKY_H__
+#define __DW_CHOLESKY_H__
+
+#include <semaphore.h>
+#include <string.h>
+#include <math.h>
+#include <sys/time.h>
+#ifdef STARPU_USE_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cublas.h>
+#endif
+
+#include <common/blas.h>
+#include <starpu.h>
+
+#define NMAXBLOCKS	32
+
+#define BLOCKSIZE	(size/nblocks)
+
+static unsigned size = 4*1024;
+static unsigned nblocks = 16;
+static unsigned nbigblocks = 8;
+static unsigned noprio = 0;
+static unsigned display = 0;
+
+static void __attribute__((unused)) parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-size") == 0) {
+		        char *argptr;
+			size = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nblocks") == 0) {
+		        char *argptr;
+			nblocks = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nbigblocks") == 0) {
+		        char *argptr;
+			nbigblocks = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-no-prio") == 0) {
+			noprio = 1;
+		}
+
+		if (strcmp(argv[i], "-display") == 0) {
+			display = 1;
+		}
+
+		if (strcmp(argv[i], "-h") == 0) {
+			printf("usage : %s [-display] [-size size] [-nblocks nblocks]\n", argv[0]);
+		}
+	}
+	if (nblocks > size) nblocks = size;
+}
+
+#endif // __DW_CHOLESKY_H__

+ 210 - 0
gcc-plugin/examples/cholesky/cholesky_kernels.c

@@ -0,0 +1,210 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "cholesky.h"
+#include "cholesky_kernels.h"
+#include "common/blas.h"
+#ifdef STARPU_USE_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cublas.h>
+#ifdef STARPU_HAVE_MAGMA
+#include "magma.h"
+#include "magma_lapack.h"
+#endif
+#endif
+
+/*
+ *   U22
+ */
+
+static inline void chol_common_cpu_codelet_update_u22(const float *left, const float *right, float *center, const unsigned dx, const unsigned dy, const unsigned dz,
+						      const unsigned ld21, const unsigned ld12, const unsigned ld22, int s)
+{
+	//printf("22\n");
+#ifdef STARPU_USE_CUDA
+	cublasStatus st;
+#endif
+
+	switch (s) {
+		case 0:
+			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21,
+				right, ld12, 1.0f, center, ld22);
+			break;
+#ifdef STARPU_USE_CUDA
+		case 1:
+			cublasSgemm('n', 't', dy, dx, dz,
+					-1.0f, left, ld21, right, ld12,
+					 1.0f, center, ld22);
+			st = cublasGetError();
+			STARPU_ASSERT(!st);
+
+			cudaThreadSynchronize();
+
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+			break;
+	}
+}
+
+void chol_cpu_codelet_update_u22(const float *left, const float *right, float *center, const unsigned dx, const unsigned dy, const unsigned dz,
+				 const unsigned ld21, const unsigned ld12, const unsigned ld22)
+	__attribute__ ((task_implementation ("cpu", chol_codelet_update_u22)));
+
+void chol_cpu_codelet_update_u22(const float *left, const float *right, float *center, const unsigned dx, const unsigned dy, const unsigned dz,
+				 const unsigned ld21, const unsigned ld12, const unsigned ld22)
+{
+	chol_common_cpu_codelet_update_u22(left, right, center, dx, dx, dz, ld21, ld12, ld22, 0);
+}
+
+#ifdef STARPU_USE_CUDA
+void chol_cublas_codelet_update_u22(void *descr[], void *_args)
+{
+	chol_common_cpu_codelet_update_u22(descr, 1, _args);
+}
+#endif// STARPU_USE_CUDA
+
+/*
+ * U21
+ */
+
+static inline void chol_common_codelet_update_u21(const float *sub11, float *sub21, const unsigned ld11, const unsigned ld21, const unsigned nx21, const unsigned ny21, int s)
+{
+	switch (s) {
+		case 0:
+			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
+			break;
+#ifdef STARPU_USE_CUDA
+		case 1:
+			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
+			cudaThreadSynchronize();
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+			break;
+	}
+}
+
+void chol_cpu_codelet_update_u21(const float *sub11, float *sub21, const unsigned ld11, const unsigned ld21, const unsigned nx21, const unsigned ny21)
+	__attribute__ ((task_implementation ("cpu", chol_codelet_update_u21)));
+
+void chol_cpu_codelet_update_u21(const float *sub11, float *sub21, const unsigned ld11, const unsigned ld21, const unsigned nx21, const unsigned ny21)
+{
+	chol_common_codelet_update_u21(sub11, sub21, ld11, ld21, nx21, ny21, 0);
+}
+
+#ifdef STARPU_USE_CUDA
+void chol_cublas_codelet_update_u21(void *descr[], void *_args)
+{
+	chol_common_codelet_update_u21(descr, 1, _args);
+}
+#endif
+
+/*
+ *	U11
+ */
+
+static inline void chol_common_codelet_update_u11(float *sub11, unsigned nx, unsigned ld, int s)
+{
+	unsigned z;
+
+	switch (s) {
+		case 0:
+
+			/*
+			 *	- alpha 11 <- lambda 11 = sqrt(alpha11)
+			 *	- alpha 21 <- l 21	= alpha 21 / lambda 11
+			 *	- A22 <- A22 - l21 trans(l21)
+			 */
+
+			for (z = 0; z < nx; z++)
+			{
+				float lambda11;
+				lambda11 = sqrt(sub11[z+z*ld]);
+				sub11[z+z*ld] = lambda11;
+
+				STARPU_ASSERT(lambda11 != 0.0f);
+
+				SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
+
+				SSYR("L", nx - z - 1, -1.0f,
+							&sub11[(z+1)+z*ld], 1,
+							&sub11[(z+1)+(z+1)*ld], ld);
+			}
+			break;
+#ifdef STARPU_USE_CUDA
+		case 1:
+#ifdef STARPU_HAVE_MAGMA
+			{
+				int ret;
+				int info;
+				ret = magma_spotrf_gpu('L', nx, sub11, ld, &info);
+				if (ret != MAGMA_SUCCESS) {
+					fprintf(stderr, "Error in Magma: %d\n", ret);
+					STARPU_ABORT();
+				}
+				cudaError_t cures = cudaThreadSynchronize();
+				STARPU_ASSERT(!cures);
+			}
+#else
+			for (z = 0; z < nx; z++)
+			{
+				float lambda11;
+				cudaMemcpy(&lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost);
+				cudaStreamSynchronize(0);
+
+				STARPU_ASSERT(lambda11 != 0.0f);
+
+				lambda11 = sqrt(lambda11);
+
+				cublasSetVector(1, sizeof(float), &lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float));
+
+				cublasSscal(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
+
+				cublasSsyr('U', nx - z - 1, -1.0f,
+							&sub11[(z+1)+z*ld], 1,
+							&sub11[(z+1)+(z+1)*ld], ld);
+			}
+
+			cudaThreadSynchronize();
+#endif
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+			break;
+	}
+}
+
+
+void chol_cpu_codelet_update_u11(float *mat, const unsigned nx, const unsigned ld)
+	__attribute__ ((task_implementation ("cpu", chol_codelet_update_u11)));
+
+void chol_cpu_codelet_update_u11(float *mat, const unsigned nx, const unsigned ld)
+{
+	chol_common_codelet_update_u11(mat, nx, ld, 0);
+}
+
+#ifdef STARPU_USE_CUDA
+void chol_cublas_codelet_update_u11(void *descr[], void *_args)
+{
+	chol_common_codelet_update_u11(descr, 1, _args);
+}
+#endif// STARPU_USE_CUDA

+ 31 - 0
gcc-plugin/examples/cholesky/cholesky_kernels.h

@@ -0,0 +1,31 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DW_CHOLESKY_MODELS_H__
+#define __DW_CHOLESKY_MODELS_H__
+
+void chol_codelet_update_u11(float* mat, const unsigned nx, const unsigned ld)
+	__attribute__ ((task));
+
+void chol_codelet_update_u21(const float *sub11, float *sub21, const unsigned ld11, const unsigned ld21, const unsigned nx21, const unsigned ny21)
+	__attribute__ ((task));
+
+void chol_codelet_update_u22(const float *left, const float *right, float *center, const unsigned dx, const unsigned dy, const unsigned dz,
+			     const unsigned ld21, const unsigned ld12, const unsigned ld22)
+	__attribute__ ((task));
+
+#endif // __DW_CHOLESKY_MODELS_H__

+ 40 - 0
gcc-plugin/examples/cholesky/cholesky_models.c

@@ -0,0 +1,40 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * As a convention, in that file, descr[0] is represented by A,
+ * 				  descr[1] is B ...
+ */
+
+/*
+ *	Number of flops of Gemm 
+ */
+
+struct starpu_perfmodel_t chol_model_11 = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "chol_model_11"
+};
+
+struct starpu_perfmodel_t chol_model_21 = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "chol_model_21"
+};
+
+struct starpu_perfmodel_t chol_model_22 = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "chol_model_22"
+};