13 lat temu · c703771abd
--- a/gcc-plugin/examples/Makefile.am
+++ b/gcc-plugin/examples/Makefile.am
@@ -14,12 +14,22 @@
 
				 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				 
			
 
				 noinst_PROGRAMS =				\
			
 
				-  matrix-mult stencil5
			
 
				+  matrix-mult stencil5 cholesky
			
 
				 
			
 
				 AM_LDFLAGS = $(top_builddir)/src/libstarpu.la
			
 
				 
			
 
				 AM_CPPFLAGS =						\
			
 
				   -I$(top_srcdir)/include				\
			
 
				+  -I$(top_srcdir)/examples				\
			
 
				   $(STARPU_OPENCL_CPPFLAGS) $(STARPU_CUDA_CPPFLAGS)
			
 
				 
			
 
				 AM_CFLAGS = -fplugin="$(builddir)/../src/.libs/starpu.so" -Wall
			
 
				+
			
 
				+cholesky_SOURCES	=		\
			
 
				+	cholesky/cholesky.c		\
			
 
				+	cholesky/cholesky_models.c	\
			
 
				+	cholesky/cholesky_kernels.c	\
			
 
				+	$(top_srcdir)/examples/common/blas.c
			
 
				+
			
 
				+cholesky_LDADD	=	\
			
 
				+	$(STARPU_BLAS_LDFLAGS)
			
--- a/gcc-plugin/examples/cholesky/cholesky.c
+++ b/gcc-plugin/examples/cholesky/cholesky.c
@@ -0,0 +1,267 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "cholesky.h"
			
 
				+#include "cholesky_kernels.h"
			
 
				+
			
 
				+/*
			
 
				+ *	code to bootstrap the factorization
			
 
				+ *	and construct the DAG
			
 
				+ */
			
 
				+static void dw_cholesky(float ***matA, unsigned size, unsigned ld, unsigned nblocks)
			
 
				+{
			
 
				+	struct timeval start;
			
 
				+	struct timeval end;
			
 
				+        int x, y;
			
 
				+
			
 
				+	/* create all the DAG nodes */
			
 
				+	unsigned i,j,k;
			
 
				+
			
 
				+        for(x = 0; x < nblocks ;  x++) {
			
 
				+                for (y = 0; y < nblocks; y++) {
			
 
				+#pragma starpu register matA[x][y] size/nblocks*size/nblocks
			
 
				+		}
			
 
				+        }
			
 
				+
			
 
				+	gettimeofday(&start, NULL);
			
 
				+
			
 
				+	for (k = 0; k < nblocks; k++)
			
 
				+        {
			
 
				+#warning deal with prio and models
			
 
				+//                int prio = STARPU_DEFAULT_PRIO;
			
 
				+//                if (!noprio) prio = STARPU_MAX_PRIO;
			
 
				+
			
 
				+		chol_codelet_update_u11(matA[k][k], size/nblocks, ld);
			
 
				+
			
 
				+		for (j = k+1; j<nblocks; j++)
			
 
				+		{
			
 
				+//                        prio = STARPU_DEFAULT_PRIO;
			
 
				+//                        if (!noprio&& (j == k+1)) prio = STARPU_MAX_PRIO;
			
 
				+
			
 
				+			chol_codelet_update_u21(matA[k][k], matA[k][j], ld, ld, size/nblocks, size/nblocks);
			
 
				+
			
 
				+			for (i = k+1; i<nblocks; i++)
			
 
				+			{
			
 
				+				if (i <= j)
			
 
				+                                {
			
 
				+//                                        prio = STARPU_DEFAULT_PRIO;
			
 
				+//                                        if (!noprio && (i == k + 1) && (j == k +1) ) prio = STARPU_MAX_PRIO;
			
 
				+
			
 
				+					chol_codelet_update_u22(matA[k][i],
			
 
				+								matA[k][j],
			
 
				+								matA[i][j],
			
 
				+								size/nblocks, size/nblocks, size/nblocks, ld, ld, ld);
			
 
				+                                }
			
 
				+			}
			
 
				+		}
			
 
				+        }
			
 
				+
			
 
				+#pragma starpu wait
			
 
				+
			
 
				+        for(x = 0; x < nblocks ;  x++) {
			
 
				+                for (y = 0; y < nblocks; y++) {
			
 
				+#pragma starpu unregister matA[x][y]
			
 
				+                }
			
 
				+        }
			
 
				+
			
 
				+#pragma starpu wait
			
 
				+	gettimeofday(&end, NULL);
			
 
				+
			
 
				+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				+	fprintf(stderr, "Computation took (in ms)\n");
			
 
				+	fprintf(stdout, "%2.2f\n", timing/1000);
			
 
				+	double flop = (1.0f*size*size*size)/3.0f;
			
 
				+	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	/* create a simple definite positive symetric matrix example
			
 
				+	 *
			
 
				+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
			
 
				+	 * */
			
 
				+
			
 
				+	float ***bmat;
			
 
				+
			
 
				+	parse_args(argc, argv);
			
 
				+
			
 
				+#warning todo
			
 
				+//	struct starpu_conf conf;
			
 
				+//	starpu_conf_init(&conf);
			
 
				+//	conf.sched_policy_name = "heft";
			
 
				+//	conf.calibrate = 1;
			
 
				+
			
 
				+#pragma starpu initialize
			
 
				+
			
 
				+	unsigned i,j,x,y;
			
 
				+        bmat = malloc(nblocks * sizeof(float *));
			
 
				+        for(x=0 ; x<nblocks ; x++)
			
 
				+	{
			
 
				+                bmat[x] = malloc(nblocks * sizeof(float *));
			
 
				+                for(y=0 ; y<nblocks ; y++)
			
 
				+		{
			
 
				+                        starpu_malloc((void **)&bmat[x][y], BLOCKSIZE*BLOCKSIZE*sizeof(float));
			
 
				+			for (i = 0; i < BLOCKSIZE; i++)
			
 
				+			{
			
 
				+				for (j = 0; j < BLOCKSIZE; j++)
			
 
				+				{
			
 
				+                                        bmat[x][y][j +i*BLOCKSIZE] = (1.0f/(1.0f+(i+(x*BLOCKSIZE)+j+(y*BLOCKSIZE)))) + ((i+(x*BLOCKSIZE) == j+(y*BLOCKSIZE))?1.0f*size:0.0f);
			
 
				+					//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+        if (display) {
			
 
				+		for(y=0 ; y<nblocks ; y++)
			
 
				+		{
			
 
				+			for(x=0 ; x<nblocks ; x++)
			
 
				+			{
			
 
				+                                printf("Block %d,%d :\n", x, y);
			
 
				+				for (j = 0; j < BLOCKSIZE; j++)
			
 
				+				{
			
 
				+					for (i = 0; i < BLOCKSIZE; i++)
			
 
				+					{
			
 
				+						if (i <= j) {
			
 
				+							printf("%2.2f\t", bmat[y][x][j +i*BLOCKSIZE]);
			
 
				+						}
			
 
				+						else {
			
 
				+							printf(".\t");
			
 
				+						}
			
 
				+					}
			
 
				+					printf("\n");
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	dw_cholesky(bmat, size, size/nblocks, nblocks);
			
 
				+
			
 
				+#pragma starpu shutdown
			
 
				+
			
 
				+        if (display) {
			
 
				+                printf("Results:\n");
			
 
				+		for(y=0 ; y<nblocks ; y++)
			
 
				+		{
			
 
				+			for(x=0 ; x<nblocks ; x++)
			
 
				+			{
			
 
				+                                printf("Block %d,%d :\n", x, y);
			
 
				+				for (j = 0; j < BLOCKSIZE; j++)
			
 
				+				{
			
 
				+					for (i = 0; i < BLOCKSIZE; i++)
			
 
				+					{
			
 
				+						if (i <= j) {
			
 
				+							printf("%2.2f\t", bmat[y][x][j +i*BLOCKSIZE]);
			
 
				+						}
			
 
				+						else {
			
 
				+							printf(".\t");
			
 
				+						}
			
 
				+					}
			
 
				+					printf("\n");
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	float *rmat = malloc(size*size*sizeof(float));
			
 
				+        for(x=0 ; x<nblocks ; x++) {
			
 
				+                for(y=0 ; y<nblocks ; y++) {
			
 
				+                        for (i = 0; i < BLOCKSIZE; i++) {
			
 
				+                                for (j = 0; j < BLOCKSIZE; j++) {
			
 
				+                                        rmat[j+(y*BLOCKSIZE)+(i+(x*BLOCKSIZE))*size] = bmat[x][y][j +i*BLOCKSIZE];
			
 
				+                                }
			
 
				+                        }
			
 
				+                }
			
 
				+        }
			
 
				+
			
 
				+	fprintf(stderr, "compute explicit LLt ...\n");
			
 
				+	for (j = 0; j < size; j++)
			
 
				+	{
			
 
				+		for (i = 0; i < size; i++)
			
 
				+		{
			
 
				+			if (i > j) {
			
 
				+				rmat[j+i*size] = 0.0f; // debug
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	float *test_mat = malloc(size*size*sizeof(float));
			
 
				+	assert(test_mat);
			
 
				+
			
 
				+	SSYRK("L", "N", size, size, 1.0f,
			
 
				+	      rmat, size, 0.0f, test_mat, size);
			
 
				+
			
 
				+	fprintf(stderr, "comparing results ...\n");
			
 
				+        if (display) {
			
 
				+                for (j = 0; j < size; j++)
			
 
				+		{
			
 
				+                        for (i = 0; i < size; i++)
			
 
				+			{
			
 
				+                                if (i <= j) {
			
 
				+                                        printf("%2.2f\t", test_mat[j +i*size]);
			
 
				+                                }
			
 
				+                                else {
			
 
				+                                        printf(".\t");
			
 
				+                                }
			
 
				+                        }
			
 
				+                        printf("\n");
			
 
				+                }
			
 
				+        }
			
 
				+
			
 
				+	int correctness = 1;
			
 
				+        for(x = 0; x < nblocks ;  x++)
			
 
				+	{
			
 
				+                for (y = 0; y < nblocks; y++)
			
 
				+		{
			
 
				+			for (i = (size/nblocks)*x ; i < (size/nblocks)*x+(size/nblocks); i++)
			
 
				+                                {
			
 
				+                                        for (j = (size/nblocks)*y ; j < (size/nblocks)*y+(size/nblocks); j++)
			
 
				+						{
			
 
				+							if (i <= j)
			
 
				+								{
			
 
				+									float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
			
 
				+									float err = abs(test_mat[j +i*size] - orig);
			
 
				+									if (err > 0.00001) {
			
 
				+										fprintf(stderr, "Error[%d, %d] --> %2.2f != %2.2f (err %2.2f)\n", i, j, test_mat[j +i*size], orig, err);
			
 
				+										correctness = 0;
			
 
				+										break;
			
 
				+									}
			
 
				+								}
			
 
				+						}
			
 
				+                                }
			
 
				+		}
			
 
				+        }
			
 
				+
			
 
				+        for(x=0 ; x<nblocks ; x++)
			
 
				+	{
			
 
				+                for(y=0 ; y<nblocks ; y++)
			
 
				+		{
			
 
				+                        starpu_free((void *)bmat[x][y]);
			
 
				+		}
			
 
				+		free(bmat[x]);
			
 
				+	}
			
 
				+	free(bmat);
			
 
				+	free(rmat);
			
 
				+	free(test_mat);
			
 
				+
			
 
				+        starpu_helper_cublas_shutdown();
			
 
				+#pragma starpu shutdown
			
 
				+
			
 
				+	assert(correctness);
			
 
				+	return 0;
			
 
				+}
			
--- a/gcc-plugin/examples/cholesky/cholesky.h
+++ b/gcc-plugin/examples/cholesky/cholesky.h
@@ -0,0 +1,78 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __DW_CHOLESKY_H__
			
 
				+#define __DW_CHOLESKY_H__
			
 
				+
			
 
				+#include <semaphore.h>
			
 
				+#include <string.h>
			
 
				+#include <math.h>
			
 
				+#include <sys/time.h>
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#include <cuda.h>
			
 
				+#include <cuda_runtime.h>
			
 
				+#include <cublas.h>
			
 
				+#endif
			
 
				+
			
 
				+#include <common/blas.h>
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#define NMAXBLOCKS	32
			
 
				+
			
 
				+#define BLOCKSIZE	(size/nblocks)
			
 
				+
			
 
				+static unsigned size = 4*1024;
			
 
				+static unsigned nblocks = 16;
			
 
				+static unsigned nbigblocks = 8;
			
 
				+static unsigned noprio = 0;
			
 
				+static unsigned display = 0;
			
 
				+
			
 
				+static void __attribute__((unused)) parse_args(int argc, char **argv)
			
 
				+{
			
 
				+	int i;
			
 
				+	for (i = 1; i < argc; i++) {
			
 
				+		if (strcmp(argv[i], "-size") == 0) {
			
 
				+		        char *argptr;
			
 
				+			size = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-nblocks") == 0) {
			
 
				+		        char *argptr;
			
 
				+			nblocks = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-nbigblocks") == 0) {
			
 
				+		        char *argptr;
			
 
				+			nbigblocks = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-no-prio") == 0) {
			
 
				+			noprio = 1;
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-display") == 0) {
			
 
				+			display = 1;
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-h") == 0) {
			
 
				+			printf("usage : %s [-display] [-size size] [-nblocks nblocks]\n", argv[0]);
			
 
				+		}
			
 
				+	}
			
 
				+	if (nblocks > size) nblocks = size;
			
 
				+}
			
 
				+
			
 
				+#endif // __DW_CHOLESKY_H__
			
--- a/gcc-plugin/examples/cholesky/cholesky_kernels.c
+++ b/gcc-plugin/examples/cholesky/cholesky_kernels.c
@@ -0,0 +1,210 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "cholesky.h"
			
 
				+#include "cholesky_kernels.h"
			
 
				+#include "common/blas.h"
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#include <cuda.h>
			
 
				+#include <cuda_runtime.h>
			
 
				+#include <cublas.h>
			
 
				+#ifdef STARPU_HAVE_MAGMA
			
 
				+#include "magma.h"
			
 
				+#include "magma_lapack.h"
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+ *   U22
			
 
				+ */
			
 
				+
			
 
				+static inline void chol_common_cpu_codelet_update_u22(const float *left, const float *right, float *center, const unsigned dx, const unsigned dy, const unsigned dz,
			
 
				+						      const unsigned ld21, const unsigned ld12, const unsigned ld22, int s)
			
 
				+{
			
 
				+	//printf("22\n");
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	cublasStatus st;
			
 
				+#endif
			
 
				+
			
 
				+	switch (s) {
			
 
				+		case 0:
			
 
				+			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21,
			
 
				+				right, ld12, 1.0f, center, ld22);
			
 
				+			break;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		case 1:
			
 
				+			cublasSgemm('n', 't', dy, dx, dz,
			
 
				+					-1.0f, left, ld21, right, ld12,
			
 
				+					 1.0f, center, ld22);
			
 
				+			st = cublasGetError();
			
 
				+			STARPU_ASSERT(!st);
			
 
				+
			
 
				+			cudaThreadSynchronize();
			
 
				+
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			STARPU_ABORT();
			
 
				+			break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void chol_cpu_codelet_update_u22(const float *left, const float *right, float *center, const unsigned dx, const unsigned dy, const unsigned dz,
			
 
				+				 const unsigned ld21, const unsigned ld12, const unsigned ld22)
			
 
				+	__attribute__ ((task_implementation ("cpu", chol_codelet_update_u22)));
			
 
				+
			
 
				+void chol_cpu_codelet_update_u22(const float *left, const float *right, float *center, const unsigned dx, const unsigned dy, const unsigned dz,
			
 
				+				 const unsigned ld21, const unsigned ld12, const unsigned ld22)
			
 
				+{
			
 
				+	chol_common_cpu_codelet_update_u22(left, right, center, dx, dx, dz, ld21, ld12, ld22, 0);
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+void chol_cublas_codelet_update_u22(void *descr[], void *_args)
			
 
				+{
			
 
				+	chol_common_cpu_codelet_update_u22(descr, 1, _args);
			
 
				+}
			
 
				+#endif// STARPU_USE_CUDA
			
 
				+
			
 
				+/*
			
 
				+ * U21
			
 
				+ */
			
 
				+
			
 
				+static inline void chol_common_codelet_update_u21(const float *sub11, float *sub21, const unsigned ld11, const unsigned ld21, const unsigned nx21, const unsigned ny21, int s)
			
 
				+{
			
 
				+	switch (s) {
			
 
				+		case 0:
			
 
				+			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
			
 
				+			break;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		case 1:
			
 
				+			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
			
 
				+			cudaThreadSynchronize();
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			STARPU_ABORT();
			
 
				+			break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void chol_cpu_codelet_update_u21(const float *sub11, float *sub21, const unsigned ld11, const unsigned ld21, const unsigned nx21, const unsigned ny21)
			
 
				+	__attribute__ ((task_implementation ("cpu", chol_codelet_update_u21)));
			
 
				+
			
 
				+void chol_cpu_codelet_update_u21(const float *sub11, float *sub21, const unsigned ld11, const unsigned ld21, const unsigned nx21, const unsigned ny21)
			
 
				+{
			
 
				+	chol_common_codelet_update_u21(sub11, sub21, ld11, ld21, nx21, ny21, 0);
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+void chol_cublas_codelet_update_u21(void *descr[], void *_args)
			
 
				+{
			
 
				+	chol_common_codelet_update_u21(descr, 1, _args);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+ *	U11
			
 
				+ */
			
 
				+
			
 
				+static inline void chol_common_codelet_update_u11(float *sub11, unsigned nx, unsigned ld, int s)
			
 
				+{
			
 
				+	unsigned z;
			
 
				+
			
 
				+	switch (s) {
			
 
				+		case 0:
			
 
				+
			
 
				+			/*
			
 
				+			 *	- alpha 11 <- lambda 11 = sqrt(alpha11)
			
 
				+			 *	- alpha 21 <- l 21	= alpha 21 / lambda 11
			
 
				+			 *	- A22 <- A22 - l21 trans(l21)
			
 
				+			 */
			
 
				+
			
 
				+			for (z = 0; z < nx; z++)
			
 
				+			{
			
 
				+				float lambda11;
			
 
				+				lambda11 = sqrt(sub11[z+z*ld]);
			
 
				+				sub11[z+z*ld] = lambda11;
			
 
				+
			
 
				+				STARPU_ASSERT(lambda11 != 0.0f);
			
 
				+
			
 
				+				SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
			
 
				+
			
 
				+				SSYR("L", nx - z - 1, -1.0f,
			
 
				+							&sub11[(z+1)+z*ld], 1,
			
 
				+							&sub11[(z+1)+(z+1)*ld], ld);
			
 
				+			}
			
 
				+			break;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		case 1:
			
 
				+#ifdef STARPU_HAVE_MAGMA
			
 
				+			{
			
 
				+				int ret;
			
 
				+				int info;
			
 
				+				ret = magma_spotrf_gpu('L', nx, sub11, ld, &info);
			
 
				+				if (ret != MAGMA_SUCCESS) {
			
 
				+					fprintf(stderr, "Error in Magma: %d\n", ret);
			
 
				+					STARPU_ABORT();
			
 
				+				}
			
 
				+				cudaError_t cures = cudaThreadSynchronize();
			
 
				+				STARPU_ASSERT(!cures);
			
 
				+			}
			
 
				+#else
			
 
				+			for (z = 0; z < nx; z++)
			
 
				+			{
			
 
				+				float lambda11;
			
 
				+				cudaMemcpy(&lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost);
			
 
				+				cudaStreamSynchronize(0);
			
 
				+
			
 
				+				STARPU_ASSERT(lambda11 != 0.0f);
			
 
				+
			
 
				+				lambda11 = sqrt(lambda11);
			
 
				+
			
 
				+				cublasSetVector(1, sizeof(float), &lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float));
			
 
				+
			
 
				+				cublasSscal(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
			
 
				+
			
 
				+				cublasSsyr('U', nx - z - 1, -1.0f,
			
 
				+							&sub11[(z+1)+z*ld], 1,
			
 
				+							&sub11[(z+1)+(z+1)*ld], ld);
			
 
				+			}
			
 
				+
			
 
				+			cudaThreadSynchronize();
			
 
				+#endif
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			STARPU_ABORT();
			
 
				+			break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void chol_cpu_codelet_update_u11(float *mat, const unsigned nx, const unsigned ld)
			
 
				+	__attribute__ ((task_implementation ("cpu", chol_codelet_update_u11)));
			
 
				+
			
 
				+void chol_cpu_codelet_update_u11(float *mat, const unsigned nx, const unsigned ld)
			
 
				+{
			
 
				+	chol_common_codelet_update_u11(mat, nx, ld, 0);
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+void chol_cublas_codelet_update_u11(void *descr[], void *_args)
			
 
				+{
			
 
				+	chol_common_codelet_update_u11(descr, 1, _args);
			
 
				+}
			
 
				+#endif// STARPU_USE_CUDA
			
--- a/gcc-plugin/examples/cholesky/cholesky_kernels.h
+++ b/gcc-plugin/examples/cholesky/cholesky_kernels.h
@@ -0,0 +1,31 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __DW_CHOLESKY_MODELS_H__
			
 
				+#define __DW_CHOLESKY_MODELS_H__
			
 
				+
			
 
				+void chol_codelet_update_u11(float* mat, const unsigned nx, const unsigned ld)
			
 
				+	__attribute__ ((task));
			
 
				+
			
 
				+void chol_codelet_update_u21(const float *sub11, float *sub21, const unsigned ld11, const unsigned ld21, const unsigned nx21, const unsigned ny21)
			
 
				+	__attribute__ ((task));
			
 
				+
			
 
				+void chol_codelet_update_u22(const float *left, const float *right, float *center, const unsigned dx, const unsigned dy, const unsigned dz,
			
 
				+			     const unsigned ld21, const unsigned ld12, const unsigned ld22)
			
 
				+	__attribute__ ((task));
			
 
				+
			
 
				+#endif // __DW_CHOLESKY_MODELS_H__
			
--- a/gcc-plugin/examples/cholesky/cholesky_models.c
+++ b/gcc-plugin/examples/cholesky/cholesky_models.c
@@ -0,0 +1,40 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * As a convention, in that file, descr[0] is represented by A,
			
 
				+ * 				  descr[1] is B ...
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ *	Number of flops of Gemm 
			
 
				+ */
			
 
				+
			
 
				+struct starpu_perfmodel_t chol_model_11 = {
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = "chol_model_11"
			
 
				+};
			
 
				+
			
 
				+struct starpu_perfmodel_t chol_model_21 = {
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = "chol_model_21"
			
 
				+};
			
 
				+
			
 
				+struct starpu_perfmodel_t chol_model_22 = {
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = "chol_model_22"
			
 
				+};