14 years ago · 7f6e65add0
--- a/mpi/Makefile.am
+++ b/mpi/Makefile.am
@@ -121,6 +121,24 @@ examples_mpi_lu_plu_example_double_SOURCES =	\
 
				 	$(top_srcdir)/examples/common/blas.c
			
 
				 endif
			
 
				 
			
 
				+########################
			
 
				+# MPI Cholesky example #
			
 
				+########################
			
 
				+
			
 
				+if !NO_BLAS_LIB
			
 
				+examplebin_PROGRAMS +=		\
			
 
				+	examples/cholesky/mpi_cholesky
			
 
				+
			
 
				+examples_cholesky_mpi_cholesky_SOURCES	=		\
			
 
				+	examples/cholesky/mpi_cholesky.c		\
			
 
				+	examples/cholesky/mpi_cholesky_models.c		\
			
 
				+	examples/cholesky/mpi_cholesky_kernels.c	\
			
 
				+	$(top_srcdir)/examples/common/blas.c
			
 
				+
			
 
				+examples_cholesky_mpi_cholesky_LDADD =	\
			
 
				+	libstarpumpi.la
			
 
				+endif
			
 
				+
			
 
				 check_PROGRAMS +=					\
			
 
				 	tests/pingpong					\
			
 
				 	tests/mpi_test					\
			
--- a/mpi/examples/cholesky/mpi_cholesky.c
+++ b/mpi/examples/cholesky/mpi_cholesky.c
@@ -0,0 +1,261 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "mpi_cholesky.h"
			
 
				+#include "mpi_cholesky_models.h"
			
 
				+
			
 
				+/*
			
 
				+ *	Create the codelets
			
 
				+ */
			
 
				+
			
 
				+static starpu_codelet cl11 =
			
 
				+{
			
 
				+	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.cpu_func = chol_cpu_codelet_update_u11,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_func = chol_cublas_codelet_update_u11,
			
 
				+#endif
			
 
				+	.nbuffers = 1,
			
 
				+	.model = &chol_model_11
			
 
				+};
			
 
				+
			
 
				+static starpu_codelet cl21 =
			
 
				+{
			
 
				+	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.cpu_func = chol_cpu_codelet_update_u21,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_func = chol_cublas_codelet_update_u21,
			
 
				+#endif
			
 
				+	.nbuffers = 2,
			
 
				+	.model = &chol_model_21
			
 
				+};
			
 
				+
			
 
				+static starpu_codelet cl22 =
			
 
				+{
			
 
				+	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.cpu_func = chol_cpu_codelet_update_u22,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_func = chol_cublas_codelet_update_u22,
			
 
				+#endif
			
 
				+	.nbuffers = 3,
			
 
				+	.model = &chol_model_22
			
 
				+};
			
 
				+
			
 
				+/* Returns the MPI node number where data indexes index is */
			
 
				+int my_distrib(int x, int nb_nodes) {
			
 
				+        return x % nb_nodes;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *	code to bootstrap the factorization
			
 
				+ *	and construct the DAG
			
 
				+ */
			
 
				+static void dw_cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks, int rank, int nodes)
			
 
				+{
			
 
				+	struct timeval start;
			
 
				+	struct timeval end;
			
 
				+        starpu_data_handle **data_handles; //[size][size];
			
 
				+        int x, y;
			
 
				+
			
 
				+	/* create all the DAG nodes */
			
 
				+	unsigned i,j,k;
			
 
				+
			
 
				+        data_handles = malloc(size*sizeof(starpu_data_handle *));
			
 
				+        for(x=0 ; x<size ; x++) data_handles[x] = malloc(size*sizeof(starpu_data_handle));
			
 
				+
			
 
				+	gettimeofday(&start, NULL);
			
 
				+        for(x = 0; x < nblocks ;  x++) {
			
 
				+                for (y = 0; y < nblocks; y++) {
			
 
				+                        int mpi_rank = my_distrib(x, nodes);
			
 
				+                        if (mpi_rank == rank) {
			
 
				+                                //fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
			
 
				+                                starpu_matrix_data_register(&data_handles[x][y], 0, (uintptr_t)&(matA[((size/nblocks)*x) + ((size/nblocks)*y) * ld]),
			
 
				+                                                            ld, size/nblocks, size/nblocks, sizeof(float));
			
 
				+                        }
			
 
				+                        else if (rank == mpi_rank+1 || rank == mpi_rank-1) {
			
 
				+                                /* I don't own that index, but will need it for my computations */
			
 
				+                                //fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
			
 
				+                                starpu_matrix_data_register(&data_handles[x][y], -1, (uintptr_t)&(matA[((size/nblocks)*x) + ((size/nblocks)*y) * ld]),
			
 
				+                                                            ld, size/nblocks, size/nblocks, sizeof(float));
			
 
				+                        }
			
 
				+                        else {
			
 
				+                                /* I know it's useless to allocate anything for this */
			
 
				+                                data_handles[x][y] = NULL;
			
 
				+                        }
			
 
				+                        if (data_handles[x][y])
			
 
				+                                starpu_data_set_rank(data_handles[x][y], mpi_rank);
			
 
				+                }
			
 
				+        }
			
 
				+
			
 
				+	for (k = 0; k < nblocks; k++)
			
 
				+        {
			
 
				+                int prio = STARPU_DEFAULT_PRIO;
			
 
				+                if (!noprio) prio = STARPU_MAX_PRIO;
			
 
				+
			
 
				+                starpu_mpi_insert_task(MPI_COMM_WORLD, &cl11,
			
 
				+                                       STARPU_PRIORITY, prio,
			
 
				+                                       STARPU_RW, data_handles[k][k],
			
 
				+                                       0);
			
 
				+
			
 
				+		for (j = k+1; j<nblocks; j++)
			
 
				+		{
			
 
				+                        prio = STARPU_DEFAULT_PRIO;
			
 
				+                        if (!noprio&& (j == k+1)) prio = STARPU_MAX_PRIO;
			
 
				+                        starpu_mpi_insert_task(MPI_COMM_WORLD, &cl21,
			
 
				+                                               STARPU_PRIORITY, prio,
			
 
				+                                               STARPU_R, data_handles[k][k],
			
 
				+                                               STARPU_RW, data_handles[k][j],
			
 
				+                                               0);
			
 
				+
			
 
				+			for (i = k+1; i<nblocks; i++)
			
 
				+			{
			
 
				+				if (i <= j)
			
 
				+                                {
			
 
				+                                        prio = STARPU_DEFAULT_PRIO;
			
 
				+                                        if (!noprio && (i == k + 1) && (j == k +1) ) prio = STARPU_MAX_PRIO;
			
 
				+                                        starpu_mpi_insert_task(MPI_COMM_WORLD, &cl22,
			
 
				+                                                               STARPU_PRIORITY, prio,
			
 
				+                                                               STARPU_R, data_handles[k][i],
			
 
				+                                                               STARPU_R, data_handles[k][j],
			
 
				+                                                               STARPU_RW, data_handles[i][j],
			
 
				+                                                               0);
			
 
				+                                }
			
 
				+			}
			
 
				+		}
			
 
				+        }
			
 
				+
			
 
				+        starpu_task_wait_for_all();
			
 
				+
			
 
				+	gettimeofday(&end, NULL);
			
 
				+
			
 
				+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				+	fprintf(stderr, "Computation took (in ms)\n");
			
 
				+	printf("%2.2f\n", timing/1000);
			
 
				+
			
 
				+	double flop = (1.0f*size*size*size)/3.0f;
			
 
				+	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				+}
			
 
				+
			
 
				+void initialize_system(float **A, unsigned dim, unsigned pinned, int *rank, int *nodes)
			
 
				+{
			
 
				+	starpu_init(NULL);
			
 
				+	starpu_mpi_initialize_extended(1, rank, nodes);
			
 
				+	starpu_helper_cublas_init();
			
 
				+
			
 
				+	if (pinned)
			
 
				+	{
			
 
				+		starpu_data_malloc_pinned_if_possible((void **)A, (size_t)dim*dim*sizeof(float));
			
 
				+	}
			
 
				+	else {
			
 
				+		*A = malloc(dim*dim*sizeof(float));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	/* create a simple definite positive symetric matrix example
			
 
				+	 *
			
 
				+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
			
 
				+	 * */
			
 
				+
			
 
				+	float *mat;
			
 
				+        int rank, nodes;
			
 
				+
			
 
				+	parse_args(argc, argv);
			
 
				+	mat = malloc(size*size*sizeof(float));
			
 
				+	initialize_system(&mat, size, pinned, &rank, &nodes);
			
 
				+
			
 
				+	unsigned i,j;
			
 
				+	for (i = 0; i < size; i++)
			
 
				+	{
			
 
				+		for (j = 0; j < size; j++)
			
 
				+		{
			
 
				+			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
			
 
				+			//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+#ifdef CHECK_OUTPUT
			
 
				+	printf("Input :\n");
			
 
				+
			
 
				+	for (j = 0; j < size; j++)
			
 
				+	{
			
 
				+		for (i = 0; i < size; i++)
			
 
				+		{
			
 
				+			if (i <= j) {
			
 
				+				printf("%2.2f\t", mat[j +i*size]);
			
 
				+			}
			
 
				+			else {
			
 
				+				printf(".\t");
			
 
				+			}
			
 
				+		}
			
 
				+		printf("\n");
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	dw_cholesky(mat, size, size, nblocks, rank, nodes);
			
 
				+
			
 
				+        starpu_helper_cublas_shutdown();
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+#ifdef CHECK_OUTPUT
			
 
				+	printf("Results :\n");
			
 
				+
			
 
				+	for (j = 0; j < size; j++)
			
 
				+	{
			
 
				+		for (i = 0; i < size; i++)
			
 
				+		{
			
 
				+			if (i <= j) {
			
 
				+				printf("%2.2f\t", mat[j +i*size]);
			
 
				+			}
			
 
				+			else {
			
 
				+				printf(".\t");
			
 
				+				mat[j+i*size] = 0.0f; // debug
			
 
				+			}
			
 
				+		}
			
 
				+		printf("\n");
			
 
				+	}
			
 
				+
			
 
				+	fprintf(stderr, "compute explicit LLt ...\n");
			
 
				+	float *test_mat = malloc(size*size*sizeof(float));
			
 
				+	STARPU_ASSERT(test_mat);
			
 
				+
			
 
				+	SSYRK("L", "N", size, size, 1.0f,
			
 
				+				mat, size, 0.0f, test_mat, size);
			
 
				+
			
 
				+	fprintf(stderr, "comparing results ...\n");
			
 
				+	for (j = 0; j < size; j++)
			
 
				+	{
			
 
				+		for (i = 0; i < size; i++)
			
 
				+		{
			
 
				+			if (i <= j) {
			
 
				+				printf("%2.2f\t", test_mat[j +i*size]);
			
 
				+			}
			
 
				+			else {
			
 
				+				printf(".\t");
			
 
				+			}
			
 
				+		}
			
 
				+		printf("\n");
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/mpi/examples/cholesky/mpi_cholesky.h
+++ b/mpi/examples/cholesky/mpi_cholesky.h
@@ -0,0 +1,126 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __DW_CHOLESKY_H__
			
 
				+#define __DW_CHOLESKY_H__
			
 
				+
			
 
				+#include <semaphore.h>
			
 
				+#include <string.h>
			
 
				+#include <math.h>
			
 
				+#include <sys/time.h>
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#include <cuda.h>
			
 
				+#include <cuda_runtime.h>
			
 
				+#include <cublas.h>
			
 
				+#endif
			
 
				+
			
 
				+#include <common/blas.h>
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#define NMAXBLOCKS	32
			
 
				+
			
 
				+#define TAG11(k)	((starpu_tag_t)( (1ULL<<60) | (unsigned long long)(k)))
			
 
				+#define TAG21(k,j)	((starpu_tag_t)(((3ULL<<60) | (((unsigned long long)(k))<<32)	\
			
 
				+					| (unsigned long long)(j))))
			
 
				+#define TAG22(k,i,j)	((starpu_tag_t)(((4ULL<<60) | ((unsigned long long)(k)<<32) 	\
			
 
				+					| ((unsigned long long)(i)<<16)	\
			
 
				+					| (unsigned long long)(j))))
			
 
				+
			
 
				+
			
 
				+
			
 
				+#define TAG11_AUX(k, prefix)	((starpu_tag_t)( (((unsigned long long)(prefix))<<60)  |  (1ULL<<56) | (unsigned long long)(k)))
			
 
				+#define TAG21_AUX(k,j, prefix)	((starpu_tag_t)( (((unsigned long long)(prefix))<<60)  			\
			
 
				+					|  ((3ULL<<56) | (((unsigned long long)(k))<<32)	\
			
 
				+					| (unsigned long long)(j))))
			
 
				+#define TAG22_AUX(k,i,j, prefix)    ((starpu_tag_t)(  (((unsigned long long)(prefix))<<60)	\
			
 
				+					|  ((4ULL<<56) | ((unsigned long long)(k)<<32)  	\
			
 
				+					| ((unsigned long long)(i)<<16) 			\
			
 
				+					| (unsigned long long)(j))))
			
 
				+
			
 
				+#define BLOCKSIZE	(size/nblocks)
			
 
				+
			
 
				+
			
 
				+#define BLAS3_FLOP(n1,n2,n3)    \
			
 
				+        (2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
			
 
				+
			
 
				+typedef struct {
			
 
				+	starpu_data_handle dataA;
			
 
				+	unsigned i;
			
 
				+	unsigned j;
			
 
				+	unsigned k;
			
 
				+	unsigned nblocks;
			
 
				+	unsigned *remaining;
			
 
				+	sem_t *sem;
			
 
				+} cl_args;
			
 
				+
			
 
				+static unsigned size = 4*1024;
			
 
				+static unsigned nblocks = 16;
			
 
				+static unsigned nbigblocks = 8;
			
 
				+static unsigned pinned = 0;
			
 
				+static unsigned noprio = 0;
			
 
				+
			
 
				+void chol_cpu_codelet_update_u11(void **, void *);
			
 
				+void chol_cpu_codelet_update_u21(void **, void *);
			
 
				+void chol_cpu_codelet_update_u22(void **, void *);
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+void chol_cublas_codelet_update_u11(void *descr[], void *_args);
			
 
				+void chol_cublas_codelet_update_u21(void *descr[], void *_args);
			
 
				+void chol_cublas_codelet_update_u22(void *descr[], void *_args);
			
 
				+#endif
			
 
				+
			
 
				+void initialize_system(float **A, unsigned dim, unsigned pinned, int *rank, int *nodes);
			
 
				+//void dw_cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks);
			
 
				+
			
 
				+extern struct starpu_perfmodel_t chol_model_11;
			
 
				+extern struct starpu_perfmodel_t chol_model_21;
			
 
				+extern struct starpu_perfmodel_t chol_model_22;
			
 
				+
			
 
				+static void __attribute__((unused)) parse_args(int argc, char **argv)
			
 
				+{
			
 
				+	int i;
			
 
				+	for (i = 1; i < argc; i++) {
			
 
				+		if (strcmp(argv[i], "-size") == 0) {
			
 
				+		        char *argptr;
			
 
				+			size = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-nblocks") == 0) {
			
 
				+		        char *argptr;
			
 
				+			nblocks = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-nbigblocks") == 0) {
			
 
				+		        char *argptr;
			
 
				+			nbigblocks = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-pin") == 0) {
			
 
				+			pinned = 1;
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-no-prio") == 0) {
			
 
				+			noprio = 1;
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-h") == 0) {
			
 
				+			printf("usage : %s [-pin] [-size size] [-nblocks nblocks]\n", argv[0]);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#endif // __DW_CHOLESKY_H__
			
--- a/mpi/examples/cholesky/mpi_cholesky_kernels.c
+++ b/mpi/examples/cholesky/mpi_cholesky_kernels.c
@@ -0,0 +1,214 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_config.h>
			
 
				+#include "mpi_cholesky.h"
			
 
				+#include "common/blas.h"
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#include <cuda.h>
			
 
				+#include <cuda_runtime.h>
			
 
				+#include <cublas.h>
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+ *   U22
			
 
				+ */
			
 
				+
			
 
				+static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __attribute__((unused)) void *_args)
			
 
				+{
			
 
				+	//printf("22\n");
			
 
				+	float *left 	= (float *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	float *right 	= (float *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+	float *center 	= (float *)STARPU_MATRIX_GET_PTR(descr[2]);
			
 
				+
			
 
				+	unsigned dx = STARPU_MATRIX_GET_NY(descr[2]);
			
 
				+	unsigned dy = STARPU_MATRIX_GET_NX(descr[2]);
			
 
				+	unsigned dz = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+
			
 
				+	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+	unsigned ld12 = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				+	unsigned ld22 = STARPU_MATRIX_GET_LD(descr[2]);
			
 
				+
			
 
				+//#ifdef STARPU_USE_CUDA
			
 
				+//	cublasStatus st;
			
 
				+//#endif
			
 
				+//
			
 
				+//	switch (s) {
			
 
				+//		case 0:
			
 
				+//			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21,
			
 
				+//				right, ld12, 1.0f, center, ld22);
			
 
				+//			break;
			
 
				+//#ifdef STARPU_USE_CUDA
			
 
				+//		case 1:
			
 
				+//			cublasSgemm('n', 't', dy, dx, dz,
			
 
				+//					-1.0f, left, ld21, right, ld12,
			
 
				+//					 1.0f, center, ld22);
			
 
				+//			st = cublasGetError();
			
 
				+//			STARPU_ASSERT(!st);
			
 
				+//
			
 
				+//			cudaThreadSynchronize();
			
 
				+//
			
 
				+//			break;
			
 
				+//#endif
			
 
				+//		default:
			
 
				+//			STARPU_ABORT();
			
 
				+//			break;
			
 
				+//	}
			
 
				+}
			
 
				+
			
 
				+void chol_cpu_codelet_update_u22(void *descr[], void *_args)
			
 
				+{
			
 
				+	chol_common_cpu_codelet_update_u22(descr, 0, _args);
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+void chol_cublas_codelet_update_u22(void *descr[], void *_args)
			
 
				+{
			
 
				+	chol_common_cpu_codelet_update_u22(descr, 1, _args);
			
 
				+}
			
 
				+#endif// STARPU_USE_CUDA
			
 
				+
			
 
				+/*
			
 
				+ * U21
			
 
				+ */
			
 
				+
			
 
				+static inline void chol_common_codelet_update_u21(void *descr[], int s, __attribute__((unused)) void *_args)
			
 
				+{
			
 
				+//	printf("21\n");
			
 
				+	float *sub11;
			
 
				+	float *sub21;
			
 
				+
			
 
				+	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	sub21 = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+
			
 
				+	unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				+
			
 
				+	unsigned nx21 = STARPU_MATRIX_GET_NY(descr[1]);
			
 
				+	unsigned ny21 = STARPU_MATRIX_GET_NX(descr[1]);
			
 
				+
			
 
				+//	switch (s) {
			
 
				+//		case 0:
			
 
				+//			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
			
 
				+//			break;
			
 
				+//#ifdef STARPU_USE_CUDA
			
 
				+//		case 1:
			
 
				+//			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
			
 
				+//			cudaThreadSynchronize();
			
 
				+//			break;
			
 
				+//#endif
			
 
				+//		default:
			
 
				+//			STARPU_ABORT();
			
 
				+//			break;
			
 
				+//	}
			
 
				+}
			
 
				+
			
 
				+void chol_cpu_codelet_update_u21(void *descr[], void *_args)
			
 
				+{
			
 
				+	 chol_common_codelet_update_u21(descr, 0, _args);
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+void chol_cublas_codelet_update_u21(void *descr[], void *_args)
			
 
				+{
			
 
				+	chol_common_codelet_update_u21(descr, 1, _args);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+ *	U11
			
 
				+ */
			
 
				+
			
 
				+static inline void chol_common_codelet_update_u11(void *descr[], int s, __attribute__((unused)) void *_args)
			
 
				+{
			
 
				+//	printf("11\n");
			
 
				+	float *sub11;
			
 
				+
			
 
				+	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+
			
 
				+	unsigned nx = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+	unsigned ld = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+
			
 
				+	unsigned z;
			
 
				+
			
 
				+//	switch (s) {
			
 
				+//		case 0:
			
 
				+//
			
 
				+//			/*
			
 
				+//			 *	- alpha 11 <- lambda 11 = sqrt(alpha11)
			
 
				+//			 *	- alpha 21 <- l 21	= alpha 21 / lambda 11
			
 
				+//			 *	- A22 <- A22 - l21 trans(l21)
			
 
				+//			 */
			
 
				+//
			
 
				+//			for (z = 0; z < nx; z++)
			
 
				+//			{
			
 
				+//				float lambda11;
			
 
				+//				lambda11 = sqrt(sub11[z+z*ld]);
			
 
				+//				sub11[z+z*ld] = lambda11;
			
 
				+//
			
 
				+//				STARPU_ASSERT(lambda11 != 0.0f);
			
 
				+//
			
 
				+//				SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
			
 
				+//
			
 
				+//				SSYR("L", nx - z - 1, -1.0f,
			
 
				+//							&sub11[(z+1)+z*ld], 1,
			
 
				+//							&sub11[(z+1)+(z+1)*ld], ld);
			
 
				+//			}
			
 
				+//			break;
			
 
				+//#ifdef STARPU_USE_CUDA
			
 
				+//		case 1:
			
 
				+//			for (z = 0; z < nx; z++)
			
 
				+//			{
			
 
				+//				float lambda11;
			
 
				+//				cudaMemcpy(&lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost);
			
 
				+//				cudaStreamSynchronize(0);
			
 
				+//
			
 
				+//				STARPU_ASSERT(lambda11 != 0.0f);
			
 
				+//
			
 
				+//				lambda11 = sqrt(lambda11);
			
 
				+//
			
 
				+//				cublasSetVector(1, sizeof(float), &lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float));
			
 
				+//
			
 
				+//				cublasSscal(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
			
 
				+//
			
 
				+//				cublasSsyr('U', nx - z - 1, -1.0f,
			
 
				+//							&sub11[(z+1)+z*ld], 1,
			
 
				+//							&sub11[(z+1)+(z+1)*ld], ld);
			
 
				+//			}
			
 
				+//
			
 
				+//			cudaThreadSynchronize();
			
 
				+//
			
 
				+//			break;
			
 
				+//#endif
			
 
				+//		default:
			
 
				+//			STARPU_ABORT();
			
 
				+//			break;
			
 
				+//	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void chol_cpu_codelet_update_u11(void *descr[], void *_args)
			
 
				+{
			
 
				+	chol_common_codelet_update_u11(descr, 0, _args);
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+void chol_cublas_codelet_update_u11(void *descr[], void *_args)
			
 
				+{
			
 
				+	chol_common_codelet_update_u11(descr, 1, _args);
			
 
				+}
			
 
				+#endif// STARPU_USE_CUDA
			
--- a/mpi/examples/cholesky/mpi_cholesky_models.c
+++ b/mpi/examples/cholesky/mpi_cholesky_models.c
@@ -0,0 +1,153 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "mpi_cholesky_models.h"
			
 
				+
			
 
				+/*
			
 
				+ * As a convention, in that file, descr[0] is represented by A,
			
 
				+ * 				  descr[1] is B ...
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ *	Number of flops of Gemm 
			
 
				+ */
			
 
				+
			
 
				+//#define USE_PERTURBATION	1
			
 
				+
			
 
				+
			
 
				+#ifdef USE_PERTURBATION
			
 
				+#define PERTURBATE(a)	((starpu_drand48()*2.0f*(AMPL) + 1.0f - (AMPL))*(a))
			
 
				+#else
			
 
				+#define PERTURBATE(a)	(a)
			
 
				+#endif
			
 
				+
			
 
				+static double cpu_chol_task_11_cost(starpu_buffer_descr *descr)
			
 
				+{
			
 
				+	uint32_t n;
			
 
				+
			
 
				+	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				+
			
 
				+	double cost = (((double)(n)*n*n)/1000.0f*0.894/0.79176);
			
 
				+
			
 
				+#ifdef STARPU_MODEL_DEBUG
			
 
				+	printf("cpu_chol_task_11_cost n %d cost %e\n", n, cost);
			
 
				+#endif
			
 
				+
			
 
				+	return PERTURBATE(cost);
			
 
				+}
			
 
				+
			
 
				+static double cuda_chol_task_11_cost(starpu_buffer_descr *descr)
			
 
				+{
			
 
				+	uint32_t n;
			
 
				+
			
 
				+	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				+
			
 
				+	double cost = (((double)(n)*n*n)/50.0f/10.75/5.088633/0.9883);
			
 
				+
			
 
				+#ifdef STARPU_MODEL_DEBUG
			
 
				+	printf("cuda_chol_task_11_cost n %d cost %e\n", n, cost);
			
 
				+#endif
			
 
				+
			
 
				+	return PERTURBATE(cost);
			
 
				+}
			
 
				+
			
 
				+static double cpu_chol_task_21_cost(starpu_buffer_descr *descr)
			
 
				+{
			
 
				+	uint32_t n;
			
 
				+
			
 
				+	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				+
			
 
				+	double cost = (((double)(n)*n*n)/7706.674/0.95/0.9965);
			
 
				+
			
 
				+#ifdef STARPU_MODEL_DEBUG
			
 
				+	printf("cpu_chol_task_21_cost n %d cost %e\n", n, cost);
			
 
				+#endif
			
 
				+
			
 
				+	return PERTURBATE(cost);
			
 
				+}
			
 
				+
			
 
				+static double cuda_chol_task_21_cost(starpu_buffer_descr *descr)
			
 
				+{
			
 
				+	uint32_t n;
			
 
				+
			
 
				+	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				+
			
 
				+	double cost = (((double)(n)*n*n)/50.0f/10.75/87.29520);
			
 
				+
			
 
				+#ifdef STARPU_MODEL_DEBUG
			
 
				+	printf("cuda_chol_task_21_cost n %d cost %e\n", n, cost);
			
 
				+#endif
			
 
				+
			
 
				+	return PERTURBATE(cost);
			
 
				+}
			
 
				+
			
 
				+static double cpu_chol_task_22_cost(starpu_buffer_descr *descr)
			
 
				+{
			
 
				+	uint32_t n;
			
 
				+
			
 
				+	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				+
			
 
				+	double cost = (((double)(n)*n*n)/50.0f/10.75/8.0760);
			
 
				+
			
 
				+#ifdef STARPU_MODEL_DEBUG
			
 
				+	printf("cpu_chol_task_22_cost n %d cost %e\n", n, cost);
			
 
				+#endif
			
 
				+
			
 
				+	return PERTURBATE(cost);
			
 
				+}
			
 
				+
			
 
				+static double cuda_chol_task_22_cost(starpu_buffer_descr *descr)
			
 
				+{
			
 
				+	uint32_t n;
			
 
				+
			
 
				+	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				+
			
 
				+	double cost = (((double)(n)*n*n)/50.0f/10.75/76.30666);
			
 
				+
			
 
				+#ifdef STARPU_MODEL_DEBUG
			
 
				+	printf("cuda_chol_task_22_cost n %d cost %e\n", n, cost);
			
 
				+#endif
			
 
				+
			
 
				+	return PERTURBATE(cost);
			
 
				+}
			
 
				+
			
 
				+struct starpu_perfmodel_t chol_model_11 = {
			
 
				+	.per_arch = { 
			
 
				+		[STARPU_CPU_DEFAULT] = { .cost_model = cpu_chol_task_11_cost },
			
 
				+		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_chol_task_11_cost }
			
 
				+	},
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = "chol_model_11"
			
 
				+};
			
 
				+
			
 
				+struct starpu_perfmodel_t chol_model_21 = {
			
 
				+	.per_arch = { 
			
 
				+		[STARPU_CPU_DEFAULT] = { .cost_model = cpu_chol_task_21_cost },
			
 
				+		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_chol_task_21_cost }
			
 
				+	},
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = "chol_model_21"
			
 
				+};
			
 
				+
			
 
				+struct starpu_perfmodel_t chol_model_22 = {
			
 
				+	.per_arch = { 
			
 
				+		[STARPU_CPU_DEFAULT] = { .cost_model = cpu_chol_task_22_cost },
			
 
				+		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_chol_task_22_cost }
			
 
				+	},
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = "chol_model_22"
			
 
				+};
			
--- a/mpi/examples/cholesky/mpi_cholesky_models.h
+++ b/mpi/examples/cholesky/mpi_cholesky_models.h
@@ -0,0 +1,23 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __DW_CHOLESKY_MODELS_H__
			
 
				+#define __DW_CHOLESKY_MODELS_H__
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#endif // __DW_CHOLESKY_MODELS_H__