Browse Source

StarPU-MPI: add cholesky example using starpu_mpi_insert_task

Nathalie Furmento 14 years ago
parent
commit
7f6e65add0

+ 18 - 0
mpi/Makefile.am

@@ -121,6 +121,24 @@ examples_mpi_lu_plu_example_double_SOURCES =	\
 	$(top_srcdir)/examples/common/blas.c
 endif
 
+########################
+# MPI Cholesky example #
+########################
+
+if !NO_BLAS_LIB
+examplebin_PROGRAMS +=		\
+	examples/cholesky/mpi_cholesky
+
+examples_cholesky_mpi_cholesky_SOURCES	=		\
+	examples/cholesky/mpi_cholesky.c		\
+	examples/cholesky/mpi_cholesky_models.c		\
+	examples/cholesky/mpi_cholesky_kernels.c	\
+	$(top_srcdir)/examples/common/blas.c
+
+examples_cholesky_mpi_cholesky_LDADD =	\
+	libstarpumpi.la
+endif
+
 check_PROGRAMS +=					\
 	tests/pingpong					\
 	tests/mpi_test					\

+ 261 - 0
mpi/examples/cholesky/mpi_cholesky.c

@@ -0,0 +1,261 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "mpi_cholesky.h"
+#include "mpi_cholesky_models.h"
+
+/*
+ *	Create the codelets
+ */
+
+static starpu_codelet cl11 =
+{
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_func = chol_cpu_codelet_update_u11,
+#ifdef STARPU_USE_CUDA
+	.cuda_func = chol_cublas_codelet_update_u11,
+#endif
+	.nbuffers = 1,
+	.model = &chol_model_11
+};
+
+static starpu_codelet cl21 =
+{
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_func = chol_cpu_codelet_update_u21,
+#ifdef STARPU_USE_CUDA
+	.cuda_func = chol_cublas_codelet_update_u21,
+#endif
+	.nbuffers = 2,
+	.model = &chol_model_21
+};
+
+static starpu_codelet cl22 =
+{
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_func = chol_cpu_codelet_update_u22,
+#ifdef STARPU_USE_CUDA
+	.cuda_func = chol_cublas_codelet_update_u22,
+#endif
+	.nbuffers = 3,
+	.model = &chol_model_22
+};
+
+/* Returns the MPI node number where data indexes index is */
+int my_distrib(int x, int nb_nodes) {
+        return x % nb_nodes;
+}
+
+/*
+ *	code to bootstrap the factorization
+ *	and construct the DAG
+ */
+static void dw_cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks, int rank, int nodes)
+{
+	struct timeval start;
+	struct timeval end;
+        starpu_data_handle **data_handles; //[size][size];
+        int x, y;
+
+	/* create all the DAG nodes */
+	unsigned i,j,k;
+
+        data_handles = malloc(size*sizeof(starpu_data_handle *));
+        for(x=0 ; x<size ; x++) data_handles[x] = malloc(size*sizeof(starpu_data_handle));
+
+	gettimeofday(&start, NULL);
+        for(x = 0; x < nblocks ;  x++) {
+                for (y = 0; y < nblocks; y++) {
+                        int mpi_rank = my_distrib(x, nodes);
+                        if (mpi_rank == rank) {
+                                //fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
+                                starpu_matrix_data_register(&data_handles[x][y], 0, (uintptr_t)&(matA[((size/nblocks)*x) + ((size/nblocks)*y) * ld]),
+                                                            ld, size/nblocks, size/nblocks, sizeof(float));
+                        }
+                        else if (rank == mpi_rank+1 || rank == mpi_rank-1) {
+                                /* I don't own that index, but will need it for my computations */
+                                //fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
+                                starpu_matrix_data_register(&data_handles[x][y], -1, (uintptr_t)&(matA[((size/nblocks)*x) + ((size/nblocks)*y) * ld]),
+                                                            ld, size/nblocks, size/nblocks, sizeof(float));
+                        }
+                        else {
+                                /* I know it's useless to allocate anything for this */
+                                data_handles[x][y] = NULL;
+                        }
+                        if (data_handles[x][y])
+                                starpu_data_set_rank(data_handles[x][y], mpi_rank);
+                }
+        }
+
+	for (k = 0; k < nblocks; k++)
+        {
+                int prio = STARPU_DEFAULT_PRIO;
+                if (!noprio) prio = STARPU_MAX_PRIO;
+
+                starpu_mpi_insert_task(MPI_COMM_WORLD, &cl11,
+                                       STARPU_PRIORITY, prio,
+                                       STARPU_RW, data_handles[k][k],
+                                       0);
+
+		for (j = k+1; j<nblocks; j++)
+		{
+                        prio = STARPU_DEFAULT_PRIO;
+                        if (!noprio&& (j == k+1)) prio = STARPU_MAX_PRIO;
+                        starpu_mpi_insert_task(MPI_COMM_WORLD, &cl21,
+                                               STARPU_PRIORITY, prio,
+                                               STARPU_R, data_handles[k][k],
+                                               STARPU_RW, data_handles[k][j],
+                                               0);
+
+			for (i = k+1; i<nblocks; i++)
+			{
+				if (i <= j)
+                                {
+                                        prio = STARPU_DEFAULT_PRIO;
+                                        if (!noprio && (i == k + 1) && (j == k +1) ) prio = STARPU_MAX_PRIO;
+                                        starpu_mpi_insert_task(MPI_COMM_WORLD, &cl22,
+                                                               STARPU_PRIORITY, prio,
+                                                               STARPU_R, data_handles[k][i],
+                                                               STARPU_R, data_handles[k][j],
+                                                               STARPU_RW, data_handles[i][j],
+                                                               0);
+                                }
+			}
+		}
+        }
+
+        starpu_task_wait_for_all();
+
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	fprintf(stderr, "Computation took (in ms)\n");
+	printf("%2.2f\n", timing/1000);
+
+	double flop = (1.0f*size*size*size)/3.0f;
+	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+}
+
+void initialize_system(float **A, unsigned dim, unsigned pinned, int *rank, int *nodes)
+{
+	starpu_init(NULL);
+	starpu_mpi_initialize_extended(1, rank, nodes);
+	starpu_helper_cublas_init();
+
+	if (pinned)
+	{
+		starpu_data_malloc_pinned_if_possible((void **)A, (size_t)dim*dim*sizeof(float));
+	}
+	else {
+		*A = malloc(dim*dim*sizeof(float));
+	}
+}
+
+int main(int argc, char **argv)
+{
+	/* create a simple definite positive symetric matrix example
+	 *
+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
+	 * */
+
+	float *mat;
+        int rank, nodes;
+
+	parse_args(argc, argv);
+	mat = malloc(size*size*sizeof(float));
+	initialize_system(&mat, size, pinned, &rank, &nodes);
+
+	unsigned i,j;
+	for (i = 0; i < size; i++)
+	{
+		for (j = 0; j < size; j++)
+		{
+			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
+			//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
+		}
+	}
+
+
+#ifdef CHECK_OUTPUT
+	printf("Input :\n");
+
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+			if (i <= j) {
+				printf("%2.2f\t", mat[j +i*size]);
+			}
+			else {
+				printf(".\t");
+			}
+		}
+		printf("\n");
+	}
+#endif
+
+	dw_cholesky(mat, size, size, nblocks, rank, nodes);
+
+        starpu_helper_cublas_shutdown();
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+#ifdef CHECK_OUTPUT
+	printf("Results :\n");
+
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+			if (i <= j) {
+				printf("%2.2f\t", mat[j +i*size]);
+			}
+			else {
+				printf(".\t");
+				mat[j+i*size] = 0.0f; // debug
+			}
+		}
+		printf("\n");
+	}
+
+	fprintf(stderr, "compute explicit LLt ...\n");
+	float *test_mat = malloc(size*size*sizeof(float));
+	STARPU_ASSERT(test_mat);
+
+	SSYRK("L", "N", size, size, 1.0f,
+				mat, size, 0.0f, test_mat, size);
+
+	fprintf(stderr, "comparing results ...\n");
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+			if (i <= j) {
+				printf("%2.2f\t", test_mat[j +i*size]);
+			}
+			else {
+				printf(".\t");
+			}
+		}
+		printf("\n");
+	}
+#endif
+
+	return 0;
+}

+ 126 - 0
mpi/examples/cholesky/mpi_cholesky.h

@@ -0,0 +1,126 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DW_CHOLESKY_H__
+#define __DW_CHOLESKY_H__
+
+#include <semaphore.h>
+#include <string.h>
+#include <math.h>
+#include <sys/time.h>
+#ifdef STARPU_USE_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cublas.h>
+#endif
+
+#include <common/blas.h>
+#include <starpu.h>
+
+#define NMAXBLOCKS	32
+
+#define TAG11(k)	((starpu_tag_t)( (1ULL<<60) | (unsigned long long)(k)))
+#define TAG21(k,j)	((starpu_tag_t)(((3ULL<<60) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(j))))
+#define TAG22(k,i,j)	((starpu_tag_t)(((4ULL<<60) | ((unsigned long long)(k)<<32) 	\
+					| ((unsigned long long)(i)<<16)	\
+					| (unsigned long long)(j))))
+
+
+
+#define TAG11_AUX(k, prefix)	((starpu_tag_t)( (((unsigned long long)(prefix))<<60)  |  (1ULL<<56) | (unsigned long long)(k)))
+#define TAG21_AUX(k,j, prefix)	((starpu_tag_t)( (((unsigned long long)(prefix))<<60)  			\
+					|  ((3ULL<<56) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(j))))
+#define TAG22_AUX(k,i,j, prefix)    ((starpu_tag_t)(  (((unsigned long long)(prefix))<<60)	\
+					|  ((4ULL<<56) | ((unsigned long long)(k)<<32)  	\
+					| ((unsigned long long)(i)<<16) 			\
+					| (unsigned long long)(j))))
+
+#define BLOCKSIZE	(size/nblocks)
+
+
+#define BLAS3_FLOP(n1,n2,n3)    \
+        (2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
+
+typedef struct {
+	starpu_data_handle dataA;
+	unsigned i;
+	unsigned j;
+	unsigned k;
+	unsigned nblocks;
+	unsigned *remaining;
+	sem_t *sem;
+} cl_args;
+
+static unsigned size = 4*1024;
+static unsigned nblocks = 16;
+static unsigned nbigblocks = 8;
+static unsigned pinned = 0;
+static unsigned noprio = 0;
+
+void chol_cpu_codelet_update_u11(void **, void *);
+void chol_cpu_codelet_update_u21(void **, void *);
+void chol_cpu_codelet_update_u22(void **, void *);
+
+#ifdef STARPU_USE_CUDA
+void chol_cublas_codelet_update_u11(void *descr[], void *_args);
+void chol_cublas_codelet_update_u21(void *descr[], void *_args);
+void chol_cublas_codelet_update_u22(void *descr[], void *_args);
+#endif
+
+void initialize_system(float **A, unsigned dim, unsigned pinned, int *rank, int *nodes);
+//void dw_cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks);
+
+extern struct starpu_perfmodel_t chol_model_11;
+extern struct starpu_perfmodel_t chol_model_21;
+extern struct starpu_perfmodel_t chol_model_22;
+
+static void __attribute__((unused)) parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-size") == 0) {
+		        char *argptr;
+			size = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nblocks") == 0) {
+		        char *argptr;
+			nblocks = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nbigblocks") == 0) {
+		        char *argptr;
+			nbigblocks = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-pin") == 0) {
+			pinned = 1;
+		}
+
+		if (strcmp(argv[i], "-no-prio") == 0) {
+			noprio = 1;
+		}
+
+		if (strcmp(argv[i], "-h") == 0) {
+			printf("usage : %s [-pin] [-size size] [-nblocks nblocks]\n", argv[0]);
+		}
+	}
+}
+
+#endif // __DW_CHOLESKY_H__

+ 214 - 0
mpi/examples/cholesky/mpi_cholesky_kernels.c

@@ -0,0 +1,214 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_config.h>
+#include "mpi_cholesky.h"
+#include "common/blas.h"
+#ifdef STARPU_USE_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cublas.h>
+#endif
+
+/*
+ *   U22
+ */
+
+static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __attribute__((unused)) void *_args)
+{
+	//printf("22\n");
+	float *left 	= (float *)STARPU_MATRIX_GET_PTR(descr[0]);
+	float *right 	= (float *)STARPU_MATRIX_GET_PTR(descr[1]);
+	float *center 	= (float *)STARPU_MATRIX_GET_PTR(descr[2]);
+
+	unsigned dx = STARPU_MATRIX_GET_NY(descr[2]);
+	unsigned dy = STARPU_MATRIX_GET_NX(descr[2]);
+	unsigned dz = STARPU_MATRIX_GET_NY(descr[0]);
+
+	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[0]);
+	unsigned ld12 = STARPU_MATRIX_GET_LD(descr[1]);
+	unsigned ld22 = STARPU_MATRIX_GET_LD(descr[2]);
+
+//#ifdef STARPU_USE_CUDA
+//	cublasStatus st;
+//#endif
+//
+//	switch (s) {
+//		case 0:
+//			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21,
+//				right, ld12, 1.0f, center, ld22);
+//			break;
+//#ifdef STARPU_USE_CUDA
+//		case 1:
+//			cublasSgemm('n', 't', dy, dx, dz,
+//					-1.0f, left, ld21, right, ld12,
+//					 1.0f, center, ld22);
+//			st = cublasGetError();
+//			STARPU_ASSERT(!st);
+//
+//			cudaThreadSynchronize();
+//
+//			break;
+//#endif
+//		default:
+//			STARPU_ABORT();
+//			break;
+//	}
+}
+
+void chol_cpu_codelet_update_u22(void *descr[], void *_args)
+{
+	chol_common_cpu_codelet_update_u22(descr, 0, _args);
+}
+
+#ifdef STARPU_USE_CUDA
+void chol_cublas_codelet_update_u22(void *descr[], void *_args)
+{
+	chol_common_cpu_codelet_update_u22(descr, 1, _args);
+}
+#endif// STARPU_USE_CUDA
+
+/*
+ * U21
+ */
+
+static inline void chol_common_codelet_update_u21(void *descr[], int s, __attribute__((unused)) void *_args)
+{
+//	printf("21\n");
+	float *sub11;
+	float *sub21;
+
+	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
+	sub21 = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
+
+	unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
+	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);
+
+	unsigned nx21 = STARPU_MATRIX_GET_NY(descr[1]);
+	unsigned ny21 = STARPU_MATRIX_GET_NX(descr[1]);
+
+//	switch (s) {
+//		case 0:
+//			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
+//			break;
+//#ifdef STARPU_USE_CUDA
+//		case 1:
+//			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
+//			cudaThreadSynchronize();
+//			break;
+//#endif
+//		default:
+//			STARPU_ABORT();
+//			break;
+//	}
+}
+
+void chol_cpu_codelet_update_u21(void *descr[], void *_args)
+{
+	 chol_common_codelet_update_u21(descr, 0, _args);
+}
+
+#ifdef STARPU_USE_CUDA
+void chol_cublas_codelet_update_u21(void *descr[], void *_args)
+{
+	chol_common_codelet_update_u21(descr, 1, _args);
+}
+#endif
+
+/*
+ *	U11
+ */
+
+static inline void chol_common_codelet_update_u11(void *descr[], int s, __attribute__((unused)) void *_args)
+{
+//	printf("11\n");
+	float *sub11;
+
+	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
+
+	unsigned nx = STARPU_MATRIX_GET_NY(descr[0]);
+	unsigned ld = STARPU_MATRIX_GET_LD(descr[0]);
+
+	unsigned z;
+
+//	switch (s) {
+//		case 0:
+//
+//			/*
+//			 *	- alpha 11 <- lambda 11 = sqrt(alpha11)
+//			 *	- alpha 21 <- l 21	= alpha 21 / lambda 11
+//			 *	- A22 <- A22 - l21 trans(l21)
+//			 */
+//
+//			for (z = 0; z < nx; z++)
+//			{
+//				float lambda11;
+//				lambda11 = sqrt(sub11[z+z*ld]);
+//				sub11[z+z*ld] = lambda11;
+//
+//				STARPU_ASSERT(lambda11 != 0.0f);
+//
+//				SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
+//
+//				SSYR("L", nx - z - 1, -1.0f,
+//							&sub11[(z+1)+z*ld], 1,
+//							&sub11[(z+1)+(z+1)*ld], ld);
+//			}
+//			break;
+//#ifdef STARPU_USE_CUDA
+//		case 1:
+//			for (z = 0; z < nx; z++)
+//			{
+//				float lambda11;
+//				cudaMemcpy(&lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost);
+//				cudaStreamSynchronize(0);
+//
+//				STARPU_ASSERT(lambda11 != 0.0f);
+//
+//				lambda11 = sqrt(lambda11);
+//
+//				cublasSetVector(1, sizeof(float), &lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float));
+//
+//				cublasSscal(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
+//
+//				cublasSsyr('U', nx - z - 1, -1.0f,
+//							&sub11[(z+1)+z*ld], 1,
+//							&sub11[(z+1)+(z+1)*ld], ld);
+//			}
+//
+//			cudaThreadSynchronize();
+//
+//			break;
+//#endif
+//		default:
+//			STARPU_ABORT();
+//			break;
+//	}
+}
+
+
+void chol_cpu_codelet_update_u11(void *descr[], void *_args)
+{
+	chol_common_codelet_update_u11(descr, 0, _args);
+}
+
+#ifdef STARPU_USE_CUDA
+void chol_cublas_codelet_update_u11(void *descr[], void *_args)
+{
+	chol_common_codelet_update_u11(descr, 1, _args);
+}
+#endif// STARPU_USE_CUDA

+ 153 - 0
mpi/examples/cholesky/mpi_cholesky_models.c

@@ -0,0 +1,153 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_cholesky_models.h"
+
+/*
+ * As a convention, in that file, descr[0] is represented by A,
+ * 				  descr[1] is B ...
+ */
+
+/*
+ *	Number of flops of Gemm 
+ */
+
+//#define USE_PERTURBATION	1
+
+
+#ifdef USE_PERTURBATION
+#define PERTURBATE(a)	((starpu_drand48()*2.0f*(AMPL) + 1.0f - (AMPL))*(a))
+#else
+#define PERTURBATE(a)	(a)
+#endif
+
+static double cpu_chol_task_11_cost(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_matrix_get_nx(descr[0].handle);
+
+	double cost = (((double)(n)*n*n)/1000.0f*0.894/0.79176);
+
+#ifdef STARPU_MODEL_DEBUG
+	printf("cpu_chol_task_11_cost n %d cost %e\n", n, cost);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+static double cuda_chol_task_11_cost(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_matrix_get_nx(descr[0].handle);
+
+	double cost = (((double)(n)*n*n)/50.0f/10.75/5.088633/0.9883);
+
+#ifdef STARPU_MODEL_DEBUG
+	printf("cuda_chol_task_11_cost n %d cost %e\n", n, cost);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+static double cpu_chol_task_21_cost(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_matrix_get_nx(descr[0].handle);
+
+	double cost = (((double)(n)*n*n)/7706.674/0.95/0.9965);
+
+#ifdef STARPU_MODEL_DEBUG
+	printf("cpu_chol_task_21_cost n %d cost %e\n", n, cost);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+static double cuda_chol_task_21_cost(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_matrix_get_nx(descr[0].handle);
+
+	double cost = (((double)(n)*n*n)/50.0f/10.75/87.29520);
+
+#ifdef STARPU_MODEL_DEBUG
+	printf("cuda_chol_task_21_cost n %d cost %e\n", n, cost);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+static double cpu_chol_task_22_cost(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_matrix_get_nx(descr[0].handle);
+
+	double cost = (((double)(n)*n*n)/50.0f/10.75/8.0760);
+
+#ifdef STARPU_MODEL_DEBUG
+	printf("cpu_chol_task_22_cost n %d cost %e\n", n, cost);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+static double cuda_chol_task_22_cost(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_matrix_get_nx(descr[0].handle);
+
+	double cost = (((double)(n)*n*n)/50.0f/10.75/76.30666);
+
+#ifdef STARPU_MODEL_DEBUG
+	printf("cuda_chol_task_22_cost n %d cost %e\n", n, cost);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+struct starpu_perfmodel_t chol_model_11 = {
+	.per_arch = { 
+		[STARPU_CPU_DEFAULT] = { .cost_model = cpu_chol_task_11_cost },
+		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_chol_task_11_cost }
+	},
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "chol_model_11"
+};
+
+struct starpu_perfmodel_t chol_model_21 = {
+	.per_arch = { 
+		[STARPU_CPU_DEFAULT] = { .cost_model = cpu_chol_task_21_cost },
+		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_chol_task_21_cost }
+	},
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "chol_model_21"
+};
+
+struct starpu_perfmodel_t chol_model_22 = {
+	.per_arch = { 
+		[STARPU_CPU_DEFAULT] = { .cost_model = cpu_chol_task_22_cost },
+		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_chol_task_22_cost }
+	},
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "chol_model_22"
+};

+ 23 - 0
mpi/examples/cholesky/mpi_cholesky_models.h

@@ -0,0 +1,23 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DW_CHOLESKY_MODELS_H__
+#define __DW_CHOLESKY_MODELS_H__
+
+#include <starpu.h>
+
+#endif // __DW_CHOLESKY_MODELS_H__