Explorar el Código

Introduce an MPI version of the LU decomposition (work in progress !)

Cédric Augonnet hace 15 años
padre
commit
c46518553b

+ 2 - 0
mpi/Makefile.am

@@ -14,6 +14,8 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 #
 
+SUBDIRS = examples
+
 CC=$(MPICC)
 
 if USE_CUDA

+ 66 - 0
mpi/examples/Makefile.am

@@ -0,0 +1,66 @@
+#
+# StarPU
+# Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+CC=$(MPICC)
+
+LIBS = $(top_builddir)/src/libstarpu.la $(top_builddir)/mpi/libstarpumpi.la @LIBS@
+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include  -I$(top_srcdir)/mpi/
+
+TESTS = $(check_PROGRAMS)
+
+check_PROGRAMS =
+
+BUILT_SOURCES =
+
+CLEANFILES = *.gcno *.gcda *.linkinfo
+
+if USE_CUDA
+
+# TODO define NVCCFLAGS
+NVCC ?= nvcc
+
+.cu.o:
+	$(NVCC) $< -c -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS) -I$(top_srcdir)/include/ -I$(top_builddir)/include/
+endif
+
+examplebindir = $(libdir)/starpu/mpi/examples/
+
+examplebin_PROGRAMS =
+
+##################
+# MPI LU example #
+##################
+
+if !NO_BLAS_LIB
+
+examplebin_PROGRAMS += 				\
+	mpi_lu/plu_example_float		\
+	mpi_lu/plu_example_double
+
+mpi_lu_plu_example_float_SOURCES =		\
+	mpi_lu/plu_example_float.c		\
+	mpi_lu/plu_solve_float.c		\
+	mpi_lu/pslu_kernels.c			\
+	mpi_lu/pslu.c				\
+	$(top_srcdir)/examples/common/blas.c
+
+mpi_lu_plu_example_double_SOURCES =		\
+	mpi_lu/plu_example_double.c		\
+	mpi_lu/plu_solve_double.c		\
+	mpi_lu/pdlu_kernels.c			\
+	mpi_lu/pdlu.c				\
+	$(top_srcdir)/examples/common/blas.c
+endif

+ 0 - 0
mpi/examples/mpi_lu/.dirstamp


+ 41 - 0
mpi/examples/mpi_lu/double.h

@@ -0,0 +1,41 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#define TYPE double
+#define MPI_TYPE	MPI_DOUBLE
+
+#define STARPU_PLU(name)       starpu_pdlu_##name
+
+#define CUBLAS_GEMM	cublasDgemm
+#define CUBLAS_TRSM	cublasDtrsm
+#define CUBLAS_SCAL	cublasDscal
+#define CUBLAS_GER	cublasDger
+#define CUBLAS_SWAP	cublasDswap
+#define CUBLAS_IAMAX	cublasIdamax
+
+#define CPU_GEMM	DGEMM
+#define CPU_GEMV	DGEMV
+#define CPU_TRSM	DTRSM
+#define CPU_SCAL	DSCAL
+#define CPU_GER		DGER
+#define CPU_SWAP	DSWAP
+
+#define CPU_TRMM	DTRMM
+#define CPU_AXPY	DAXPY
+#define CPU_ASUM	DASUM
+#define CPU_IAMAX	IDAMAX
+
+#define PIVOT_THRESHHOLD	10e-10

+ 41 - 0
mpi/examples/mpi_lu/float.h

@@ -0,0 +1,41 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#define TYPE float
+#define MPI_TYPE	MPI_FLOAT
+
+#define STARPU_PLU(name)       starpu_pslu_##name
+
+#define CUBLAS_GEMM	cublasSgemm
+#define CUBLAS_TRSM	cublasStrsm
+#define CUBLAS_SCAL	cublasSscal
+#define CUBLAS_GER	cublasSger
+#define CUBLAS_SWAP	cublasSswap
+#define CUBLAS_IAMAX	cublasIsamax
+
+#define CPU_GEMM	SGEMM
+#define CPU_GEMV	SGEMV
+#define CPU_TRSM	STRSM
+#define CPU_SCAL	SSCAL
+#define CPU_GER		SGER
+#define CPU_SWAP	SSWAP
+
+#define CPU_TRMM	STRMM
+#define CPU_AXPY	SAXPY
+#define CPU_ASUM	SASUM
+#define CPU_IAMAX	ISAMAX
+
+#define PIVOT_THRESHHOLD	10e-5

+ 18 - 0
mpi/examples/mpi_lu/pdlu.c

@@ -0,0 +1,18 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "double.h"
+#include "pxlu.c"

+ 18 - 0
mpi/examples/mpi_lu/pdlu_kernels.c

@@ -0,0 +1,18 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "double.h"
+#include "pxlu_kernels.c"

+ 260 - 0
mpi/examples/mpi_lu/plu_example.c

@@ -0,0 +1,260 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <math.h>
+#include <starpu.h>
+
+#include "pxlu.h"
+//#include "pxlu_kernels.h"
+
+static unsigned long size = 16384;
+static unsigned nblocks = 16;
+static unsigned check = 0;
+static unsigned p = 1;
+static unsigned q = 1;
+
+static starpu_data_handle *dataA_handles;
+static TYPE **dataA;
+
+/* In order to implement the distributed LU decomposition, we allocate
+ * temporary buffers */
+static starpu_data_handle tmp_11_block_handle;
+static TYPE **tmp_11_block;
+static starpu_data_handle *tmp_12_block_handles;
+static TYPE **tmp_12_block;
+static starpu_data_handle *tmp_21_block_handles;
+static TYPE *tmp_21_block;
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-size") == 0) {
+			char *argptr;
+			size = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nblocks") == 0) {
+			char *argptr;
+			nblocks = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-check") == 0) {
+			check = 1;
+		}
+
+		if (strcmp(argv[i], "-p") == 0) {
+			char *argptr;
+			p = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-q") == 0) {
+			char *argptr;
+			q = strtol(argv[++i], &argptr, 10);
+		}
+	}
+}
+
+static void fill_block_with_random(TYPE *blockptr, unsigned size, unsigned nblocks)
+{
+	const unsigned block_size = (size/nblocks);
+
+	unsigned i, j;
+	for (j = 0; j < block_size; j++)
+	for (i = 0; i < block_size; i++)
+	{
+		blockptr[i+j*block_size] = (TYPE)drand48();
+//		blockptr[i+j*block_size] = (i == j)?2.0:0.0;
+	}
+}
+
+starpu_data_handle STARPU_PLU(get_block_handle)(unsigned j, unsigned i)
+{
+	return dataA_handles[i+j*nblocks];
+}
+
+starpu_data_handle STARPU_PLU(get_tmp_11_block_handle)(void)
+{
+	return tmp_11_block_handle;
+}
+
+starpu_data_handle STARPU_PLU(get_tmp_12_block_handle)(unsigned j)
+{
+	return tmp_12_block_handles[j];
+}
+
+starpu_data_handle STARPU_PLU(get_tmp_21_block_handle)(unsigned i)
+{
+	return tmp_21_block_handles[i];
+}
+
+TYPE *STARPU_PLU(get_block)(unsigned j, unsigned i)
+{
+	return dataA[i+j*nblocks];
+}
+
+static void init_matrix(int rank)
+{
+	/* Allocate a grid of data handles, not all of them have to be allocated later on */
+	dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle));
+	dataA = calloc(nblocks*nblocks, sizeof(TYPE *));
+
+	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
+
+	/* Allocate all the blocks that belong to this mpi node */
+	unsigned long i,j;
+	for (j = 0; j < nblocks; j++)
+	{
+		for (i = 0; i < nblocks; i++)
+		{
+			if (get_block_rank(i, j) == rank)
+			{
+				/* This blocks should be treated by the current MPI process */
+				/* Allocate and fill it */
+				starpu_malloc_pinned_if_possible((void **)&dataA[i+j*nblocks], blocksize);
+
+				fill_block_with_random(STARPU_PLU(get_block)(j, i), size, nblocks);
+
+				/* Register it to StarPU */
+				starpu_register_blas_data(&dataA_handles[i+nblocks*j], 0,
+					(uintptr_t)dataA[i+nblocks*j], size/nblocks,
+					size/nblocks, size/nblocks, sizeof(TYPE));
+			} 
+		}
+	}
+
+	/* Allocate the temporary buffers required for the distributed algorithm */
+
+	/* tmp buffer 11 */
+	starpu_malloc_pinned_if_possible((void **)&tmp_11_block, blocksize);
+	starpu_register_blas_data(&tmp_11_block_handle, 0, (uintptr_t)tmp_11_block,
+			size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
+
+	/* tmp buffers 12 and 21 */
+	tmp_12_block_handles = malloc(nblocks*sizeof(starpu_data_handle));
+	tmp_21_block_handles = malloc(nblocks*sizeof(starpu_data_handle));
+	tmp_12_block = malloc(nblocks*sizeof(TYPE *));
+	tmp_21_block = malloc(nblocks*sizeof(TYPE *));
+	
+	unsigned k;
+	for (k = 0; k < nblocks; k++)
+	{
+		starpu_malloc_pinned_if_possible((void **)&tmp_12_block[k], blocksize);
+		starpu_register_blas_data(&tmp_12_block_handles[k], 0,
+			(uintptr_t)tmp_12_block[k],
+			size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
+
+		starpu_malloc_pinned_if_possible((void **)&tmp_21_block[k], blocksize);
+		starpu_register_blas_data(&tmp_21_block_handles[k], 0,
+			(uintptr_t)tmp_21_block[k],
+			size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
+	}
+}
+
+int get_block_rank(unsigned i, unsigned j)
+{
+	/* Take a 2D block cyclic distribution */
+	/* NB: p (resp. q) is for "direction" i (resp. j) */
+	return (j % q) * p + (i % p);
+}
+
+int main(int argc, char **argv)
+{
+	int rank;
+	int world_size;
+
+	/*
+	 *	Initialization
+	 */
+	MPI_Init(&argc, &argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+
+	srand48((long int)time(NULL));
+
+	parse_args(argc, argv);
+
+	STARPU_ASSERT(p*q == world_size);
+
+	starpu_init(NULL);
+	starpu_mpi_initialize();
+	starpu_helper_init_cublas();
+
+	int barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
+
+	/*
+	 * 	Problem Init
+	 */
+
+	init_matrix(rank);
+
+	TYPE *x, *y;
+
+	if (check)
+	{
+		if (rank == 0)
+			fprintf(stderr, "Compute AX = B\n");
+
+		x = calloc(size, sizeof(TYPE));
+		STARPU_ASSERT(x);
+
+		y = calloc(size, sizeof(TYPE));
+		STARPU_ASSERT(y);
+		
+		unsigned ind;
+		for (ind = 0; ind < size; ind++)
+		{
+			//x[ind] = (TYPE)1.0;
+			x[ind] = (TYPE)drand48();
+			y[ind] = (TYPE)0.0;
+		}
+
+		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
+
+		if (rank == 0)
+		for (ind = 0; ind < 10; ind++)
+		{
+			fprintf(stderr, "y[%d] = %f\n", ind, (float)y[ind]);
+		}
+		
+	}
+
+	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
+	fprintf(stderr, "Rank %d PID %d\n", rank, getpid());
+	sleep(10);
+
+	STARPU_PLU(plu_main)(nblocks, rank, world_size);
+
+	/*
+	 * 	Termination
+	 */
+	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
+
+	starpu_helper_shutdown_cublas();
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	MPI_Finalize();
+
+	return 0;
+}

+ 18 - 0
mpi/examples/mpi_lu/plu_example_double.c

@@ -0,0 +1,18 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "double.h"
+#include "plu_example.c"

+ 18 - 0
mpi/examples/mpi_lu/plu_example_float.c

@@ -0,0 +1,18 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "float.h"
+#include "plu_example.c"

+ 55 - 0
mpi/examples/mpi_lu/plu_solve.c

@@ -0,0 +1,55 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "pxlu.h"
+
+static STARPU_PLU(compute_ax_block)(unsigned size, unsigned nblocks,
+				 TYPE *block_data, TYPE *sub_x, TYPE *sub_y)
+{
+	CPU_GEMV("N", size/nblocks, size/nblocks, 1.0, block_data, size/nblocks, sub_x, 1, 1.0, sub_y, 1);
+}
+
+/* y is only valid on node 0 */
+void STARPU_PLU(compute_ax)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank)
+{
+	/* Create temporary buffers where all MPI processes are going to
+	 * compute Ai x = yi where Ai is the matrix containing the blocks of A
+	 * affected to process i, and 0 everywhere else. We then have y as the
+	 * sum of all yi. */
+	TYPE *yi = calloc(size, sizeof(TYPE));
+
+	/* Compute Aix = yi */
+	unsigned long i,j;
+	for (j = 0; j < nblocks; j++)
+	{
+		for (i = 0; i < nblocks; i++)
+		{
+			if (get_block_rank(i, j) == rank)
+			{
+				/* That block belongs to the current MPI process */
+				TYPE *block_data = STARPU_PLU(get_block)(j, i);
+				TYPE *sub_x = &x[i*(size/nblocks)];
+				TYPE *sub_yi = &yi[j*(size/nblocks)];
+
+				STARPU_PLU(compute_ax_block)(size, nblocks, block_data, sub_x, sub_yi);
+			}
+		}
+	}
+
+	/* Compute the Sum of all yi = y */
+	MPI_Reduce(yi, y, size, MPI_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
+}

+ 18 - 0
mpi/examples/mpi_lu/plu_solve_double.c

@@ -0,0 +1,18 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "double.h"
+#include "plu_solve.c"

+ 18 - 0
mpi/examples/mpi_lu/plu_solve_float.c

@@ -0,0 +1,18 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "float.h"
+#include "plu_solve.c"

+ 18 - 0
mpi/examples/mpi_lu/pslu.c

@@ -0,0 +1,18 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "float.h"
+#include "pxlu.c"

+ 18 - 0
mpi/examples/mpi_lu/pslu_kernels.c

@@ -0,0 +1,18 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "float.h"
+#include "pxlu_kernels.c"

+ 635 - 0
mpi/examples/mpi_lu/pxlu.c

@@ -0,0 +1,635 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "pxlu.h"
+#include "pxlu_kernels.h"
+
+#define MPI_TAG11(k)	((1U << 30) | (k))
+#define MPI_TAG12(k, j)	((2U << 30) | (j)*nblocks | (k))
+#define MPI_TAG21(k, i)	((3U << 30) | (i)*nblocks | (k))
+
+#define TAG11(k)	((starpu_tag_t)( (1ULL<<50) | (unsigned long long)(k)))
+#define TAG12(k,i)	((starpu_tag_t)(((2ULL<<50) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(i))))
+#define TAG21(k,j)	((starpu_tag_t)(((3ULL<<50) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(j))))
+#define TAG22(k,i,j)	((starpu_tag_t)(((4ULL<<50) | ((unsigned long long)(k)<<32) 	\
+					| ((unsigned long long)(i)<<16)	\
+					| (unsigned long long)(j))))
+#define TAG11_SAVE(k)	((starpu_tag_t)( (5ULL<<50) | (unsigned long long)(k)))
+#define TAG12_SAVE(k,i)	((starpu_tag_t)(((6ULL<<50) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(i))))
+#define TAG21_SAVE(k,j)	((starpu_tag_t)(((7ULL<<50) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(j))))
+
+#define TAG11_SAVE_PARTIAL(k)	((starpu_tag_t)( (8ULL<<50) | (unsigned long long)(k)))
+#define TAG12_SAVE_PARTIAL(k,i)	((starpu_tag_t)(((9ULL<<50) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(i))))
+#define TAG21_SAVE_PARTIAL(k,j)	((starpu_tag_t)(((10ULL<<50) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(j))))
+
+#define STARPU_TAG_INIT	((starpu_tag_t)(11ULL<<50))
+
+
+static unsigned no_prio = 0;
+
+static unsigned nblocks = 0;
+static int rank = -1;
+static int world_size = -1;
+
+struct callback_arg {
+	unsigned i, j, k;
+};
+
+/*
+ *	Various
+ */
+
+static struct starpu_task *create_task(starpu_tag_t id)
+{
+	struct starpu_task *task = starpu_task_create();
+		task->cl_arg = NULL;
+
+	task->use_tag = 1;
+	task->tag_id = id;
+
+	return task;
+}
+
+/* Send handle to every node appearing in the mask, and unlock tag once the
+ * transfers are done. */
+static void send_data_to_mask(starpu_data_handle handle, int *rank_mask, int mpi_tag, starpu_tag_t tag)
+{
+	unsigned cnt = 0;
+
+	int rank_array[world_size];
+	int comm_array[world_size];
+	int mpi_tag_array[world_size];
+	starpu_data_handle handle_array[world_size];
+
+	unsigned r;
+	for (r = 0; r < world_size; r++)
+	{
+		if (rank_mask[r])
+			rank_array[cnt++] = r;
+
+		comm_array[r] = MPI_COMM_WORLD;
+		mpi_tag_array[r] = mpi_tag;
+		handle_array[r] = handle;
+	}
+
+	if (cnt == 0)
+	{
+		/* In case there is no message to send, we release the tag at
+		 * once */
+		starpu_tag_notify_from_apps(tag);
+	}
+	else {
+		starpu_mpi_isend_array_detached_unlock_tag(cnt, handle_array,
+				rank_array, mpi_tag_array, comm_array, tag);
+	}
+}
+
+/* Initiate a receive request once all dependencies are fulfilled and unlock
+ * tag 'unlocked_tag' once it's done. */
+
+struct recv_when_done_callback_arg {
+	int source;
+	int mpi_tag;
+	starpu_data_handle handle;
+	starpu_tag_t unlocked_tag;
+};
+
+static void callback_receive_when_done(void *_arg)
+{
+	struct recv_when_done_callback_arg *arg = _arg;
+
+	starpu_mpi_irecv_detached_unlock_tag(arg->handle, arg->source,
+			arg->mpi_tag, MPI_COMM_WORLD, arg->unlocked_tag);
+
+	free(arg);
+}
+
+static void receive_when_deps_are_done(unsigned ndeps, starpu_tag_t *deps_tags,
+				int source, int mpi_tag,
+				starpu_data_handle handle,
+				starpu_tag_t partial_tag,
+				starpu_tag_t unlocked_tag)
+{
+	if (ndeps == 0)
+	{
+		starpu_tag_notify_from_apps(unlocked_tag);
+		return;
+	}
+
+	struct recv_when_done_callback_arg *arg =
+		malloc(sizeof(struct recv_when_done_callback_arg));
+	
+	arg->source = source;
+	arg->mpi_tag = mpi_tag;
+	arg->handle = handle;
+	arg->unlocked_tag = unlocked_tag;
+
+	starpu_create_sync_task(partial_tag, ndeps, deps_tags,
+					callback_receive_when_done, arg);
+}
+
+/*
+ *	Task 11 (diagonal factorization)
+ */
+
+static void create_task_11_recv(unsigned k)
+{
+	unsigned i, j;
+
+	/* The current node is not computing that task, so we receive the block
+	 * with MPI */
+
+	/* We don't issue a MPI receive request until everyone using the
+	 * temporary buffer is done : 11_(k-1) can be used by 12_(k-1)j and
+	 * 21(k-1)i with i,j >= k */
+	unsigned ndeps = 0;
+	starpu_tag_t tag_array[2*nblocks];
+	
+	if (k > 0)
+	for (i = k; i < nblocks; i++)
+	{
+		if (rank == get_block_rank(i, k))
+			tag_array[ndeps++] = TAG21(k-1, i);
+	}
+
+	if (k > 0)
+	for (j = k; j < nblocks; j++)
+	{
+		if (rank == get_block_rank(k, j))
+			tag_array[ndeps++] = TAG12(k-1, j);
+	}
+	
+	
+	int source = get_block_rank(k, j);
+	starpu_data_handle block_handle = STARPU_PLU(get_tmp_11_block_handle)();
+	int mpi_tag = MPI_TAG11(k);
+	starpu_tag_t partial_tag = TAG11_SAVE_PARTIAL(k);
+	starpu_tag_t unlocked_tag = TAG11_SAVE(k);
+
+	receive_when_deps_are_done(ndeps, tag_array, source, mpi_tag, block_handle, partial_tag, unlocked_tag);
+}
+
+static void find_nodes_using_11(unsigned k, int *rank_mask)
+{
+	memset(rank_mask, 0, world_size*sizeof(int));
+
+	/* Block 11_k is used to compute 12_kj + 12ki with i,j > k */
+	unsigned i;
+	for (i = k; i < nblocks; i++)
+	{
+		int r = get_block_rank(i, k);
+		rank_mask[r] = 1;
+	}
+
+	unsigned j;
+	for (j = k; j < nblocks; j++)
+	{
+		int r = get_block_rank(k, j);
+		rank_mask[r] = 1;
+	}
+}
+
+static void callback_task_11_real(void *_arg)
+{
+	struct callback_arg *arg = _arg;
+
+	unsigned k = arg->k;
+
+	/* Find all the nodes potentially requiring this block */
+	int rank_mask[world_size];
+	find_nodes_using_11(k, rank_mask);
+	rank_mask[rank] = 0;
+
+	/* Send the block to those nodes */
+	starpu_data_handle block_handle = STARPU_PLU(get_block_handle)(k, k);
+	starpu_tag_t tag = TAG11_SAVE(k);
+	int mpi_tag = MPI_TAG11(k);
+	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
+	
+	free(arg);
+}
+
+static void create_task_11_real(unsigned k)
+{
+	struct starpu_task *task = create_task(TAG11(k));
+
+	task->cl = &STARPU_PLU(cl11);
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = STARPU_PLU(get_block_handle)(k, k);
+	task->buffers[0].mode = STARPU_RW;
+
+	struct callback_arg *arg = malloc(sizeof(struct callback_arg));
+		arg->k = k;
+
+	task->callback_func = callback_task_11_real;
+	task->callback_arg = arg;
+
+	/* this is an important task */
+	if (!no_prio)
+		task->priority = MAX_PRIO;
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
+	}
+	else {
+		starpu_tag_declare_deps(TAG11(k), 1, STARPU_TAG_INIT);
+	}
+
+	starpu_submit_task(task);
+}
+
+static void create_task_11(unsigned k)
+{
+	if (get_block_rank(k, k) == rank)
+	{
+		create_task_11_real(k);
+	}
+	else {
+		/* We don't handle the task, but perhaps we have to generate MPI transfers. */
+		int rank_mask[world_size];
+		find_nodes_using_11(k, rank_mask);
+		
+		if (rank_mask[rank])
+			create_task_11_recv(k);
+	}
+}
+
+
+
+/*
+ *	Task 12 (Update lower left (TRSM))
+ */
+
+static void create_task_12_recv(unsigned k, unsigned j)
+{
+	unsigned i;
+
+	/* The current node is not computing that task, so we receive the block
+	 * with MPI */
+
+	/* We don't issue a MPI receive request until everyone using the
+	 * temporary buffer is done : 12_(k-1)j can be used by 22_(k-1)ij with
+	 * i >= k */
+	unsigned ndeps = 0;
+	starpu_tag_t tag_array[nblocks];
+	
+	if (k > 0)
+	for (i = k; i < nblocks; i++)
+	{
+		if (rank == get_block_rank(i, j))
+			tag_array[ndeps++] = TAG22(k-1, i, j);
+	}
+	
+	int source = get_block_rank(k, j);
+	starpu_data_handle block_handle = STARPU_PLU(get_tmp_12_block_handle)(j);
+	int mpi_tag = MPI_TAG21(j, k);
+	starpu_tag_t partial_tag = TAG21_SAVE_PARTIAL(j, k);
+	starpu_tag_t unlocked_tag = TAG21_SAVE(j, k);
+
+	receive_when_deps_are_done(ndeps, tag_array, source, mpi_tag, block_handle, partial_tag, unlocked_tag);
+}
+
+static void find_nodes_using_12(unsigned k, unsigned j, int *rank_mask)
+{
+	memset(rank_mask, 0, world_size*sizeof(int));
+
+	/* Block 12_kj is used to compute 22_kij with i > k */
+	unsigned i;
+	for (i = k; i < nblocks; i++)
+	{
+		int r = get_block_rank(i, j);
+		rank_mask[r] = 1;
+	}
+}
+
+static void callback_task_12_real(void *_arg)
+{
+	struct callback_arg *arg = _arg;
+
+	unsigned k = arg->k;
+	unsigned j = arg->j;
+
+	/* Find all the nodes potentially requiring this block */
+	int rank_mask[world_size];
+	find_nodes_using_12(k, j, rank_mask);
+	rank_mask[rank] = 0;
+
+	/* Send the block to those nodes */
+	starpu_data_handle block_handle = STARPU_PLU(get_block_handle)(j, k);
+	starpu_tag_t tag = TAG12_SAVE(k, j);
+	int mpi_tag = MPI_TAG12(k, j);
+	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
+	
+	free(arg);
+}
+
+static void create_task_12_real(unsigned k, unsigned j)
+{
+	struct starpu_task *task = create_task(TAG12(k, j));
+	
+	task->cl = &STARPU_PLU(cl12);
+
+	/* which sub-data is manipulated ? */
+	starpu_data_handle diag_block;
+	if (get_block_rank(k, k) == rank)
+		diag_block = STARPU_PLU(get_block_handle)(k, k);
+	else 
+		diag_block = STARPU_PLU(get_tmp_11_block_handle)();
+
+	task->buffers[0].handle = diag_block; 
+	task->buffers[0].mode = STARPU_R;
+	task->buffers[1].handle = STARPU_PLU(get_block_handle)(j, k); 
+	task->buffers[1].mode = STARPU_RW;
+
+	struct callback_arg *arg = malloc(sizeof(struct callback_arg));
+		arg->j = j;
+		arg->k = k;
+
+	task->callback_func = callback_task_12_real;
+	task->callback_arg = arg;
+
+	if (!no_prio && (j == k+1)) {
+		task->priority = MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG12(k, j), 2, TAG11_SAVE(k), TAG22(k-1, k, j));
+	}
+	else {
+		starpu_tag_declare_deps(TAG12(k, j), 1, TAG11_SAVE(k));
+	}
+
+	starpu_submit_task(task);
+}
+
+static void create_task_12(unsigned k, unsigned j)
+{
+	if (get_block_rank(k, j) == rank)
+	{
+		create_task_12_real(k, j);
+	}
+	else {
+		/* We don't handle the task, but perhaps we have to generate MPI transfers. */
+		int rank_mask[world_size];
+		find_nodes_using_12(k, j, rank_mask);
+		
+		if (rank_mask[rank])
+			create_task_12_recv(k, j);
+	}
+}
+
+/*
+ *	Task 21 (Update upper right (TRSM))
+ */
+
+static void create_task_21_recv(unsigned k, unsigned i)
+{
+	unsigned j;
+
+	/* The current node is not computing that task, so we receive the block
+	 * with MPI */
+
+	/* We don't issue a MPI receive request until everyone using the
+	 * temporary buffer is done : 21_(k-1)i can be used by 22_(k-1)ij with
+	 * j >= k */
+	unsigned ndeps = 0;
+	starpu_tag_t tag_array[nblocks];
+	
+	if (k > 0)
+	for (j = k; j < nblocks; j++)
+	{
+		if (rank == get_block_rank(i, j))
+			tag_array[ndeps++] = TAG22(k-1, i, j);
+	}
+	
+	int source = get_block_rank(i, k);
+	starpu_data_handle block_handle = STARPU_PLU(get_tmp_21_block_handle)(i);
+	int mpi_tag = MPI_TAG21(k, i);
+	starpu_tag_t partial_tag = TAG21_SAVE_PARTIAL(k, i);
+	starpu_tag_t unlocked_tag = TAG21_SAVE(k, i);
+
+	receive_when_deps_are_done(ndeps, tag_array, source, mpi_tag, block_handle, partial_tag, unlocked_tag);
+}
+
+static void find_nodes_using_21(unsigned k, unsigned i, int *rank_mask)
+{
+	memset(rank_mask, 0, world_size*sizeof(int));
+
+	/* Block 21_ki is used to compute 22_kij with j > k */
+	unsigned j;
+	for (j = k; j < nblocks; j++)
+	{
+		int r = get_block_rank(i, j);
+		rank_mask[r] = 1;
+	}
+}
+
+static void callback_task_21_real(void *_arg)
+{
+	struct callback_arg *arg = _arg;
+
+	unsigned k = arg->k;
+	unsigned i = arg->i;
+
+	/* Find all the nodes potentially requiring this block */
+	int rank_mask[world_size];
+	find_nodes_using_21(k, i, rank_mask);
+	rank_mask[rank] = 0;
+
+	/* Send the block to those nodes */
+	starpu_data_handle block_handle = STARPU_PLU(get_block_handle)(k, i);
+	starpu_tag_t tag = TAG21_SAVE(k, i);
+	int mpi_tag = MPI_TAG21(k, i);
+	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
+	
+	free(arg);
+}
+
+static void create_task_21_real(unsigned k, unsigned i)
+{
+	struct starpu_task *task = create_task(TAG21(k, i));
+
+	task->cl = &STARPU_PLU(cl21);
+	
+	/* which sub-data is manipulated ? */
+	starpu_data_handle diag_block;
+	if (get_block_rank(k, k) == rank)
+		diag_block = STARPU_PLU(get_block_handle)(k, k);
+	else 
+		diag_block = STARPU_PLU(get_tmp_11_block_handle)();
+
+	task->buffers[0].handle = diag_block; 
+	task->buffers[0].mode = STARPU_R;
+	task->buffers[1].handle = STARPU_PLU(get_block_handle)(k, i);
+	task->buffers[1].mode = STARPU_RW;
+
+	struct callback_arg *arg = malloc(sizeof(struct callback_arg));
+		arg->i = i;
+		arg->k = k;
+
+	task->callback_func = callback_task_21_real;
+	task->callback_arg = arg;
+
+	if (!no_prio && (i == k+1)) {
+		task->priority = MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG21(k, i), 2, TAG11_SAVE(k), TAG22(k-1, i, k));
+	}
+	else {
+		starpu_tag_declare_deps(TAG21(k, i), 1, TAG11_SAVE(k));
+	}
+
+	starpu_submit_task(task);
+}
+
+static void create_task_21(unsigned k, unsigned i)
+{
+	if (get_block_rank(i, k) == rank)
+	{
+		create_task_21_real(k, i);
+	}
+	else {
+		/* We don't handle the task, but perhaps we have to generate MPI transfers. */
+		int rank_mask[world_size];
+		find_nodes_using_21(k, i, rank_mask);
+		
+		if (rank_mask[rank])
+			create_task_21_recv(k, i);
+	}
+}
+
+/*
+ *	Task 22 (GEMM)
+ */
+
+static void create_task_22(unsigned k, unsigned i, unsigned j)
+{
+//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
+
+	struct starpu_task *task = create_task(TAG22(k, i, j));
+
+	task->cl = &STARPU_PLU(cl22);
+
+	/* which sub-data is manipulated ? */
+
+	/* produced by TAG21_SAVE(k, i) */ 
+	starpu_data_handle block21;
+	if (get_block_rank(i, k) == rank)
+		block21 = STARPU_PLU(get_block_handle)(k, i);
+	else 
+		block21 = STARPU_PLU(get_tmp_21_block_handle)(i);
+
+	task->buffers[0].handle = block21;
+	task->buffers[0].mode = STARPU_R;
+
+	/* produced by TAG12_SAVE(k, j) */
+	starpu_data_handle block12;
+	if (get_block_rank(k, j) == rank)
+		block12 = STARPU_PLU(get_block_handle)(j, k);
+	else 
+		block12 = STARPU_PLU(get_tmp_12_block_handle)(j);
+
+	task->buffers[1].handle = block12;
+	task->buffers[1].mode = STARPU_R;
+
+	/* produced by TAG22(k-1, i, j) */
+	task->buffers[2].handle = STARPU_PLU(get_block_handle)(j, i);
+	task->buffers[2].mode = STARPU_RW;
+
+	if (!no_prio &&  (i == k + 1) && (j == k +1) ) {
+		task->priority = MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG12_SAVE(k, j), TAG21_SAVE(k, i));
+	}
+	else {
+		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG12_SAVE(k, j), TAG21_SAVE(k, i));
+	}
+
+	starpu_submit_task(task);
+}
+
+/*
+ *	code to bootstrap the factorization 
+ */
+
+void STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
+{
+	struct timeval start;
+	struct timeval end;
+
+	nblocks = _nblocks;
+	rank = _rank;
+	world_size = _world_size;
+
+	struct starpu_task *entry_task = NULL;
+
+	/* create all the DAG nodes */
+	unsigned i,j,k;
+
+	for (k = 0; k < nblocks; k++)
+	{
+		create_task_11(k);
+
+		for (i = k+1; i<nblocks; i++)
+		{
+			create_task_12(k, i);
+			create_task_21(k, i);
+		}
+
+		for (i = k+1; i<nblocks; i++)
+		{
+			for (j = k+1; j<nblocks; j++)
+			{
+				create_task_22(k, i, j);
+			}
+		}
+	}
+
+	/* schedule the codelet */
+	gettimeofday(&start, NULL);
+
+	fprintf(stderr, "Rank %d GO\n", rank);
+	starpu_tag_notify_from_apps(STARPU_TAG_INIT);
+
+	/* stall the application until the end of computations : note that it
+	 * may be liberated explicitely by MPI */
+	starpu_tag_wait(TAG11(nblocks-1));
+
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	fprintf(stderr, "Computation took (in ms)\n");
+	printf("%2.2f\n", timing/1000);
+
+//	unsigned n = size;
+//	double flop = (2.0f*n*n*n)/3.0f;
+//	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+}

+ 37 - 0
mpi/examples/mpi_lu/pxlu.h

@@ -0,0 +1,37 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __PXLU_H__
+#define __PXLU_H__
+
+/* for USE_CUDA */
+#include <starpu_config.h>
+#include <starpu.h>
+
+#include <common/blas.h>
+
+#include <starpu_mpi.h>
+
+#define BLAS3_FLOP(n1,n2,n3)    \
+        (2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
+
+starpu_data_handle STARPU_PLU(get_block_handle)(unsigned j, unsigned i);
+TYPE *STARPU_PLU(get_block)(unsigned j, unsigned i);
+starpu_data_handle STARPU_PLU(get_tmp_11_block_handle)(void);
+starpu_data_handle STARPU_PLU(get_tmp_12_block_handle)(unsigned j);
+starpu_data_handle STARPU_PLU(get_tmp_21_block_handle)(unsigned i);
+
+#endif // __PXLU_H__

+ 383 - 0
mpi/examples/mpi_lu/pxlu_kernels.c

@@ -0,0 +1,383 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "pxlu.h"
+#include "pxlu_kernels.h"
+#include <math.h>
+
+/*
+ *   U22 
+ */
+
+static inline void STARPU_PLU(common_u22)(void *descr[],
+				int s, __attribute__((unused)) void *_args)
+{
+	TYPE *right 	= (TYPE *)GET_BLAS_PTR(descr[0]);
+	TYPE *left 	= (TYPE *)GET_BLAS_PTR(descr[1]);
+	TYPE *center 	= (TYPE *)GET_BLAS_PTR(descr[2]);
+
+	unsigned dx = GET_BLAS_NX(descr[2]);
+	unsigned dy = GET_BLAS_NY(descr[2]);
+	unsigned dz = GET_BLAS_NY(descr[0]);
+
+	unsigned ld12 = GET_BLAS_LD(descr[0]);
+	unsigned ld21 = GET_BLAS_LD(descr[1]);
+	unsigned ld22 = GET_BLAS_LD(descr[2]);
+
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	fprintf(stderr, "KERNEL 22 %d\n", rank);
+
+#ifdef USE_CUDA
+	cublasStatus status;
+	cudaError_t cures;
+#endif
+
+	switch (s) {
+		case 0:
+			CPU_GEMM("N", "N", dy, dx, dz, 
+				(TYPE)-1.0, right, ld21, left, ld12,
+				(TYPE)1.0, center, ld22);
+			break;
+
+#ifdef USE_CUDA
+		case 1:
+			CUBLAS_GEMM('n', 'n', dx, dy, dz,
+				(TYPE)-1.0, right, ld21, left, ld12,
+				(TYPE)1.0f, center, ld22);
+
+			status = cublasGetError();
+			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
+				STARPU_ABORT();
+
+			if (STARPU_UNLIKELY((cures = cudaThreadSynchronize()) != cudaSuccess))
+				CUDA_REPORT_ERROR(cures);
+
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+			break;
+	}
+}
+
+static void STARPU_PLU(cpu_u22)(void *descr[], void *_args)
+{
+	STARPU_PLU(common_u22)(descr, 0, _args);
+}
+
+#ifdef USE_CUDA
+static void STARPU_PLU(cublas_u22)(void *descr[], void *_args)
+{
+	STARPU_PLU(common_u22)(descr, 1, _args);
+}
+#endif// USE_CUDA
+
+static struct starpu_perfmodel_t STARPU_PLU(model_22) = {
+	.type = HISTORY_BASED,
+#ifdef ATLAS
+	.symbol = STARPU_PLU_STR(lu_model_22_atlas)
+#elif defined(GOTO)
+	.symbol = STARPU_PLU_STR(lu_model_22_goto)
+#else
+	.symbol = STARPU_PLU_STR(lu_model_22)
+#endif
+};
+
+starpu_codelet STARPU_PLU(cl22) = {
+	.where = CORE|CUDA,
+	.core_func = STARPU_PLU(cpu_u22),
+#ifdef USE_CUDA
+	.cuda_func = STARPU_PLU(cublas_u22),
+#endif
+	.nbuffers = 3,
+	.model = &STARPU_PLU(model_22)
+};
+
+
+/*
+ * U12
+ */
+
+static inline void STARPU_PLU(common_u12)(void *descr[],
+				int s, __attribute__((unused)) void *_args)
+{
+	TYPE *sub11;
+	TYPE *sub12;
+
+	sub11 = (TYPE *)GET_BLAS_PTR(descr[0]);	
+	sub12 = (TYPE *)GET_BLAS_PTR(descr[1]);
+
+	unsigned ld11 = GET_BLAS_LD(descr[0]);
+	unsigned ld12 = GET_BLAS_LD(descr[1]);
+
+	unsigned nx12 = GET_BLAS_NX(descr[1]);
+	unsigned ny12 = GET_BLAS_NY(descr[1]);
+
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	fprintf(stderr, "KERNEL 12 %d\n", rank);
+
+#ifdef USE_CUDA
+	cublasStatus status;
+	cudaError_t cures;
+#endif
+
+	/* solve L11 U12 = A12 (find U12) */
+	switch (s) {
+		case 0:
+			CPU_TRSM("L", "L", "N", "N", nx12, ny12,
+					(TYPE)1.0, sub11, ld11, sub12, ld12);
+			break;
+#ifdef USE_CUDA
+		case 1:
+			CUBLAS_TRSM('L', 'L', 'N', 'N', ny12, nx12,
+					(TYPE)1.0, sub11, ld11, sub12, ld12);
+
+			status = cublasGetError();
+			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
+				STARPU_ABORT();
+
+			if (STARPU_UNLIKELY((cures = cudaThreadSynchronize()) != cudaSuccess))
+				CUDA_REPORT_ERROR(cures);
+
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+			break;
+	}
+}
+
+static void STARPU_PLU(cpu_u12)(void *descr[], void *_args)
+{
+	STARPU_PLU(common_u12)(descr, 0, _args);
+}
+
+#ifdef USE_CUDA
+static void STARPU_PLU(cublas_u12)(void *descr[], void *_args)
+{
+	STARPU_PLU(common_u12)(descr, 1, _args);
+}
+#endif // USE_CUDA
+
+static struct starpu_perfmodel_t STARPU_PLU(model_12) = {
+	.type = HISTORY_BASED,
+#ifdef ATLAS
+	.symbol = STARPU_PLU_STR(lu_model_12_atlas)
+#elif defined(GOTO)
+	.symbol = STARPU_PLU_STR(lu_model_12_goto)
+#else
+	.symbol = STARPU_PLU_STR(lu_model_12)
+#endif
+};
+
+starpu_codelet STARPU_PLU(cl12) = {
+	.where = CORE|CUDA,
+	.core_func = STARPU_PLU(cpu_u12),
+#ifdef USE_CUDA
+	.cuda_func = STARPU_PLU(cublas_u12),
+#endif
+	.nbuffers = 2,
+	.model = &STARPU_PLU(model_12)
+};
+
+
+/* 
+ * U21
+ */
+
+static inline void STARPU_PLU(common_u21)(void *descr[],
+				int s, __attribute__((unused)) void *_args)
+{
+	TYPE *sub11;
+	TYPE *sub21;
+
+	sub11 = (TYPE *)GET_BLAS_PTR(descr[0]);
+	sub21 = (TYPE *)GET_BLAS_PTR(descr[1]);
+
+	unsigned ld11 = GET_BLAS_LD(descr[0]);
+	unsigned ld21 = GET_BLAS_LD(descr[1]);
+
+	unsigned nx21 = GET_BLAS_NX(descr[1]);
+	unsigned ny21 = GET_BLAS_NY(descr[1]);
+	
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	fprintf(stderr, "KERNEL 21 %d\n", rank);
+
+
+#ifdef USE_CUDA
+	cublasStatus status;
+	cudaError_t cures;
+#endif
+
+	switch (s) {
+		case 0:
+			CPU_TRSM("R", "U", "N", "U", nx21, ny21,
+					(TYPE)1.0, sub11, ld11, sub21, ld21);
+			break;
+#ifdef USE_CUDA
+		case 1:
+			CUBLAS_TRSM('R', 'U', 'N', 'U', ny21, nx21,
+					(TYPE)1.0, sub11, ld11, sub21, ld21);
+
+			status = cublasGetError();
+			if (status != CUBLAS_STATUS_SUCCESS)
+				STARPU_ABORT();
+
+			cudaThreadSynchronize();
+
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+			break;
+	}
+}
+
+static void STARPU_PLU(cpu_u21)(void *descr[], void *_args)
+{
+	STARPU_PLU(common_u21)(descr, 0, _args);
+}
+
+#ifdef USE_CUDA
+static void STARPU_PLU(cublas_u21)(void *descr[], void *_args)
+{
+	STARPU_PLU(common_u21)(descr, 1, _args);
+}
+#endif 
+
+static struct starpu_perfmodel_t STARPU_PLU(model_21) = {
+	.type = HISTORY_BASED,
+#ifdef ATLAS
+	.symbol = STARPU_PLU_STR(lu_model_21_atlas)
+#elif defined(GOTO)
+	.symbol = STARPU_PLU_STR(lu_model_21_goto)
+#else
+	.symbol = STARPU_PLU_STR(lu_model_21)
+#endif
+};
+
+starpu_codelet STARPU_PLU(cl21) = {
+	.where = CORE|CUDA,
+	.core_func = STARPU_PLU(cpu_u21),
+#ifdef USE_CUDA
+	.cuda_func = STARPU_PLU(cublas_u21),
+#endif
+	.nbuffers = 2,
+	.model = &STARPU_PLU(model_21)
+};
+
+
+/*
+ *	U11
+ */
+
+static inline void STARPU_PLU(common_u11)(void *descr[],
+				int s, __attribute__((unused)) void *_args)
+{
+	TYPE *sub11;
+
+	sub11 = (TYPE *)GET_BLAS_PTR(descr[0]); 
+
+	unsigned long nx = GET_BLAS_NX(descr[0]);
+	unsigned long ld = GET_BLAS_LD(descr[0]);
+
+	unsigned long z;
+
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	fprintf(stderr, "KERNEL 11 %d\n", rank);
+
+	switch (s) {
+		case 0:
+			for (z = 0; z < nx; z++)
+			{
+				TYPE pivot;
+				pivot = sub11[z+z*ld];
+				STARPU_ASSERT(pivot != 0.0);
+		
+				CPU_SCAL(nx - z - 1, (1.0/pivot), &sub11[z+(z+1)*ld], ld);
+		
+				CPU_GER(nx - z - 1, nx - z - 1, -1.0,
+						&sub11[(z+1)+z*ld], 1,
+						&sub11[z+(z+1)*ld], ld,
+						&sub11[(z+1) + (z+1)*ld],ld);
+			}
+			break;
+#ifdef USE_CUDA
+		case 1:
+			for (z = 0; z < nx; z++)
+			{
+				TYPE pivot;
+				cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
+				cudaStreamSynchronize(0);
+
+				STARPU_ASSERT(pivot != 0.0);
+				
+				CUBLAS_SCAL(nx - z - 1, 1.0/pivot, &sub11[z+(z+1)*ld], ld);
+				
+				CUBLAS_GER(nx - z - 1, nx - z - 1, -1.0,
+						&sub11[(z+1)+z*ld], 1,
+						&sub11[z+(z+1)*ld], ld,
+						&sub11[(z+1) + (z+1)*ld],ld);
+			}
+			
+			cudaThreadSynchronize();
+
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+			break;
+	}
+}
+
+static void STARPU_PLU(cpu_u11)(void *descr[], void *_args)
+{
+	STARPU_PLU(common_u11)(descr, 0, _args);
+}
+
+#ifdef USE_CUDA
+static void STARPU_PLU(cublas_u11)(void *descr[], void *_args)
+{
+	STARPU_PLU(common_u11)(descr, 1, _args);
+}
+#endif// USE_CUDA
+
+static struct starpu_perfmodel_t STARPU_PLU(model_11) = {
+	.type = HISTORY_BASED,
+#ifdef ATLAS
+	.symbol = STARPU_PLU_STR(lu_model_11_atlas)
+#elif defined(GOTO)
+	.symbol = STARPU_PLU_STR(lu_model_11_goto)
+#else
+	.symbol = STARPU_PLU_STR(lu_model_11)
+#endif
+};
+
+starpu_codelet STARPU_PLU(cl11) = {
+	.where = CORE|CUDA,
+	.core_func = STARPU_PLU(cpu_u11),
+#ifdef USE_CUDA
+	.cuda_func = STARPU_PLU(cublas_u11),
+#endif
+	.nbuffers = 1,
+	.model = &STARPU_PLU(model_11)
+};
+
+

+ 31 - 0
mpi/examples/mpi_lu/pxlu_kernels.h

@@ -0,0 +1,31 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __PXLU_KERNELS_H__
+#define __PXLU_KERNELS_H__
+
+#include <starpu.h>
+
+#define str(s) #s
+#define xstr(s)        str(s)
+#define STARPU_PLU_STR(name)  xstr(STARPU_PLU(name))
+
+starpu_codelet STARPU_PLU(cl11);
+starpu_codelet STARPU_PLU(cl12);
+starpu_codelet STARPU_PLU(cl21);
+starpu_codelet STARPU_PLU(cl22);
+
+#endif // __PXLU_KERNELS_H__

+ 18 - 0
mpi/examples/mpi_lu/slu_kernels.c

@@ -0,0 +1,18 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "float.h"
+#include "xlu_kernels.c"