hace 15 años · c46518553b
--- a/mpi/Makefile.am
+++ b/mpi/Makefile.am
@@ -14,6 +14,8 @@
 
				 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				 #
			
 
				 
			
 
				+SUBDIRS = examples
			
 
				+
			
 
				 CC=$(MPICC)
			
 
				 
			
 
				 if USE_CUDA
			
--- a/mpi/examples/Makefile.am
+++ b/mpi/examples/Makefile.am
@@ -0,0 +1,66 @@
 
				+#
			
 
				+# StarPU
			
 
				+# Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+#
			
 
				+# This program is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# This program is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+
			
 
				+CC=$(MPICC)
			
 
				+
			
 
				+LIBS = $(top_builddir)/src/libstarpu.la $(top_builddir)/mpi/libstarpumpi.la @LIBS@
			
 
				+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include  -I$(top_srcdir)/mpi/
			
 
				+
			
 
				+TESTS = $(check_PROGRAMS)
			
 
				+
			
 
				+check_PROGRAMS =
			
 
				+
			
 
				+BUILT_SOURCES =
			
 
				+
			
 
				+CLEANFILES = *.gcno *.gcda *.linkinfo
			
 
				+
			
 
				+if USE_CUDA
			
 
				+
			
 
				+# TODO define NVCCFLAGS
			
 
				+NVCC ?= nvcc
			
 
				+
			
 
				+.cu.o:
			
 
				+	$(NVCC) $< -c -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS) -I$(top_srcdir)/include/ -I$(top_builddir)/include/
			
 
				+endif
			
 
				+
			
 
				+examplebindir = $(libdir)/starpu/mpi/examples/
			
 
				+
			
 
				+examplebin_PROGRAMS =
			
 
				+
			
 
				+##################
			
 
				+# MPI LU example #
			
 
				+##################
			
 
				+
			
 
				+if !NO_BLAS_LIB
			
 
				+
			
 
				+examplebin_PROGRAMS += 				\
			
 
				+	mpi_lu/plu_example_float		\
			
 
				+	mpi_lu/plu_example_double
			
 
				+
			
 
				+mpi_lu_plu_example_float_SOURCES =		\
			
 
				+	mpi_lu/plu_example_float.c		\
			
 
				+	mpi_lu/plu_solve_float.c		\
			
 
				+	mpi_lu/pslu_kernels.c			\
			
 
				+	mpi_lu/pslu.c				\
			
 
				+	$(top_srcdir)/examples/common/blas.c
			
 
				+
			
 
				+mpi_lu_plu_example_double_SOURCES =		\
			
 
				+	mpi_lu/plu_example_double.c		\
			
 
				+	mpi_lu/plu_solve_double.c		\
			
 
				+	mpi_lu/pdlu_kernels.c			\
			
 
				+	mpi_lu/pdlu.c				\
			
 
				+	$(top_srcdir)/examples/common/blas.c
			
 
				+endif
			
--- a/mpi/examples/mpi_lu/.dirstamp
+++ b/mpi/examples/mpi_lu/.dirstamp
--- a/mpi/examples/mpi_lu/double.h
+++ b/mpi/examples/mpi_lu/double.h
@@ -0,0 +1,41 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#define TYPE double
			
 
				+#define MPI_TYPE	MPI_DOUBLE
			
 
				+
			
 
				+#define STARPU_PLU(name)       starpu_pdlu_##name
			
 
				+
			
 
				+#define CUBLAS_GEMM	cublasDgemm
			
 
				+#define CUBLAS_TRSM	cublasDtrsm
			
 
				+#define CUBLAS_SCAL	cublasDscal
			
 
				+#define CUBLAS_GER	cublasDger
			
 
				+#define CUBLAS_SWAP	cublasDswap
			
 
				+#define CUBLAS_IAMAX	cublasIdamax
			
 
				+
			
 
				+#define CPU_GEMM	DGEMM
			
 
				+#define CPU_GEMV	DGEMV
			
 
				+#define CPU_TRSM	DTRSM
			
 
				+#define CPU_SCAL	DSCAL
			
 
				+#define CPU_GER		DGER
			
 
				+#define CPU_SWAP	DSWAP
			
 
				+
			
 
				+#define CPU_TRMM	DTRMM
			
 
				+#define CPU_AXPY	DAXPY
			
 
				+#define CPU_ASUM	DASUM
			
 
				+#define CPU_IAMAX	IDAMAX
			
 
				+
			
 
				+#define PIVOT_THRESHHOLD	10e-10
			
--- a/mpi/examples/mpi_lu/float.h
+++ b/mpi/examples/mpi_lu/float.h
@@ -0,0 +1,41 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#define TYPE float
			
 
				+#define MPI_TYPE	MPI_FLOAT
			
 
				+
			
 
				+#define STARPU_PLU(name)       starpu_pslu_##name
			
 
				+
			
 
				+#define CUBLAS_GEMM	cublasSgemm
			
 
				+#define CUBLAS_TRSM	cublasStrsm
			
 
				+#define CUBLAS_SCAL	cublasSscal
			
 
				+#define CUBLAS_GER	cublasSger
			
 
				+#define CUBLAS_SWAP	cublasSswap
			
 
				+#define CUBLAS_IAMAX	cublasIsamax
			
 
				+
			
 
				+#define CPU_GEMM	SGEMM
			
 
				+#define CPU_GEMV	SGEMV
			
 
				+#define CPU_TRSM	STRSM
			
 
				+#define CPU_SCAL	SSCAL
			
 
				+#define CPU_GER		SGER
			
 
				+#define CPU_SWAP	SSWAP
			
 
				+
			
 
				+#define CPU_TRMM	STRMM
			
 
				+#define CPU_AXPY	SAXPY
			
 
				+#define CPU_ASUM	SASUM
			
 
				+#define CPU_IAMAX	ISAMAX
			
 
				+
			
 
				+#define PIVOT_THRESHHOLD	10e-5
			
--- a/mpi/examples/mpi_lu/pdlu.c
+++ b/mpi/examples/mpi_lu/pdlu.c
@@ -0,0 +1,18 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "double.h"
			
 
				+#include "pxlu.c"
			
--- a/mpi/examples/mpi_lu/pdlu_kernels.c
+++ b/mpi/examples/mpi_lu/pdlu_kernels.c
@@ -0,0 +1,18 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "double.h"
			
 
				+#include "pxlu_kernels.c"
			
--- a/mpi/examples/mpi_lu/plu_example.c
+++ b/mpi/examples/mpi_lu/plu_example.c
@@ -0,0 +1,260 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <stdlib.h>
			
 
				+#include <stdio.h>
			
 
				+#include <string.h>
			
 
				+#include <time.h>
			
 
				+#include <math.h>
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#include "pxlu.h"
			
 
				+//#include "pxlu_kernels.h"
			
 
				+
			
 
				+static unsigned long size = 16384;
			
 
				+static unsigned nblocks = 16;
			
 
				+static unsigned check = 0;
			
 
				+static unsigned p = 1;
			
 
				+static unsigned q = 1;
			
 
				+
			
 
				+static starpu_data_handle *dataA_handles;
			
 
				+static TYPE **dataA;
			
 
				+
			
 
				+/* In order to implement the distributed LU decomposition, we allocate
			
 
				+ * temporary buffers */
			
 
				+static starpu_data_handle tmp_11_block_handle;
			
 
				+static TYPE **tmp_11_block;
			
 
				+static starpu_data_handle *tmp_12_block_handles;
			
 
				+static TYPE **tmp_12_block;
			
 
				+static starpu_data_handle *tmp_21_block_handles;
			
 
				+static TYPE *tmp_21_block;
			
 
				+
			
 
				+static void parse_args(int argc, char **argv)
			
 
				+{
			
 
				+	int i;
			
 
				+	for (i = 1; i < argc; i++) {
			
 
				+		if (strcmp(argv[i], "-size") == 0) {
			
 
				+			char *argptr;
			
 
				+			size = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-nblocks") == 0) {
			
 
				+			char *argptr;
			
 
				+			nblocks = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-check") == 0) {
			
 
				+			check = 1;
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-p") == 0) {
			
 
				+			char *argptr;
			
 
				+			p = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-q") == 0) {
			
 
				+			char *argptr;
			
 
				+			q = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void fill_block_with_random(TYPE *blockptr, unsigned size, unsigned nblocks)
			
 
				+{
			
 
				+	const unsigned block_size = (size/nblocks);
			
 
				+
			
 
				+	unsigned i, j;
			
 
				+	for (j = 0; j < block_size; j++)
			
 
				+	for (i = 0; i < block_size; i++)
			
 
				+	{
			
 
				+		blockptr[i+j*block_size] = (TYPE)drand48();
			
 
				+//		blockptr[i+j*block_size] = (i == j)?2.0:0.0;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+starpu_data_handle STARPU_PLU(get_block_handle)(unsigned j, unsigned i)
			
 
				+{
			
 
				+	return dataA_handles[i+j*nblocks];
			
 
				+}
			
 
				+
			
 
				+starpu_data_handle STARPU_PLU(get_tmp_11_block_handle)(void)
			
 
				+{
			
 
				+	return tmp_11_block_handle;
			
 
				+}
			
 
				+
			
 
				+starpu_data_handle STARPU_PLU(get_tmp_12_block_handle)(unsigned j)
			
 
				+{
			
 
				+	return tmp_12_block_handles[j];
			
 
				+}
			
 
				+
			
 
				+starpu_data_handle STARPU_PLU(get_tmp_21_block_handle)(unsigned i)
			
 
				+{
			
 
				+	return tmp_21_block_handles[i];
			
 
				+}
			
 
				+
			
 
				+TYPE *STARPU_PLU(get_block)(unsigned j, unsigned i)
			
 
				+{
			
 
				+	return dataA[i+j*nblocks];
			
 
				+}
			
 
				+
			
 
				+static void init_matrix(int rank)
			
 
				+{
			
 
				+	/* Allocate a grid of data handles, not all of them have to be allocated later on */
			
 
				+	dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle));
			
 
				+	dataA = calloc(nblocks*nblocks, sizeof(TYPE *));
			
 
				+
			
 
				+	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
			
 
				+
			
 
				+	/* Allocate all the blocks that belong to this mpi node */
			
 
				+	unsigned long i,j;
			
 
				+	for (j = 0; j < nblocks; j++)
			
 
				+	{
			
 
				+		for (i = 0; i < nblocks; i++)
			
 
				+		{
			
 
				+			if (get_block_rank(i, j) == rank)
			
 
				+			{
			
 
				+				/* This blocks should be treated by the current MPI process */
			
 
				+				/* Allocate and fill it */
			
 
				+				starpu_malloc_pinned_if_possible((void **)&dataA[i+j*nblocks], blocksize);
			
 
				+
			
 
				+				fill_block_with_random(STARPU_PLU(get_block)(j, i), size, nblocks);
			
 
				+
			
 
				+				/* Register it to StarPU */
			
 
				+				starpu_register_blas_data(&dataA_handles[i+nblocks*j], 0,
			
 
				+					(uintptr_t)dataA[i+nblocks*j], size/nblocks,
			
 
				+					size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				+			} 
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* Allocate the temporary buffers required for the distributed algorithm */
			
 
				+
			
 
				+	/* tmp buffer 11 */
			
 
				+	starpu_malloc_pinned_if_possible((void **)&tmp_11_block, blocksize);
			
 
				+	starpu_register_blas_data(&tmp_11_block_handle, 0, (uintptr_t)tmp_11_block,
			
 
				+			size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				+
			
 
				+	/* tmp buffers 12 and 21 */
			
 
				+	tmp_12_block_handles = malloc(nblocks*sizeof(starpu_data_handle));
			
 
				+	tmp_21_block_handles = malloc(nblocks*sizeof(starpu_data_handle));
			
 
				+	tmp_12_block = malloc(nblocks*sizeof(TYPE *));
			
 
				+	tmp_21_block = malloc(nblocks*sizeof(TYPE *));
			
 
				+	
			
 
				+	unsigned k;
			
 
				+	for (k = 0; k < nblocks; k++)
			
 
				+	{
			
 
				+		starpu_malloc_pinned_if_possible((void **)&tmp_12_block[k], blocksize);
			
 
				+		starpu_register_blas_data(&tmp_12_block_handles[k], 0,
			
 
				+			(uintptr_t)tmp_12_block[k],
			
 
				+			size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				+
			
 
				+		starpu_malloc_pinned_if_possible((void **)&tmp_21_block[k], blocksize);
			
 
				+		starpu_register_blas_data(&tmp_21_block_handles[k], 0,
			
 
				+			(uintptr_t)tmp_21_block[k],
			
 
				+			size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int get_block_rank(unsigned i, unsigned j)
			
 
				+{
			
 
				+	/* Take a 2D block cyclic distribution */
			
 
				+	/* NB: p (resp. q) is for "direction" i (resp. j) */
			
 
				+	return (j % q) * p + (i % p);
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int rank;
			
 
				+	int world_size;
			
 
				+
			
 
				+	/*
			
 
				+	 *	Initialization
			
 
				+	 */
			
 
				+	MPI_Init(&argc, &argv);
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
			
 
				+
			
 
				+	srand48((long int)time(NULL));
			
 
				+
			
 
				+	parse_args(argc, argv);
			
 
				+
			
 
				+	STARPU_ASSERT(p*q == world_size);
			
 
				+
			
 
				+	starpu_init(NULL);
			
 
				+	starpu_mpi_initialize();
			
 
				+	starpu_helper_init_cublas();
			
 
				+
			
 
				+	int barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
			
 
				+
			
 
				+	/*
			
 
				+	 * 	Problem Init
			
 
				+	 */
			
 
				+
			
 
				+	init_matrix(rank);
			
 
				+
			
 
				+	TYPE *x, *y;
			
 
				+
			
 
				+	if (check)
			
 
				+	{
			
 
				+		if (rank == 0)
			
 
				+			fprintf(stderr, "Compute AX = B\n");
			
 
				+
			
 
				+		x = calloc(size, sizeof(TYPE));
			
 
				+		STARPU_ASSERT(x);
			
 
				+
			
 
				+		y = calloc(size, sizeof(TYPE));
			
 
				+		STARPU_ASSERT(y);
			
 
				+		
			
 
				+		unsigned ind;
			
 
				+		for (ind = 0; ind < size; ind++)
			
 
				+		{
			
 
				+			//x[ind] = (TYPE)1.0;
			
 
				+			x[ind] = (TYPE)drand48();
			
 
				+			y[ind] = (TYPE)0.0;
			
 
				+		}
			
 
				+
			
 
				+		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
			
 
				+
			
 
				+		if (rank == 0)
			
 
				+		for (ind = 0; ind < 10; ind++)
			
 
				+		{
			
 
				+			fprintf(stderr, "y[%d] = %f\n", ind, (float)y[ind]);
			
 
				+		}
			
 
				+		
			
 
				+	}
			
 
				+
			
 
				+	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
			
 
				+	fprintf(stderr, "Rank %d PID %d\n", rank, getpid());
			
 
				+	sleep(10);
			
 
				+
			
 
				+	STARPU_PLU(plu_main)(nblocks, rank, world_size);
			
 
				+
			
 
				+	/*
			
 
				+	 * 	Termination
			
 
				+	 */
			
 
				+	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
			
 
				+
			
 
				+	starpu_helper_shutdown_cublas();
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	MPI_Finalize();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/mpi/examples/mpi_lu/plu_example_double.c
+++ b/mpi/examples/mpi_lu/plu_example_double.c
@@ -0,0 +1,18 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "double.h"
			
 
				+#include "plu_example.c"
			
--- a/mpi/examples/mpi_lu/plu_example_float.c
+++ b/mpi/examples/mpi_lu/plu_example_float.c
@@ -0,0 +1,18 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "float.h"
			
 
				+#include "plu_example.c"
			
--- a/mpi/examples/mpi_lu/plu_solve.c
+++ b/mpi/examples/mpi_lu/plu_solve.c
@@ -0,0 +1,55 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include "pxlu.h"
			
 
				+
			
 
				+static STARPU_PLU(compute_ax_block)(unsigned size, unsigned nblocks,
			
 
				+				 TYPE *block_data, TYPE *sub_x, TYPE *sub_y)
			
 
				+{
			
 
				+	CPU_GEMV("N", size/nblocks, size/nblocks, 1.0, block_data, size/nblocks, sub_x, 1, 1.0, sub_y, 1);
			
 
				+}
			
 
				+
			
 
				+/* y is only valid on node 0 */
			
 
				+void STARPU_PLU(compute_ax)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank)
			
 
				+{
			
 
				+	/* Create temporary buffers where all MPI processes are going to
			
 
				+	 * compute Ai x = yi where Ai is the matrix containing the blocks of A
			
 
				+	 * affected to process i, and 0 everywhere else. We then have y as the
			
 
				+	 * sum of all yi. */
			
 
				+	TYPE *yi = calloc(size, sizeof(TYPE));
			
 
				+
			
 
				+	/* Compute Aix = yi */
			
 
				+	unsigned long i,j;
			
 
				+	for (j = 0; j < nblocks; j++)
			
 
				+	{
			
 
				+		for (i = 0; i < nblocks; i++)
			
 
				+		{
			
 
				+			if (get_block_rank(i, j) == rank)
			
 
				+			{
			
 
				+				/* That block belongs to the current MPI process */
			
 
				+				TYPE *block_data = STARPU_PLU(get_block)(j, i);
			
 
				+				TYPE *sub_x = &x[i*(size/nblocks)];
			
 
				+				TYPE *sub_yi = &yi[j*(size/nblocks)];
			
 
				+
			
 
				+				STARPU_PLU(compute_ax_block)(size, nblocks, block_data, sub_x, sub_yi);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* Compute the Sum of all yi = y */
			
 
				+	MPI_Reduce(yi, y, size, MPI_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
			
 
				+}
			
--- a/mpi/examples/mpi_lu/plu_solve_double.c
+++ b/mpi/examples/mpi_lu/plu_solve_double.c
@@ -0,0 +1,18 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "double.h"
			
 
				+#include "plu_solve.c"
			
--- a/mpi/examples/mpi_lu/plu_solve_float.c
+++ b/mpi/examples/mpi_lu/plu_solve_float.c
@@ -0,0 +1,18 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "float.h"
			
 
				+#include "plu_solve.c"
			
--- a/mpi/examples/mpi_lu/pslu.c
+++ b/mpi/examples/mpi_lu/pslu.c
@@ -0,0 +1,18 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "float.h"
			
 
				+#include "pxlu.c"
			
--- a/mpi/examples/mpi_lu/pslu_kernels.c
+++ b/mpi/examples/mpi_lu/pslu_kernels.c
@@ -0,0 +1,18 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "float.h"
			
 
				+#include "pxlu_kernels.c"
			
--- a/mpi/examples/mpi_lu/pxlu.c
+++ b/mpi/examples/mpi_lu/pxlu.c
@@ -0,0 +1,635 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "pxlu.h"
			
 
				+#include "pxlu_kernels.h"
			
 
				+
			
 
				+#define MPI_TAG11(k)	((1U << 30) | (k))
			
 
				+#define MPI_TAG12(k, j)	((2U << 30) | (j)*nblocks | (k))
			
 
				+#define MPI_TAG21(k, i)	((3U << 30) | (i)*nblocks | (k))
			
 
				+
			
 
				+#define TAG11(k)	((starpu_tag_t)( (1ULL<<50) | (unsigned long long)(k)))
			
 
				+#define TAG12(k,i)	((starpu_tag_t)(((2ULL<<50) | (((unsigned long long)(k))<<32)	\
			
 
				+					| (unsigned long long)(i))))
			
 
				+#define TAG21(k,j)	((starpu_tag_t)(((3ULL<<50) | (((unsigned long long)(k))<<32)	\
			
 
				+					| (unsigned long long)(j))))
			
 
				+#define TAG22(k,i,j)	((starpu_tag_t)(((4ULL<<50) | ((unsigned long long)(k)<<32) 	\
			
 
				+					| ((unsigned long long)(i)<<16)	\
			
 
				+					| (unsigned long long)(j))))
			
 
				+#define TAG11_SAVE(k)	((starpu_tag_t)( (5ULL<<50) | (unsigned long long)(k)))
			
 
				+#define TAG12_SAVE(k,i)	((starpu_tag_t)(((6ULL<<50) | (((unsigned long long)(k))<<32)	\
			
 
				+					| (unsigned long long)(i))))
			
 
				+#define TAG21_SAVE(k,j)	((starpu_tag_t)(((7ULL<<50) | (((unsigned long long)(k))<<32)	\
			
 
				+					| (unsigned long long)(j))))
			
 
				+
			
 
				+#define TAG11_SAVE_PARTIAL(k)	((starpu_tag_t)( (8ULL<<50) | (unsigned long long)(k)))
			
 
				+#define TAG12_SAVE_PARTIAL(k,i)	((starpu_tag_t)(((9ULL<<50) | (((unsigned long long)(k))<<32)	\
			
 
				+					| (unsigned long long)(i))))
			
 
				+#define TAG21_SAVE_PARTIAL(k,j)	((starpu_tag_t)(((10ULL<<50) | (((unsigned long long)(k))<<32)	\
			
 
				+					| (unsigned long long)(j))))
			
 
				+
			
 
				+#define STARPU_TAG_INIT	((starpu_tag_t)(11ULL<<50))
			
 
				+
			
 
				+
			
 
				+static unsigned no_prio = 0;
			
 
				+
			
 
				+static unsigned nblocks = 0;
			
 
				+static int rank = -1;
			
 
				+static int world_size = -1;
			
 
				+
			
 
				+struct callback_arg {
			
 
				+	unsigned i, j, k;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ *	Various
			
 
				+ */
			
 
				+
			
 
				+static struct starpu_task *create_task(starpu_tag_t id)
			
 
				+{
			
 
				+	struct starpu_task *task = starpu_task_create();
			
 
				+		task->cl_arg = NULL;
			
 
				+
			
 
				+	task->use_tag = 1;
			
 
				+	task->tag_id = id;
			
 
				+
			
 
				+	return task;
			
 
				+}
			
 
				+
			
 
				+/* Send handle to every node appearing in the mask, and unlock tag once the
			
 
				+ * transfers are done. */
			
 
				+static void send_data_to_mask(starpu_data_handle handle, int *rank_mask, int mpi_tag, starpu_tag_t tag)
			
 
				+{
			
 
				+	unsigned cnt = 0;
			
 
				+
			
 
				+	int rank_array[world_size];
			
 
				+	int comm_array[world_size];
			
 
				+	int mpi_tag_array[world_size];
			
 
				+	starpu_data_handle handle_array[world_size];
			
 
				+
			
 
				+	unsigned r;
			
 
				+	for (r = 0; r < world_size; r++)
			
 
				+	{
			
 
				+		if (rank_mask[r])
			
 
				+			rank_array[cnt++] = r;
			
 
				+
			
 
				+		comm_array[r] = MPI_COMM_WORLD;
			
 
				+		mpi_tag_array[r] = mpi_tag;
			
 
				+		handle_array[r] = handle;
			
 
				+	}
			
 
				+
			
 
				+	if (cnt == 0)
			
 
				+	{
			
 
				+		/* In case there is no message to send, we release the tag at
			
 
				+		 * once */
			
 
				+		starpu_tag_notify_from_apps(tag);
			
 
				+	}
			
 
				+	else {
			
 
				+		starpu_mpi_isend_array_detached_unlock_tag(cnt, handle_array,
			
 
				+				rank_array, mpi_tag_array, comm_array, tag);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* Initiate a receive request once all dependencies are fulfilled and unlock
			
 
				+ * tag 'unlocked_tag' once it's done. */
			
 
				+
			
 
				+struct recv_when_done_callback_arg {
			
 
				+	int source;
			
 
				+	int mpi_tag;
			
 
				+	starpu_data_handle handle;
			
 
				+	starpu_tag_t unlocked_tag;
			
 
				+};
			
 
				+
			
 
				+static void callback_receive_when_done(void *_arg)
			
 
				+{
			
 
				+	struct recv_when_done_callback_arg *arg = _arg;
			
 
				+
			
 
				+	starpu_mpi_irecv_detached_unlock_tag(arg->handle, arg->source,
			
 
				+			arg->mpi_tag, MPI_COMM_WORLD, arg->unlocked_tag);
			
 
				+
			
 
				+	free(arg);
			
 
				+}
			
 
				+
			
 
				+static void receive_when_deps_are_done(unsigned ndeps, starpu_tag_t *deps_tags,
			
 
				+				int source, int mpi_tag,
			
 
				+				starpu_data_handle handle,
			
 
				+				starpu_tag_t partial_tag,
			
 
				+				starpu_tag_t unlocked_tag)
			
 
				+{
			
 
				+	if (ndeps == 0)
			
 
				+	{
			
 
				+		starpu_tag_notify_from_apps(unlocked_tag);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	struct recv_when_done_callback_arg *arg =
			
 
				+		malloc(sizeof(struct recv_when_done_callback_arg));
			
 
				+	
			
 
				+	arg->source = source;
			
 
				+	arg->mpi_tag = mpi_tag;
			
 
				+	arg->handle = handle;
			
 
				+	arg->unlocked_tag = unlocked_tag;
			
 
				+
			
 
				+	starpu_create_sync_task(partial_tag, ndeps, deps_tags,
			
 
				+					callback_receive_when_done, arg);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *	Task 11 (diagonal factorization)
			
 
				+ */
			
 
				+
			
 
				+static void create_task_11_recv(unsigned k)
			
 
				+{
			
 
				+	unsigned i, j;
			
 
				+
			
 
				+	/* The current node is not computing that task, so we receive the block
			
 
				+	 * with MPI */
			
 
				+
			
 
				+	/* We don't issue a MPI receive request until everyone using the
			
 
				+	 * temporary buffer is done : 11_(k-1) can be used by 12_(k-1)j and
			
 
				+	 * 21(k-1)i with i,j >= k */
			
 
				+	unsigned ndeps = 0;
			
 
				+	starpu_tag_t tag_array[2*nblocks];
			
 
				+	
			
 
				+	if (k > 0)
			
 
				+	for (i = k; i < nblocks; i++)
			
 
				+	{
			
 
				+		if (rank == get_block_rank(i, k))
			
 
				+			tag_array[ndeps++] = TAG21(k-1, i);
			
 
				+	}
			
 
				+
			
 
				+	if (k > 0)
			
 
				+	for (j = k; j < nblocks; j++)
			
 
				+	{
			
 
				+		if (rank == get_block_rank(k, j))
			
 
				+			tag_array[ndeps++] = TAG12(k-1, j);
			
 
				+	}
			
 
				+	
			
 
				+	
			
 
				+	int source = get_block_rank(k, j);
			
 
				+	starpu_data_handle block_handle = STARPU_PLU(get_tmp_11_block_handle)();
			
 
				+	int mpi_tag = MPI_TAG11(k);
			
 
				+	starpu_tag_t partial_tag = TAG11_SAVE_PARTIAL(k);
			
 
				+	starpu_tag_t unlocked_tag = TAG11_SAVE(k);
			
 
				+
			
 
				+	receive_when_deps_are_done(ndeps, tag_array, source, mpi_tag, block_handle, partial_tag, unlocked_tag);
			
 
				+}
			
 
				+
			
 
				+static void find_nodes_using_11(unsigned k, int *rank_mask)
			
 
				+{
			
 
				+	memset(rank_mask, 0, world_size*sizeof(int));
			
 
				+
			
 
				+	/* Block 11_k is used to compute 12_kj + 12ki with i,j > k */
			
 
				+	unsigned i;
			
 
				+	for (i = k; i < nblocks; i++)
			
 
				+	{
			
 
				+		int r = get_block_rank(i, k);
			
 
				+		rank_mask[r] = 1;
			
 
				+	}
			
 
				+
			
 
				+	unsigned j;
			
 
				+	for (j = k; j < nblocks; j++)
			
 
				+	{
			
 
				+		int r = get_block_rank(k, j);
			
 
				+		rank_mask[r] = 1;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void callback_task_11_real(void *_arg)
			
 
				+{
			
 
				+	struct callback_arg *arg = _arg;
			
 
				+
			
 
				+	unsigned k = arg->k;
			
 
				+
			
 
				+	/* Find all the nodes potentially requiring this block */
			
 
				+	int rank_mask[world_size];
			
 
				+	find_nodes_using_11(k, rank_mask);
			
 
				+	rank_mask[rank] = 0;
			
 
				+
			
 
				+	/* Send the block to those nodes */
			
 
				+	starpu_data_handle block_handle = STARPU_PLU(get_block_handle)(k, k);
			
 
				+	starpu_tag_t tag = TAG11_SAVE(k);
			
 
				+	int mpi_tag = MPI_TAG11(k);
			
 
				+	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
			
 
				+	
			
 
				+	free(arg);
			
 
				+}
			
 
				+
			
 
				+static void create_task_11_real(unsigned k)
			
 
				+{
			
 
				+	struct starpu_task *task = create_task(TAG11(k));
			
 
				+
			
 
				+	task->cl = &STARPU_PLU(cl11);
			
 
				+
			
 
				+	/* which sub-data is manipulated ? */
			
 
				+	task->buffers[0].handle = STARPU_PLU(get_block_handle)(k, k);
			
 
				+	task->buffers[0].mode = STARPU_RW;
			
 
				+
			
 
				+	struct callback_arg *arg = malloc(sizeof(struct callback_arg));
			
 
				+		arg->k = k;
			
 
				+
			
 
				+	task->callback_func = callback_task_11_real;
			
 
				+	task->callback_arg = arg;
			
 
				+
			
 
				+	/* this is an important task */
			
 
				+	if (!no_prio)
			
 
				+		task->priority = MAX_PRIO;
			
 
				+
			
 
				+	/* enforce dependencies ... */
			
 
				+	if (k > 0) {
			
 
				+		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
			
 
				+	}
			
 
				+	else {
			
 
				+		starpu_tag_declare_deps(TAG11(k), 1, STARPU_TAG_INIT);
			
 
				+	}
			
 
				+
			
 
				+	starpu_submit_task(task);
			
 
				+}
			
 
				+
			
 
				+static void create_task_11(unsigned k)
			
 
				+{
			
 
				+	if (get_block_rank(k, k) == rank)
			
 
				+	{
			
 
				+		create_task_11_real(k);
			
 
				+	}
			
 
				+	else {
			
 
				+		/* We don't handle the task, but perhaps we have to generate MPI transfers. */
			
 
				+		int rank_mask[world_size];
			
 
				+		find_nodes_using_11(k, rank_mask);
			
 
				+		
			
 
				+		if (rank_mask[rank])
			
 
				+			create_task_11_recv(k);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ *	Task 12 (Update lower left (TRSM))
			
 
				+ */
			
 
				+
			
 
				+static void create_task_12_recv(unsigned k, unsigned j)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+
			
 
				+	/* The current node is not computing that task, so we receive the block
			
 
				+	 * with MPI */
			
 
				+
			
 
				+	/* We don't issue a MPI receive request until everyone using the
			
 
				+	 * temporary buffer is done : 12_(k-1)j can be used by 22_(k-1)ij with
			
 
				+	 * i >= k */
			
 
				+	unsigned ndeps = 0;
			
 
				+	starpu_tag_t tag_array[nblocks];
			
 
				+	
			
 
				+	if (k > 0)
			
 
				+	for (i = k; i < nblocks; i++)
			
 
				+	{
			
 
				+		if (rank == get_block_rank(i, j))
			
 
				+			tag_array[ndeps++] = TAG22(k-1, i, j);
			
 
				+	}
			
 
				+	
			
 
				+	int source = get_block_rank(k, j);
			
 
				+	starpu_data_handle block_handle = STARPU_PLU(get_tmp_12_block_handle)(j);
			
 
				+	int mpi_tag = MPI_TAG21(j, k);
			
 
				+	starpu_tag_t partial_tag = TAG21_SAVE_PARTIAL(j, k);
			
 
				+	starpu_tag_t unlocked_tag = TAG21_SAVE(j, k);
			
 
				+
			
 
				+	receive_when_deps_are_done(ndeps, tag_array, source, mpi_tag, block_handle, partial_tag, unlocked_tag);
			
 
				+}
			
 
				+
			
 
				+static void find_nodes_using_12(unsigned k, unsigned j, int *rank_mask)
			
 
				+{
			
 
				+	memset(rank_mask, 0, world_size*sizeof(int));
			
 
				+
			
 
				+	/* Block 12_kj is used to compute 22_kij with i > k */
			
 
				+	unsigned i;
			
 
				+	for (i = k; i < nblocks; i++)
			
 
				+	{
			
 
				+		int r = get_block_rank(i, j);
			
 
				+		rank_mask[r] = 1;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void callback_task_12_real(void *_arg)
			
 
				+{
			
 
				+	struct callback_arg *arg = _arg;
			
 
				+
			
 
				+	unsigned k = arg->k;
			
 
				+	unsigned j = arg->j;
			
 
				+
			
 
				+	/* Find all the nodes potentially requiring this block */
			
 
				+	int rank_mask[world_size];
			
 
				+	find_nodes_using_12(k, j, rank_mask);
			
 
				+	rank_mask[rank] = 0;
			
 
				+
			
 
				+	/* Send the block to those nodes */
			
 
				+	starpu_data_handle block_handle = STARPU_PLU(get_block_handle)(j, k);
			
 
				+	starpu_tag_t tag = TAG12_SAVE(k, j);
			
 
				+	int mpi_tag = MPI_TAG12(k, j);
			
 
				+	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
			
 
				+	
			
 
				+	free(arg);
			
 
				+}
			
 
				+
			
 
				+static void create_task_12_real(unsigned k, unsigned j)
			
 
				+{
			
 
				+	struct starpu_task *task = create_task(TAG12(k, j));
			
 
				+	
			
 
				+	task->cl = &STARPU_PLU(cl12);
			
 
				+
			
 
				+	/* which sub-data is manipulated ? */
			
 
				+	starpu_data_handle diag_block;
			
 
				+	if (get_block_rank(k, k) == rank)
			
 
				+		diag_block = STARPU_PLU(get_block_handle)(k, k);
			
 
				+	else 
			
 
				+		diag_block = STARPU_PLU(get_tmp_11_block_handle)();
			
 
				+
			
 
				+	task->buffers[0].handle = diag_block; 
			
 
				+	task->buffers[0].mode = STARPU_R;
			
 
				+	task->buffers[1].handle = STARPU_PLU(get_block_handle)(j, k); 
			
 
				+	task->buffers[1].mode = STARPU_RW;
			
 
				+
			
 
				+	struct callback_arg *arg = malloc(sizeof(struct callback_arg));
			
 
				+		arg->j = j;
			
 
				+		arg->k = k;
			
 
				+
			
 
				+	task->callback_func = callback_task_12_real;
			
 
				+	task->callback_arg = arg;
			
 
				+
			
 
				+	if (!no_prio && (j == k+1)) {
			
 
				+		task->priority = MAX_PRIO;
			
 
				+	}
			
 
				+
			
 
				+	/* enforce dependencies ... */
			
 
				+	if (k > 0) {
			
 
				+		starpu_tag_declare_deps(TAG12(k, j), 2, TAG11_SAVE(k), TAG22(k-1, k, j));
			
 
				+	}
			
 
				+	else {
			
 
				+		starpu_tag_declare_deps(TAG12(k, j), 1, TAG11_SAVE(k));
			
 
				+	}
			
 
				+
			
 
				+	starpu_submit_task(task);
			
 
				+}
			
 
				+
			
 
				+static void create_task_12(unsigned k, unsigned j)
			
 
				+{
			
 
				+	if (get_block_rank(k, j) == rank)
			
 
				+	{
			
 
				+		create_task_12_real(k, j);
			
 
				+	}
			
 
				+	else {
			
 
				+		/* We don't handle the task, but perhaps we have to generate MPI transfers. */
			
 
				+		int rank_mask[world_size];
			
 
				+		find_nodes_using_12(k, j, rank_mask);
			
 
				+		
			
 
				+		if (rank_mask[rank])
			
 
				+			create_task_12_recv(k, j);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *	Task 21 (Update upper right (TRSM))
			
 
				+ */
			
 
				+
			
 
				+static void create_task_21_recv(unsigned k, unsigned i)
			
 
				+{
			
 
				+	unsigned j;
			
 
				+
			
 
				+	/* The current node is not computing that task, so we receive the block
			
 
				+	 * with MPI */
			
 
				+
			
 
				+	/* We don't issue a MPI receive request until everyone using the
			
 
				+	 * temporary buffer is done : 21_(k-1)i can be used by 22_(k-1)ij with
			
 
				+	 * j >= k */
			
 
				+	unsigned ndeps = 0;
			
 
				+	starpu_tag_t tag_array[nblocks];
			
 
				+	
			
 
				+	if (k > 0)
			
 
				+	for (j = k; j < nblocks; j++)
			
 
				+	{
			
 
				+		if (rank == get_block_rank(i, j))
			
 
				+			tag_array[ndeps++] = TAG22(k-1, i, j);
			
 
				+	}
			
 
				+	
			
 
				+	int source = get_block_rank(i, k);
			
 
				+	starpu_data_handle block_handle = STARPU_PLU(get_tmp_21_block_handle)(i);
			
 
				+	int mpi_tag = MPI_TAG21(k, i);
			
 
				+	starpu_tag_t partial_tag = TAG21_SAVE_PARTIAL(k, i);
			
 
				+	starpu_tag_t unlocked_tag = TAG21_SAVE(k, i);
			
 
				+
			
 
				+	receive_when_deps_are_done(ndeps, tag_array, source, mpi_tag, block_handle, partial_tag, unlocked_tag);
			
 
				+}
			
 
				+
			
 
				+static void find_nodes_using_21(unsigned k, unsigned i, int *rank_mask)
			
 
				+{
			
 
				+	memset(rank_mask, 0, world_size*sizeof(int));
			
 
				+
			
 
				+	/* Block 21_ki is used to compute 22_kij with j > k */
			
 
				+	unsigned j;
			
 
				+	for (j = k; j < nblocks; j++)
			
 
				+	{
			
 
				+		int r = get_block_rank(i, j);
			
 
				+		rank_mask[r] = 1;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void callback_task_21_real(void *_arg)
			
 
				+{
			
 
				+	struct callback_arg *arg = _arg;
			
 
				+
			
 
				+	unsigned k = arg->k;
			
 
				+	unsigned i = arg->i;
			
 
				+
			
 
				+	/* Find all the nodes potentially requiring this block */
			
 
				+	int rank_mask[world_size];
			
 
				+	find_nodes_using_21(k, i, rank_mask);
			
 
				+	rank_mask[rank] = 0;
			
 
				+
			
 
				+	/* Send the block to those nodes */
			
 
				+	starpu_data_handle block_handle = STARPU_PLU(get_block_handle)(k, i);
			
 
				+	starpu_tag_t tag = TAG21_SAVE(k, i);
			
 
				+	int mpi_tag = MPI_TAG21(k, i);
			
 
				+	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
			
 
				+	
			
 
				+	free(arg);
			
 
				+}
			
 
				+
			
 
				+static void create_task_21_real(unsigned k, unsigned i)
			
 
				+{
			
 
				+	struct starpu_task *task = create_task(TAG21(k, i));
			
 
				+
			
 
				+	task->cl = &STARPU_PLU(cl21);
			
 
				+	
			
 
				+	/* which sub-data is manipulated ? */
			
 
				+	starpu_data_handle diag_block;
			
 
				+	if (get_block_rank(k, k) == rank)
			
 
				+		diag_block = STARPU_PLU(get_block_handle)(k, k);
			
 
				+	else 
			
 
				+		diag_block = STARPU_PLU(get_tmp_11_block_handle)();
			
 
				+
			
 
				+	task->buffers[0].handle = diag_block; 
			
 
				+	task->buffers[0].mode = STARPU_R;
			
 
				+	task->buffers[1].handle = STARPU_PLU(get_block_handle)(k, i);
			
 
				+	task->buffers[1].mode = STARPU_RW;
			
 
				+
			
 
				+	struct callback_arg *arg = malloc(sizeof(struct callback_arg));
			
 
				+		arg->i = i;
			
 
				+		arg->k = k;
			
 
				+
			
 
				+	task->callback_func = callback_task_21_real;
			
 
				+	task->callback_arg = arg;
			
 
				+
			
 
				+	if (!no_prio && (i == k+1)) {
			
 
				+		task->priority = MAX_PRIO;
			
 
				+	}
			
 
				+
			
 
				+	/* enforce dependencies ... */
			
 
				+	if (k > 0) {
			
 
				+		starpu_tag_declare_deps(TAG21(k, i), 2, TAG11_SAVE(k), TAG22(k-1, i, k));
			
 
				+	}
			
 
				+	else {
			
 
				+		starpu_tag_declare_deps(TAG21(k, i), 1, TAG11_SAVE(k));
			
 
				+	}
			
 
				+
			
 
				+	starpu_submit_task(task);
			
 
				+}
			
 
				+
			
 
				+static void create_task_21(unsigned k, unsigned i)
			
 
				+{
			
 
				+	if (get_block_rank(i, k) == rank)
			
 
				+	{
			
 
				+		create_task_21_real(k, i);
			
 
				+	}
			
 
				+	else {
			
 
				+		/* We don't handle the task, but perhaps we have to generate MPI transfers. */
			
 
				+		int rank_mask[world_size];
			
 
				+		find_nodes_using_21(k, i, rank_mask);
			
 
				+		
			
 
				+		if (rank_mask[rank])
			
 
				+			create_task_21_recv(k, i);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *	Task 22 (GEMM)
			
 
				+ */
			
 
				+
			
 
				+static void create_task_22(unsigned k, unsigned i, unsigned j)
			
 
				+{
			
 
				+//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
			
 
				+
			
 
				+	struct starpu_task *task = create_task(TAG22(k, i, j));
			
 
				+
			
 
				+	task->cl = &STARPU_PLU(cl22);
			
 
				+
			
 
				+	/* which sub-data is manipulated ? */
			
 
				+
			
 
				+	/* produced by TAG21_SAVE(k, i) */ 
			
 
				+	starpu_data_handle block21;
			
 
				+	if (get_block_rank(i, k) == rank)
			
 
				+		block21 = STARPU_PLU(get_block_handle)(k, i);
			
 
				+	else 
			
 
				+		block21 = STARPU_PLU(get_tmp_21_block_handle)(i);
			
 
				+
			
 
				+	task->buffers[0].handle = block21;
			
 
				+	task->buffers[0].mode = STARPU_R;
			
 
				+
			
 
				+	/* produced by TAG12_SAVE(k, j) */
			
 
				+	starpu_data_handle block12;
			
 
				+	if (get_block_rank(k, j) == rank)
			
 
				+		block12 = STARPU_PLU(get_block_handle)(j, k);
			
 
				+	else 
			
 
				+		block12 = STARPU_PLU(get_tmp_12_block_handle)(j);
			
 
				+
			
 
				+	task->buffers[1].handle = block12;
			
 
				+	task->buffers[1].mode = STARPU_R;
			
 
				+
			
 
				+	/* produced by TAG22(k-1, i, j) */
			
 
				+	task->buffers[2].handle = STARPU_PLU(get_block_handle)(j, i);
			
 
				+	task->buffers[2].mode = STARPU_RW;
			
 
				+
			
 
				+	if (!no_prio &&  (i == k + 1) && (j == k +1) ) {
			
 
				+		task->priority = MAX_PRIO;
			
 
				+	}
			
 
				+
			
 
				+	/* enforce dependencies ... */
			
 
				+	if (k > 0) {
			
 
				+		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG12_SAVE(k, j), TAG21_SAVE(k, i));
			
 
				+	}
			
 
				+	else {
			
 
				+		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG12_SAVE(k, j), TAG21_SAVE(k, i));
			
 
				+	}
			
 
				+
			
 
				+	starpu_submit_task(task);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *	code to bootstrap the factorization 
			
 
				+ */
			
 
				+
			
 
				+void STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
			
 
				+{
			
 
				+	struct timeval start;
			
 
				+	struct timeval end;
			
 
				+
			
 
				+	nblocks = _nblocks;
			
 
				+	rank = _rank;
			
 
				+	world_size = _world_size;
			
 
				+
			
 
				+	struct starpu_task *entry_task = NULL;
			
 
				+
			
 
				+	/* create all the DAG nodes */
			
 
				+	unsigned i,j,k;
			
 
				+
			
 
				+	for (k = 0; k < nblocks; k++)
			
 
				+	{
			
 
				+		create_task_11(k);
			
 
				+
			
 
				+		for (i = k+1; i<nblocks; i++)
			
 
				+		{
			
 
				+			create_task_12(k, i);
			
 
				+			create_task_21(k, i);
			
 
				+		}
			
 
				+
			
 
				+		for (i = k+1; i<nblocks; i++)
			
 
				+		{
			
 
				+			for (j = k+1; j<nblocks; j++)
			
 
				+			{
			
 
				+				create_task_22(k, i, j);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* schedule the codelet */
			
 
				+	gettimeofday(&start, NULL);
			
 
				+
			
 
				+	fprintf(stderr, "Rank %d GO\n", rank);
			
 
				+	starpu_tag_notify_from_apps(STARPU_TAG_INIT);
			
 
				+
			
 
				+	/* stall the application until the end of computations : note that it
			
 
				+	 * may be liberated explicitely by MPI */
			
 
				+	starpu_tag_wait(TAG11(nblocks-1));
			
 
				+
			
 
				+	gettimeofday(&end, NULL);
			
 
				+
			
 
				+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				+	fprintf(stderr, "Computation took (in ms)\n");
			
 
				+	printf("%2.2f\n", timing/1000);
			
 
				+
			
 
				+//	unsigned n = size;
			
 
				+//	double flop = (2.0f*n*n*n)/3.0f;
			
 
				+//	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				+}
			
--- a/mpi/examples/mpi_lu/pxlu.h
+++ b/mpi/examples/mpi_lu/pxlu.h
@@ -0,0 +1,37 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __PXLU_H__
			
 
				+#define __PXLU_H__
			
 
				+
			
 
				+/* for USE_CUDA */
			
 
				+#include <starpu_config.h>
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#include <common/blas.h>
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+
			
 
				+#define BLAS3_FLOP(n1,n2,n3)    \
			
 
				+        (2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
			
 
				+
			
 
				+starpu_data_handle STARPU_PLU(get_block_handle)(unsigned j, unsigned i);
			
 
				+TYPE *STARPU_PLU(get_block)(unsigned j, unsigned i);
			
 
				+starpu_data_handle STARPU_PLU(get_tmp_11_block_handle)(void);
			
 
				+starpu_data_handle STARPU_PLU(get_tmp_12_block_handle)(unsigned j);
			
 
				+starpu_data_handle STARPU_PLU(get_tmp_21_block_handle)(unsigned i);
			
 
				+
			
 
				+#endif // __PXLU_H__
			
--- a/mpi/examples/mpi_lu/pxlu_kernels.c
+++ b/mpi/examples/mpi_lu/pxlu_kernels.c
@@ -0,0 +1,383 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "pxlu.h"
			
 
				+#include "pxlu_kernels.h"
			
 
				+#include <math.h>
			
 
				+
			
 
				+/*
			
 
				+ *   U22 
			
 
				+ */
			
 
				+
			
 
				+static inline void STARPU_PLU(common_u22)(void *descr[],
			
 
				+				int s, __attribute__((unused)) void *_args)
			
 
				+{
			
 
				+	TYPE *right 	= (TYPE *)GET_BLAS_PTR(descr[0]);
			
 
				+	TYPE *left 	= (TYPE *)GET_BLAS_PTR(descr[1]);
			
 
				+	TYPE *center 	= (TYPE *)GET_BLAS_PTR(descr[2]);
			
 
				+
			
 
				+	unsigned dx = GET_BLAS_NX(descr[2]);
			
 
				+	unsigned dy = GET_BLAS_NY(descr[2]);
			
 
				+	unsigned dz = GET_BLAS_NY(descr[0]);
			
 
				+
			
 
				+	unsigned ld12 = GET_BLAS_LD(descr[0]);
			
 
				+	unsigned ld21 = GET_BLAS_LD(descr[1]);
			
 
				+	unsigned ld22 = GET_BLAS_LD(descr[2]);
			
 
				+
			
 
				+	int rank;
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	fprintf(stderr, "KERNEL 22 %d\n", rank);
			
 
				+
			
 
				+#ifdef USE_CUDA
			
 
				+	cublasStatus status;
			
 
				+	cudaError_t cures;
			
 
				+#endif
			
 
				+
			
 
				+	switch (s) {
			
 
				+		case 0:
			
 
				+			CPU_GEMM("N", "N", dy, dx, dz, 
			
 
				+				(TYPE)-1.0, right, ld21, left, ld12,
			
 
				+				(TYPE)1.0, center, ld22);
			
 
				+			break;
			
 
				+
			
 
				+#ifdef USE_CUDA
			
 
				+		case 1:
			
 
				+			CUBLAS_GEMM('n', 'n', dx, dy, dz,
			
 
				+				(TYPE)-1.0, right, ld21, left, ld12,
			
 
				+				(TYPE)1.0f, center, ld22);
			
 
				+
			
 
				+			status = cublasGetError();
			
 
				+			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
			
 
				+				STARPU_ABORT();
			
 
				+
			
 
				+			if (STARPU_UNLIKELY((cures = cudaThreadSynchronize()) != cudaSuccess))
			
 
				+				CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			STARPU_ABORT();
			
 
				+			break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void STARPU_PLU(cpu_u22)(void *descr[], void *_args)
			
 
				+{
			
 
				+	STARPU_PLU(common_u22)(descr, 0, _args);
			
 
				+}
			
 
				+
			
 
				+#ifdef USE_CUDA
			
 
				+static void STARPU_PLU(cublas_u22)(void *descr[], void *_args)
			
 
				+{
			
 
				+	STARPU_PLU(common_u22)(descr, 1, _args);
			
 
				+}
			
 
				+#endif// USE_CUDA
			
 
				+
			
 
				+static struct starpu_perfmodel_t STARPU_PLU(model_22) = {
			
 
				+	.type = HISTORY_BASED,
			
 
				+#ifdef ATLAS
			
 
				+	.symbol = STARPU_PLU_STR(lu_model_22_atlas)
			
 
				+#elif defined(GOTO)
			
 
				+	.symbol = STARPU_PLU_STR(lu_model_22_goto)
			
 
				+#else
			
 
				+	.symbol = STARPU_PLU_STR(lu_model_22)
			
 
				+#endif
			
 
				+};
			
 
				+
			
 
				+starpu_codelet STARPU_PLU(cl22) = {
			
 
				+	.where = CORE|CUDA,
			
 
				+	.core_func = STARPU_PLU(cpu_u22),
			
 
				+#ifdef USE_CUDA
			
 
				+	.cuda_func = STARPU_PLU(cublas_u22),
			
 
				+#endif
			
 
				+	.nbuffers = 3,
			
 
				+	.model = &STARPU_PLU(model_22)
			
 
				+};
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * U12
			
 
				+ */
			
 
				+
			
 
				+static inline void STARPU_PLU(common_u12)(void *descr[],
			
 
				+				int s, __attribute__((unused)) void *_args)
			
 
				+{
			
 
				+	TYPE *sub11;
			
 
				+	TYPE *sub12;
			
 
				+
			
 
				+	sub11 = (TYPE *)GET_BLAS_PTR(descr[0]);	
			
 
				+	sub12 = (TYPE *)GET_BLAS_PTR(descr[1]);
			
 
				+
			
 
				+	unsigned ld11 = GET_BLAS_LD(descr[0]);
			
 
				+	unsigned ld12 = GET_BLAS_LD(descr[1]);
			
 
				+
			
 
				+	unsigned nx12 = GET_BLAS_NX(descr[1]);
			
 
				+	unsigned ny12 = GET_BLAS_NY(descr[1]);
			
 
				+
			
 
				+	int rank;
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	fprintf(stderr, "KERNEL 12 %d\n", rank);
			
 
				+
			
 
				+#ifdef USE_CUDA
			
 
				+	cublasStatus status;
			
 
				+	cudaError_t cures;
			
 
				+#endif
			
 
				+
			
 
				+	/* solve L11 U12 = A12 (find U12) */
			
 
				+	switch (s) {
			
 
				+		case 0:
			
 
				+			CPU_TRSM("L", "L", "N", "N", nx12, ny12,
			
 
				+					(TYPE)1.0, sub11, ld11, sub12, ld12);
			
 
				+			break;
			
 
				+#ifdef USE_CUDA
			
 
				+		case 1:
			
 
				+			CUBLAS_TRSM('L', 'L', 'N', 'N', ny12, nx12,
			
 
				+					(TYPE)1.0, sub11, ld11, sub12, ld12);
			
 
				+
			
 
				+			status = cublasGetError();
			
 
				+			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
			
 
				+				STARPU_ABORT();
			
 
				+
			
 
				+			if (STARPU_UNLIKELY((cures = cudaThreadSynchronize()) != cudaSuccess))
			
 
				+				CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			STARPU_ABORT();
			
 
				+			break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void STARPU_PLU(cpu_u12)(void *descr[], void *_args)
			
 
				+{
			
 
				+	STARPU_PLU(common_u12)(descr, 0, _args);
			
 
				+}
			
 
				+
			
 
				+#ifdef USE_CUDA
			
 
				+static void STARPU_PLU(cublas_u12)(void *descr[], void *_args)
			
 
				+{
			
 
				+	STARPU_PLU(common_u12)(descr, 1, _args);
			
 
				+}
			
 
				+#endif // USE_CUDA
			
 
				+
			
 
				+static struct starpu_perfmodel_t STARPU_PLU(model_12) = {
			
 
				+	.type = HISTORY_BASED,
			
 
				+#ifdef ATLAS
			
 
				+	.symbol = STARPU_PLU_STR(lu_model_12_atlas)
			
 
				+#elif defined(GOTO)
			
 
				+	.symbol = STARPU_PLU_STR(lu_model_12_goto)
			
 
				+#else
			
 
				+	.symbol = STARPU_PLU_STR(lu_model_12)
			
 
				+#endif
			
 
				+};
			
 
				+
			
 
				+starpu_codelet STARPU_PLU(cl12) = {
			
 
				+	.where = CORE|CUDA,
			
 
				+	.core_func = STARPU_PLU(cpu_u12),
			
 
				+#ifdef USE_CUDA
			
 
				+	.cuda_func = STARPU_PLU(cublas_u12),
			
 
				+#endif
			
 
				+	.nbuffers = 2,
			
 
				+	.model = &STARPU_PLU(model_12)
			
 
				+};
			
 
				+
			
 
				+
			
 
				+/* 
			
 
				+ * U21
			
 
				+ */
			
 
				+
			
 
				+static inline void STARPU_PLU(common_u21)(void *descr[],
			
 
				+				int s, __attribute__((unused)) void *_args)
			
 
				+{
			
 
				+	TYPE *sub11;
			
 
				+	TYPE *sub21;
			
 
				+
			
 
				+	sub11 = (TYPE *)GET_BLAS_PTR(descr[0]);
			
 
				+	sub21 = (TYPE *)GET_BLAS_PTR(descr[1]);
			
 
				+
			
 
				+	unsigned ld11 = GET_BLAS_LD(descr[0]);
			
 
				+	unsigned ld21 = GET_BLAS_LD(descr[1]);
			
 
				+
			
 
				+	unsigned nx21 = GET_BLAS_NX(descr[1]);
			
 
				+	unsigned ny21 = GET_BLAS_NY(descr[1]);
			
 
				+	
			
 
				+	int rank;
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	fprintf(stderr, "KERNEL 21 %d\n", rank);
			
 
				+
			
 
				+
			
 
				+#ifdef USE_CUDA
			
 
				+	cublasStatus status;
			
 
				+	cudaError_t cures;
			
 
				+#endif
			
 
				+
			
 
				+	switch (s) {
			
 
				+		case 0:
			
 
				+			CPU_TRSM("R", "U", "N", "U", nx21, ny21,
			
 
				+					(TYPE)1.0, sub11, ld11, sub21, ld21);
			
 
				+			break;
			
 
				+#ifdef USE_CUDA
			
 
				+		case 1:
			
 
				+			CUBLAS_TRSM('R', 'U', 'N', 'U', ny21, nx21,
			
 
				+					(TYPE)1.0, sub11, ld11, sub21, ld21);
			
 
				+
			
 
				+			status = cublasGetError();
			
 
				+			if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+				STARPU_ABORT();
			
 
				+
			
 
				+			cudaThreadSynchronize();
			
 
				+
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			STARPU_ABORT();
			
 
				+			break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void STARPU_PLU(cpu_u21)(void *descr[], void *_args)
			
 
				+{
			
 
				+	STARPU_PLU(common_u21)(descr, 0, _args);
			
 
				+}
			
 
				+
			
 
				+#ifdef USE_CUDA
			
 
				+static void STARPU_PLU(cublas_u21)(void *descr[], void *_args)
			
 
				+{
			
 
				+	STARPU_PLU(common_u21)(descr, 1, _args);
			
 
				+}
			
 
				+#endif 
			
 
				+
			
 
				+static struct starpu_perfmodel_t STARPU_PLU(model_21) = {
			
 
				+	.type = HISTORY_BASED,
			
 
				+#ifdef ATLAS
			
 
				+	.symbol = STARPU_PLU_STR(lu_model_21_atlas)
			
 
				+#elif defined(GOTO)
			
 
				+	.symbol = STARPU_PLU_STR(lu_model_21_goto)
			
 
				+#else
			
 
				+	.symbol = STARPU_PLU_STR(lu_model_21)
			
 
				+#endif
			
 
				+};
			
 
				+
			
 
				+starpu_codelet STARPU_PLU(cl21) = {
			
 
				+	.where = CORE|CUDA,
			
 
				+	.core_func = STARPU_PLU(cpu_u21),
			
 
				+#ifdef USE_CUDA
			
 
				+	.cuda_func = STARPU_PLU(cublas_u21),
			
 
				+#endif
			
 
				+	.nbuffers = 2,
			
 
				+	.model = &STARPU_PLU(model_21)
			
 
				+};
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ *	U11
			
 
				+ */
			
 
				+
			
 
				+static inline void STARPU_PLU(common_u11)(void *descr[],
			
 
				+				int s, __attribute__((unused)) void *_args)
			
 
				+{
			
 
				+	TYPE *sub11;
			
 
				+
			
 
				+	sub11 = (TYPE *)GET_BLAS_PTR(descr[0]); 
			
 
				+
			
 
				+	unsigned long nx = GET_BLAS_NX(descr[0]);
			
 
				+	unsigned long ld = GET_BLAS_LD(descr[0]);
			
 
				+
			
 
				+	unsigned long z;
			
 
				+
			
 
				+	int rank;
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	fprintf(stderr, "KERNEL 11 %d\n", rank);
			
 
				+
			
 
				+	switch (s) {
			
 
				+		case 0:
			
 
				+			for (z = 0; z < nx; z++)
			
 
				+			{
			
 
				+				TYPE pivot;
			
 
				+				pivot = sub11[z+z*ld];
			
 
				+				STARPU_ASSERT(pivot != 0.0);
			
 
				+		
			
 
				+				CPU_SCAL(nx - z - 1, (1.0/pivot), &sub11[z+(z+1)*ld], ld);
			
 
				+		
			
 
				+				CPU_GER(nx - z - 1, nx - z - 1, -1.0,
			
 
				+						&sub11[(z+1)+z*ld], 1,
			
 
				+						&sub11[z+(z+1)*ld], ld,
			
 
				+						&sub11[(z+1) + (z+1)*ld],ld);
			
 
				+			}
			
 
				+			break;
			
 
				+#ifdef USE_CUDA
			
 
				+		case 1:
			
 
				+			for (z = 0; z < nx; z++)
			
 
				+			{
			
 
				+				TYPE pivot;
			
 
				+				cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
			
 
				+				cudaStreamSynchronize(0);
			
 
				+
			
 
				+				STARPU_ASSERT(pivot != 0.0);
			
 
				+				
			
 
				+				CUBLAS_SCAL(nx - z - 1, 1.0/pivot, &sub11[z+(z+1)*ld], ld);
			
 
				+				
			
 
				+				CUBLAS_GER(nx - z - 1, nx - z - 1, -1.0,
			
 
				+						&sub11[(z+1)+z*ld], 1,
			
 
				+						&sub11[z+(z+1)*ld], ld,
			
 
				+						&sub11[(z+1) + (z+1)*ld],ld);
			
 
				+			}
			
 
				+			
			
 
				+			cudaThreadSynchronize();
			
 
				+
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			STARPU_ABORT();
			
 
				+			break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void STARPU_PLU(cpu_u11)(void *descr[], void *_args)
			
 
				+{
			
 
				+	STARPU_PLU(common_u11)(descr, 0, _args);
			
 
				+}
			
 
				+
			
 
				+#ifdef USE_CUDA
			
 
				+static void STARPU_PLU(cublas_u11)(void *descr[], void *_args)
			
 
				+{
			
 
				+	STARPU_PLU(common_u11)(descr, 1, _args);
			
 
				+}
			
 
				+#endif// USE_CUDA
			
 
				+
			
 
				+static struct starpu_perfmodel_t STARPU_PLU(model_11) = {
			
 
				+	.type = HISTORY_BASED,
			
 
				+#ifdef ATLAS
			
 
				+	.symbol = STARPU_PLU_STR(lu_model_11_atlas)
			
 
				+#elif defined(GOTO)
			
 
				+	.symbol = STARPU_PLU_STR(lu_model_11_goto)
			
 
				+#else
			
 
				+	.symbol = STARPU_PLU_STR(lu_model_11)
			
 
				+#endif
			
 
				+};
			
 
				+
			
 
				+starpu_codelet STARPU_PLU(cl11) = {
			
 
				+	.where = CORE|CUDA,
			
 
				+	.core_func = STARPU_PLU(cpu_u11),
			
 
				+#ifdef USE_CUDA
			
 
				+	.cuda_func = STARPU_PLU(cublas_u11),
			
 
				+#endif
			
 
				+	.nbuffers = 1,
			
 
				+	.model = &STARPU_PLU(model_11)
			
 
				+};
			
 
				+
			
 
				+
			
--- a/mpi/examples/mpi_lu/pxlu_kernels.h
+++ b/mpi/examples/mpi_lu/pxlu_kernels.h
@@ -0,0 +1,31 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __PXLU_KERNELS_H__
			
 
				+#define __PXLU_KERNELS_H__
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#define str(s) #s
			
 
				+#define xstr(s)        str(s)
			
 
				+#define STARPU_PLU_STR(name)  xstr(STARPU_PLU(name))
			
 
				+
			
 
				+starpu_codelet STARPU_PLU(cl11);
			
 
				+starpu_codelet STARPU_PLU(cl12);
			
 
				+starpu_codelet STARPU_PLU(cl21);
			
 
				+starpu_codelet STARPU_PLU(cl22);
			
 
				+
			
 
				+#endif // __PXLU_KERNELS_H__
			
--- a/mpi/examples/mpi_lu/slu_kernels.c
+++ b/mpi/examples/mpi_lu/slu_kernels.c
@@ -0,0 +1,18 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "float.h"
			
 
				+#include "xlu_kernels.c"