Browse Source

Add out of core LU decomposition

Samuel Thibault 12 years ago
parent
commit
596cecabe0

+ 4 - 2
doc/doxygen/chapters/mpi_support.doxy

@@ -388,6 +388,8 @@ MPI (complex numbers),
 <c>starpu_mpi_insert_task</c>. The non-distributed version can check for
 <c>starpu_mpi_insert_task</c>. The non-distributed version can check for
 <algorithm correctness in 1-node configuration, the distributed version uses
 <algorithm correctness in 1-node configuration, the distributed version uses
 exactly the same source code, to be used over MPI,
 exactly the same source code, to be used over MPI,
-<li><c>mpi_lu</c> is an LU decomposition example, provided in both explicit and
-implicit versions.
+<li><c>mpi_lu</c> is an LU decomposition example, provided in three versions:
+<c>plu_example</c> uses explicit MPI data transfers, <c>plu_implicit_example</c>
+uses implicit MPI data transfers, <c>plu_outofcore_example</c> uses implicit MPI
+data transfers and supports data matrices which do not fit in memory (out-of-core).
 </ul>
 </ul>

+ 29 - 2
mpi/examples/Makefile.am

@@ -49,6 +49,7 @@ EXTRA_DIST = 				\
 	mpi_lu/mpi_lu-double.h		\
 	mpi_lu/mpi_lu-double.h		\
 	mpi_lu/plu_example.c		\
 	mpi_lu/plu_example.c		\
 	mpi_lu/plu_implicit_example.c	\
 	mpi_lu/plu_implicit_example.c	\
+	mpi_lu/plu_outofcore_example.c	\
 	mpi_lu/plu_solve.c		\
 	mpi_lu/plu_solve.c		\
 	mpi_lu/pxlu.h			\
 	mpi_lu/pxlu.h			\
 	mpi_lu/pxlu.c			\
 	mpi_lu/pxlu.c			\
@@ -105,7 +106,9 @@ examplebin_PROGRAMS += 			\
 	mpi_lu/plu_example_float	\
 	mpi_lu/plu_example_float	\
 	mpi_lu/plu_example_double	\
 	mpi_lu/plu_example_double	\
 	mpi_lu/plu_implicit_example_float	\
 	mpi_lu/plu_implicit_example_float	\
-	mpi_lu/plu_implicit_example_double
+	mpi_lu/plu_implicit_example_double	\
+	mpi_lu/plu_outofcore_example_float	\
+	mpi_lu/plu_outofcore_example_double
 
 
 mpi_lu_plu_example_float_LDADD =	\
 mpi_lu_plu_example_float_LDADD =	\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
@@ -149,7 +152,31 @@ mpi_lu_plu_implicit_example_double_LDADD =	\
 	$(STARPU_BLAS_LDFLAGS) -lm
 	$(STARPU_BLAS_LDFLAGS) -lm
 
 
 mpi_lu_plu_implicit_example_double_SOURCES =	\
 mpi_lu_plu_implicit_example_double_SOURCES =	\
-	mpi_lu/plu_implicit_example_double.c	\
+	mpi_lu/plu_outofcore_example_double.c	\
+	mpi_lu/plu_solve_double.c		\
+	mpi_lu/pdlu_kernels.c			\
+	mpi_lu/pdlu_implicit.c			\
+	$(top_srcdir)/examples/common/blas.c
+
+mpi_lu_plu_outofcore_example_float_LDADD =	\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
+	$(STARPU_LIBNUMA_LDFLAGS)				\
+	$(STARPU_BLAS_LDFLAGS) -lm
+
+mpi_lu_plu_outofcore_example_float_SOURCES =	\
+	mpi_lu/plu_outofcore_example_float.c	\
+	mpi_lu/plu_solve_float.c		\
+	mpi_lu/pslu_kernels.c			\
+	mpi_lu/pslu_implicit.c			\
+	$(top_srcdir)/examples/common/blas.c
+
+mpi_lu_plu_outofcore_example_double_LDADD =	\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
+	$(STARPU_LIBNUMA_LDFLAGS)				\
+	$(STARPU_BLAS_LDFLAGS) -lm
+
+mpi_lu_plu_outofcore_example_double_SOURCES =	\
+	mpi_lu/plu_outofcore_example_double.c	\
 	mpi_lu/plu_solve_double.c		\
 	mpi_lu/plu_solve_double.c		\
 	mpi_lu/pdlu_kernels.c			\
 	mpi_lu/pdlu_kernels.c			\
 	mpi_lu/pdlu_implicit.c			\
 	mpi_lu/pdlu_implicit.c			\

+ 380 - 0
mpi/examples/mpi_lu/plu_outofcore_example.c

@@ -0,0 +1,380 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <math.h>
+#include <starpu.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+
+#include "pxlu.h"
+//#include "pxlu_kernels.h"
+
+#ifdef STARPU_HAVE_LIBNUMA
+#include <numaif.h>
+#endif
+
+static unsigned long size = 4096;
+static unsigned nblocks = 16;
+static unsigned check = 0;
+static int p = 1;
+static int q = 1;
+static unsigned display = 0;
+static char *path = "/tmp/starpu-mpi_LU";
+
+#ifdef STARPU_HAVE_LIBNUMA
+static unsigned numa = 0;
+#endif
+
+static size_t allocated_memory = 0;
+
+static starpu_data_handle_t *dataA_handles;
+
+int get_block_rank(unsigned i, unsigned j);
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-size") == 0) {
+			char *argptr;
+			size = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nblocks") == 0) {
+			char *argptr;
+			nblocks = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-check") == 0) {
+			check = 1;
+		}
+
+		if (strcmp(argv[i], "-display") == 0) {
+			display = 1;
+		}
+
+		if (strcmp(argv[i], "-numa") == 0) {
+#ifdef STARPU_HAVE_LIBNUMA
+			numa = 1;
+#else
+			if (rank == 0)
+				fprintf(stderr, "Warning: libnuma is not available\n");
+#endif
+		}
+
+		if (strcmp(argv[i], "-p") == 0) {
+			char *argptr;
+			p = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-q") == 0) {
+			char *argptr;
+			q = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-path") == 0)
+			path = argv[++i];
+
+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0) {
+			fprintf(stderr,"usage: %s [-size n] [-nblocks b] [-check] [-display] [-numa] [-p p] [-q q] [-path PATH]\n", argv[0]);
+			fprintf(stderr,"\np * q must be equal to the number of MPI nodes\n");
+			exit(0);
+		}
+	}
+}
+
+unsigned STARPU_PLU(display_flag)(void)
+{
+	return display;
+}
+
+static void fill_block_with_random(TYPE *blockptr, unsigned psize, unsigned pnblocks)
+{
+	const unsigned block_size = (psize/pnblocks);
+
+	unsigned i, j;
+	for (i = 0; i < block_size; i++)
+	     for (j = 0; j < block_size; j++)
+	     {
+		  blockptr[j+i*block_size] = (TYPE)starpu_drand48();
+	     }
+}
+
+static void create_matrix()
+{
+	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
+	TYPE *blockptr = malloc(blocksize);
+	int fd;
+	char *filename;
+	unsigned filename_length = strlen(path) + 1 + sizeof(nblocks)*3 + 1 + sizeof(nblocks)*3 + 1;
+
+	filename = malloc(filename_length);
+
+	allocated_memory += nblocks*nblocks*blocksize*sizeof(TYPE *);
+
+	/* Create the whole matrix on the disk */
+	unsigned i,j;
+	for (j = 0; j < nblocks; j++)
+	{
+		for (i = 0; i < nblocks; i++)
+		{
+			fill_block_with_random(blockptr, size, nblocks);
+			if (i == j)
+			{
+				unsigned tmp;
+				for (tmp = 0; tmp < size/nblocks; tmp++)
+				{
+					blockptr[tmp*((size/nblocks)+1)] += (TYPE)10*nblocks;
+				}
+			}
+			snprintf(filename, filename_length, "%s/%u,%u", path, i, j);
+			fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, 0777);
+			if (fd < 0) {
+				perror("open");
+				exit(1);
+			}
+			if (write(fd, blockptr, blocksize) != (ssize_t) blocksize) {
+				fprintf(stderr,"short write");
+				exit(1);
+			}
+			if (close(fd) < 0) {
+				perror("close");
+				exit(1);
+			}
+		}
+	}
+
+	free(blockptr);
+	free(filename);
+}
+
+static void init_matrix(int rank)
+{
+	/* Allocate a grid of data handles, not all of them have to be allocated later on */
+	dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle_t));
+
+	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
+
+	int disk_node = starpu_disk_register(&starpu_disk_unistd_ops, path, 1024*1024);
+
+	char filename[sizeof(nblocks)*3 + 1 + sizeof(nblocks)*3 + 1];
+
+	/* Allocate all the blocks that belong to this mpi node */
+	unsigned i,j;
+	for (j = 0; j < nblocks; j++)
+	{
+		for (i = 0; i < nblocks; i++)
+		{
+			int block_rank = get_block_rank(i, j);
+//			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
+			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
+
+			if (block_rank == rank)
+			{
+				void *disk_obj;
+				snprintf(filename, sizeof(filename), "%u,%u", i, j);
+				/* Register it to StarPU */
+				disk_obj = starpu_disk_open(disk_node, filename, blocksize);
+				if (!disk_obj) {
+					fprintf(stderr,"could not open %s\n", filename);
+					exit(1);
+				}
+				starpu_matrix_data_register(handleptr, disk_node,
+					(uintptr_t) disk_obj, size/nblocks,
+					size/nblocks, size/nblocks, sizeof(TYPE));
+			}
+			else {
+				starpu_matrix_data_register(handleptr, -1,
+					0, size/nblocks,
+					size/nblocks, size/nblocks, sizeof(TYPE));
+			}
+			starpu_data_set_rank(*handleptr, block_rank);
+			starpu_data_set_tag(*handleptr, j+i*nblocks);
+		}
+	}
+
+	//display_all_blocks(nblocks, size/nblocks);
+}
+
+TYPE *STARPU_PLU(get_block)(unsigned i, unsigned j)
+{
+	/* This does not really make sense in out of core */
+	assert(0);
+}
+
+int get_block_rank(unsigned i, unsigned j)
+{
+	/* Take a 2D block cyclic distribution */
+	/* NB: p (resp. q) is for "direction" i (resp. j) */
+	return (j % q) * p + (i % p);
+}
+
+starpu_data_handle_t STARPU_PLU(get_block_handle)(unsigned i, unsigned j)
+{
+	return dataA_handles[j+i*nblocks];
+}
+
+int main(int argc, char **argv)
+{
+	int rank;
+	int world_size;
+	int ret;
+	unsigned i, j;
+
+	starpu_srand48((long int)time(NULL));
+
+	parse_args(argc, argv);
+
+	ret = mkdir(path, 0777);
+	if (ret != 0 && errno != EEXIST) {
+		fprintf(stderr,"%s does not exist\n", path);
+		exit(1);
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	ret = starpu_mpi_init(&argc, &argv, 1);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+
+	STARPU_ASSERT(p*q == world_size);
+
+	starpu_cublas_init();
+
+	/*
+	 * 	Problem Init
+	 */
+
+	if (rank == 0)
+		create_matrix();
+
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+
+	init_matrix(rank);
+
+	fprintf(stderr, "%dMB on disk\n", (int)(allocated_memory/(1024*1024)));
+
+	TYPE *a_r = NULL;
+//	STARPU_PLU(display_data_content)(a_r, size);
+
+	TYPE *x, *y;
+
+	if (check)
+	{
+		x = calloc(size, sizeof(TYPE));
+		STARPU_ASSERT(x);
+
+		y = calloc(size, sizeof(TYPE));
+		STARPU_ASSERT(y);
+
+		if (rank == 0)
+		{
+			unsigned ind;
+			for (ind = 0; ind < size; ind++)
+				x[ind] = (TYPE)starpu_drand48();
+		}
+
+		a_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
+
+		if (rank == 0)
+			STARPU_PLU(display_data_content)(a_r, size);
+
+//		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
+	}
+
+	double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size);
+
+	/*
+	 * 	Report performance
+	 */
+
+	if (rank == 0)
+	{
+		fprintf(stderr, "Computation took: %f ms\n", timing/1000);
+
+		unsigned n = size;
+		double flop = (2.0f*n*n*n)/3.0f;
+		fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+	}
+
+	/*
+	 *	Test Result Correctness
+	 */
+
+	if (check)
+	{
+		/*
+		 *	Compute || A - LU ||
+		 */
+
+		STARPU_PLU(compute_lu_matrix)(size, nblocks, a_r);
+
+#if 0
+		/*
+		 *	Compute || Ax - LUx ||
+		 */
+
+		unsigned ind;
+
+		y2 = calloc(size, sizeof(TYPE));
+		STARPU_ASSERT(y);
+
+		if (rank == 0)
+		{
+			for (ind = 0; ind < size; ind++)
+			{
+				y2[ind] = (TYPE)0.0;
+			}
+		}
+
+		STARPU_PLU(compute_lux)(size, x, y2, nblocks, rank);
+
+		/* Compute y2 = y2 - y */
+		CPU_AXPY(size, -1.0, y, 1, y2, 1);
+
+		TYPE err = CPU_ASUM(size, y2, 1);
+		int max = CPU_IAMAX(size, y2, 1);
+
+		fprintf(stderr, "(A - LU)X Avg error : %e\n", err/(size*size));
+		fprintf(stderr, "(A - LU)X Max error : %e\n", y2[max]);
+#endif
+	}
+
+	/*
+	 * 	Termination
+	 */
+	for (j = 0; j < nblocks; j++)
+	{
+		for (i = 0; i < nblocks; i++)
+		{
+			starpu_data_unregister(dataA_handles[j+nblocks*i]);
+		}
+	}
+
+	starpu_cublas_shutdown();
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	return 0;
+}

+ 19 - 0
mpi/examples/mpi_lu/plu_outofcore_example_double.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-double.h"
+#include "plu_outofcore_example.c"

+ 19 - 0
mpi/examples/mpi_lu/plu_outofcore_example_float.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-float.h"
+#include "plu_outofcore_example.c"