9 years ago · 3313776d3e
--- a/mpi/examples/Makefile.am
+++ b/mpi/examples/Makefile.am
@@ -68,6 +68,7 @@ EXTRA_DIST = 				\
 
				 	matrix_decomposition/mpi_cholesky_models.h 	\
			
 
				 	matrix_decomposition/mpi_decomposition_params.h	\
			
 
				 	matrix_decomposition/mpi_decomposition_matrix.h	\
			
 
				+	matrix_mult/mm_starpu.c		\
			
 
				 	user_datatype/my_interface.h			\
			
 
				 	helper.h
			
 
				 
			
@@ -236,6 +237,25 @@ starpu_mpi_EXAMPLES +=				\
 
				 endif
			
 
				 endif
			
 
				 
			
 
				+########################
			
 
				+# MPI Matrix mult example #
			
 
				+########################
			
 
				+
			
 
				+if BUILD_EXAMPLES
			
 
				+examplebin_PROGRAMS +=		\
			
 
				+	matrix_mult/mm
			
 
				+
			
 
				+matrix_mult_mm_SOURCES	=		\
			
 
				+	matrix_mult/mm.c
			
 
				+
			
 
				+matrix_mult_mm_LDADD =			\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
			
 
				+	-lm
			
 
				+
			
 
				+starpu_mpi_EXAMPLES +=				\
			
 
				+	matrix_mult/mm
			
 
				+endif
			
 
				+
			
 
				 ###################
			
 
				 # complex example #
			
 
				 ###################
			
--- a/mpi/examples/matrix_mult/Makefile
+++ b/mpi/examples/matrix_mult/Makefile
@@ -0,0 +1,30 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2016  Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+
			
 
				+# This makefile gives an example on how to build the testcase outside StarPU
			
 
				+
			
 
				+PRG	= mm
			
 
				+
			
 
				+CC	= mpicc
			
 
				+CFLAGS	= $(shell pkg-config --cflags starpumpi-1.3) -g -Wall
			
 
				+LDFLAGS	= $(shell pkg-config --libs starpumpi-1.3) -lm
			
 
				+
			
 
				+.phony: all clean
			
 
				+
			
 
				+all: $(PRG)
			
 
				+
			
 
				+clean:
			
 
				+	rm -f $(PRG) *.o starpu*.log
			
--- a/mpi/examples/matrix_mult/environment
+++ b/mpi/examples/matrix_mult/environment
@@ -0,0 +1,25 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2016  Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+
			
 
				+# This script gives an example on how to set environment variables to build and run the testcase outside StarPU
			
 
				+
			
 
				+STARPU_INSTALL_DIR=/usr # set this to StarPU's installation directory
			
 
				+
			
 
				+PATH=$STARPU_INSTALL_DIR/bin:$PATH
			
 
				+PKG_CONFIG_PATH=$STARPU_INSTALL_DIR/lib/pkgconfig:$PKG_CONFIG_PATH
			
 
				+LD_LIBRARY_PATH=$STARPU_INSTALL_DIR/lib:$LD_LIBRARY_PATH
			
 
				+
			
 
				+export PATH PKG_CONFIG_PATH LD_LIBRARY_PATH
			
--- a/mpi/examples/matrix_mult/mm.c
+++ b/mpi/examples/matrix_mult/mm.c
@@ -0,0 +1,360 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2016  Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * This example illustrates how to distribute a pre-existing data structure to
			
 
				+ * a set of computing nodes using StarPU-MPI routines.
			
 
				+ */
			
 
				+
			
 
				+#include <stdlib.h>
			
 
				+#include <stdio.h>
			
 
				+#include <assert.h>
			
 
				+#include <math.h>
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_mpi.h>
			
 
				+
			
 
				+#define VERBOSE 0
			
 
				+
			
 
				+static int N  = 16; /* Matrix size */
			
 
				+static int BS =  4; /* Block size */
			
 
				+
			
 
				+#define NB ((N)/(BS)) /* Number of blocks */
			
 
				+
			
 
				+/* Matrices. Will be allocated as regular, linearized C arrays */
			
 
				+static double *A = NULL; /* A will be partitioned as BS rows x N  cols blocks */
			
 
				+static double *B = NULL; /* B will be partitioned as N  rows x BS cols blocks */
			
 
				+static double *C = NULL; /* C will be partitioned as BS rows x BS cols blocks */
			
 
				+
			
 
				+/* Arrays of data handles for managing matrix blocks */
			
 
				+static starpu_data_handle_t *A_h;
			
 
				+static starpu_data_handle_t *B_h;
			
 
				+static starpu_data_handle_t *C_h;
			
 
				+
			
 
				+static int comm_rank; /* mpi rank of the process */
			
 
				+static int comm_size; /* size of the mpi session */
			
 
				+
			
 
				+static void alloc_matrices(void)
			
 
				+{
			
 
				+	/* Regular 'malloc' can also be used instead, however, starpu_malloc make sure that
			
 
				+	 * the area is allocated in suitably pinned memory to improve data transfers, especially
			
 
				+	 * with CUDA */
			
 
				+	starpu_malloc((void **)&A, N*N*sizeof(double));
			
 
				+	starpu_malloc((void **)&B, N*N*sizeof(double));
			
 
				+	starpu_malloc((void **)&C, N*N*sizeof(double));
			
 
				+}
			
 
				+
			
 
				+static void free_matrices(void)
			
 
				+{
			
 
				+	starpu_free(A);
			
 
				+	starpu_free(B);
			
 
				+	starpu_free(C);
			
 
				+}
			
 
				+
			
 
				+static void init_matrices(void)
			
 
				+{
			
 
				+	int row,col;
			
 
				+	for (row = 0; row < N; row++)
			
 
				+	{
			
 
				+		for (col = 0; col < N; col++)
			
 
				+		{
			
 
				+			A[row*N+col] = (row==col)?2:0;
			
 
				+			B[row*N+col] = row*N+col;
			
 
				+			C[row*N+col] = 0;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#if VERBOSE
			
 
				+static void disp_matrix(double *m)
			
 
				+{
			
 
				+	int row,col;
			
 
				+	for (row = 0; row < N; row++)
			
 
				+	{
			
 
				+		for (col = 0; col < N; col++)
			
 
				+		{
			
 
				+			printf("\t%.2lf", m[row*N+col]);
			
 
				+		}
			
 
				+		printf("\n");
			
 
				+	}
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+static void check_result(void) {
			
 
				+	int row,col;
			
 
				+	for (row = 0; row < N; row++)
			
 
				+	{
			
 
				+		for (col = 0; col < N; col++)
			
 
				+		{
			
 
				+			if (fabs(C[row*N+col] - 2*(row*N+col)) > 1.0)
			
 
				+			{
			
 
				+				fprintf(stderr, "check failed\n");
			
 
				+				exit(1);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+#if VERBOSE
			
 
				+	printf("success\n");
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* Register the matrix blocks to StarPU and to StarPU-MPI */
			
 
				+static void register_matrices()
			
 
				+{
			
 
				+	A_h = calloc(NB, sizeof(starpu_data_handle_t));
			
 
				+	B_h = calloc(NB, sizeof(starpu_data_handle_t));
			
 
				+	C_h = calloc(NB*NB, sizeof(starpu_data_handle_t));
			
 
				+
			
 
				+	/* Memory region, where the data being registered resides.
			
 
				+	 * In this example, all blocks are allocated by node 0, thus
			
 
				+	 * - node 0 specifies STARPU_MAIN_RAM to indicate that it owns the block in its main memory
			
 
				+	 * - nodes !0 specify -1 to indicate that they don't have a copy of the block initially
			
 
				+	 */
			
 
				+	int mr = (comm_rank == 0) ? STARPU_MAIN_RAM : -1;
			
 
				+
			
 
				+	/* mpi tag used for the block */
			
 
				+	int tag = 0;
			
 
				+
			
 
				+	int b_row,b_col;
			
 
				+
			
 
				+	for (b_row = 0; b_row < NB; b_row++) {
			
 
				+		/* Register a block to StarPU */
			
 
				+		starpu_matrix_data_register(&A_h[b_row],
			
 
				+				mr,
			
 
				+				(comm_rank == 0)?(uintptr_t)(A+b_row*BS*N):0, N, N, BS,
			
 
				+				sizeof(double));
			
 
				+
			
 
				+		/* Register a block to StarPU-MPI, specifying the mpi tag to use for transfering the block
			
 
				+		 * and the rank of the owner node.
			
 
				+		 *
			
 
				+		 * Note: StarPU-MPI is an autonomous layer built on top of StarPU, hence the two separate
			
 
				+		 * registration steps.
			
 
				+		 */
			
 
				+		starpu_mpi_data_register(A_h[b_row], tag++, 0);
			
 
				+	}
			
 
				+
			
 
				+	for (b_col = 0; b_col < NB; b_col++) {
			
 
				+		starpu_matrix_data_register(&B_h[b_col],
			
 
				+				mr,
			
 
				+				(comm_rank == 0)?(uintptr_t)(B+b_col*BS):0, N, BS, N,
			
 
				+				sizeof(double));
			
 
				+		starpu_mpi_data_register(B_h[b_col], tag++, 0);
			
 
				+	}
			
 
				+
			
 
				+	for (b_row = 0; b_row < NB; b_row++) {
			
 
				+		for (b_col = 0; b_col < NB; b_col++) {
			
 
				+			starpu_matrix_data_register(&C_h[b_row*NB+b_col],
			
 
				+					mr,
			
 
				+					(comm_rank == 0)?(uintptr_t)(C+b_row*BS*N+b_col*BS):0, N, BS, BS,
			
 
				+					sizeof(double));
			
 
				+			starpu_mpi_data_register(C_h[b_row*NB+b_col], tag++, 0);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* Transfer ownership of the C matrix blocks following some user-defined distribution over the nodes.
			
 
				+ * Note: since C will be Write-accessed, it will implicitly define which node perform the task
			
 
				+ * associated to a given block. */
			
 
				+static void distribute_matrix_C(void)
			
 
				+{
			
 
				+	int b_row,b_col;
			
 
				+	for (b_row = 0; b_row < NB; b_row++)
			
 
				+	{
			
 
				+		for (b_col = 0; b_col < NB; b_col++)
			
 
				+		{
			
 
				+			starpu_data_handle_t h = C_h[b_row*NB+b_col]; 
			
 
				+
			
 
				+			/* Select the node where the block should be computed. */
			
 
				+			int target_rank = (b_row+b_col)%comm_size;
			
 
				+
			
 
				+			/* Move the block on to its new owner. */
			
 
				+			starpu_mpi_get_data_on_node(MPI_COMM_WORLD, h, target_rank);
			
 
				+			starpu_mpi_data_set_rank(h, target_rank);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* Transfer ownership of the C matrix blocks back to node 0, for display purpose. This is not mandatory. */
			
 
				+static void undistribute_matrix_C(void)
			
 
				+{
			
 
				+	int b_row,b_col;
			
 
				+	for (b_row = 0; b_row < NB; b_row++) {
			
 
				+		for (b_col = 0; b_col < NB; b_col++) {
			
 
				+			starpu_data_handle_t h = C_h[b_row*NB+b_col]; 
			
 
				+			starpu_mpi_get_data_on_node(MPI_COMM_WORLD, h, 0);
			
 
				+			starpu_mpi_data_set_rank(h, 0);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* Unregister matrices from the StarPU management. */
			
 
				+static void unregister_matrices()
			
 
				+{
			
 
				+	int b_row,b_col;
			
 
				+
			
 
				+	for (b_row = 0; b_row < NB; b_row++) {
			
 
				+		starpu_data_unregister(A_h[b_row]);
			
 
				+	}
			
 
				+
			
 
				+	for (b_col = 0; b_col < NB; b_col++) {
			
 
				+		starpu_data_unregister(B_h[b_col]);
			
 
				+	}
			
 
				+
			
 
				+	for (b_row = 0; b_row < NB; b_row++) {
			
 
				+		for (b_col = 0; b_col < NB; b_col++) {
			
 
				+			starpu_data_unregister(C_h[b_row*NB+b_col]);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	free(A_h);
			
 
				+	free(B_h);
			
 
				+	free(C_h);
			
 
				+}
			
 
				+
			
 
				+/* Perform the actual computation. In a real-life case, this would rather call a BLAS 'gemm' routine
			
 
				+ * instead. */
			
 
				+static void cpu_mult(void *handles[], STARPU_ATTRIBUTE_UNUSED void *arg)
			
 
				+{
			
 
				+	double *block_A = (double *)STARPU_MATRIX_GET_PTR(handles[0]);
			
 
				+	double *block_B = (double *)STARPU_MATRIX_GET_PTR(handles[1]);
			
 
				+	double *block_C = (double *)STARPU_MATRIX_GET_PTR(handles[2]);
			
 
				+
			
 
				+	unsigned n_col_A = STARPU_MATRIX_GET_NX(handles[0]);
			
 
				+	unsigned n_col_B = STARPU_MATRIX_GET_NX(handles[1]);
			
 
				+	unsigned n_col_C = STARPU_MATRIX_GET_NX(handles[2]);
			
 
				+
			
 
				+	unsigned n_row_A = STARPU_MATRIX_GET_NY(handles[0]);
			
 
				+	unsigned n_row_B = STARPU_MATRIX_GET_NY(handles[1]);
			
 
				+	unsigned n_row_C = STARPU_MATRIX_GET_NY(handles[2]);
			
 
				+
			
 
				+	unsigned ld_A = STARPU_MATRIX_GET_LD(handles[0]);
			
 
				+	unsigned ld_B = STARPU_MATRIX_GET_LD(handles[1]);
			
 
				+	unsigned ld_C = STARPU_MATRIX_GET_LD(handles[2]);
			
 
				+
			
 
				+	/* Sanity check, not needed in real life case */
			
 
				+	assert(n_col_C == n_col_B);
			
 
				+	assert(n_row_C == n_row_A);
			
 
				+	assert(n_col_A == n_row_B);
			
 
				+
			
 
				+	int i,j,k;
			
 
				+	for (k = 0; k < n_row_C; k++) {
			
 
				+		for (j = 0; j < n_col_C; j++) {
			
 
				+			for (i = 0; i < n_col_A; i++) {
			
 
				+				block_C[k*ld_C+j] += block_A[k*ld_A+i] * block_B[i*ld_B+j]; 
			
 
				+			}
			
 
				+
			
 
				+#if VERBOSE
			
 
				+			/* For illustration purpose, shows which node computed
			
 
				+			 * the block in the decimal part of the cell */
			
 
				+			block_C[k*ld_C+j] += comm_rank / 100.0;
			
 
				+#endif
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* Define a StarPU 'codelet' structure for the matrix multiply kernel above.
			
 
				+ * This structure enable specifying multiple implementations for the kernel (such as CUDA or OpenCL versions)
			
 
				+ */
			
 
				+static struct starpu_codelet gemm_cl =
			
 
				+{
			
 
				+	.cpu_funcs = {cpu_mult}, /* cpu implementation(s) of the routine */
			
 
				+	.nbuffers = 3, /* number of data handles referenced by this routine */
			
 
				+	.modes = {STARPU_R, STARPU_R, STARPU_RW} /* access modes for each data handle */
			
 
				+};
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	/* Initializes the StarPU core */
			
 
				+	int ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	/* Initializes the StarPU-MPI layer */
			
 
				+	ret = starpu_mpi_init(&argc, &argv, 1);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	/* Parse the matrix size and block size optional args */
			
 
				+	if (argc > 1) {
			
 
				+		N = atoi(argv[1]);
			
 
				+		if (N < 1) {
			
 
				+			fprintf(stderr, "invalid matrix size\n");
			
 
				+			exit(1);
			
 
				+		}
			
 
				+		if (argc > 2) {
			
 
				+			BS = atoi(argv[2]);
			
 
				+		}
			
 
				+		if (BS < 1 || N % BS != 0) {
			
 
				+			fprintf(stderr, "invalid block size\n");
			
 
				+			exit(1);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* Get the process rank and session size */
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &comm_rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &comm_size);
			
 
				+
			
 
				+	if (comm_rank == 0)
			
 
				+	{
			
 
				+#if VERBOSE
			
 
				+		printf("N = %d\n", N);
			
 
				+		printf("BS = %d\n", BS);
			
 
				+		printf("NB = %d\n", NB);
			
 
				+		printf("comm_size = %d\n", comm_size);
			
 
				+#endif
			
 
				+		/* In this example, node rank 0 performs all the memory allocations and initializations,
			
 
				+		 * and the blocks are later distributed on the other nodes.
			
 
				+		 * This is not mandatory however, and blocks could be allocated on other nodes right
			
 
				+		 * from the beginning, depending on the application needs (in particular for the case
			
 
				+		 * where the session wide data footprint is larger than a single node available memory. */
			
 
				+		alloc_matrices();
			
 
				+		init_matrices();
			
 
				+	}
			
 
				+
			
 
				+	/* Register matrices to StarPU and StarPU-MPI */
			
 
				+	register_matrices();
			
 
				+	/* Distribute C blocks */
			
 
				+	distribute_matrix_C();
			
 
				+
			
 
				+	int b_row,b_col;
			
 
				+
			
 
				+	for (b_row = 0; b_row < NB; b_row++)
			
 
				+	{
			
 
				+		for (b_col = 0; b_col < NB; b_col++)
			
 
				+		{
			
 
				+			starpu_mpi_task_insert(MPI_COMM_WORLD, &gemm_cl,
			
 
				+					STARPU_R,  A_h[b_row],
			
 
				+					STARPU_R,  B_h[b_col],
			
 
				+					STARPU_RW, C_h[b_row*NB+b_col],
			
 
				+					0);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	undistribute_matrix_C();
			
 
				+	unregister_matrices();
			
 
				+
			
 
				+	if (comm_rank == 0) {
			
 
				+#if VERBOSE
			
 
				+		disp_matrix(C);
			
 
				+#endif
			
 
				+		check_result();
			
 
				+		free_matrices();
			
 
				+	}
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+}
			
 
				+