9 lat temu · 8f654dead2
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,6 +1,6 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				-# Copyright (C) 2009-2015  Université de Bordeaux
			
 
				+# Copyright (C) 2009-2016  Université de Bordeaux
			
 
				 # Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
			
 
				 # Copyright (C) 2014 INRIA
			
 
				 #
			
@@ -22,6 +22,7 @@ New features:
 
				   * Enable anticipative writeback by default.
			
 
				   * New scheduler with heterogeneous priorities
			
 
				   * Support priorities for data transfers.
			
 
				+  * Add STARPU_MALLOC_SIMULATION_FOLDED flag to save memory when simulating.
			
 
				 
			
 
				 Changes:
			
 
				   * Vastly improve simgrid simulation time.
			
--- a/doc/doxygen/chapters/21simgrid.doxy
+++ b/doc/doxygen/chapters/21simgrid.doxy
@@ -133,7 +133,7 @@ simulation and a very simple simulation (which is thus close to scheduling
 
				 theory results), see the \ref STARPU_SIMGRID_CUDA_MALLOC_COST and \ref
			
 
				 STARPU_SIMGRID_CUDA_QUEUE_COST environment variables.
			
 
				 
			
 
				-\section MPI applications
			
 
				+\section SimulationMPIApplications MPI applications
			
 
				 
			
 
				 StarPU-MPI applications can also be run in simgrid mode. It needs to be compiled
			
 
				 with smpicc, and run using the starpu_smpirun script, for instance:
			
@@ -147,7 +147,7 @@ list of MPI nodes to be used. StarPU currently only supports homogeneous MPI
 
				 clusters: for each MPI node it will just replicate the architecture referred by
			
 
				 \ref STARPU_HOSTNAME.
			
 
				 
			
 
				-\section Debugging applications
			
 
				+\section SimulationDebuggingApplications Debugging applications
			
 
				 
			
 
				 By default, simgrid uses its own implementation of threads, which prevents gdb
			
 
				 from being able to inspect stacks of all threads.  To be able to fully debug an
			
@@ -157,5 +157,30 @@ able to manipulate as usual.
 
				 
			
 
				 \snippet simgrid.c To be included. You should update doxygen if you see this text.
			
 
				 
			
 
				+\section SimulationMemoryUsage Memory usage
			
 
				+
			
 
				+Since kernels are not actually run and data transfers are not actually
			
 
				+performed, the data memory does not actually need to be allocated.  This allows
			
 
				+for instance to simulate the execution of applications processing very big data
			
 
				+on a small laptop.
			
 
				+
			
 
				+The application can for instance pass <c>1</c> (or whatever bogus pointer)
			
 
				+to starpu data registration functions, instead of allocating data. This will
			
 
				+however require the application to take care of not trying to access the data,
			
 
				+and will not work in MPI mode, which performs transfers.
			
 
				+
			
 
				+Another way is to pass the STARPU_MALLOC_SIMULATION_FOLDED flag to the
			
 
				+starpu_malloc_flags() function. This will make it allocate a memory area which
			
 
				+one can read/write, but optimized so that this does not actually consume
			
 
				+memory. Of course, the values read from such area will be bogus, but this allows
			
 
				+the application to keep e.g. data load, store, initialization as it is, and also
			
 
				+work in MPI mode.
			
 
				+
			
 
				+Note however that notably Linux kernels refuse obvious memory overcommitting by
			
 
				+default, so a single allocation can typically not be bigger than the amount of
			
 
				+physical memory, see https://www.kernel.org/doc/Documentation/vm/overcommit-accounting
			
 
				+This prevents for instance from allocating a single huge matrix. Allocating a
			
 
				+huge matrix in several tiles is not a problem, however. <c>sysctl
			
 
				+vm.overcommit_memory=1</c> can also be used to allow such overcommit.
			
 
				 
			
 
				 */
			
--- a/doc/doxygen/chapters/api/standard_memory_library.doxy
+++ b/doc/doxygen/chapters/api/standard_memory_library.doxy
@@ -45,6 +45,15 @@ according to the STARPU_MINIMUM_AVAILABLE_MEM and STARPU_TARGET_AVAILABLE_MEM
 
				 environment variables. If STARPU_MEMORY_WAIT is set, no overflowing will happen,
			
 
				 starpu_malloc_flags() will wait for other eviction mechanisms to release enough memory.
			
 
				 
			
 
				+\def STARPU_MALLOC_SIMULATION_FOLDED
			
 
				+\ingroup API_Standard_Memory_Library
			
 
				+Value passed to the function starpu_malloc_flags() to indicate that when
			
 
				+StarPU is using simgrid, the allocation can be "folded", i.e. a memory area is
			
 
				+allocated, but its content is actually a replicate of the same memory area, to
			
 
				+avoid having to actually allocate that much memory . This thus allows to have a
			
 
				+memory area that does not actually consumes memory, to which one can read from
			
 
				+and write to normally, but get bogus values.
			
 
				+
			
 
				 \fn int starpu_malloc_flags(void **A, size_t dim, int flags)
			
 
				 \ingroup API_Standard_Memory_Library
			
 
				 Performs a memory allocation based on the constraints defined
			
--- a/examples/cholesky/cholesky_grain_tag.c
+++ b/examples/cholesky/cholesky_grain_tag.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2016  Université de Bordeaux
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010, 2011, 2012  CNRS
			
 
				  *
			
@@ -267,6 +267,7 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
				 static void initialize_system(float **A, unsigned dim, unsigned pinned)
			
 
				 {
			
 
				 	int ret;
			
 
				+	int flags = STARPU_MALLOC_SIMULATION_FOLDED;
			
 
				 
			
 
				 #ifdef STARPU_HAVE_MAGMA
			
 
				 	magma_init();
			
@@ -289,16 +290,9 @@ static void initialize_system(float **A, unsigned dim, unsigned pinned)
 
				 
			
 
				 	starpu_cublas_init();
			
 
				 
			
 
				-#ifndef STARPU_SIMGRID
			
 
				 	if (pinned)
			
 
				-	{
			
 
				-		starpu_malloc((void **)A, dim*dim*sizeof(float));
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		*A = malloc(dim*dim*sizeof(float));
			
 
				-	}
			
 
				-#endif
			
 
				+		flags |= STARPU_MALLOC_PINNED;
			
 
				+	starpu_malloc_flags((void **)A, dim*dim*sizeof(float), flags);
			
 
				 }
			
 
				 
			
 
				 int cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks, unsigned pinned)
			
@@ -323,16 +317,13 @@ int cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, un
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static void shutdown_system(float **matA, unsigned pinned)
			
 
				+static void shutdown_system(float **matA, unsigned dim, unsigned pinned)
			
 
				 {
			
 
				+	int flags = STARPU_MALLOC_SIMULATION_FOLDED;
			
 
				 	if (pinned)
			
 
				-	{
			
 
				-	     starpu_free(*matA);
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-	     free(*matA);
			
 
				-	}
			
 
				+		flags |= STARPU_MALLOC_PINNED;
			
 
				+
			
 
				+	starpu_free_flags(*matA, dim*dim*sizeof(float), flags);
			
 
				 
			
 
				 	starpu_cublas_shutdown();
			
 
				 	starpu_shutdown();
			
@@ -433,6 +424,6 @@ int main(int argc, char **argv)
 
				 	free(test_mat);
			
 
				 #endif
			
 
				 
			
 
				-	shutdown_system(&mat, pinned);
			
 
				+	shutdown_system(&mat, size, pinned);
			
 
				 	return ret;
			
 
				 }
			
--- a/examples/cholesky/cholesky_implicit.c
+++ b/examples/cholesky/cholesky_implicit.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2016  Université de Bordeaux
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013  CNRS
			
 
				  *
			
@@ -192,8 +192,8 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 
				 	float *mat = NULL;
			
 
				 	unsigned i,j;
			
 
				 
			
 
				+	starpu_malloc_flags((void **)&mat, (size_t)size*size*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				 #ifndef STARPU_SIMGRID
			
 
				-	starpu_malloc((void **)&mat, (size_t)size*size*sizeof(float));
			
 
				 	for (i = 0; i < size; i++)
			
 
				 	{
			
 
				 		for (j = 0; j < size; j++)
			
@@ -303,7 +303,7 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 
				 	        }
			
 
				 		free(test_mat);
			
 
				 	}
			
 
				-	starpu_free(mat);
			
 
				+	starpu_free_flags(mat, (size_t)size*size*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				 }
			
 
				 
			
 
				 int main(int argc, char **argv)
			
--- a/examples/cholesky/cholesky_tag.c
+++ b/examples/cholesky/cholesky_tag.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2016  Université de Bordeaux
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013  CNRS
			
 
				  *
			
@@ -230,6 +230,7 @@ static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
				 static int initialize_system(float **A, unsigned dim, unsigned pinned)
			
 
				 {
			
 
				 	int ret;
			
 
				+	int flags = STARPU_MALLOC_SIMULATION_FOLDED;
			
 
				 
			
 
				 #ifdef STARPU_HAVE_MAGMA
			
 
				 	magma_init();
			
@@ -252,16 +253,10 @@ static int initialize_system(float **A, unsigned dim, unsigned pinned)
 
				 
			
 
				 	starpu_cublas_init();
			
 
				 
			
 
				-#ifndef STARPU_SIMGRID
			
 
				 	if (pinned)
			
 
				-	{
			
 
				-		starpu_malloc((void **)A, (size_t)dim*dim*sizeof(float));
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		*A = malloc(dim*dim*sizeof(float));
			
 
				-	}
			
 
				-#endif
			
 
				+		flags |= STARPU_MALLOC_PINNED;
			
 
				+	starpu_malloc_flags((void **)A, dim*dim*sizeof(float), flags);
			
 
				+
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -294,16 +289,13 @@ static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 
				 	starpu_data_unregister(dataA);
			
 
				 }
			
 
				 
			
 
				-static void shutdown_system(float **matA, unsigned pinned)
			
 
				+static void shutdown_system(float **matA, unsigned dim, unsigned pinned)
			
 
				 {
			
 
				+	int flags = STARPU_MALLOC_SIMULATION_FOLDED;
			
 
				 	if (pinned)
			
 
				-	{
			
 
				-		starpu_free(*matA);
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		free(*matA);
			
 
				-	}
			
 
				+		flags |= STARPU_MALLOC_PINNED;
			
 
				+
			
 
				+	starpu_free_flags(*matA, dim*dim*sizeof(float), flags);
			
 
				 
			
 
				 	starpu_cublas_shutdown();
			
 
				 	starpu_shutdown();
			
@@ -404,6 +396,6 @@ int main(int argc, char **argv)
 
				 	free(test_mat);
			
 
				 #endif
			
 
				 
			
 
				-	shutdown_system(&mat, pinned);
			
 
				+	shutdown_system(&mat, size, pinned);
			
 
				 	return 0;
			
 
				 }
			
--- a/examples/cholesky/cholesky_tile_tag.c
+++ b/examples/cholesky/cholesky_tile_tag.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2016  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -249,21 +249,17 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_cublas_init();
			
 
				 
			
 
				-#ifndef STARPU_SIMGRID
			
 
				 	for (y = 0; y < nblocks; y++)
			
 
				 	for (x = 0; x < nblocks; x++)
			
 
				 	{
			
 
				 		if (x <= y)
			
 
				 		{
			
 
				-#ifdef STARPU_HAVE_POSIX_MEMALIGN
			
 
				-			posix_memalign((void **)&A[y][x], 128, BLOCKSIZE*BLOCKSIZE*sizeof(float));
			
 
				-#else
			
 
				-			A[y][x] = malloc(BLOCKSIZE*BLOCKSIZE*sizeof(float));
			
 
				-#endif
			
 
				+			starpu_malloc_flags((void **)&A[y][x], BLOCKSIZE*BLOCKSIZE*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				 			assert(A[y][x]);
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+#ifndef STARPU_SIMGRID
			
 
				 	/* create a simple definite positive symetric matrix example
			
 
				 	 *
			
 
				 	 *	Hilbert matrix : h(i,j) = 1/(i+j+1) ( + n In to make is stable ) 
			
@@ -304,7 +300,7 @@ int main(int argc, char **argv)
 
				 		if (x <= y)
			
 
				 		{
			
 
				 			starpu_data_unregister(A_state[y][x]);
			
 
				-			free(A[y][x]);
			
 
				+			starpu_free_flags(A[y][x], BLOCKSIZE*BLOCKSIZE*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				 		}
			
 
				 	}
			
 
				 
			
--- a/examples/lu/lu_example.c
+++ b/examples/lu/lu_example.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2016  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2015  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -175,12 +175,13 @@ void copy_matrix_into_blocks(void)
 
				 static void init_matrix(void)
			
 
				 {
			
 
				 	/* allocate matrix */
			
 
				-	starpu_malloc((void **)&A, (size_t)size*size*sizeof(TYPE));
			
 
				+	starpu_malloc_flags((void **)&A, (size_t)size*size*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				 	STARPU_ASSERT(A);
			
 
				 
			
 
				 	starpu_srand48((long int)time(NULL));
			
 
				 	/* starpu_srand48(0); */
			
 
				 
			
 
				+#ifndef STARPU_SIMGRID
			
 
				 	/* initialize matrix content */
			
 
				 	unsigned long i,j;
			
 
				 	for (j = 0; j < size; j++)
			
@@ -196,6 +197,7 @@ static void init_matrix(void)
 
				 				A[i + j*size] *= 100;
			
 
				 		}
			
 
				 	}
			
 
				+#endif
			
 
				 
			
 
				 }
			
 
				 
			
@@ -325,9 +327,9 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_cublas_init();
			
 
				 
			
 
				-#ifndef STARPU_SIMGRID
			
 
				 	init_matrix();
			
 
				 
			
 
				+#ifndef STARPU_SIMGRID
			
 
				 	unsigned *ipiv = NULL;
			
 
				 	if (check)
			
 
				 		save_matrix();
			
@@ -418,7 +420,7 @@ int main(int argc, char **argv)
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				-	starpu_free(A);
			
 
				+	starpu_free_flags(A, (size_t)size*size*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				 
			
 
				 	starpu_cublas_shutdown();
			
 
				 
			
--- a/examples/mult/xgemm.c
+++ b/examples/mult/xgemm.c
@@ -86,11 +86,11 @@ static void init_problem_data(void)
 
				 {
			
 
				 	unsigned i,j;
			
 
				 
			
 
				-#ifndef STARPU_SIMGRID
			
 
				-	starpu_malloc((void **)&A, zdim*ydim*sizeof(TYPE));
			
 
				-	starpu_malloc((void **)&B, xdim*zdim*sizeof(TYPE));
			
 
				-	starpu_malloc((void **)&C, xdim*ydim*sizeof(TYPE));
			
 
				+	starpu_malloc_flags((void **)&A, zdim*ydim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				+	starpu_malloc_flags((void **)&B, xdim*zdim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				+	starpu_malloc_flags((void **)&C, xdim*ydim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				 
			
 
				+#ifndef STARPU_SIMGRID
			
 
				 	/* fill the A and B matrices */
			
 
				 	for (j=0; j < ydim; j++)
			
 
				 	{
			
@@ -397,9 +397,9 @@ enodev:
 
				 	if (check)
			
 
				 		check_output();
			
 
				 
			
 
				-	starpu_free(A);
			
 
				-	starpu_free(B);
			
 
				-	starpu_free(C);
			
 
				+	starpu_free_flags(A, zdim*ydim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				+	starpu_free_flags(B, xdim*zdim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				+	starpu_free_flags(C, xdim*ydim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				 
			
 
				 	starpu_cublas_shutdown();
			
 
				 	starpu_shutdown();
			
--- a/examples/stencil/stencil-blocks.c
+++ b/examples/stencil/stencil-blocks.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2013-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2013-2016  Université de Bordeaux
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -254,9 +254,8 @@ static void allocate_block_on_node(starpu_data_handle_t *handleptr, TYPE **ptr,
 
				 	size_t block_size = nx*ny*nz*sizeof(TYPE);
			
 
				 
			
 
				 	/* Allocate memory */
			
 
				-#ifndef STARPU_SIMGRID
			
 
				 #if 1
			
 
				-	ret = starpu_malloc((void **)ptr, block_size);
			
 
				+	ret = starpu_malloc_flags((void **)ptr, block_size, STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				 	STARPU_ASSERT(ret == 0);
			
 
				 #else
			
 
				 	*ptr = malloc(block_size);
			
@@ -265,6 +264,7 @@ static void allocate_block_on_node(starpu_data_handle_t *handleptr, TYPE **ptr,
 
				 
			
 
				 	allocated += block_size;
			
 
				 
			
 
				+#ifndef STARPU_SIMGRID
			
 
				 	/* Fill the blocks with 0 */
			
 
				 	memset(*ptr, 0, block_size);
			
 
				 #endif
			
@@ -273,11 +273,12 @@ static void allocate_block_on_node(starpu_data_handle_t *handleptr, TYPE **ptr,
 
				 	starpu_block_data_register(handleptr, STARPU_MAIN_RAM, (uintptr_t)*ptr, nx, nx*ny, nx, ny, nz, sizeof(TYPE));
			
 
				 }
			
 
				 
			
 
				-static void free_block_on_node(starpu_data_handle_t handleptr)
			
 
				+static void free_block_on_node(starpu_data_handle_t handleptr, unsigned nx, unsigned ny, unsigned nz)
			
 
				 {
			
 
				 	void *ptr = (void *) starpu_block_get_local_ptr(handleptr);
			
 
				+	size_t block_size = nx*ny*nz*sizeof(TYPE);
			
 
				 	starpu_data_unregister(handleptr);
			
 
				-	starpu_free(ptr);
			
 
				+	starpu_free_flags(ptr, block_size, STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				 }
			
 
				 
			
 
				 void display_memory_consumption(int rank)
			
@@ -351,24 +352,24 @@ void free_memory_on_node(int rank)
 
				 		/* Main blocks */
			
 
				 		if (node == rank)
			
 
				 		{
			
 
				-			free_block_on_node(block->layers_handle[0]);
			
 
				-			free_block_on_node(block->layers_handle[1]);
			
 
				+			free_block_on_node(block->layers_handle[0], (sizex + 2*K), (sizey + 2*K), K);
			
 
				+			free_block_on_node(block->layers_handle[1], (sizex + 2*K), (sizey + 2*K), K);
			
 
				 		}
			
 
				 
			
 
				 		/* Boundary blocks : Top */
			
 
				 		int top_node = block->boundary_blocks[T]->mpi_node;
			
 
				 		if ((node == rank) || (top_node == rank))
			
 
				 		{
			
 
				-			free_block_on_node(block->boundaries_handle[T][0]);
			
 
				-			free_block_on_node(block->boundaries_handle[T][1]);
			
 
				+			free_block_on_node(block->boundaries_handle[T][0], (sizex + 2*K), (sizey + 2*K), K);
			
 
				+			free_block_on_node(block->boundaries_handle[T][1], (sizex + 2*K), (sizey + 2*K), K);
			
 
				 		}
			
 
				 
			
 
				 		/* Boundary blocks : Bottom */
			
 
				 		int bottom_node = block->boundary_blocks[B]->mpi_node;
			
 
				 		if ((node == rank) || (bottom_node == rank))
			
 
				 		{
			
 
				-			free_block_on_node(block->boundaries_handle[B][0]);
			
 
				-			free_block_on_node(block->boundaries_handle[B][1]);
			
 
				+			free_block_on_node(block->boundaries_handle[B][0], (sizex + 2*K), (sizey + 2*K), K);
			
 
				+			free_block_on_node(block->boundaries_handle[B][1], (sizex + 2*K), (sizey + 2*K), K);
			
 
				 		}
			
 
				 	}
			
 
				 }
			
--- a/include/starpu_stdlib.h
+++ b/include/starpu_stdlib.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2010-2016  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -32,6 +32,8 @@ extern "C"
 
				 #define STARPU_MEMORY_WAIT	((1ULL)<<4)
			
 
				 #define STARPU_MEMORY_OVERFLOW	((1ULL)<<5)
			
 
				 
			
 
				+#define STARPU_MALLOC_SIMULATION_FOLDED	((1ULL)<<6)
			
 
				+
			
 
				 void starpu_malloc_set_align(size_t align);
			
 
				 
			
 
				 int starpu_malloc(void **A, size_t dim);
			
--- a/src/datawizard/malloc.c
+++ b/src/datawizard/malloc.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2010, 2012-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2010, 2012-2016  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -28,10 +28,26 @@
 
				 #include <datawizard/malloc.h>
			
 
				 #include <core/simgrid.h>
			
 
				 
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+#include <sys/mman.h>
			
 
				+#include <fcntl.h>
			
 
				+#endif
			
 
				+
			
 
				+#ifndef O_BINARY
			
 
				+#define O_BINARY 0
			
 
				+#endif
			
 
				+
			
 
				 static size_t _malloc_align = sizeof(void*);
			
 
				 static int disable_pinning;
			
 
				 static int malloc_on_node_default_flags[STARPU_MAXNODES];
			
 
				 
			
 
				+/* This file is used for implementing "folded" allocation */
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+static int bogusfile = -1;
			
 
				+/* Reasonably "costless" */
			
 
				+#define BOGUSFILE_SIZE (16UL<<20)
			
 
				+#endif
			
 
				+
			
 
				 void starpu_malloc_set_align(size_t align)
			
 
				 {
			
 
				 	STARPU_ASSERT_MSG(!(align & (align - 1)), "Alignment given to starpu_malloc_set_align (%lu) must be a power of two", (unsigned long) align);
			
@@ -201,6 +217,60 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 
				 #endif /* STARPU_SIMGRID */
			
 
				 	}
			
 
				 
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+	if (flags & STARPU_MALLOC_SIMULATION_FOLDED)
			
 
				+	{
			
 
				+		/* Use "folded" allocation: the same file is mapped several
			
 
				+		 * times contiguously, to get a memory area one can read/write,
			
 
				+		 * without consuming memory */
			
 
				+
			
 
				+		/* First reserve memory area */
			
 
				+		void *buf = mmap (NULL, dim, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
			
 
				+		unsigned i;
			
 
				+		if (buf == MAP_FAILED)
			
 
				+		{
			
 
				+			_STARPU_DISP("Warning: could not allocate %luMiB of memory, you need to run \"sysctl vm.overcommit_memory=1\" as root to allow so big allocations\n", (unsigned long) (dim >> 20));
			
 
				+			ret = -ENOMEM;
			
 
				+			*A = NULL;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			if (bogusfile == -1)
			
 
				+			{
			
 
				+				char *path = starpu_getenv("TMPDIR");
			
 
				+				if (!path)
			
 
				+					path = "/tmp";
			
 
				+				/* Create bogus file if not done already */
			
 
				+				char *name = _starpu_mktemp(path, O_RDWR | O_BINARY, &bogusfile);
			
 
				+				if (!name)
			
 
				+				{
			
 
				+					ret = errno;
			
 
				+					munmap(buf, dim);
			
 
				+					*A = NULL;
			
 
				+				}
			
 
				+				unlink(name);
			
 
				+				free(name);
			
 
				+				_starpu_ftruncate(bogusfile, BOGUSFILE_SIZE);
			
 
				+			}
			
 
				+			/* Map the bogus file in place of the anonymous memory */
			
 
				+			for (i = 0; i < dim / BOGUSFILE_SIZE; i++)
			
 
				+			{
			
 
				+				void *pos = (void*) ((unsigned long) buf + i * BOGUSFILE_SIZE);
			
 
				+				void *res = mmap(pos, BOGUSFILE_SIZE, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_SHARED, bogusfile, 0);
			
 
				+				STARPU_ASSERT(res == pos);
			
 
				+			}
			
 
				+
			
 
				+			if (dim % BOGUSFILE_SIZE)
			
 
				+			{
			
 
				+				void *pos = (void*) ((unsigned long) buf + i * BOGUSFILE_SIZE);
			
 
				+				void *res = mmap(pos, dim % BOGUSFILE_SIZE, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_SHARED, bogusfile, 0);
			
 
				+				STARPU_ASSERT(res == pos);
			
 
				+			}
			
 
				+			*A = buf;
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+#endif
			
 
				 	if (_starpu_can_submit_scc_task())
			
 
				 	{
			
 
				 #ifdef STARPU_USE_SCC
			
@@ -359,6 +429,13 @@ int starpu_free_flags(void *A, size_t dim, int flags)
 
				 	}
			
 
				 #endif /* STARPU_SIMGRID */
			
 
				 
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+	if (flags & STARPU_MALLOC_SIMULATION_FOLDED)
			
 
				+	{
			
 
				+		munmap(A, dim);
			
 
				+	}
			
 
				+	else
			
 
				+#endif
			
 
				 	if (_starpu_can_submit_scc_task())
			
 
				 	{
			
 
				 #ifdef STARPU_USE_SCC