Browse Source

examples/stencil: add an implicit distributed flavour of the stencil example

Marc Sergent 8 years ago
parent
commit
c79e7c2d23

+ 27 - 2
examples/stencil/Makefile.am

@@ -93,12 +93,14 @@ endif
 ###################
 
 STARPU_EXAMPLES =				\
-	stencil
+	stencil						\
+	implicit_stencil
 
 examplebindir = $(libdir)/starpu/examples/stencil
 
 examplebin_PROGRAMS =				\
-	stencil
+	stencil							\
+	implicit_stencil
 
 stencil_SOURCES =				\
 	life.c					\
@@ -123,6 +125,29 @@ stencil_SOURCES +=				\
 	shadow_opencl.c
 endif
 
+implicit_stencil_SOURCES =				\
+	life.c					\
+	implicit-stencil-kernels.c			\
+	implicit-stencil-tasks.c				\
+	implicit-stencil-blocks.c			\
+	implicit-stencil.c
+
+noinst_HEADERS =				\
+	implicit-stencil.h				\
+	shadow.h
+
+if STARPU_USE_CUDA
+implicit_stencil_SOURCES +=				\
+	life_cuda.cu				\
+	shadow.cu
+endif
+
+if STARPU_USE_OPENCL
+implicit_stencil_SOURCES +=				\
+	life_opencl.c				\
+	shadow_opencl.c
+endif
+
 outs =						\
 	0.5.out					\
 	0.out					\

+ 4 - 0
examples/stencil/README

@@ -42,3 +42,7 @@ penalty ratio), run make pics or make view to get pictures.
 mpi.out: results on MPI.
 
 results: a few results
+
+You can also use the implicit distributed flavour of this application (e.g.
+with communications between processes automatically inferred by StarPU-MPI),
+which is called implicit_stencil.

+ 445 - 0
examples/stencil/implicit-stencil-blocks.c

@@ -0,0 +1,445 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2013-2016  Université de Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "implicit-stencil.h"
+#include <math.h>
+
+/* Manage block and tags allocation */
+
+static struct block_description *blocks;
+static unsigned sizex, sizey, sizez;
+static unsigned nbz;
+static unsigned *block_sizes_z;
+
+/*
+ *	Tags for various codelet completion
+ */
+
+/*
+ * common tag format:
+ */
+static starpu_tag_t tag_common(int z, int dir, int type)
+{
+	return (((((starpu_tag_t)type) << 4) | ((dir+1)/2)) << 32)|(starpu_tag_t)z;
+}
+
+/* Completion of last update tasks */
+starpu_tag_t TAG_FINISH(int z)
+{
+	z = (z + nbz)%nbz;
+
+	starpu_tag_t tag = tag_common(z, 0, 1);
+	return tag;
+}
+
+/* Completion of the save codelet for MPI send/recv */
+starpu_tag_t TAG_START(int z, int dir)
+{
+	z = (z + nbz)%nbz;
+
+	starpu_tag_t tag = tag_common(z, dir, 2);
+	return tag;
+}
+
+/*
+ * common MPI tag format:
+ */
+static int mpi_tag_common(int z, int dir, int layer_or_boundary, int buffer)
+{
+	return (z<<12) | (layer_or_boundary << 8) | ((((1+dir)/2))<<4) | buffer;
+}
+
+int MPI_TAG_LAYERS(int z, int buffer)
+{
+	z = (z + nbz)%nbz;
+
+    /* No direction for layers ; layer is 0 */
+	int tag = mpi_tag_common(z, 0, 0, buffer);
+
+	return tag;
+}
+
+int MPI_TAG_BOUNDARIES(int z, int dir, int buffer)
+{
+	z = (z + nbz)%nbz;
+
+	int tag = mpi_tag_common(z, dir, 1, buffer);
+
+	return tag;
+}
+
+
+/*
+ *	Block descriptors
+ */
+
+/* Compute the size of the different blocks */
+static void compute_block_sizes(void)
+{
+	block_sizes_z = (unsigned *) malloc(nbz*sizeof(unsigned));
+	STARPU_ASSERT(block_sizes_z);
+
+	/* Perhaps the last chunk is smaller */
+	unsigned default_block_size = (sizez+nbz-1)/nbz;
+	unsigned remaining = sizez;
+
+	unsigned b;
+	for (b = 0; b < nbz; b++)
+	{
+		block_sizes_z[b] = MIN(default_block_size, remaining);
+		remaining -= block_sizes_z[b];
+	}
+
+	STARPU_ASSERT(remaining == 0);
+}
+
+unsigned get_block_size(int bz)
+{
+	return block_sizes_z[bz];
+}
+
+struct block_description *get_block_description(int z)
+{
+	z = (z + nbz)%nbz;
+
+	STARPU_ASSERT(&blocks[z]);
+
+	return &blocks[z];
+}
+
+int get_block_mpi_node(int z)
+{
+	z = (z + nbz)%nbz;
+	return blocks[z].mpi_node;
+}
+
+void create_blocks_array(unsigned _sizex, unsigned _sizey, unsigned _sizez, unsigned _nbz)
+{
+	/* Store the parameters */
+	nbz = _nbz;
+	sizex = _sizex;
+	sizey = _sizey;
+	sizez = _sizez;
+
+	/* Create a grid of block descriptors */
+	blocks = (struct block_description *) calloc(nbz, sizeof(struct block_description));
+	STARPU_ASSERT(blocks);
+
+	/* What is the size of the different blocks ? */
+	compute_block_sizes();
+
+	unsigned bz;
+	for (bz = 0; bz < nbz; bz++)
+	{
+		struct block_description * block =
+				get_block_description(bz);
+
+		/* Which block is it ? */
+		block->bz = bz;
+
+		/* For simplicity, we store which are the neighbours blocks */
+		block->boundary_blocks[B] = get_block_description((bz-1+nbz)%nbz);
+		block->boundary_blocks[T] = get_block_description((bz+1)%nbz);
+	}
+}
+
+void free_blocks_array()
+{
+	free(blocks);
+	free(block_sizes_z);
+}
+
+/*
+ *	Initialization of the blocks
+ */
+
+void assign_blocks_to_workers(int rank)
+{
+	unsigned bz;
+
+	/* NB: perhaps we could count a GPU as multiple workers */
+
+	/* how many workers are there ? */
+	/*unsigned nworkers = starpu_worker_get_count();*/
+
+	/* how many blocks are on that MPI node ? */
+	unsigned nblocks = 0;
+	for (bz = 0; bz < nbz; bz++)
+	{
+		struct block_description *block =
+				get_block_description(bz);
+
+		if (block->mpi_node == rank)
+			nblocks++;
+	}
+
+	/* how many blocks per worker ? */
+	/*unsigned nblocks_per_worker = (nblocks + nworkers - 1)/nworkers;*/
+
+	/* we now attribute up to nblocks_per_worker blocks per workers */
+	unsigned attributed = 0;
+	for (bz = 0; bz < nbz; bz++)
+	{
+		struct block_description *block =
+				get_block_description(bz);
+
+		if (block->mpi_node == rank)
+		{
+			unsigned workerid;
+			/* Manage initial block distribution between CPU and GPU */
+		#if 0
+			#if 1
+			/* GPUs then CPUs */
+			if (attributed < 3*18)
+				workerid = attributed / 18;
+			else
+				workerid = 3+ (attributed - 3*18) / 2;
+			#else
+			/* GPUs interleaved with CPUs */
+			if ((attributed % 20) <= 1)
+				workerid = 3 + attributed / 20;
+			else if (attributed < 60)
+				workerid = attributed / 20;
+			else
+				workerid = (attributed - 60)/2 + 6;
+			#endif
+		#else
+			/* Only GPUS */
+			workerid = (attributed / 21) % 3;
+		#endif
+			/*= attributed/nblocks_per_worker;*/
+
+			block->preferred_worker = workerid;
+
+			attributed++;
+		}
+	}
+}
+
+
+
+void assign_blocks_to_mpi_nodes(int world_size)
+{
+	unsigned nzblocks_per_process = (nbz + world_size - 1) / world_size;
+
+	unsigned bz;
+	for (bz = 0; bz < nbz; bz++)
+	{
+		struct block_description *block =
+				get_block_description(bz);
+
+		block->mpi_node = bz / nzblocks_per_process;
+	}
+}
+
+static size_t allocated = 0;
+
+static void allocate_block_on_node(starpu_data_handle_t *handleptr, TYPE **ptr, unsigned nx, unsigned ny, unsigned nz)
+{
+	int ret;
+	size_t block_size = nx*ny*nz*sizeof(TYPE);
+
+	/* Allocate memory */
+#if 1
+	ret = starpu_malloc_flags((void **)ptr, block_size, STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+	STARPU_ASSERT(ret == 0);
+#else
+	*ptr = malloc(block_size);
+	STARPU_ASSERT(*ptr);
+#endif
+
+	allocated += block_size;
+
+//#ifndef STARPU_SIMGRID
+//	/* Fill the blocks with 0 */
+//	memset(*ptr, 0, block_size);
+//#endif
+
+	/* Register it to StarPU */
+	starpu_block_data_register(handleptr, STARPU_MAIN_RAM, (uintptr_t)*ptr, nx, nx*ny, nx, ny, nz, sizeof(TYPE));
+}
+
+static void free_block_on_node(starpu_data_handle_t handleptr, unsigned nx, unsigned ny, unsigned nz)
+{
+	void *ptr = (void *) starpu_block_get_local_ptr(handleptr);
+	size_t block_size = nx*ny*nz*sizeof(TYPE);
+	starpu_free_flags(ptr, block_size, STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+	starpu_data_unregister(handleptr);
+}
+
+void display_memory_consumption(int rank, double time)
+{
+	FPRINTF(stderr, "%lu B of memory were allocated on node %d in %f ms\n", allocated, rank, time/1000);
+}
+
+void allocate_memory_on_node(int rank)
+{
+	unsigned bz;
+
+    /* Correctly allocate and declare all data handles to StarPU. */
+	for (bz = 0; bz < nbz; bz++)
+	{
+		struct block_description *block = get_block_description(bz);
+		int node = block->mpi_node;
+		unsigned size_bz = block_sizes_z[bz];
+
+		if (node == rank)
+		{
+            /* Main blocks */
+			allocate_block_on_node(&block->layers_handle[0], &block->layers[0],
+						(sizex + 2*K), (sizey + 2*K), (size_bz + 2*K));
+			allocate_block_on_node(&block->layers_handle[1], &block->layers[1],
+						(sizex + 2*K), (sizey + 2*K), (size_bz + 2*K));
+
+            /* Boundary blocks : Top */
+			allocate_block_on_node(&block->boundaries_handle[T][0], &block->boundaries[T][0],
+						(sizex + 2*K), (sizey + 2*K), K);
+			allocate_block_on_node(&block->boundaries_handle[T][1], &block->boundaries[T][1],
+						(sizex + 2*K), (sizey + 2*K), K);
+
+            /* Boundary blocks : Bottom */
+			allocate_block_on_node(&block->boundaries_handle[B][0], &block->boundaries[B][0],
+						(sizex + 2*K), (sizey + 2*K), K);
+			allocate_block_on_node(&block->boundaries_handle[B][1], &block->boundaries[B][1],
+						(sizex + 2*K), (sizey + 2*K), K);
+		}
+
+        /* Register void blocks to StarPU, that StarPU-MPI will request to
+         * neighbour nodes if needed for the local computation */
+        else
+        {
+            /* Main blocks */
+            starpu_block_data_register(&block->layers_handle[0], -1, (uintptr_t) NULL, (sizex + 2*K), (sizex + 2*K)*(sizey + 2*K), (sizex + 2*K), (sizey + 2*K), (size_bz + 2*K), sizeof(TYPE));
+            starpu_block_data_register(&block->layers_handle[1], -1, (uintptr_t) NULL, (sizex + 2*K), (sizex + 2*K)*(sizey + 2*K), (sizex + 2*K), (sizey + 2*K), (size_bz + 2*K), sizeof(TYPE));
+
+            /* Boundary blocks : Top */
+            starpu_block_data_register(&block->boundaries_handle[T][0], -1, (uintptr_t) NULL, (sizex + 2*K), (sizex + 2*K)*(sizey + 2*K), (sizex + 2*K), (sizey + 2*K), K, sizeof(TYPE));
+            starpu_block_data_register(&block->boundaries_handle[T][1], -1, (uintptr_t) NULL, (sizex + 2*K), (sizex + 2*K)*(sizey + 2*K), (sizex + 2*K), (sizey + 2*K), K, sizeof(TYPE));
+        
+            /* Boundary blocks : Bottom */
+            starpu_block_data_register(&block->boundaries_handle[B][0], -1, (uintptr_t) NULL, (sizex + 2*K), (sizex + 2*K)*(sizey + 2*K), (sizex + 2*K), (sizey + 2*K), K, sizeof(TYPE));
+            starpu_block_data_register(&block->boundaries_handle[B][1], -1, (uintptr_t) NULL, (sizex + 2*K), (sizex + 2*K)*(sizey + 2*K), (sizex + 2*K), (sizey + 2*K), K, sizeof(TYPE));
+        }
+
+
+        /* Register all data to StarPU-MPI, even the ones that are not
+         * allocated on the local node. */
+
+        /* Main blocks */
+        starpu_mpi_data_register(block->layers_handle[0], MPI_TAG_LAYERS(bz, 0), node);
+        starpu_mpi_data_register(block->layers_handle[1], MPI_TAG_LAYERS(bz, 1), node);
+
+        /* Boundary blocks : Top */
+        starpu_mpi_data_register(block->boundaries_handle[T][0], MPI_TAG_BOUNDARIES(bz, T, 0), node);
+        starpu_mpi_data_register(block->boundaries_handle[T][1], MPI_TAG_BOUNDARIES(bz, T, 1), node);
+
+        /* Boundary blocks : Bottom */
+        starpu_mpi_data_register(block->boundaries_handle[B][0], MPI_TAG_BOUNDARIES(bz, B, 0), node);
+        starpu_mpi_data_register(block->boundaries_handle[B][1], MPI_TAG_BOUNDARIES(bz, B, 1), node);
+	}
+
+    /* Initialize all the data in parallel */
+	for (bz = 0; bz < nbz; bz++)
+	{
+		struct block_description *block = get_block_description(bz);
+		int node = block->mpi_node;
+		unsigned size_bz = block_sizes_z[bz];
+
+		if (node == rank)
+		{
+            /* Set all the data to 0 */
+            create_task_memset(sizex, sizey, bz);
+
+            /* Initialize the first layer with some random data */
+            create_task_initlayer(sizex, sizey, bz);
+		}
+	}
+    starpu_task_wait_for_all();
+}
+
+void free_memory_on_node(int rank)
+{
+	unsigned bz;
+	for (bz = 0; bz < nbz; bz++)
+	{
+		struct block_description *block = get_block_description(bz);
+
+		int node = block->mpi_node;
+
+		/* Main blocks */
+		if (node == rank)
+		{
+			free_block_on_node(block->layers_handle[0], (sizex + 2*K), (sizey + 2*K), K);
+			free_block_on_node(block->layers_handle[1], (sizex + 2*K), (sizey + 2*K), K);
+		}
+        else
+        {
+            starpu_data_unregister(block->layers_handle[0]);
+            starpu_data_unregister(block->layers_handle[1]);
+        }
+
+		/* Boundary blocks : Top */
+		if (node == rank)
+		{
+			free_block_on_node(block->boundaries_handle[T][0], (sizex + 2*K), (sizey + 2*K), K);
+			free_block_on_node(block->boundaries_handle[T][1], (sizex + 2*K), (sizey + 2*K), K);
+		}
+        else
+        {
+            starpu_data_unregister(block->boundaries_handle[T][0]);
+            starpu_data_unregister(block->boundaries_handle[T][1]);
+        }
+
+		/* Boundary blocks : Bottom */
+		if (node == rank)
+		{
+			free_block_on_node(block->boundaries_handle[B][0], (sizex + 2*K), (sizey + 2*K), K);
+			free_block_on_node(block->boundaries_handle[B][1], (sizex + 2*K), (sizey + 2*K), K);
+		}
+        else
+        {
+            starpu_data_unregister(block->boundaries_handle[B][0]);
+            starpu_data_unregister(block->boundaries_handle[B][1]);
+        }
+	}
+}
+
+/* check how many cells are alive */
+void check(int rank)
+{
+	unsigned bz;
+	for (bz = 0; bz < nbz; bz++)
+	{
+		struct block_description *block = get_block_description(bz);
+
+		int node = block->mpi_node;
+
+		/* Main blocks */
+		if (node == rank)
+		{
+			unsigned size_bz = block_sizes_z[bz];
+#ifdef LIFE
+			unsigned x, y, z;
+			unsigned sum = 0;
+			for (x = 0; x < sizex; x++)
+				for (y = 0; y < sizey; y++)
+					for (z = 0; z < size_bz; z++)
+						sum += block->layers[0][(K+x)+(K+y)*(sizex + 2*K)+(K+z)*(sizex+2*K)*(sizey+2*K)];
+			printf("block %u got %u/%u alive\n", bz, sum, sizex*sizey*size_bz);
+#endif
+		}
+	}
+}

+ 769 - 0
examples/stencil/implicit-stencil-kernels.c

@@ -0,0 +1,769 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2015  Université de Bordeaux
+ * Copyright (C) 2012, 2013, 2016  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "implicit-stencil.h"
+
+/* Computation Kernels */
+
+/*
+ * There are three codeletets:
+ *
+ * - cl_update, which takes a block and the boundaries of its neighbours, loads
+ *   the boundaries into the block and perform some update loops:
+ *
+ *     comp. buffer      save. buffers        comp. buffer        save. buffers        comp. buffer
+ *   |     ...     |                                                                                      
+ *   |             | +------------------+ +------------------+                                            
+ *   |     #N+1    | | #N+1 bottom copy====>#N+1 bottom copy |                                            
+ *   +-------------+ +------------------+ +------------------+                                            
+ *   | #N top copy | |   #N top copy    | |                  |                                            
+ *   +-------------+ +------------------+ |                  |                                            
+ *                                        | #N               |                                            
+ *                                                 ...                                                    
+ *                                        |                  | +----------------+ +----------------------+
+ *                                        |                  | | #N bottom copy | | block #N bottom copy |
+ * ^                                      +------------------+ +----------------+ +----------------------+
+ * |                                      | #N-1 top copy   <====#N-1 top copy  | |  block #N-1          |
+ * |                                      +------------------+ +----------------+ |                      |
+ * Z                                                                                     ...
+ *
+ * - save_cl_top, which take a block and its top boundary, and saves the top of
+ *   the block into the boundary (to be given as bottom of the neighbour above
+ *   this block).
+ *
+ *     comp. buffer      save. buffers        comp. buffer        save. buffers        comp. buffer
+ *   |     ...     |                                                                                      
+ *   |             | +------------------+ +------------------+                                            
+ *   |     #N+1    | | #N+1 bottom copy | | #N+1 bottom copy |                                            
+ *   +-------------+ +------------------+ +------------------+                                            
+ *   | #N top copy | |   #N top copy   <====                 |                                            
+ *   +-------------+ +------------------+ |..................|                                            
+ *                                        | #N               |                                            
+ *                                                 ...                                                    
+ *                                        |                  | +----------------+ +----------------------+
+ *                                        |                  | | #N bottom copy | | block #N bottom copy |
+ * ^                                      +------------------+ +----------------+ +----------------------+
+ * |                                      | #N-1 top copy    | | #N-1 top copy  | |  block #N-1          |
+ * |                                      +------------------+ +----------------+ |                      |
+ * Z                                                                                     ...
+ *
+ * - save_cl_bottom, same for the bottom
+ *     comp. buffer      save. buffers        comp. buffer        save. buffers        comp. buffer
+ *   |     ...     |                                                                                      
+ *   |             | +------------------+ +------------------+                                            
+ *   |     #N+1    | | #N+1 bottom copy | | #N+1 bottom copy |                                            
+ *   +-------------+ +------------------+ +------------------+                                            
+ *   | #N top copy | |   #N top copy    | |                  |                                            
+ *   +-------------+ +------------------+ |                  |                                            
+ *                                        | #N               |                                            
+ *                                                 ...                                                    
+ *                                        |..................| +----------------+ +----------------------+
+ *                                        |                 ====>#N bottom copy | | block #N bottom copy |
+ * ^                                      +------------------+ +----------------+ +----------------------+
+ * |                                      | #N-1 top copy    | | #N-1 top copy  | |  block #N-1          |
+ * |                                      +------------------+ +----------------+ |                      |
+ * Z                                                                                     ...
+ *
+ * The idea is that the computation buffers thus don't have to move, only their
+ * boundaries are copied to buffers that do move (be it CPU/GPU, GPU/GPU or via
+ * MPI)
+ *
+ * For each of the buffers above, there are two (0/1) buffers to make new/old switch costless.
+ */
+
+#if 0
+# define DEBUG(fmt, ...) fprintf(stderr,fmt,##__VA_ARGS__)
+#else
+# define DEBUG(fmt, ...) (void) 0
+#endif
+
+/* Record which GPU ran which block, for nice pictures */
+int who_runs_what_len;
+int *who_runs_what;
+int *who_runs_what_index;
+double *last_tick;
+
+/* Achieved iterations */
+static int achieved_iter;
+
+/* Record how many updates each worker performed */
+unsigned update_per_worker[STARPU_NMAXWORKERS];
+
+static void record_who_runs_what(struct block_description *block)
+{
+	double now, now2, diff, delta = get_ticks() * 1000;
+	int workerid = starpu_worker_get_id_check();
+
+	now = starpu_timing_now();
+	now2 = now - start;
+	diff = now2 - last_tick[block->bz];
+	while (diff >= delta)
+	{
+		last_tick[block->bz] += delta;
+		diff = now2 - last_tick[block->bz];
+		if (who_runs_what_index[block->bz] < who_runs_what_len)
+			who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = -1;
+	}
+
+	if (who_runs_what_index[block->bz] < who_runs_what_len)
+		who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = global_workerid(workerid);
+}
+
+static void check_load(struct starpu_block_interface *block, struct starpu_block_interface *boundary)
+{
+	/* Sanity checks */
+	STARPU_ASSERT(block->nx == boundary->nx);
+	STARPU_ASSERT(block->ny == boundary->ny);
+	STARPU_ASSERT(boundary->nz == K);
+
+	/* NB: this is not fully garanteed ... but it's *very* likely and that
+	 * makes our life much simpler */
+	STARPU_ASSERT(block->ldy == boundary->ldy);
+	STARPU_ASSERT(block->ldz == boundary->ldz);
+}
+
+/*
+ * Load a neighbour's boundary into block, CPU version
+ */
+static void load_subblock_from_buffer_cpu(void *_block,
+					void *_boundary,
+					unsigned firstz)
+{
+	struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
+	struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
+	check_load(block, boundary);
+
+	/* We do a contiguous memory transfer */
+	size_t boundary_size = K*block->ldz*block->elemsize;
+
+	unsigned offset = firstz*block->ldz;
+	TYPE *block_data = (TYPE *)block->ptr;
+	TYPE *boundary_data = (TYPE *)boundary->ptr;
+	memcpy(&block_data[offset], boundary_data, boundary_size);
+}
+
+/*
+ * Load a neighbour's boundary into block, CUDA version
+ */
+#ifdef STARPU_USE_CUDA
+static void load_subblock_from_buffer_cuda(void *_block,
+					void *_boundary,
+					unsigned firstz)
+{
+	struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
+	struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
+	check_load(block, boundary);
+
+	/* We do a contiguous memory transfer */
+	size_t boundary_size = K*block->ldz*block->elemsize;
+
+	unsigned offset = firstz*block->ldz;
+	TYPE *block_data = (TYPE *)block->ptr;
+	TYPE *boundary_data = (TYPE *)boundary->ptr;
+	cudaMemcpyAsync(&block_data[offset], boundary_data, boundary_size, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
+}
+
+/*
+ * cl_update (CUDA version)
+ */
+static void update_func_cuda(void *descr[], void *arg)
+{
+    unsigned z;
+    starpu_codelet_unpack_args(arg, &z);
+	struct block_description *block = get_block_description(z);
+
+	int workerid = starpu_worker_get_id_check();
+	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
+	if (block->bz == 0)
+		FPRINTF(stderr,"!!! DO update_func_cuda z %u CUDA%d !!!\n", block->bz, workerid);
+	else
+		DEBUG( "!!! DO update_func_cuda z %u CUDA%d !!!\n", block->bz, workerid);
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+	int rank = 0;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	DEBUG( "!!!           RANK %d              !!!\n", rank);
+#endif
+	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
+
+	unsigned block_size_z = get_block_size(block->bz);
+	unsigned i;
+	update_per_worker[workerid]++;
+
+	record_who_runs_what(block);
+
+	/*
+	 *	Load neighbours' boundaries : TOP
+	 */
+
+	/* The offset along the z axis is (block_size_z + K) */
+	load_subblock_from_buffer_cuda(descr[0], descr[2], block_size_z+K);
+	load_subblock_from_buffer_cuda(descr[1], descr[3], block_size_z+K);
+
+	/*
+	 *	Load neighbours' boundaries : BOTTOM
+	 */
+	load_subblock_from_buffer_cuda(descr[0], descr[4], 0);
+	load_subblock_from_buffer_cuda(descr[1], descr[5], 0);
+
+	/*
+	 *	Stencils ... do the actual work here :) TODO
+	 */
+
+	for (i=1; i<=K; i++)
+	{
+		struct starpu_block_interface *oldb = descr[i%2], *newb = descr[(i+1)%2];
+		TYPE *old = (void*) oldb->ptr, *newer = (void*) newb->ptr;
+
+		/* Shadow data */
+		cuda_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
+
+		/* And perform actual computation */
+#ifdef LIFE
+		cuda_life_update_host(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
+#else
+		cudaMemcpyAsync(newer, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
+#endif /* LIFE */
+	}
+
+	if (block->bz == 0)
+		starpu_top_update_data_integer(starpu_top_achieved_loop, ++achieved_iter);
+}
+#endif /* STARPU_USE_CUDA */
+
+/*
+ * Load a neighbour's boundary into block, OpenCL version
+ */
+#ifdef STARPU_USE_OPENCL
+static void load_subblock_from_buffer_opencl(struct starpu_block_interface *block,
+					struct starpu_block_interface *boundary,
+					unsigned firstz)
+{
+	check_load(block, boundary);
+
+	/* We do a contiguous memory transfer */
+	size_t boundary_size = K*block->ldz*block->elemsize;
+
+	unsigned offset = firstz*block->ldz;
+	cl_mem block_data = (cl_mem)block->dev_handle;
+	cl_mem boundary_data = (cl_mem)boundary->dev_handle;
+
+        cl_command_queue cq;
+        starpu_opencl_get_current_queue(&cq);
+        cl_int ret = clEnqueueCopyBuffer(cq, boundary_data, block_data, 0, offset, boundary_size, 0, NULL, NULL);
+	if (ret != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(ret);
+}
+
+/*
+ * cl_update (OpenCL version)
+ */
+static void update_func_opencl(void *descr[], void *arg)
+{
+    unsigned z;
+    starpu_codelet_unpack_args(arg, &z);
+	struct block_description *block = get_block_description(z);
+
+	int workerid = starpu_worker_get_id_check();
+	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
+	if (block->bz == 0)
+		FPRINTF(stderr,"!!! DO update_func_opencl z %u OPENCL%d !!!\n", block->bz, workerid);
+	else
+		DEBUG( "!!! DO update_func_opencl z %u OPENCL%d !!!\n", block->bz, workerid);
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+	int rank = 0;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	DEBUG( "!!!           RANK %d              !!!\n", rank);
+#endif
+	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
+
+	unsigned block_size_z = get_block_size(block->bz);
+	unsigned i;
+	update_per_worker[workerid]++;
+
+	record_who_runs_what(block);
+
+        cl_command_queue cq;
+        starpu_opencl_get_current_queue(&cq);
+
+	/*
+	 *	Load neighbours' boundaries : TOP
+	 */
+
+	/* The offset along the z axis is (block_size_z + K) */
+	load_subblock_from_buffer_opencl(descr[0], descr[2], block_size_z+K);
+	load_subblock_from_buffer_opencl(descr[1], descr[3], block_size_z+K);
+
+	/*
+	 *	Load neighbours' boundaries : BOTTOM
+	 */
+	load_subblock_from_buffer_opencl(descr[0], descr[4], 0);
+	load_subblock_from_buffer_opencl(descr[1], descr[5], 0);
+
+	/*
+	 *	Stencils ... do the actual work here :) TODO
+	 */
+
+	for (i=1; i<=K; i++)
+	{
+		struct starpu_block_interface *oldb = descr[i%2], *newb = descr[(i+1)%2];
+		TYPE *old = (void*) oldb->dev_handle, *newer = (void*) newb->dev_handle;
+
+		/* Shadow data */
+		opencl_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
+
+		/* And perform actual computation */
+#ifdef LIFE
+		opencl_life_update_host(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
+#else
+		cl_event event;
+                cl_int ret = clEnqueueCopyBuffer(cq, old, newer, 0, 0, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer), 0, NULL, &event);
+		if (ret != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(ret);
+
+#endif /* LIFE */
+	}
+
+	if (block->bz == 0)
+		starpu_top_update_data_integer(starpu_top_achieved_loop, ++achieved_iter);
+}
+#endif /* STARPU_USE_OPENCL */
+
+/*
+ * cl_update (CPU version)
+ */
+void update_func_cpu(void *descr[], void *arg)
+{
+    unsigned z;
+    starpu_codelet_unpack_args(arg, &z);
+	struct block_description *block = get_block_description(z);
+
+	int workerid = starpu_worker_get_id_check();
+	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
+	if (block->bz == 0)
+		DEBUG("!!! DO update_func_cpu z %u CPU%d !!!\n", block->bz, workerid);
+	else
+		DEBUG("!!! DO update_func_cpu z %u CPU%d !!!\n", block->bz, workerid);
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+	int rank = 0;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	DEBUG( "!!!           RANK %d            !!!\n", rank);
+#endif
+	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
+
+	unsigned block_size_z = get_block_size(block->bz);
+	unsigned i;
+	update_per_worker[workerid]++;
+
+	record_who_runs_what(block);
+
+	/*
+	 *	Load neighbours' boundaries : TOP
+	 */
+
+	/* The offset along the z axis is (block_size_z + K) */
+	load_subblock_from_buffer_cpu(descr[0], descr[2], block_size_z+K);
+	load_subblock_from_buffer_cpu(descr[1], descr[3], block_size_z+K);
+
+	/*
+	 *	Load neighbours' boundaries : BOTTOM
+	 */
+	load_subblock_from_buffer_cpu(descr[0], descr[4], 0);
+	load_subblock_from_buffer_cpu(descr[1], descr[5], 0);
+
+	/*
+	 *	Stencils ... do the actual work here :) TODO
+	 */
+
+	for (i=1; i<=K; i++)
+	{
+		struct starpu_block_interface *oldb = (struct starpu_block_interface *) descr[i%2], *newb = (struct starpu_block_interface *) descr[(i+1)%2];
+		TYPE *old = (TYPE*) oldb->ptr, *newer = (TYPE*) newb->ptr;
+
+		/* Shadow data */
+		unsigned ldy = oldb->ldy, ldz = oldb->ldz;
+		unsigned nx = oldb->nx, ny = oldb->ny, nz = oldb->nz;
+		unsigned x, y, z;
+		unsigned stepx = 1;
+		unsigned stepy = 1;
+		unsigned stepz = 1;
+		unsigned idx = 0;
+		unsigned idy = 0;
+		unsigned idz = 0;
+		TYPE *ptr = old;
+
+#		include "shadow.h"
+
+		/* And perform actual computation */
+#ifdef LIFE
+		life_update(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
+#else
+		memcpy(newer, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer));
+#endif /* LIFE */
+	}
+
+	if (block->bz == 0)
+		starpu_top_update_data_integer(starpu_top_achieved_loop, ++achieved_iter);
+}
+
+/* Performance model and codelet structure */
+static struct starpu_perfmodel cl_update_model =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "cl_update" 
+};
+
+struct starpu_codelet cl_update =
+{
+	.cpu_funcs = {update_func_cpu},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {update_func_cuda},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {update_func_opencl},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
+#endif
+	.model = &cl_update_model,
+	.nbuffers = 6,
+	.modes = {STARPU_RW, STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R}
+};
+
+/*
+ * Save the block internal boundaries to give them to our neighbours.
+ */
+
+/* CPU version */
+static void load_subblock_into_buffer_cpu(void *_block,
+					void *_boundary,
+					unsigned firstz)
+{
+	struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
+	struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
+	check_load(block, boundary);
+
+	/* We do a contiguous memory transfer */
+	size_t boundary_size = K*block->ldz*block->elemsize;
+
+	unsigned offset = firstz*block->ldz;
+	TYPE *block_data = (TYPE *)block->ptr;
+	TYPE *boundary_data = (TYPE *)boundary->ptr;
+	memcpy(boundary_data, &block_data[offset], boundary_size);
+}
+
+/* CUDA version */
+#ifdef STARPU_USE_CUDA
+static void load_subblock_into_buffer_cuda(void *_block,
+					void *_boundary,
+					unsigned firstz)
+{
+	struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
+	struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
+	check_load(block, boundary);
+
+	/* We do a contiguous memory transfer */
+	size_t boundary_size = K*block->ldz*block->elemsize;
+
+	unsigned offset = firstz*block->ldz;
+	TYPE *block_data = (TYPE *)block->ptr;
+	TYPE *boundary_data = (TYPE *)boundary->ptr;
+	cudaMemcpyAsync(boundary_data, &block_data[offset], boundary_size, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
+}
+#endif /* STARPU_USE_CUDA */
+
+/* OPENCL version */
+#ifdef STARPU_USE_OPENCL
+static void load_subblock_into_buffer_opencl(struct starpu_block_interface *block,
+					struct starpu_block_interface *boundary,
+					unsigned firstz)
+{
+	check_load(block, boundary);
+
+	/* We do a contiguous memory transfer */
+	size_t boundary_size = K*block->ldz*block->elemsize;
+
+	unsigned offset = firstz*block->ldz;
+	cl_mem block_data = (cl_mem)block->dev_handle;
+	cl_mem boundary_data = (cl_mem)boundary->dev_handle;
+
+        cl_command_queue cq;
+        starpu_opencl_get_current_queue(&cq);
+
+        cl_int ret = clEnqueueCopyBuffer(cq, block_data, boundary_data, offset, 0, boundary_size, 0, NULL, NULL);
+	if (ret != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(ret);
+}
+#endif /* STARPU_USE_OPENCL */
+
+/* Record how many top/bottom saves each worker performed */
+unsigned top_per_worker[STARPU_NMAXWORKERS];
+unsigned bottom_per_worker[STARPU_NMAXWORKERS];
+
+/* top save, CPU version */
+void dummy_func_top_cpu(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
+{
+    unsigned z;
+    starpu_codelet_unpack_args(arg, &z);
+	struct block_description *block = get_block_description(z);
+
+	int workerid = starpu_worker_get_id_check();
+	top_per_worker[workerid]++;
+
+	DEBUG( "DO SAVE Bottom block %d\n", block->bz);
+
+	/* The offset along the z axis is (block_size_z + K)- K */
+	unsigned block_size_z = get_block_size(block->bz);
+
+	load_subblock_into_buffer_cpu(descr[0], descr[2], block_size_z);
+	load_subblock_into_buffer_cpu(descr[1], descr[3], block_size_z);
+}
+
+/* bottom save, CPU version */
+void dummy_func_bottom_cpu(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
+{
+    unsigned z;
+    starpu_codelet_unpack_args(arg, &z);
+	struct block_description *block = get_block_description(z);
+    STARPU_ASSERT(block);
+
+	int workerid = starpu_worker_get_id_check();
+	bottom_per_worker[workerid]++;
+
+	DEBUG( "DO SAVE Top block %d\n", block->bz);
+
+	load_subblock_into_buffer_cpu(descr[0], descr[2], K);
+	load_subblock_into_buffer_cpu(descr[1], descr[3], K);
+}
+
+/* top save, CUDA version */
+#ifdef STARPU_USE_CUDA
+static void dummy_func_top_cuda(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
+{
+    unsigned z;
+    starpu_codelet_unpack_args(arg, &z);
+	struct block_description *block = get_block_description(z);
+
+	int workerid = starpu_worker_get_id_check();
+	top_per_worker[workerid]++;
+
+	DEBUG( "DO SAVE Top block %d\n", block->bz);
+
+	/* The offset along the z axis is (block_size_z + K)- K */
+	unsigned block_size_z = get_block_size(block->bz);
+
+	load_subblock_into_buffer_cuda(descr[0], descr[2], block_size_z);
+	load_subblock_into_buffer_cuda(descr[1], descr[3], block_size_z);
+}
+
+/* bottom save, CUDA version */
+static void dummy_func_bottom_cuda(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
+{
+    unsigned z;
+    starpu_codelet_unpack_args(arg, &z);
+	struct block_description *block = get_block_description(z);
+
+	int workerid = starpu_worker_get_id_check();
+	bottom_per_worker[workerid]++;
+
+	DEBUG( "DO SAVE Bottom block %d on CUDA\n", block->bz);
+
+	load_subblock_into_buffer_cuda(descr[0], descr[2], K);
+	load_subblock_into_buffer_cuda(descr[1], descr[3], K);
+}
+#endif /* STARPU_USE_CUDA */
+
+/* top save, OpenCL version */
+#ifdef STARPU_USE_OPENCL
+static void dummy_func_top_opencl(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
+{
+    unsigned z;
+    starpu_codelet_unpack_args(arg, &z);
+	struct block_description *block = get_block_description(z);
+
+	int workerid = starpu_worker_get_id_check();
+	top_per_worker[workerid]++;
+
+	DEBUG( "DO SAVE Top block %d\n", block->bz);
+
+	/* The offset along the z axis is (block_size_z + K)- K */
+	unsigned block_size_z = get_block_size(block->bz);
+
+	load_subblock_into_buffer_opencl(descr[0], descr[2], block_size_z);
+	load_subblock_into_buffer_opencl(descr[1], descr[3], block_size_z);
+}
+
+/* bottom save, OPENCL version */
+static void dummy_func_bottom_opencl(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
+{
+    unsigned z;
+    starpu_codelet_unpack_args(arg, &z);
+	struct block_description *block = get_block_description(z);
+
+	int workerid = starpu_worker_get_id_check();
+	bottom_per_worker[workerid]++;
+
+	DEBUG( "DO SAVE Bottom block %d on OPENCL\n", block->bz);
+
+	load_subblock_into_buffer_opencl(descr[0], descr[2], K);
+	load_subblock_into_buffer_opencl(descr[1], descr[3], K);
+}
+#endif /* STARPU_USE_OPENCL */
+
+/* Performance models and codelet for save */
+static struct starpu_perfmodel save_cl_bottom_model =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "save_cl_bottom" 
+};
+
+static struct starpu_perfmodel save_cl_top_model =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "save_cl_top" 
+};
+
+struct starpu_codelet save_cl_bottom =
+{
+	.cpu_funcs = {dummy_func_bottom_cpu},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {dummy_func_bottom_cuda},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {dummy_func_bottom_opencl},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
+#endif
+	.model = &save_cl_bottom_model,
+	.nbuffers = 4,
+	.modes = {STARPU_R, STARPU_R, STARPU_W, STARPU_W}
+};
+
+struct starpu_codelet save_cl_top =
+{
+	.cpu_funcs = {dummy_func_top_cpu},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {dummy_func_top_cuda},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {dummy_func_top_opencl},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
+#endif
+	.model = &save_cl_top_model,
+	.nbuffers = 4,
+	.modes = {STARPU_R, STARPU_R, STARPU_W, STARPU_W}
+};
+
+/* Memset a block's buffers */
+static void memset_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
+{
+    unsigned sizex, sizey, bz;
+    starpu_codelet_unpack_args(arg, &sizex, &sizey, &bz);
+	struct block_description *block = get_block_description(bz);
+    unsigned size_bz = get_block_size(bz);
+
+    unsigned x,y,z;
+    for (x = 0; x < sizex + 2*K; x++)
+    {
+        for (y = 0; y < sizey + 2*K; y++)
+        {
+            /* Main blocks */
+            for (z = 0; z < size_bz + 2*K; z++)
+            {
+                block->layers[0][(x)+(y)*(sizex + 2*K)+(z)*(sizex+2*K)*(sizey+2*K)] = 0; 
+                block->layers[1][(x)+(y)*(sizex + 2*K)+(z)*(sizex+2*K)*(sizey+2*K)] = 0; 
+            }
+            for (z = 0; z < K; z++)
+            {
+                /* Boundary blocks : Top */
+                block->boundaries[T][0][(x)+(y)*(sizex + 2*K)+(z)*(sizex+2*K)*(sizey+2*K)] = 0; 
+                block->boundaries[T][1][(x)+(y)*(sizex + 2*K)+(z)*(sizex+2*K)*(sizey+2*K)] = 0; 
+
+                /* Boundary blocks : Bottom */
+                block->boundaries[B][0][(x)+(y)*(sizex + 2*K)+(z)*(sizex+2*K)*(sizey+2*K)] = 0; 
+                block->boundaries[B][1][(x)+(y)*(sizex + 2*K)+(z)*(sizex+2*K)*(sizey+2*K)] = 0; 
+            }
+
+        }
+    }
+    //memset(block->layers[0], 0, (sizex + 2*K)*(sizey + 2*K)*(size_bz + 2*K)*sizeof(block->layers[0]));
+    //memset(block->layers[1], 0, (sizex + 2*K)*(sizey + 2*K)*(size_bz + 2*K)*sizeof(block->layers[1]));
+
+    //memset(block->boundaries[T][0], 0, (sizex + 2*K)*(sizey + 2*K)*K*sizeof(block->boundaries[T][0]));
+    //memset(block->boundaries[T][1], 0, (sizex + 2*K)*(sizey + 2*K)*K*sizeof(block->boundaries[T][1]));
+
+    //memset(block->boundaries[B][0], 0, (sizex + 2*K)*(sizey + 2*K)*K*sizeof(block->boundaries[B][0]));
+    //memset(block->boundaries[B][1], 0, (sizex + 2*K)*(sizey + 2*K)*K*sizeof(block->boundaries[B][1]));
+}
+
+static double memset_cost_function(struct starpu_task *task, unsigned nimpl)
+{
+	(void) task;
+	(void) nimpl;
+	return 0.000001;
+}
+
+static struct starpu_perfmodel memset_model =
+{
+	.type = STARPU_COMMON,
+	.cost_function = memset_cost_function,
+	.symbol = "memset"
+};
+
+struct starpu_codelet cl_memset =
+{
+	.cpu_funcs = {memset_func},
+	.model = &memset_model,
+	.nbuffers = 6,
+	.modes = {STARPU_W, STARPU_W, STARPU_W, STARPU_W, STARPU_W, STARPU_W}
+};
+
+/* Initialize a block's layer */
+static void initlayer_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
+{
+    unsigned sizex, sizey, bz;
+    starpu_codelet_unpack_args(arg, &sizex, &sizey, &bz);
+	struct block_description *block = get_block_description(bz);
+    unsigned size_bz = get_block_size(bz);
+
+    /* Initialize layer with some random data */
+    unsigned x, y, z;
+    unsigned sum = 0;
+    for (x = 0; x < sizex; x++)
+        for (y = 0; y < sizey; y++)
+            for (z = 0; z < size_bz; z++)
+                sum += block->layers[0][(K+x)+(K+y)*(sizex + 2*K)+(K+z)*(sizex+2*K)*(sizey+2*K)] = (int)((x/7.+y/13.+(bz*size_bz + z)/17.) * 10.) % 2;
+}
+
+static double initlayer_cost_function(struct starpu_task *task, unsigned nimpl)
+{
+	(void) task;
+	(void) nimpl;
+	return 0.000001;
+}
+
+static struct starpu_perfmodel initlayer_model =
+{
+	.type = STARPU_COMMON,
+	.cost_function = initlayer_cost_function,
+	.symbol = "initlayer"
+};
+
+struct starpu_codelet cl_initlayer =
+{
+	.cpu_funcs = {initlayer_func},
+	.model = &initlayer_model,
+	.nbuffers = 1,
+	.modes = {STARPU_W}
+};
+

+ 203 - 0
examples/stencil/implicit-stencil-tasks.c

@@ -0,0 +1,203 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2013-2015  Université de Bordeaux
+ * Copyright (C) 2012, 2013, 2015  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "implicit-stencil.h"
+
+#define BIND_LAST 1
+
+/*
+ * Schedule tasks for updates and saves
+ */
+
+/*
+ * NB: iter = 0: initialization phase, TAG_U(z, 0) = TAG_INIT
+ *
+ * dir is -1 or +1.
+ */
+
+#if 0
+# define DEBUG(fmt, ...) fprintf(stderr,fmt,##__VA_ARGS__)
+#else
+# define DEBUG(fmt, ...)
+#endif
+ 
+#define starpu_insert_task(...) starpu_mpi_insert_task(MPI_COMM_WORLD, __VA_ARGS__)
+
+/* 
+ * Schedule initialization tasks 
+ */
+
+void create_task_memset(unsigned sizex, unsigned sizey, unsigned z)
+{
+	struct block_description *descr = get_block_description(z);
+    struct starpu_codelet *codelet = &cl_memset;
+
+    int ret = starpu_insert_task(
+            codelet,
+            STARPU_VALUE,   &sizex,  sizeof(unsigned),
+            STARPU_VALUE,   &sizey,  sizeof(unsigned),
+            STARPU_VALUE,   &z,  sizeof(unsigned),
+            STARPU_W,   descr->layers_handle[0],
+            STARPU_W,   descr->layers_handle[1],
+            STARPU_W,   descr->boundaries_handle[T][0],
+            STARPU_W,   descr->boundaries_handle[T][1],
+            STARPU_W,   descr->boundaries_handle[B][0],
+            STARPU_W,   descr->boundaries_handle[B][1],
+                0);
+
+    if (ret)
+    {
+        FPRINTF(stderr, "Could not submit task save: %d\n", ret);
+        if (ret == -ENODEV)
+            exit(77);
+        STARPU_ABORT();
+    }
+}
+
+void create_task_initlayer(unsigned sizex, unsigned sizey, unsigned z)
+{
+	struct block_description *descr = get_block_description(z);
+    struct starpu_codelet *codelet = &cl_initlayer;
+
+    int ret = starpu_insert_task(
+            codelet,
+            STARPU_VALUE,   &sizex,  sizeof(unsigned),
+            STARPU_VALUE,   &sizey,  sizeof(unsigned),
+            STARPU_VALUE,   &z,  sizeof(unsigned),
+            STARPU_W,   descr->layers_handle[0],
+                0);
+
+    if (ret)
+    {
+        FPRINTF(stderr, "Could not submit task save: %d\n", ret);
+        if (ret == -ENODEV)
+            exit(77);
+        STARPU_ABORT();
+    }
+}
+
+/*
+ * Schedule saving boundaries of blocks to communication buffers
+ */
+
+static void create_task_save_local(unsigned iter, unsigned z, int dir, int local_rank)
+{
+	struct block_description *descr = get_block_description(z);
+	int node_z = get_block_mpi_node(z);
+	int node_z_and_d = get_block_mpi_node(z+dir);
+    struct starpu_codelet *codelet;
+    int ret;
+
+    codelet = (dir == -1)?&save_cl_bottom:&save_cl_top;
+    ret = starpu_insert_task(
+            codelet,
+            STARPU_VALUE,   &z,  sizeof(unsigned),
+            STARPU_R,   descr->layers_handle[0],
+            STARPU_R,   descr->layers_handle[1],
+            STARPU_W,   descr->boundaries_handle[(1-dir)/2][0],
+            STARPU_W,   descr->boundaries_handle[(1-dir)/2][1],
+            STARPU_PRIORITY,    STARPU_MAX_PRIO,
+                0);
+
+    if (ret)
+    {
+        FPRINTF(stderr, "Could not submit task save: %d\n", ret);
+        if (ret == -ENODEV)
+            exit(77);
+        STARPU_ABORT();
+    }
+}
+
+/*
+ * Schedule update computation in computation buffer
+ */
+
+void create_task_update(unsigned iter, unsigned z, int local_rank)
+{
+	STARPU_ASSERT(iter != 0);
+
+	unsigned old_layer = (K*(iter-1)) % 2;
+	unsigned new_layer = (old_layer + 1) % 2;
+
+	struct block_description *descr = get_block_description(z);
+	struct block_description *bottom_neighbour = descr->boundary_blocks[B];
+	struct block_description *top_neighbour = descr->boundary_blocks[T];
+
+	struct starpu_codelet *codelet = &cl_update;
+
+    // Simple-level prio
+    //int prio = ((bottom_neighbour->mpi_node != local_rank) || (top_neighbour->mpi_node != local_rank )) ? STARPU_MAX_PRIO : STARPU_DEFAULT_PRIO;
+
+    // Two-level prio
+    int prio = ((bottom_neighbour->mpi_node != local_rank) || (top_neighbour->mpi_node != local_rank )) ? STARPU_MAX_PRIO :
+               ((bottom_neighbour->boundary_blocks[B]->mpi_node != local_rank) || (top_neighbour->boundary_blocks[T]->mpi_node != local_rank )) ? STARPU_MAX_PRIO-1 : STARPU_DEFAULT_PRIO;
+
+    int ret = starpu_insert_task(
+            codelet,
+            STARPU_VALUE,   &z,  sizeof(unsigned),
+	        STARPU_RW,      descr->layers_handle[old_layer],
+	        STARPU_RW,      descr->layers_handle[new_layer],
+	        STARPU_R,       bottom_neighbour->boundaries_handle[T][old_layer],
+	        STARPU_R,       bottom_neighbour->boundaries_handle[T][new_layer],
+	        STARPU_R,       top_neighbour->boundaries_handle[B][old_layer],
+	        STARPU_R,       top_neighbour->boundaries_handle[B][new_layer],
+            STARPU_PRIORITY,    prio,
+                0);
+	if (ret)
+	{
+		FPRINTF(stderr, "Could not submit task update block: %d\n", ret);
+		if (ret == -ENODEV)
+			exit(77);
+		STARPU_ABORT();
+	}
+}
+
+/*
+ * Create all the tasks
+ */
+void create_tasks(int rank)
+{
+	int iter;
+	int bz;
+	int niter = get_niter();
+	int nbz = get_nbz();
+
+	for (iter = 0; iter <= niter; iter++)
+	{
+	     for (bz = 0; bz < nbz; bz++)
+	     {
+		    if ((iter > 0) && ((get_block_mpi_node(bz) == rank)|| (get_block_mpi_node(bz+1) == rank)|| (get_block_mpi_node(bz-1) == rank)))
+			    create_task_update(iter, bz, rank);
+	     }
+
+	     for (bz = 0; bz < nbz; bz++)
+	     {
+		     if (iter != niter)
+		     {
+                int node_z = get_block_mpi_node(bz);
+                int node_z_and_b = get_block_mpi_node(bz-1);
+                int node_z_and_t = get_block_mpi_node(bz+1);
+
+                  if ((node_z == rank) || ((node_z != node_z_and_b) && (node_z_and_b == rank)))
+				     create_task_save_local(iter, bz, +1, rank);
+
+                  if ((node_z == rank) || ((node_z != node_z_and_t) && (node_z_and_t == rank)))
+				     create_task_save_local(iter, bz, -1, rank);
+		     }
+	     }
+	}
+}

+ 387 - 0
examples/stencil/implicit-stencil.c

@@ -0,0 +1,387 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
+ * Copyright (C) 2010-2012, 2014  Université de Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "implicit-stencil.h"
+
+/* Main application */
+
+/* default parameter values */
+static unsigned  bind_tasks = 0;
+
+static unsigned ticks = 1000;
+
+#ifdef STARPU_QUICK_CHECK
+static unsigned niter = 4;
+#define SIZE 16
+#else
+static unsigned niter = 32;
+#define SIZE 128
+#endif
+
+/* Problem size */
+static unsigned sizex = SIZE;
+static unsigned sizey = SIZE;
+static unsigned sizez = 64*SIZE;
+
+/* Number of blocks (scattered over the different MPI processes) */
+unsigned nbz = 64;
+
+/* StarPU top variables */
+struct starpu_top_data* starpu_top_init_loop;
+struct starpu_top_data* starpu_top_achieved_loop;
+
+double start;
+double begin, end;
+double timing; 
+
+/*
+ *	Initialization
+ */
+
+unsigned get_bind_tasks(void)
+{
+	return bind_tasks;
+}
+
+unsigned get_nbz(void)
+{
+	return nbz;
+}
+
+unsigned get_niter(void)
+{
+	return niter;
+}
+
+unsigned get_ticks(void)
+{
+	return ticks;
+}
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++)
+	{
+		if (strcmp(argv[i], "-b") == 0)
+		{
+			bind_tasks = 1;
+		}
+
+		if (strcmp(argv[i], "-nbz") == 0)
+		{
+			nbz = atoi(argv[++i]);
+		}
+
+		if (strcmp(argv[i], "-sizex") == 0)
+		{
+			sizex = atoi(argv[++i]);
+		}
+
+		if (strcmp(argv[i], "-sizey") == 0)
+		{
+			sizey = atoi(argv[++i]);
+		}
+
+		if (strcmp(argv[i], "-sizez") == 0)
+		{
+			sizez = atoi(argv[++i]);
+		}
+
+		if (strcmp(argv[i], "-niter") == 0)
+		{
+			niter = atoi(argv[++i]);
+		}
+
+		if (strcmp(argv[i], "-ticks") == 0)
+		{
+			ticks = atoi(argv[++i]);
+		}
+
+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
+		{
+			 fprintf(stderr, "Usage : %s [options...]\n", argv[0]);
+			 fprintf(stderr, "\n");
+			 fprintf(stderr, "Options:\n");
+			 fprintf(stderr, "-b			bind tasks on CPUs/GPUs\n");
+			 fprintf(stderr, "-nbz <n>		Number of blocks on Z axis (%u by default)\n", nbz);
+			 fprintf(stderr, "-size[xyz] <size>	Domain size on x/y/z axis (%ux%ux%u by default)\n", sizex, sizey, sizez);
+			 fprintf(stderr, "-niter <n>		Number of iterations (%u by default)\n", niter);
+			 fprintf(stderr, "-ticks <t>		How often to put ticks in the output (ms, %u by default)\n", ticks);
+			 exit(0);
+		}
+	}
+}
+
+static void init_problem(int argc, char **argv, int rank, int world_size)
+{
+	parse_args(argc, argv);
+
+	if (getenv("STARPU_TOP"))
+	{
+		starpu_top_init_loop = starpu_top_add_data_integer("Task creation iter", 0, niter, 1);
+		starpu_top_achieved_loop = starpu_top_add_data_integer("Task achieved iter", 0, niter, 1);
+		starpu_top_init_and_wait("stencil_top example");
+	}
+	create_blocks_array(sizex, sizey, sizez, nbz);
+
+	/* Select the MPI process which should compute the different blocks */
+	assign_blocks_to_mpi_nodes(world_size);
+
+	assign_blocks_to_workers(rank);
+
+	/* Allocate the different memory blocks, if used by the MPI process */
+	start = starpu_timing_now();
+
+	allocate_memory_on_node(rank);
+
+	end = starpu_timing_now();
+	timing = end - begin;
+
+	display_memory_consumption(rank, timing);
+
+	who_runs_what_len = 2*niter;
+	who_runs_what = (int *) calloc(nbz * who_runs_what_len, sizeof(*who_runs_what));
+	who_runs_what_index = (int *) calloc(nbz, sizeof(*who_runs_what_index));
+	last_tick = (double *) calloc(nbz, sizeof(*last_tick));
+}
+
+static void free_problem(int rank)
+{
+    free_memory_on_node(rank);
+	free_blocks_array();
+	free(who_runs_what);
+	free(who_runs_what_index);
+	free(last_tick);
+}
+
+/*
+ *	Main body
+ */
+
+void f(unsigned task_per_worker[STARPU_NMAXWORKERS])
+{
+	unsigned total = 0;
+	int worker;
+
+	for (worker = 0; worker < STARPU_NMAXWORKERS; worker++)
+		total += task_per_worker[worker];
+	for (worker = 0; worker < STARPU_NMAXWORKERS; worker++)
+	{
+		if (task_per_worker[worker])
+		{
+			char name[32];
+			starpu_worker_get_name(worker, name, sizeof(name));
+			FPRINTF(stderr,"\t%s -> %u (%2.2f%%)\n", name, task_per_worker[worker], (100.0*task_per_worker[worker])/total);
+		}
+	}
+}
+
+unsigned global_workerid(unsigned local_workerid)
+{
+#ifdef STARPU_USE_MPI
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	unsigned workers_per_node = starpu_worker_get_count();
+
+	return (local_workerid + rank*workers_per_node);
+#else
+	return local_workerid;
+#endif
+}
+
+int main(int argc, char **argv)
+{
+	int rank;
+	int world_size;
+	int ret;
+
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+	int thread_support;
+	if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support))
+	{
+		FPRINTF(stderr, "MPI_Init_thread failed\n");
+	}
+	if (thread_support == MPI_THREAD_FUNNELED)
+		FPRINTF(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n");
+	if (thread_support < MPI_THREAD_FUNNELED)
+		FPRINTF(stderr,"Warning: MPI does not have thread support!\n");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+#else
+	rank = 0;
+	world_size = 1;
+#endif
+
+	if (rank == 0)
+	{
+		FPRINTF(stderr, "Running on %d nodes\n", world_size);
+		fflush(stderr);
+	}
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+	ret = starpu_mpi_init(NULL, NULL, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+#endif
+
+#ifdef STARPU_USE_OPENCL
+        opencl_life_init();
+        opencl_shadow_init();
+#endif /*STARPU_USE_OPENCL*/
+
+	init_problem(argc, argv, rank, world_size);
+
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+	int barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
+#endif
+	if (rank == 0)
+		FPRINTF(stderr, "GO !\n");
+
+	start = starpu_timing_now();
+
+	begin = starpu_timing_now();
+
+	create_tasks(rank);
+
+	//starpu_tag_notify_from_apps(TAG_INIT_TASK);
+
+	//wait_end_tasks(rank);
+
+    starpu_task_wait_for_all();
+
+	end = starpu_timing_now();
+
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
+#endif
+
+#if 0
+	check(rank);
+#endif
+
+	/*display_debug(nbz, niter, rank);*/
+
+	/* timing in us */
+	timing = end - begin;
+
+	double min_timing = timing;
+	double max_timing = timing;
+	double sum_timing = timing;
+
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+	int reduce_ret;
+
+	reduce_ret = MPI_Reduce(&timing, &min_timing, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
+	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
+
+	reduce_ret = MPI_Reduce(&timing, &max_timing, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
+
+	reduce_ret = MPI_Reduce(&timing, &sum_timing, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
+
+	/* XXX we should do a gather instead, here we assume that non initialized values are still 0 */
+	int *who_runs_what_tmp = malloc(nbz * who_runs_what_len * sizeof(*who_runs_what));
+	reduce_ret = MPI_Reduce(who_runs_what, who_runs_what_tmp, nbz * who_runs_what_len, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
+	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
+
+	memcpy(who_runs_what, who_runs_what_tmp, nbz * who_runs_what_len * sizeof(*who_runs_what));
+	free(who_runs_what_tmp);
+
+	/* XXX we should do a gather instead, here we assume that non initialized values are still 0 */
+	int *who_runs_what_index_tmp = malloc(nbz * sizeof(*who_runs_what_index));
+	reduce_ret = MPI_Reduce(who_runs_what_index, who_runs_what_index_tmp, nbz, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
+	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
+
+	memcpy(who_runs_what_index, who_runs_what_index_tmp, nbz * sizeof(*who_runs_what_index));
+	free(who_runs_what_index_tmp);
+#endif
+
+	if (rank == 0)
+	{
+#if 1 
+		FPRINTF(stderr, "update:\n");
+		f(update_per_worker);
+		FPRINTF(stderr, "top:\n");
+		f(top_per_worker);
+		FPRINTF(stderr, "bottom:\n");
+		f(bottom_per_worker);
+#endif
+#if 1
+		unsigned nzblocks_per_process = (nbz + world_size - 1) / world_size;
+
+		int iter;
+		for (iter = 0; iter < who_runs_what_len; iter++)
+		{
+			unsigned last, bz;
+			last = 1;
+			for (bz = 0; bz < nbz; bz++)
+			{
+				if ((bz % nzblocks_per_process) == 0)
+					FPRINTF(stderr, "| ");
+
+				if (who_runs_what_index[bz] <= iter)
+					FPRINTF(stderr,"_ ");
+				else
+				{
+					last = 0;
+					if (who_runs_what[bz + iter * nbz] == -1)
+						FPRINTF(stderr,"* ");
+					else
+						FPRINTF(stderr, "%d ", who_runs_what[bz + iter * nbz]);
+				}
+			}
+			FPRINTF(stderr, "\n");
+
+			if (last)
+				break;
+		}
+#endif
+
+		fflush(stderr);
+
+		FPRINTF(stdout, "Computation took: %f ms on %d MPI processes\n", max_timing/1000, world_size);
+		FPRINTF(stdout, "\tMIN : %f ms\n", min_timing/1000);
+		FPRINTF(stdout, "\tMAX : %f ms\n", max_timing/1000);
+		FPRINTF(stdout, "\tAVG : %f ms\n", sum_timing/(world_size*1000));
+	}
+
+	free_problem(rank);
+
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+	starpu_mpi_shutdown();
+#endif
+
+	starpu_shutdown();
+
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+	MPI_Finalize();
+#endif
+
+#ifdef STARPU_USE_OPENCL
+        opencl_life_free();
+        opencl_shadow_free();
+#endif /*STARPU_USE_OPENCL*/
+
+	return 0;
+}

+ 157 - 0
examples/stencil/implicit-stencil.h

@@ -0,0 +1,157 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011, 2012, 2013  CNRS
+ * Copyright (C) 2010-2011, 2014  Université de Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __IMPLICIT_STENCIL_H__
+#define __IMPLICIT_STENCIL_H__
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <starpu.h>
+
+#ifndef __CUDACC__
+#ifdef STARPU_USE_MPI
+#include <mpi.h>
+#include <starpu_mpi.h>
+#endif
+#endif
+
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+#define LIFE
+
+#ifdef LIFE
+#define TYPE	unsigned char
+extern void life_update(int bz, const TYPE *old, TYPE *newp, int nx, int ny, int nz, int ldy, int ldz, int iter);
+#else
+#define TYPE	float
+#endif
+
+#define K	1
+
+#define NDIRS 2
+extern struct starpu_top_data* starpu_top_init_loop;
+extern struct starpu_top_data* starpu_top_achieved_loop;
+
+
+/* Split only on the z axis to make things simple */
+typedef enum
+{
+	B = 0,
+	T = 1
+} direction;
+
+/* Description of a domain block */
+struct block_description
+{
+	/* Which MPI node should process that block ? */
+	int mpi_node;
+
+	unsigned preferred_worker;
+
+	unsigned bz;
+
+
+	/* For each of the following buffers, there are two (0/1) buffers to
+	 * make new/old switch costless. */
+
+	/* This is the computation buffer for this block, it includes
+	 * neighbours' border to make computation easier */
+	TYPE *layers[2];
+	starpu_data_handle_t layers_handle[2];
+
+	/* This is the "save" buffer, i.e. a copy of our neighbour's border.
+	 * This one is used for CPU/GPU or MPI communication (rather than the
+	 * whole domain block) */
+	TYPE *boundaries[NDIRS][2];
+	starpu_data_handle_t boundaries_handle[NDIRS][2];
+
+	/* Shortcut pointer to the neighbours */
+	struct block_description *boundary_blocks[NDIRS];
+};
+
+#define TAG_INIT_TASK			((starpu_tag_t)1)
+
+starpu_tag_t TAG_FINISH(int z);
+starpu_tag_t TAG_START(int z, int dir);
+int MPI_TAG0(int z, int iter, int dir);
+int MPI_TAG1(int z, int iter, int dir);
+
+#define MIN(a,b)	((a)<(b)?(a):(b))
+
+void create_blocks_array(unsigned sizex, unsigned sizey, unsigned sizez, unsigned nbz);
+void free_blocks_array();
+struct block_description *get_block_description(int z);
+void assign_blocks_to_mpi_nodes(int world_size);
+void allocate_memory_on_node(int rank);
+void assign_blocks_to_workers(int rank);
+void create_tasks(int rank);
+void wait_end_tasks(int rank);
+void check(int rank);
+void free_memory_on_node(int rank);
+
+void display_memory_consumption(int rank, double time);
+
+int get_block_mpi_node(int z);
+unsigned get_block_size(int z);
+unsigned get_bind_tasks(void);
+
+unsigned get_nbz(void);
+unsigned get_niter(void);
+unsigned get_ticks(void);
+
+unsigned global_workerid(unsigned local_workerid);
+
+void create_task_memset(unsigned sizex, unsigned sizey, unsigned z);
+void create_task_initlayer(unsigned sizex, unsigned sizey, unsigned z);
+void create_task_update(unsigned iter, unsigned z, int local_rank);
+void create_task_save(unsigned iter, unsigned z, int dir, int local_rank);
+
+extern int starpu_mpi_initialize(void);
+extern int starpu_mpi_shutdown(void);
+
+/* kernels */
+extern struct starpu_codelet cl_update;
+extern struct starpu_codelet save_cl_bottom;
+extern struct starpu_codelet save_cl_top;
+extern struct starpu_codelet cl_memset;
+extern struct starpu_codelet cl_initlayer;
+
+extern unsigned update_per_worker[STARPU_NMAXWORKERS];
+extern unsigned top_per_worker[STARPU_NMAXWORKERS];
+extern unsigned bottom_per_worker[STARPU_NMAXWORKERS];
+
+extern double start;
+extern int who_runs_what_len;
+extern int *who_runs_what;
+extern int *who_runs_what_index;
+extern double *last_tick;
+
+#ifndef _externC
+#define _externC
+#endif
+
+_externC void cuda_life_update_host(int bz, const TYPE *old, TYPE *newp, int nx, int ny, int nz, int ldy, int ldz, int iter);
+_externC void cuda_shadow_host(int bz, TYPE *ptr, int nx, int ny, int nz, int ldy, int ldz, int i);
+
+_externC void opencl_shadow_init(void);
+_externC void opencl_shadow_free(void);
+_externC void opencl_shadow_host(int bz, TYPE *ptr, int nx, int ny, int nz, int ldy, int ldz, int i);
+_externC void opencl_life_init(void);
+_externC void opencl_life_free(void);
+_externC void opencl_life_update_host(int bz, const TYPE *old, TYPE *newp, int nx, int ny, int nz, int ldy, int ldz, int iter);
+
+#endif /* __IMPLICIT_STENCIL_H__ */