9 年之前 · c79e7c2d23
--- a/examples/stencil/Makefile.am
+++ b/examples/stencil/Makefile.am
@@ -93,12 +93,14 @@ endif
 
				 ###################
			
 
				 
			
 
				 STARPU_EXAMPLES =				\
			
 
				-	stencil
			
 
				+	stencil						\
			
 
				+	implicit_stencil
			
 
				 
			
 
				 examplebindir = $(libdir)/starpu/examples/stencil
			
 
				 
			
 
				 examplebin_PROGRAMS =				\
			
 
				-	stencil
			
 
				+	stencil							\
			
 
				+	implicit_stencil
			
 
				 
			
 
				 stencil_SOURCES =				\
			
 
				 	life.c					\
			
@@ -123,6 +125,29 @@ stencil_SOURCES +=				\
 
				 	shadow_opencl.c
			
 
				 endif
			
 
				 
			
 
				+implicit_stencil_SOURCES =				\
			
 
				+	life.c					\
			
 
				+	implicit-stencil-kernels.c			\
			
 
				+	implicit-stencil-tasks.c				\
			
 
				+	implicit-stencil-blocks.c			\
			
 
				+	implicit-stencil.c
			
 
				+
			
 
				+noinst_HEADERS =				\
			
 
				+	implicit-stencil.h				\
			
 
				+	shadow.h
			
 
				+
			
 
				+if STARPU_USE_CUDA
			
 
				+implicit_stencil_SOURCES +=				\
			
 
				+	life_cuda.cu				\
			
 
				+	shadow.cu
			
 
				+endif
			
 
				+
			
 
				+if STARPU_USE_OPENCL
			
 
				+implicit_stencil_SOURCES +=				\
			
 
				+	life_opencl.c				\
			
 
				+	shadow_opencl.c
			
 
				+endif
			
 
				+
			
 
				 outs =						\
			
 
				 	0.5.out					\
			
 
				 	0.out					\
			
--- a/examples/stencil/README
+++ b/examples/stencil/README
@@ -42,3 +42,7 @@ penalty ratio), run make pics or make view to get pictures.
 
				 mpi.out: results on MPI.
			
 
				 
			
 
				 results: a few results
			
 
				+
			
 
				+You can also use the implicit distributed flavour of this application (e.g.
			
 
				+with communications between processes automatically inferred by StarPU-MPI),
			
 
				+which is called implicit_stencil.
			
--- a/examples/stencil/implicit-stencil-blocks.c
+++ b/examples/stencil/implicit-stencil-blocks.c
@@ -0,0 +1,445 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2013-2016  Université de Bordeaux
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "implicit-stencil.h"
			
 
				+#include <math.h>
			
 
				+
			
 
				+/* Manage block and tags allocation */
			
 
				+
			
 
				+static struct block_description *blocks;
			
 
				+static unsigned sizex, sizey, sizez;
			
 
				+static unsigned nbz;
			
 
				+static unsigned *block_sizes_z;
			
 
				+
			
 
				+/*
			
 
				+ *	Tags for various codelet completion
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * common tag format:
			
 
				+ */
			
 
				+static starpu_tag_t tag_common(int z, int dir, int type)
			
 
				+{
			
 
				+	return (((((starpu_tag_t)type) << 4) | ((dir+1)/2)) << 32)|(starpu_tag_t)z;
			
 
				+}
			
 
				+
			
 
				+/* Completion of last update tasks */
			
 
				+starpu_tag_t TAG_FINISH(int z)
			
 
				+{
			
 
				+	z = (z + nbz)%nbz;
			
 
				+
			
 
				+	starpu_tag_t tag = tag_common(z, 0, 1);
			
 
				+	return tag;
			
 
				+}
			
 
				+
			
 
				+/* Completion of the save codelet for MPI send/recv */
			
 
				+starpu_tag_t TAG_START(int z, int dir)
			
 
				+{
			
 
				+	z = (z + nbz)%nbz;
			
 
				+
			
 
				+	starpu_tag_t tag = tag_common(z, dir, 2);
			
 
				+	return tag;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * common MPI tag format:
			
 
				+ */
			
 
				+static int mpi_tag_common(int z, int dir, int layer_or_boundary, int buffer)
			
 
				+{
			
 
				+	return (z<<12) | (layer_or_boundary << 8) | ((((1+dir)/2))<<4) | buffer;
			
 
				+}
			
 
				+
			
 
				+int MPI_TAG_LAYERS(int z, int buffer)
			
 
				+{
			
 
				+	z = (z + nbz)%nbz;
			
 
				+
			
 
				+    /* No direction for layers ; layer is 0 */
			
 
				+	int tag = mpi_tag_common(z, 0, 0, buffer);
			
 
				+
			
 
				+	return tag;
			
 
				+}
			
 
				+
			
 
				+int MPI_TAG_BOUNDARIES(int z, int dir, int buffer)
			
 
				+{
			
 
				+	z = (z + nbz)%nbz;
			
 
				+
			
 
				+	int tag = mpi_tag_common(z, dir, 1, buffer);
			
 
				+
			
 
				+	return tag;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ *	Block descriptors
			
 
				+ */
			
 
				+
			
 
				+/* Compute the size of the different blocks */
			
 
				+static void compute_block_sizes(void)
			
 
				+{
			
 
				+	block_sizes_z = (unsigned *) malloc(nbz*sizeof(unsigned));
			
 
				+	STARPU_ASSERT(block_sizes_z);
			
 
				+
			
 
				+	/* Perhaps the last chunk is smaller */
			
 
				+	unsigned default_block_size = (sizez+nbz-1)/nbz;
			
 
				+	unsigned remaining = sizez;
			
 
				+
			
 
				+	unsigned b;
			
 
				+	for (b = 0; b < nbz; b++)
			
 
				+	{
			
 
				+		block_sizes_z[b] = MIN(default_block_size, remaining);
			
 
				+		remaining -= block_sizes_z[b];
			
 
				+	}
			
 
				+
			
 
				+	STARPU_ASSERT(remaining == 0);
			
 
				+}
			
 
				+
			
 
				+unsigned get_block_size(int bz)
			
 
				+{
			
 
				+	return block_sizes_z[bz];
			
 
				+}
			
 
				+
			
 
				+struct block_description *get_block_description(int z)
			
 
				+{
			
 
				+	z = (z + nbz)%nbz;
			
 
				+
			
 
				+	STARPU_ASSERT(&blocks[z]);
			
 
				+
			
 
				+	return &blocks[z];
			
 
				+}
			
 
				+
			
 
				+int get_block_mpi_node(int z)
			
 
				+{
			
 
				+	z = (z + nbz)%nbz;
			
 
				+	return blocks[z].mpi_node;
			
 
				+}
			
 
				+
			
 
				+void create_blocks_array(unsigned _sizex, unsigned _sizey, unsigned _sizez, unsigned _nbz)
			
 
				+{
			
 
				+	/* Store the parameters */
			
 
				+	nbz = _nbz;
			
 
				+	sizex = _sizex;
			
 
				+	sizey = _sizey;
			
 
				+	sizez = _sizez;
			
 
				+
			
 
				+	/* Create a grid of block descriptors */
			
 
				+	blocks = (struct block_description *) calloc(nbz, sizeof(struct block_description));
			
 
				+	STARPU_ASSERT(blocks);
			
 
				+
			
 
				+	/* What is the size of the different blocks ? */
			
 
				+	compute_block_sizes();
			
 
				+
			
 
				+	unsigned bz;
			
 
				+	for (bz = 0; bz < nbz; bz++)
			
 
				+	{
			
 
				+		struct block_description * block =
			
 
				+				get_block_description(bz);
			
 
				+
			
 
				+		/* Which block is it ? */
			
 
				+		block->bz = bz;
			
 
				+
			
 
				+		/* For simplicity, we store which are the neighbours blocks */
			
 
				+		block->boundary_blocks[B] = get_block_description((bz-1+nbz)%nbz);
			
 
				+		block->boundary_blocks[T] = get_block_description((bz+1)%nbz);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void free_blocks_array()
			
 
				+{
			
 
				+	free(blocks);
			
 
				+	free(block_sizes_z);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *	Initialization of the blocks
			
 
				+ */
			
 
				+
			
 
				+void assign_blocks_to_workers(int rank)
			
 
				+{
			
 
				+	unsigned bz;
			
 
				+
			
 
				+	/* NB: perhaps we could count a GPU as multiple workers */
			
 
				+
			
 
				+	/* how many workers are there ? */
			
 
				+	/*unsigned nworkers = starpu_worker_get_count();*/
			
 
				+
			
 
				+	/* how many blocks are on that MPI node ? */
			
 
				+	unsigned nblocks = 0;
			
 
				+	for (bz = 0; bz < nbz; bz++)
			
 
				+	{
			
 
				+		struct block_description *block =
			
 
				+				get_block_description(bz);
			
 
				+
			
 
				+		if (block->mpi_node == rank)
			
 
				+			nblocks++;
			
 
				+	}
			
 
				+
			
 
				+	/* how many blocks per worker ? */
			
 
				+	/*unsigned nblocks_per_worker = (nblocks + nworkers - 1)/nworkers;*/
			
 
				+
			
 
				+	/* we now attribute up to nblocks_per_worker blocks per workers */
			
 
				+	unsigned attributed = 0;
			
 
				+	for (bz = 0; bz < nbz; bz++)
			
 
				+	{
			
 
				+		struct block_description *block =
			
 
				+				get_block_description(bz);
			
 
				+
			
 
				+		if (block->mpi_node == rank)
			
 
				+		{
			
 
				+			unsigned workerid;
			
 
				+			/* Manage initial block distribution between CPU and GPU */
			
 
				+		#if 0
			
 
				+			#if 1
			
 
				+			/* GPUs then CPUs */
			
 
				+			if (attributed < 3*18)
			
 
				+				workerid = attributed / 18;
			
 
				+			else
			
 
				+				workerid = 3+ (attributed - 3*18) / 2;
			
 
				+			#else
			
 
				+			/* GPUs interleaved with CPUs */
			
 
				+			if ((attributed % 20) <= 1)
			
 
				+				workerid = 3 + attributed / 20;
			
 
				+			else if (attributed < 60)
			
 
				+				workerid = attributed / 20;
			
 
				+			else
			
 
				+				workerid = (attributed - 60)/2 + 6;
			
 
				+			#endif
			
 
				+		#else
			
 
				+			/* Only GPUS */
			
 
				+			workerid = (attributed / 21) % 3;
			
 
				+		#endif
			
 
				+			/*= attributed/nblocks_per_worker;*/
			
 
				+
			
 
				+			block->preferred_worker = workerid;
			
 
				+
			
 
				+			attributed++;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+void assign_blocks_to_mpi_nodes(int world_size)
			
 
				+{
			
 
				+	unsigned nzblocks_per_process = (nbz + world_size - 1) / world_size;
			
 
				+
			
 
				+	unsigned bz;
			
 
				+	for (bz = 0; bz < nbz; bz++)
			
 
				+	{
			
 
				+		struct block_description *block =
			
 
				+				get_block_description(bz);
			
 
				+
			
 
				+		block->mpi_node = bz / nzblocks_per_process;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static size_t allocated = 0;
			
 
				+
			
 
				+static void allocate_block_on_node(starpu_data_handle_t *handleptr, TYPE **ptr, unsigned nx, unsigned ny, unsigned nz)
			
 
				+{
			
 
				+	int ret;
			
 
				+	size_t block_size = nx*ny*nz*sizeof(TYPE);
			
 
				+
			
 
				+	/* Allocate memory */
			
 
				+#if 1
			
 
				+	ret = starpu_malloc_flags((void **)ptr, block_size, STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				+	STARPU_ASSERT(ret == 0);
			
 
				+#else
			
 
				+	*ptr = malloc(block_size);
			
 
				+	STARPU_ASSERT(*ptr);
			
 
				+#endif
			
 
				+
			
 
				+	allocated += block_size;
			
 
				+
			
 
				+//#ifndef STARPU_SIMGRID
			
 
				+//	/* Fill the blocks with 0 */
			
 
				+//	memset(*ptr, 0, block_size);
			
 
				+//#endif
			
 
				+
			
 
				+	/* Register it to StarPU */
			
 
				+	starpu_block_data_register(handleptr, STARPU_MAIN_RAM, (uintptr_t)*ptr, nx, nx*ny, nx, ny, nz, sizeof(TYPE));
			
 
				+}
			
 
				+
			
 
				+static void free_block_on_node(starpu_data_handle_t handleptr, unsigned nx, unsigned ny, unsigned nz)
			
 
				+{
			
 
				+	void *ptr = (void *) starpu_block_get_local_ptr(handleptr);
			
 
				+	size_t block_size = nx*ny*nz*sizeof(TYPE);
			
 
				+	starpu_free_flags(ptr, block_size, STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				+	starpu_data_unregister(handleptr);
			
 
				+}
			
 
				+
			
 
				+void display_memory_consumption(int rank, double time)
			
 
				+{
			
 
				+	FPRINTF(stderr, "%lu B of memory were allocated on node %d in %f ms\n", allocated, rank, time/1000);
			
 
				+}
			
 
				+
			
 
				+void allocate_memory_on_node(int rank)
			
 
				+{
			
 
				+	unsigned bz;
			
 
				+
			
 
				+    /* Correctly allocate and declare all data handles to StarPU. */
			
 
				+	for (bz = 0; bz < nbz; bz++)
			
 
				+	{
			
 
				+		struct block_description *block = get_block_description(bz);
			
 
				+		int node = block->mpi_node;
			
 
				+		unsigned size_bz = block_sizes_z[bz];
			
 
				+
			
 
				+		if (node == rank)
			
 
				+		{
			
 
				+            /* Main blocks */
			
 
				+			allocate_block_on_node(&block->layers_handle[0], &block->layers[0],
			
 
				+						(sizex + 2*K), (sizey + 2*K), (size_bz + 2*K));
			
 
				+			allocate_block_on_node(&block->layers_handle[1], &block->layers[1],
			
 
				+						(sizex + 2*K), (sizey + 2*K), (size_bz + 2*K));
			
 
				+
			
 
				+            /* Boundary blocks : Top */
			
 
				+			allocate_block_on_node(&block->boundaries_handle[T][0], &block->boundaries[T][0],
			
 
				+						(sizex + 2*K), (sizey + 2*K), K);
			
 
				+			allocate_block_on_node(&block->boundaries_handle[T][1], &block->boundaries[T][1],
			
 
				+						(sizex + 2*K), (sizey + 2*K), K);
			
 
				+
			
 
				+            /* Boundary blocks : Bottom */
			
 
				+			allocate_block_on_node(&block->boundaries_handle[B][0], &block->boundaries[B][0],
			
 
				+						(sizex + 2*K), (sizey + 2*K), K);
			
 
				+			allocate_block_on_node(&block->boundaries_handle[B][1], &block->boundaries[B][1],
			
 
				+						(sizex + 2*K), (sizey + 2*K), K);
			
 
				+		}
			
 
				+
			
 
				+        /* Register void blocks to StarPU, that StarPU-MPI will request to
			
 
				+         * neighbour nodes if needed for the local computation */
			
 
				+        else
			
 
				+        {
			
 
				+            /* Main blocks */
			
 
				+            starpu_block_data_register(&block->layers_handle[0], -1, (uintptr_t) NULL, (sizex + 2*K), (sizex + 2*K)*(sizey + 2*K), (sizex + 2*K), (sizey + 2*K), (size_bz + 2*K), sizeof(TYPE));
			
 
				+            starpu_block_data_register(&block->layers_handle[1], -1, (uintptr_t) NULL, (sizex + 2*K), (sizex + 2*K)*(sizey + 2*K), (sizex + 2*K), (sizey + 2*K), (size_bz + 2*K), sizeof(TYPE));
			
 
				+
			
 
				+            /* Boundary blocks : Top */
			
 
				+            starpu_block_data_register(&block->boundaries_handle[T][0], -1, (uintptr_t) NULL, (sizex + 2*K), (sizex + 2*K)*(sizey + 2*K), (sizex + 2*K), (sizey + 2*K), K, sizeof(TYPE));
			
 
				+            starpu_block_data_register(&block->boundaries_handle[T][1], -1, (uintptr_t) NULL, (sizex + 2*K), (sizex + 2*K)*(sizey + 2*K), (sizex + 2*K), (sizey + 2*K), K, sizeof(TYPE));
			
 
				+        
			
 
				+            /* Boundary blocks : Bottom */
			
 
				+            starpu_block_data_register(&block->boundaries_handle[B][0], -1, (uintptr_t) NULL, (sizex + 2*K), (sizex + 2*K)*(sizey + 2*K), (sizex + 2*K), (sizey + 2*K), K, sizeof(TYPE));
			
 
				+            starpu_block_data_register(&block->boundaries_handle[B][1], -1, (uintptr_t) NULL, (sizex + 2*K), (sizex + 2*K)*(sizey + 2*K), (sizex + 2*K), (sizey + 2*K), K, sizeof(TYPE));
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+        /* Register all data to StarPU-MPI, even the ones that are not
			
 
				+         * allocated on the local node. */
			
 
				+
			
 
				+        /* Main blocks */
			
 
				+        starpu_mpi_data_register(block->layers_handle[0], MPI_TAG_LAYERS(bz, 0), node);
			
 
				+        starpu_mpi_data_register(block->layers_handle[1], MPI_TAG_LAYERS(bz, 1), node);
			
 
				+
			
 
				+        /* Boundary blocks : Top */
			
 
				+        starpu_mpi_data_register(block->boundaries_handle[T][0], MPI_TAG_BOUNDARIES(bz, T, 0), node);
			
 
				+        starpu_mpi_data_register(block->boundaries_handle[T][1], MPI_TAG_BOUNDARIES(bz, T, 1), node);
			
 
				+
			
 
				+        /* Boundary blocks : Bottom */
			
 
				+        starpu_mpi_data_register(block->boundaries_handle[B][0], MPI_TAG_BOUNDARIES(bz, B, 0), node);
			
 
				+        starpu_mpi_data_register(block->boundaries_handle[B][1], MPI_TAG_BOUNDARIES(bz, B, 1), node);
			
 
				+	}
			
 
				+
			
 
				+    /* Initialize all the data in parallel */
			
 
				+	for (bz = 0; bz < nbz; bz++)
			
 
				+	{
			
 
				+		struct block_description *block = get_block_description(bz);
			
 
				+		int node = block->mpi_node;
			
 
				+		unsigned size_bz = block_sizes_z[bz];
			
 
				+
			
 
				+		if (node == rank)
			
 
				+		{
			
 
				+            /* Set all the data to 0 */
			
 
				+            create_task_memset(sizex, sizey, bz);
			
 
				+
			
 
				+            /* Initialize the first layer with some random data */
			
 
				+            create_task_initlayer(sizex, sizey, bz);
			
 
				+		}
			
 
				+	}
			
 
				+    starpu_task_wait_for_all();
			
 
				+}
			
 
				+
			
 
				+void free_memory_on_node(int rank)
			
 
				+{
			
 
				+	unsigned bz;
			
 
				+	for (bz = 0; bz < nbz; bz++)
			
 
				+	{
			
 
				+		struct block_description *block = get_block_description(bz);
			
 
				+
			
 
				+		int node = block->mpi_node;
			
 
				+
			
 
				+		/* Main blocks */
			
 
				+		if (node == rank)
			
 
				+		{
			
 
				+			free_block_on_node(block->layers_handle[0], (sizex + 2*K), (sizey + 2*K), K);
			
 
				+			free_block_on_node(block->layers_handle[1], (sizex + 2*K), (sizey + 2*K), K);
			
 
				+		}
			
 
				+        else
			
 
				+        {
			
 
				+            starpu_data_unregister(block->layers_handle[0]);
			
 
				+            starpu_data_unregister(block->layers_handle[1]);
			
 
				+        }
			
 
				+
			
 
				+		/* Boundary blocks : Top */
			
 
				+		if (node == rank)
			
 
				+		{
			
 
				+			free_block_on_node(block->boundaries_handle[T][0], (sizex + 2*K), (sizey + 2*K), K);
			
 
				+			free_block_on_node(block->boundaries_handle[T][1], (sizex + 2*K), (sizey + 2*K), K);
			
 
				+		}
			
 
				+        else
			
 
				+        {
			
 
				+            starpu_data_unregister(block->boundaries_handle[T][0]);
			
 
				+            starpu_data_unregister(block->boundaries_handle[T][1]);
			
 
				+        }
			
 
				+
			
 
				+		/* Boundary blocks : Bottom */
			
 
				+		if (node == rank)
			
 
				+		{
			
 
				+			free_block_on_node(block->boundaries_handle[B][0], (sizex + 2*K), (sizey + 2*K), K);
			
 
				+			free_block_on_node(block->boundaries_handle[B][1], (sizex + 2*K), (sizey + 2*K), K);
			
 
				+		}
			
 
				+        else
			
 
				+        {
			
 
				+            starpu_data_unregister(block->boundaries_handle[B][0]);
			
 
				+            starpu_data_unregister(block->boundaries_handle[B][1]);
			
 
				+        }
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* check how many cells are alive */
			
 
				+void check(int rank)
			
 
				+{
			
 
				+	unsigned bz;
			
 
				+	for (bz = 0; bz < nbz; bz++)
			
 
				+	{
			
 
				+		struct block_description *block = get_block_description(bz);
			
 
				+
			
 
				+		int node = block->mpi_node;
			
 
				+
			
 
				+		/* Main blocks */
			
 
				+		if (node == rank)
			
 
				+		{
			
 
				+			unsigned size_bz = block_sizes_z[bz];
			
 
				+#ifdef LIFE
			
 
				+			unsigned x, y, z;
			
 
				+			unsigned sum = 0;
			
 
				+			for (x = 0; x < sizex; x++)
			
 
				+				for (y = 0; y < sizey; y++)
			
 
				+					for (z = 0; z < size_bz; z++)
			
 
				+						sum += block->layers[0][(K+x)+(K+y)*(sizex + 2*K)+(K+z)*(sizex+2*K)*(sizey+2*K)];
			
 
				+			printf("block %u got %u/%u alive\n", bz, sum, sizex*sizey*size_bz);
			
 
				+#endif
			
 
				+		}
			
 
				+	}
			
 
				+}
			
--- a/examples/stencil/implicit-stencil-kernels.c
+++ b/examples/stencil/implicit-stencil-kernels.c
@@ -0,0 +1,769 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2012, 2013, 2016  CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "implicit-stencil.h"
			
 
				+
			
 
				+/* Computation Kernels */
			
 
				+
			
 
				+/*
			
 
				+ * There are three codeletets:
			
 
				+ *
			
 
				+ * - cl_update, which takes a block and the boundaries of its neighbours, loads
			
 
				+ *   the boundaries into the block and perform some update loops:
			
 
				+ *
			
 
				+ *     comp. buffer      save. buffers        comp. buffer        save. buffers        comp. buffer
			
 
				+ *   |     ...     |                                                                                      
			
 
				+ *   |             | +------------------+ +------------------+                                            
			
 
				+ *   |     #N+1    | | #N+1 bottom copy====>#N+1 bottom copy |                                            
			
 
				+ *   +-------------+ +------------------+ +------------------+                                            
			
 
				+ *   | #N top copy | |   #N top copy    | |                  |                                            
			
 
				+ *   +-------------+ +------------------+ |                  |                                            
			
 
				+ *                                        | #N               |                                            
			
 
				+ *                                                 ...                                                    
			
 
				+ *                                        |                  | +----------------+ +----------------------+
			
 
				+ *                                        |                  | | #N bottom copy | | block #N bottom copy |
			
 
				+ * ^                                      +------------------+ +----------------+ +----------------------+
			
 
				+ * |                                      | #N-1 top copy   <====#N-1 top copy  | |  block #N-1          |
			
 
				+ * |                                      +------------------+ +----------------+ |                      |
			
 
				+ * Z                                                                                     ...
			
 
				+ *
			
 
				+ * - save_cl_top, which take a block and its top boundary, and saves the top of
			
 
				+ *   the block into the boundary (to be given as bottom of the neighbour above
			
 
				+ *   this block).
			
 
				+ *
			
 
				+ *     comp. buffer      save. buffers        comp. buffer        save. buffers        comp. buffer
			
 
				+ *   |     ...     |                                                                                      
			
 
				+ *   |             | +------------------+ +------------------+                                            
			
 
				+ *   |     #N+1    | | #N+1 bottom copy | | #N+1 bottom copy |                                            
			
 
				+ *   +-------------+ +------------------+ +------------------+                                            
			
 
				+ *   | #N top copy | |   #N top copy   <====                 |                                            
			
 
				+ *   +-------------+ +------------------+ |..................|                                            
			
 
				+ *                                        | #N               |                                            
			
 
				+ *                                                 ...                                                    
			
 
				+ *                                        |                  | +----------------+ +----------------------+
			
 
				+ *                                        |                  | | #N bottom copy | | block #N bottom copy |
			
 
				+ * ^                                      +------------------+ +----------------+ +----------------------+
			
 
				+ * |                                      | #N-1 top copy    | | #N-1 top copy  | |  block #N-1          |
			
 
				+ * |                                      +------------------+ +----------------+ |                      |
			
 
				+ * Z                                                                                     ...
			
 
				+ *
			
 
				+ * - save_cl_bottom, same for the bottom
			
 
				+ *     comp. buffer      save. buffers        comp. buffer        save. buffers        comp. buffer
			
 
				+ *   |     ...     |                                                                                      
			
 
				+ *   |             | +------------------+ +------------------+                                            
			
 
				+ *   |     #N+1    | | #N+1 bottom copy | | #N+1 bottom copy |                                            
			
 
				+ *   +-------------+ +------------------+ +------------------+                                            
			
 
				+ *   | #N top copy | |   #N top copy    | |                  |                                            
			
 
				+ *   +-------------+ +------------------+ |                  |                                            
			
 
				+ *                                        | #N               |                                            
			
 
				+ *                                                 ...                                                    
			
 
				+ *                                        |..................| +----------------+ +----------------------+
			
 
				+ *                                        |                 ====>#N bottom copy | | block #N bottom copy |
			
 
				+ * ^                                      +------------------+ +----------------+ +----------------------+
			
 
				+ * |                                      | #N-1 top copy    | | #N-1 top copy  | |  block #N-1          |
			
 
				+ * |                                      +------------------+ +----------------+ |                      |
			
 
				+ * Z                                                                                     ...
			
 
				+ *
			
 
				+ * The idea is that the computation buffers thus don't have to move, only their
			
 
				+ * boundaries are copied to buffers that do move (be it CPU/GPU, GPU/GPU or via
			
 
				+ * MPI)
			
 
				+ *
			
 
				+ * For each of the buffers above, there are two (0/1) buffers to make new/old switch costless.
			
 
				+ */
			
 
				+
			
 
				+#if 0
			
 
				+# define DEBUG(fmt, ...) fprintf(stderr,fmt,##__VA_ARGS__)
			
 
				+#else
			
 
				+# define DEBUG(fmt, ...) (void) 0
			
 
				+#endif
			
 
				+
			
 
				+/* Record which GPU ran which block, for nice pictures */
			
 
				+int who_runs_what_len;
			
 
				+int *who_runs_what;
			
 
				+int *who_runs_what_index;
			
 
				+double *last_tick;
			
 
				+
			
 
				+/* Achieved iterations */
			
 
				+static int achieved_iter;
			
 
				+
			
 
				+/* Record how many updates each worker performed */
			
 
				+unsigned update_per_worker[STARPU_NMAXWORKERS];
			
 
				+
			
 
				+static void record_who_runs_what(struct block_description *block)
			
 
				+{
			
 
				+	double now, now2, diff, delta = get_ticks() * 1000;
			
 
				+	int workerid = starpu_worker_get_id_check();
			
 
				+
			
 
				+	now = starpu_timing_now();
			
 
				+	now2 = now - start;
			
 
				+	diff = now2 - last_tick[block->bz];
			
 
				+	while (diff >= delta)
			
 
				+	{
			
 
				+		last_tick[block->bz] += delta;
			
 
				+		diff = now2 - last_tick[block->bz];
			
 
				+		if (who_runs_what_index[block->bz] < who_runs_what_len)
			
 
				+			who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = -1;
			
 
				+	}
			
 
				+
			
 
				+	if (who_runs_what_index[block->bz] < who_runs_what_len)
			
 
				+		who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = global_workerid(workerid);
			
 
				+}
			
 
				+
			
 
				+static void check_load(struct starpu_block_interface *block, struct starpu_block_interface *boundary)
			
 
				+{
			
 
				+	/* Sanity checks */
			
 
				+	STARPU_ASSERT(block->nx == boundary->nx);
			
 
				+	STARPU_ASSERT(block->ny == boundary->ny);
			
 
				+	STARPU_ASSERT(boundary->nz == K);
			
 
				+
			
 
				+	/* NB: this is not fully garanteed ... but it's *very* likely and that
			
 
				+	 * makes our life much simpler */
			
 
				+	STARPU_ASSERT(block->ldy == boundary->ldy);
			
 
				+	STARPU_ASSERT(block->ldz == boundary->ldz);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Load a neighbour's boundary into block, CPU version
			
 
				+ */
			
 
				+static void load_subblock_from_buffer_cpu(void *_block,
			
 
				+					void *_boundary,
			
 
				+					unsigned firstz)
			
 
				+{
			
 
				+	struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
			
 
				+	struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
			
 
				+	check_load(block, boundary);
			
 
				+
			
 
				+	/* We do a contiguous memory transfer */
			
 
				+	size_t boundary_size = K*block->ldz*block->elemsize;
			
 
				+
			
 
				+	unsigned offset = firstz*block->ldz;
			
 
				+	TYPE *block_data = (TYPE *)block->ptr;
			
 
				+	TYPE *boundary_data = (TYPE *)boundary->ptr;
			
 
				+	memcpy(&block_data[offset], boundary_data, boundary_size);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Load a neighbour's boundary into block, CUDA version
			
 
				+ */
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+static void load_subblock_from_buffer_cuda(void *_block,
			
 
				+					void *_boundary,
			
 
				+					unsigned firstz)
			
 
				+{
			
 
				+	struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
			
 
				+	struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
			
 
				+	check_load(block, boundary);
			
 
				+
			
 
				+	/* We do a contiguous memory transfer */
			
 
				+	size_t boundary_size = K*block->ldz*block->elemsize;
			
 
				+
			
 
				+	unsigned offset = firstz*block->ldz;
			
 
				+	TYPE *block_data = (TYPE *)block->ptr;
			
 
				+	TYPE *boundary_data = (TYPE *)boundary->ptr;
			
 
				+	cudaMemcpyAsync(&block_data[offset], boundary_data, boundary_size, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * cl_update (CUDA version)
			
 
				+ */
			
 
				+static void update_func_cuda(void *descr[], void *arg)
			
 
				+{
			
 
				+    unsigned z;
			
 
				+    starpu_codelet_unpack_args(arg, &z);
			
 
				+	struct block_description *block = get_block_description(z);
			
 
				+
			
 
				+	int workerid = starpu_worker_get_id_check();
			
 
				+	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
			
 
				+	if (block->bz == 0)
			
 
				+		FPRINTF(stderr,"!!! DO update_func_cuda z %u CUDA%d !!!\n", block->bz, workerid);
			
 
				+	else
			
 
				+		DEBUG( "!!! DO update_func_cuda z %u CUDA%d !!!\n", block->bz, workerid);
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+	int rank = 0;
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	DEBUG( "!!!           RANK %d              !!!\n", rank);
			
 
				+#endif
			
 
				+	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
			
 
				+
			
 
				+	unsigned block_size_z = get_block_size(block->bz);
			
 
				+	unsigned i;
			
 
				+	update_per_worker[workerid]++;
			
 
				+
			
 
				+	record_who_runs_what(block);
			
 
				+
			
 
				+	/*
			
 
				+	 *	Load neighbours' boundaries : TOP
			
 
				+	 */
			
 
				+
			
 
				+	/* The offset along the z axis is (block_size_z + K) */
			
 
				+	load_subblock_from_buffer_cuda(descr[0], descr[2], block_size_z+K);
			
 
				+	load_subblock_from_buffer_cuda(descr[1], descr[3], block_size_z+K);
			
 
				+
			
 
				+	/*
			
 
				+	 *	Load neighbours' boundaries : BOTTOM
			
 
				+	 */
			
 
				+	load_subblock_from_buffer_cuda(descr[0], descr[4], 0);
			
 
				+	load_subblock_from_buffer_cuda(descr[1], descr[5], 0);
			
 
				+
			
 
				+	/*
			
 
				+	 *	Stencils ... do the actual work here :) TODO
			
 
				+	 */
			
 
				+
			
 
				+	for (i=1; i<=K; i++)
			
 
				+	{
			
 
				+		struct starpu_block_interface *oldb = descr[i%2], *newb = descr[(i+1)%2];
			
 
				+		TYPE *old = (void*) oldb->ptr, *newer = (void*) newb->ptr;
			
 
				+
			
 
				+		/* Shadow data */
			
 
				+		cuda_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
			
 
				+
			
 
				+		/* And perform actual computation */
			
 
				+#ifdef LIFE
			
 
				+		cuda_life_update_host(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
			
 
				+#else
			
 
				+		cudaMemcpyAsync(newer, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
			
 
				+#endif /* LIFE */
			
 
				+	}
			
 
				+
			
 
				+	if (block->bz == 0)
			
 
				+		starpu_top_update_data_integer(starpu_top_achieved_loop, ++achieved_iter);
			
 
				+}
			
 
				+#endif /* STARPU_USE_CUDA */
			
 
				+
			
 
				+/*
			
 
				+ * Load a neighbour's boundary into block, OpenCL version
			
 
				+ */
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static void load_subblock_from_buffer_opencl(struct starpu_block_interface *block,
			
 
				+					struct starpu_block_interface *boundary,
			
 
				+					unsigned firstz)
			
 
				+{
			
 
				+	check_load(block, boundary);
			
 
				+
			
 
				+	/* We do a contiguous memory transfer */
			
 
				+	size_t boundary_size = K*block->ldz*block->elemsize;
			
 
				+
			
 
				+	unsigned offset = firstz*block->ldz;
			
 
				+	cl_mem block_data = (cl_mem)block->dev_handle;
			
 
				+	cl_mem boundary_data = (cl_mem)boundary->dev_handle;
			
 
				+
			
 
				+        cl_command_queue cq;
			
 
				+        starpu_opencl_get_current_queue(&cq);
			
 
				+        cl_int ret = clEnqueueCopyBuffer(cq, boundary_data, block_data, 0, offset, boundary_size, 0, NULL, NULL);
			
 
				+	if (ret != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(ret);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * cl_update (OpenCL version)
			
 
				+ */
			
 
				+static void update_func_opencl(void *descr[], void *arg)
			
 
				+{
			
 
				+    unsigned z;
			
 
				+    starpu_codelet_unpack_args(arg, &z);
			
 
				+	struct block_description *block = get_block_description(z);
			
 
				+
			
 
				+	int workerid = starpu_worker_get_id_check();
			
 
				+	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
			
 
				+	if (block->bz == 0)
			
 
				+		FPRINTF(stderr,"!!! DO update_func_opencl z %u OPENCL%d !!!\n", block->bz, workerid);
			
 
				+	else
			
 
				+		DEBUG( "!!! DO update_func_opencl z %u OPENCL%d !!!\n", block->bz, workerid);
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+	int rank = 0;
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	DEBUG( "!!!           RANK %d              !!!\n", rank);
			
 
				+#endif
			
 
				+	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
			
 
				+
			
 
				+	unsigned block_size_z = get_block_size(block->bz);
			
 
				+	unsigned i;
			
 
				+	update_per_worker[workerid]++;
			
 
				+
			
 
				+	record_who_runs_what(block);
			
 
				+
			
 
				+        cl_command_queue cq;
			
 
				+        starpu_opencl_get_current_queue(&cq);
			
 
				+
			
 
				+	/*
			
 
				+	 *	Load neighbours' boundaries : TOP
			
 
				+	 */
			
 
				+
			
 
				+	/* The offset along the z axis is (block_size_z + K) */
			
 
				+	load_subblock_from_buffer_opencl(descr[0], descr[2], block_size_z+K);
			
 
				+	load_subblock_from_buffer_opencl(descr[1], descr[3], block_size_z+K);
			
 
				+
			
 
				+	/*
			
 
				+	 *	Load neighbours' boundaries : BOTTOM
			
 
				+	 */
			
 
				+	load_subblock_from_buffer_opencl(descr[0], descr[4], 0);
			
 
				+	load_subblock_from_buffer_opencl(descr[1], descr[5], 0);
			
 
				+
			
 
				+	/*
			
 
				+	 *	Stencils ... do the actual work here :) TODO
			
 
				+	 */
			
 
				+
			
 
				+	for (i=1; i<=K; i++)
			
 
				+	{
			
 
				+		struct starpu_block_interface *oldb = descr[i%2], *newb = descr[(i+1)%2];
			
 
				+		TYPE *old = (void*) oldb->dev_handle, *newer = (void*) newb->dev_handle;
			
 
				+
			
 
				+		/* Shadow data */
			
 
				+		opencl_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
			
 
				+
			
 
				+		/* And perform actual computation */
			
 
				+#ifdef LIFE
			
 
				+		opencl_life_update_host(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
			
 
				+#else
			
 
				+		cl_event event;
			
 
				+                cl_int ret = clEnqueueCopyBuffer(cq, old, newer, 0, 0, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer), 0, NULL, &event);
			
 
				+		if (ret != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(ret);
			
 
				+
			
 
				+#endif /* LIFE */
			
 
				+	}
			
 
				+
			
 
				+	if (block->bz == 0)
			
 
				+		starpu_top_update_data_integer(starpu_top_achieved_loop, ++achieved_iter);
			
 
				+}
			
 
				+#endif /* STARPU_USE_OPENCL */
			
 
				+
			
 
				+/*
			
 
				+ * cl_update (CPU version)
			
 
				+ */
			
 
				+void update_func_cpu(void *descr[], void *arg)
			
 
				+{
			
 
				+    unsigned z;
			
 
				+    starpu_codelet_unpack_args(arg, &z);
			
 
				+	struct block_description *block = get_block_description(z);
			
 
				+
			
 
				+	int workerid = starpu_worker_get_id_check();
			
 
				+	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
			
 
				+	if (block->bz == 0)
			
 
				+		DEBUG("!!! DO update_func_cpu z %u CPU%d !!!\n", block->bz, workerid);
			
 
				+	else
			
 
				+		DEBUG("!!! DO update_func_cpu z %u CPU%d !!!\n", block->bz, workerid);
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+	int rank = 0;
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	DEBUG( "!!!           RANK %d            !!!\n", rank);
			
 
				+#endif
			
 
				+	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
			
 
				+
			
 
				+	unsigned block_size_z = get_block_size(block->bz);
			
 
				+	unsigned i;
			
 
				+	update_per_worker[workerid]++;
			
 
				+
			
 
				+	record_who_runs_what(block);
			
 
				+
			
 
				+	/*
			
 
				+	 *	Load neighbours' boundaries : TOP
			
 
				+	 */
			
 
				+
			
 
				+	/* The offset along the z axis is (block_size_z + K) */
			
 
				+	load_subblock_from_buffer_cpu(descr[0], descr[2], block_size_z+K);
			
 
				+	load_subblock_from_buffer_cpu(descr[1], descr[3], block_size_z+K);
			
 
				+
			
 
				+	/*
			
 
				+	 *	Load neighbours' boundaries : BOTTOM
			
 
				+	 */
			
 
				+	load_subblock_from_buffer_cpu(descr[0], descr[4], 0);
			
 
				+	load_subblock_from_buffer_cpu(descr[1], descr[5], 0);
			
 
				+
			
 
				+	/*
			
 
				+	 *	Stencils ... do the actual work here :) TODO
			
 
				+	 */
			
 
				+
			
 
				+	for (i=1; i<=K; i++)
			
 
				+	{
			
 
				+		struct starpu_block_interface *oldb = (struct starpu_block_interface *) descr[i%2], *newb = (struct starpu_block_interface *) descr[(i+1)%2];
			
 
				+		TYPE *old = (TYPE*) oldb->ptr, *newer = (TYPE*) newb->ptr;
			
 
				+
			
 
				+		/* Shadow data */
			
 
				+		unsigned ldy = oldb->ldy, ldz = oldb->ldz;
			
 
				+		unsigned nx = oldb->nx, ny = oldb->ny, nz = oldb->nz;
			
 
				+		unsigned x, y, z;
			
 
				+		unsigned stepx = 1;
			
 
				+		unsigned stepy = 1;
			
 
				+		unsigned stepz = 1;
			
 
				+		unsigned idx = 0;
			
 
				+		unsigned idy = 0;
			
 
				+		unsigned idz = 0;
			
 
				+		TYPE *ptr = old;
			
 
				+
			
 
				+#		include "shadow.h"
			
 
				+
			
 
				+		/* And perform actual computation */
			
 
				+#ifdef LIFE
			
 
				+		life_update(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
			
 
				+#else
			
 
				+		memcpy(newer, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer));
			
 
				+#endif /* LIFE */
			
 
				+	}
			
 
				+
			
 
				+	if (block->bz == 0)
			
 
				+		starpu_top_update_data_integer(starpu_top_achieved_loop, ++achieved_iter);
			
 
				+}
			
 
				+
			
 
				+/* Performance model and codelet structure */
			
 
				+static struct starpu_perfmodel cl_update_model =
			
 
				+{
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = "cl_update" 
			
 
				+};
			
 
				+
			
 
				+struct starpu_codelet cl_update =
			
 
				+{
			
 
				+	.cpu_funcs = {update_func_cpu},
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {update_func_cuda},
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.opencl_funcs = {update_func_opencl},
			
 
				+	.opencl_flags = {STARPU_OPENCL_ASYNC},
			
 
				+#endif
			
 
				+	.model = &cl_update_model,
			
 
				+	.nbuffers = 6,
			
 
				+	.modes = {STARPU_RW, STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R}
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Save the block internal boundaries to give them to our neighbours.
			
 
				+ */
			
 
				+
			
 
				+/* CPU version */
			
 
				+static void load_subblock_into_buffer_cpu(void *_block,
			
 
				+					void *_boundary,
			
 
				+					unsigned firstz)
			
 
				+{
			
 
				+	struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
			
 
				+	struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
			
 
				+	check_load(block, boundary);
			
 
				+
			
 
				+	/* We do a contiguous memory transfer */
			
 
				+	size_t boundary_size = K*block->ldz*block->elemsize;
			
 
				+
			
 
				+	unsigned offset = firstz*block->ldz;
			
 
				+	TYPE *block_data = (TYPE *)block->ptr;
			
 
				+	TYPE *boundary_data = (TYPE *)boundary->ptr;
			
 
				+	memcpy(boundary_data, &block_data[offset], boundary_size);
			
 
				+}
			
 
				+
			
 
				+/* CUDA version */
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+static void load_subblock_into_buffer_cuda(void *_block,
			
 
				+					void *_boundary,
			
 
				+					unsigned firstz)
			
 
				+{
			
 
				+	struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
			
 
				+	struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
			
 
				+	check_load(block, boundary);
			
 
				+
			
 
				+	/* We do a contiguous memory transfer */
			
 
				+	size_t boundary_size = K*block->ldz*block->elemsize;
			
 
				+
			
 
				+	unsigned offset = firstz*block->ldz;
			
 
				+	TYPE *block_data = (TYPE *)block->ptr;
			
 
				+	TYPE *boundary_data = (TYPE *)boundary->ptr;
			
 
				+	cudaMemcpyAsync(boundary_data, &block_data[offset], boundary_size, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
			
 
				+}
			
 
				+#endif /* STARPU_USE_CUDA */
			
 
				+
			
 
				+/* OPENCL version */
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static void load_subblock_into_buffer_opencl(struct starpu_block_interface *block,
			
 
				+					struct starpu_block_interface *boundary,
			
 
				+					unsigned firstz)
			
 
				+{
			
 
				+	check_load(block, boundary);
			
 
				+
			
 
				+	/* We do a contiguous memory transfer */
			
 
				+	size_t boundary_size = K*block->ldz*block->elemsize;
			
 
				+
			
 
				+	unsigned offset = firstz*block->ldz;
			
 
				+	cl_mem block_data = (cl_mem)block->dev_handle;
			
 
				+	cl_mem boundary_data = (cl_mem)boundary->dev_handle;
			
 
				+
			
 
				+        cl_command_queue cq;
			
 
				+        starpu_opencl_get_current_queue(&cq);
			
 
				+
			
 
				+        cl_int ret = clEnqueueCopyBuffer(cq, block_data, boundary_data, offset, 0, boundary_size, 0, NULL, NULL);
			
 
				+	if (ret != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(ret);
			
 
				+}
			
 
				+#endif /* STARPU_USE_OPENCL */
			
 
				+
			
 
				+/* Record how many top/bottom saves each worker performed */
			
 
				+unsigned top_per_worker[STARPU_NMAXWORKERS];
			
 
				+unsigned bottom_per_worker[STARPU_NMAXWORKERS];
			
 
				+
			
 
				+/* top save, CPU version */
			
 
				+void dummy_func_top_cpu(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
			
 
				+{
			
 
				+    unsigned z;
			
 
				+    starpu_codelet_unpack_args(arg, &z);
			
 
				+	struct block_description *block = get_block_description(z);
			
 
				+
			
 
				+	int workerid = starpu_worker_get_id_check();
			
 
				+	top_per_worker[workerid]++;
			
 
				+
			
 
				+	DEBUG( "DO SAVE Bottom block %d\n", block->bz);
			
 
				+
			
 
				+	/* The offset along the z axis is (block_size_z + K)- K */
			
 
				+	unsigned block_size_z = get_block_size(block->bz);
			
 
				+
			
 
				+	load_subblock_into_buffer_cpu(descr[0], descr[2], block_size_z);
			
 
				+	load_subblock_into_buffer_cpu(descr[1], descr[3], block_size_z);
			
 
				+}
			
 
				+
			
 
				+/* bottom save, CPU version */
			
 
				+void dummy_func_bottom_cpu(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
			
 
				+{
			
 
				+    unsigned z;
			
 
				+    starpu_codelet_unpack_args(arg, &z);
			
 
				+	struct block_description *block = get_block_description(z);
			
 
				+    STARPU_ASSERT(block);
			
 
				+
			
 
				+	int workerid = starpu_worker_get_id_check();
			
 
				+	bottom_per_worker[workerid]++;
			
 
				+
			
 
				+	DEBUG( "DO SAVE Top block %d\n", block->bz);
			
 
				+
			
 
				+	load_subblock_into_buffer_cpu(descr[0], descr[2], K);
			
 
				+	load_subblock_into_buffer_cpu(descr[1], descr[3], K);
			
 
				+}
			
 
				+
			
 
				+/* top save, CUDA version */
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+static void dummy_func_top_cuda(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
			
 
				+{
			
 
				+    unsigned z;
			
 
				+    starpu_codelet_unpack_args(arg, &z);
			
 
				+	struct block_description *block = get_block_description(z);
			
 
				+
			
 
				+	int workerid = starpu_worker_get_id_check();
			
 
				+	top_per_worker[workerid]++;
			
 
				+
			
 
				+	DEBUG( "DO SAVE Top block %d\n", block->bz);
			
 
				+
			
 
				+	/* The offset along the z axis is (block_size_z + K)- K */
			
 
				+	unsigned block_size_z = get_block_size(block->bz);
			
 
				+
			
 
				+	load_subblock_into_buffer_cuda(descr[0], descr[2], block_size_z);
			
 
				+	load_subblock_into_buffer_cuda(descr[1], descr[3], block_size_z);
			
 
				+}
			
 
				+
			
 
				+/* bottom save, CUDA version */
			
 
				+static void dummy_func_bottom_cuda(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
			
 
				+{
			
 
				+    unsigned z;
			
 
				+    starpu_codelet_unpack_args(arg, &z);
			
 
				+	struct block_description *block = get_block_description(z);
			
 
				+
			
 
				+	int workerid = starpu_worker_get_id_check();
			
 
				+	bottom_per_worker[workerid]++;
			
 
				+
			
 
				+	DEBUG( "DO SAVE Bottom block %d on CUDA\n", block->bz);
			
 
				+
			
 
				+	load_subblock_into_buffer_cuda(descr[0], descr[2], K);
			
 
				+	load_subblock_into_buffer_cuda(descr[1], descr[3], K);
			
 
				+}
			
 
				+#endif /* STARPU_USE_CUDA */
			
 
				+
			
 
				+/* top save, OpenCL version */
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static void dummy_func_top_opencl(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
			
 
				+{
			
 
				+    unsigned z;
			
 
				+    starpu_codelet_unpack_args(arg, &z);
			
 
				+	struct block_description *block = get_block_description(z);
			
 
				+
			
 
				+	int workerid = starpu_worker_get_id_check();
			
 
				+	top_per_worker[workerid]++;
			
 
				+
			
 
				+	DEBUG( "DO SAVE Top block %d\n", block->bz);
			
 
				+
			
 
				+	/* The offset along the z axis is (block_size_z + K)- K */
			
 
				+	unsigned block_size_z = get_block_size(block->bz);
			
 
				+
			
 
				+	load_subblock_into_buffer_opencl(descr[0], descr[2], block_size_z);
			
 
				+	load_subblock_into_buffer_opencl(descr[1], descr[3], block_size_z);
			
 
				+}
			
 
				+
			
 
				+/* bottom save, OPENCL version */
			
 
				+static void dummy_func_bottom_opencl(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
			
 
				+{
			
 
				+    unsigned z;
			
 
				+    starpu_codelet_unpack_args(arg, &z);
			
 
				+	struct block_description *block = get_block_description(z);
			
 
				+
			
 
				+	int workerid = starpu_worker_get_id_check();
			
 
				+	bottom_per_worker[workerid]++;
			
 
				+
			
 
				+	DEBUG( "DO SAVE Bottom block %d on OPENCL\n", block->bz);
			
 
				+
			
 
				+	load_subblock_into_buffer_opencl(descr[0], descr[2], K);
			
 
				+	load_subblock_into_buffer_opencl(descr[1], descr[3], K);
			
 
				+}
			
 
				+#endif /* STARPU_USE_OPENCL */
			
 
				+
			
 
				+/* Performance models and codelet for save */
			
 
				+static struct starpu_perfmodel save_cl_bottom_model =
			
 
				+{
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = "save_cl_bottom" 
			
 
				+};
			
 
				+
			
 
				+static struct starpu_perfmodel save_cl_top_model =
			
 
				+{
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = "save_cl_top" 
			
 
				+};
			
 
				+
			
 
				+struct starpu_codelet save_cl_bottom =
			
 
				+{
			
 
				+	.cpu_funcs = {dummy_func_bottom_cpu},
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {dummy_func_bottom_cuda},
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.opencl_funcs = {dummy_func_bottom_opencl},
			
 
				+	.opencl_flags = {STARPU_OPENCL_ASYNC},
			
 
				+#endif
			
 
				+	.model = &save_cl_bottom_model,
			
 
				+	.nbuffers = 4,
			
 
				+	.modes = {STARPU_R, STARPU_R, STARPU_W, STARPU_W}
			
 
				+};
			
 
				+
			
 
				+struct starpu_codelet save_cl_top =
			
 
				+{
			
 
				+	.cpu_funcs = {dummy_func_top_cpu},
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {dummy_func_top_cuda},
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.opencl_funcs = {dummy_func_top_opencl},
			
 
				+	.opencl_flags = {STARPU_OPENCL_ASYNC},
			
 
				+#endif
			
 
				+	.model = &save_cl_top_model,
			
 
				+	.nbuffers = 4,
			
 
				+	.modes = {STARPU_R, STARPU_R, STARPU_W, STARPU_W}
			
 
				+};
			
 
				+
			
 
				+/* Memset a block's buffers */
			
 
				+static void memset_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
			
 
				+{
			
 
				+    unsigned sizex, sizey, bz;
			
 
				+    starpu_codelet_unpack_args(arg, &sizex, &sizey, &bz);
			
 
				+	struct block_description *block = get_block_description(bz);
			
 
				+    unsigned size_bz = get_block_size(bz);
			
 
				+
			
 
				+    unsigned x,y,z;
			
 
				+    for (x = 0; x < sizex + 2*K; x++)
			
 
				+    {
			
 
				+        for (y = 0; y < sizey + 2*K; y++)
			
 
				+        {
			
 
				+            /* Main blocks */
			
 
				+            for (z = 0; z < size_bz + 2*K; z++)
			
 
				+            {
			
 
				+                block->layers[0][(x)+(y)*(sizex + 2*K)+(z)*(sizex+2*K)*(sizey+2*K)] = 0; 
			
 
				+                block->layers[1][(x)+(y)*(sizex + 2*K)+(z)*(sizex+2*K)*(sizey+2*K)] = 0; 
			
 
				+            }
			
 
				+            for (z = 0; z < K; z++)
			
 
				+            {
			
 
				+                /* Boundary blocks : Top */
			
 
				+                block->boundaries[T][0][(x)+(y)*(sizex + 2*K)+(z)*(sizex+2*K)*(sizey+2*K)] = 0; 
			
 
				+                block->boundaries[T][1][(x)+(y)*(sizex + 2*K)+(z)*(sizex+2*K)*(sizey+2*K)] = 0; 
			
 
				+
			
 
				+                /* Boundary blocks : Bottom */
			
 
				+                block->boundaries[B][0][(x)+(y)*(sizex + 2*K)+(z)*(sizex+2*K)*(sizey+2*K)] = 0; 
			
 
				+                block->boundaries[B][1][(x)+(y)*(sizex + 2*K)+(z)*(sizex+2*K)*(sizey+2*K)] = 0; 
			
 
				+            }
			
 
				+
			
 
				+        }
			
 
				+    }
			
 
				+    //memset(block->layers[0], 0, (sizex + 2*K)*(sizey + 2*K)*(size_bz + 2*K)*sizeof(block->layers[0]));
			
 
				+    //memset(block->layers[1], 0, (sizex + 2*K)*(sizey + 2*K)*(size_bz + 2*K)*sizeof(block->layers[1]));
			
 
				+
			
 
				+    //memset(block->boundaries[T][0], 0, (sizex + 2*K)*(sizey + 2*K)*K*sizeof(block->boundaries[T][0]));
			
 
				+    //memset(block->boundaries[T][1], 0, (sizex + 2*K)*(sizey + 2*K)*K*sizeof(block->boundaries[T][1]));
			
 
				+
			
 
				+    //memset(block->boundaries[B][0], 0, (sizex + 2*K)*(sizey + 2*K)*K*sizeof(block->boundaries[B][0]));
			
 
				+    //memset(block->boundaries[B][1], 0, (sizex + 2*K)*(sizey + 2*K)*K*sizeof(block->boundaries[B][1]));
			
 
				+}
			
 
				+
			
 
				+static double memset_cost_function(struct starpu_task *task, unsigned nimpl)
			
 
				+{
			
 
				+	(void) task;
			
 
				+	(void) nimpl;
			
 
				+	return 0.000001;
			
 
				+}
			
 
				+
			
 
				+static struct starpu_perfmodel memset_model =
			
 
				+{
			
 
				+	.type = STARPU_COMMON,
			
 
				+	.cost_function = memset_cost_function,
			
 
				+	.symbol = "memset"
			
 
				+};
			
 
				+
			
 
				+struct starpu_codelet cl_memset =
			
 
				+{
			
 
				+	.cpu_funcs = {memset_func},
			
 
				+	.model = &memset_model,
			
 
				+	.nbuffers = 6,
			
 
				+	.modes = {STARPU_W, STARPU_W, STARPU_W, STARPU_W, STARPU_W, STARPU_W}
			
 
				+};
			
 
				+
			
 
				+/* Initialize a block's layer */
			
 
				+static void initlayer_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
			
 
				+{
			
 
				+    unsigned sizex, sizey, bz;
			
 
				+    starpu_codelet_unpack_args(arg, &sizex, &sizey, &bz);
			
 
				+	struct block_description *block = get_block_description(bz);
			
 
				+    unsigned size_bz = get_block_size(bz);
			
 
				+
			
 
				+    /* Initialize layer with some random data */
			
 
				+    unsigned x, y, z;
			
 
				+    unsigned sum = 0;
			
 
				+    for (x = 0; x < sizex; x++)
			
 
				+        for (y = 0; y < sizey; y++)
			
 
				+            for (z = 0; z < size_bz; z++)
			
 
				+                sum += block->layers[0][(K+x)+(K+y)*(sizex + 2*K)+(K+z)*(sizex+2*K)*(sizey+2*K)] = (int)((x/7.+y/13.+(bz*size_bz + z)/17.) * 10.) % 2;
			
 
				+}
			
 
				+
			
 
				+static double initlayer_cost_function(struct starpu_task *task, unsigned nimpl)
			
 
				+{
			
 
				+	(void) task;
			
 
				+	(void) nimpl;
			
 
				+	return 0.000001;
			
 
				+}
			
 
				+
			
 
				+static struct starpu_perfmodel initlayer_model =
			
 
				+{
			
 
				+	.type = STARPU_COMMON,
			
 
				+	.cost_function = initlayer_cost_function,
			
 
				+	.symbol = "initlayer"
			
 
				+};
			
 
				+
			
 
				+struct starpu_codelet cl_initlayer =
			
 
				+{
			
 
				+	.cpu_funcs = {initlayer_func},
			
 
				+	.model = &initlayer_model,
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_W}
			
 
				+};
			
 
				+
			
--- a/examples/stencil/implicit-stencil-tasks.c
+++ b/examples/stencil/implicit-stencil-tasks.c
@@ -0,0 +1,203 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2013-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2012, 2013, 2015  CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "implicit-stencil.h"
			
 
				+
			
 
				+#define BIND_LAST 1
			
 
				+
			
 
				+/*
			
 
				+ * Schedule tasks for updates and saves
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * NB: iter = 0: initialization phase, TAG_U(z, 0) = TAG_INIT
			
 
				+ *
			
 
				+ * dir is -1 or +1.
			
 
				+ */
			
 
				+
			
 
				+#if 0
			
 
				+# define DEBUG(fmt, ...) fprintf(stderr,fmt,##__VA_ARGS__)
			
 
				+#else
			
 
				+# define DEBUG(fmt, ...)
			
 
				+#endif
			
 
				+ 
			
 
				+#define starpu_insert_task(...) starpu_mpi_insert_task(MPI_COMM_WORLD, __VA_ARGS__)
			
 
				+
			
 
				+/* 
			
 
				+ * Schedule initialization tasks 
			
 
				+ */
			
 
				+
			
 
				+void create_task_memset(unsigned sizex, unsigned sizey, unsigned z)
			
 
				+{
			
 
				+	struct block_description *descr = get_block_description(z);
			
 
				+    struct starpu_codelet *codelet = &cl_memset;
			
 
				+
			
 
				+    int ret = starpu_insert_task(
			
 
				+            codelet,
			
 
				+            STARPU_VALUE,   &sizex,  sizeof(unsigned),
			
 
				+            STARPU_VALUE,   &sizey,  sizeof(unsigned),
			
 
				+            STARPU_VALUE,   &z,  sizeof(unsigned),
			
 
				+            STARPU_W,   descr->layers_handle[0],
			
 
				+            STARPU_W,   descr->layers_handle[1],
			
 
				+            STARPU_W,   descr->boundaries_handle[T][0],
			
 
				+            STARPU_W,   descr->boundaries_handle[T][1],
			
 
				+            STARPU_W,   descr->boundaries_handle[B][0],
			
 
				+            STARPU_W,   descr->boundaries_handle[B][1],
			
 
				+                0);
			
 
				+
			
 
				+    if (ret)
			
 
				+    {
			
 
				+        FPRINTF(stderr, "Could not submit task save: %d\n", ret);
			
 
				+        if (ret == -ENODEV)
			
 
				+            exit(77);
			
 
				+        STARPU_ABORT();
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void create_task_initlayer(unsigned sizex, unsigned sizey, unsigned z)
			
 
				+{
			
 
				+	struct block_description *descr = get_block_description(z);
			
 
				+    struct starpu_codelet *codelet = &cl_initlayer;
			
 
				+
			
 
				+    int ret = starpu_insert_task(
			
 
				+            codelet,
			
 
				+            STARPU_VALUE,   &sizex,  sizeof(unsigned),
			
 
				+            STARPU_VALUE,   &sizey,  sizeof(unsigned),
			
 
				+            STARPU_VALUE,   &z,  sizeof(unsigned),
			
 
				+            STARPU_W,   descr->layers_handle[0],
			
 
				+                0);
			
 
				+
			
 
				+    if (ret)
			
 
				+    {
			
 
				+        FPRINTF(stderr, "Could not submit task save: %d\n", ret);
			
 
				+        if (ret == -ENODEV)
			
 
				+            exit(77);
			
 
				+        STARPU_ABORT();
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Schedule saving boundaries of blocks to communication buffers
			
 
				+ */
			
 
				+
			
 
				+static void create_task_save_local(unsigned iter, unsigned z, int dir, int local_rank)
			
 
				+{
			
 
				+	struct block_description *descr = get_block_description(z);
			
 
				+	int node_z = get_block_mpi_node(z);
			
 
				+	int node_z_and_d = get_block_mpi_node(z+dir);
			
 
				+    struct starpu_codelet *codelet;
			
 
				+    int ret;
			
 
				+
			
 
				+    codelet = (dir == -1)?&save_cl_bottom:&save_cl_top;
			
 
				+    ret = starpu_insert_task(
			
 
				+            codelet,
			
 
				+            STARPU_VALUE,   &z,  sizeof(unsigned),
			
 
				+            STARPU_R,   descr->layers_handle[0],
			
 
				+            STARPU_R,   descr->layers_handle[1],
			
 
				+            STARPU_W,   descr->boundaries_handle[(1-dir)/2][0],
			
 
				+            STARPU_W,   descr->boundaries_handle[(1-dir)/2][1],
			
 
				+            STARPU_PRIORITY,    STARPU_MAX_PRIO,
			
 
				+                0);
			
 
				+
			
 
				+    if (ret)
			
 
				+    {
			
 
				+        FPRINTF(stderr, "Could not submit task save: %d\n", ret);
			
 
				+        if (ret == -ENODEV)
			
 
				+            exit(77);
			
 
				+        STARPU_ABORT();
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Schedule update computation in computation buffer
			
 
				+ */
			
 
				+
			
 
				+void create_task_update(unsigned iter, unsigned z, int local_rank)
			
 
				+{
			
 
				+	STARPU_ASSERT(iter != 0);
			
 
				+
			
 
				+	unsigned old_layer = (K*(iter-1)) % 2;
			
 
				+	unsigned new_layer = (old_layer + 1) % 2;
			
 
				+
			
 
				+	struct block_description *descr = get_block_description(z);
			
 
				+	struct block_description *bottom_neighbour = descr->boundary_blocks[B];
			
 
				+	struct block_description *top_neighbour = descr->boundary_blocks[T];
			
 
				+
			
 
				+	struct starpu_codelet *codelet = &cl_update;
			
 
				+
			
 
				+    // Simple-level prio
			
 
				+    //int prio = ((bottom_neighbour->mpi_node != local_rank) || (top_neighbour->mpi_node != local_rank )) ? STARPU_MAX_PRIO : STARPU_DEFAULT_PRIO;
			
 
				+
			
 
				+    // Two-level prio
			
 
				+    int prio = ((bottom_neighbour->mpi_node != local_rank) || (top_neighbour->mpi_node != local_rank )) ? STARPU_MAX_PRIO :
			
 
				+               ((bottom_neighbour->boundary_blocks[B]->mpi_node != local_rank) || (top_neighbour->boundary_blocks[T]->mpi_node != local_rank )) ? STARPU_MAX_PRIO-1 : STARPU_DEFAULT_PRIO;
			
 
				+
			
 
				+    int ret = starpu_insert_task(
			
 
				+            codelet,
			
 
				+            STARPU_VALUE,   &z,  sizeof(unsigned),
			
 
				+	        STARPU_RW,      descr->layers_handle[old_layer],
			
 
				+	        STARPU_RW,      descr->layers_handle[new_layer],
			
 
				+	        STARPU_R,       bottom_neighbour->boundaries_handle[T][old_layer],
			
 
				+	        STARPU_R,       bottom_neighbour->boundaries_handle[T][new_layer],
			
 
				+	        STARPU_R,       top_neighbour->boundaries_handle[B][old_layer],
			
 
				+	        STARPU_R,       top_neighbour->boundaries_handle[B][new_layer],
			
 
				+            STARPU_PRIORITY,    prio,
			
 
				+                0);
			
 
				+	if (ret)
			
 
				+	{
			
 
				+		FPRINTF(stderr, "Could not submit task update block: %d\n", ret);
			
 
				+		if (ret == -ENODEV)
			
 
				+			exit(77);
			
 
				+		STARPU_ABORT();
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Create all the tasks
			
 
				+ */
			
 
				+void create_tasks(int rank)
			
 
				+{
			
 
				+	int iter;
			
 
				+	int bz;
			
 
				+	int niter = get_niter();
			
 
				+	int nbz = get_nbz();
			
 
				+
			
 
				+	for (iter = 0; iter <= niter; iter++)
			
 
				+	{
			
 
				+	     for (bz = 0; bz < nbz; bz++)
			
 
				+	     {
			
 
				+		    if ((iter > 0) && ((get_block_mpi_node(bz) == rank)|| (get_block_mpi_node(bz+1) == rank)|| (get_block_mpi_node(bz-1) == rank)))
			
 
				+			    create_task_update(iter, bz, rank);
			
 
				+	     }
			
 
				+
			
 
				+	     for (bz = 0; bz < nbz; bz++)
			
 
				+	     {
			
 
				+		     if (iter != niter)
			
 
				+		     {
			
 
				+                int node_z = get_block_mpi_node(bz);
			
 
				+                int node_z_and_b = get_block_mpi_node(bz-1);
			
 
				+                int node_z_and_t = get_block_mpi_node(bz+1);
			
 
				+
			
 
				+                  if ((node_z == rank) || ((node_z != node_z_and_b) && (node_z_and_b == rank)))
			
 
				+				     create_task_save_local(iter, bz, +1, rank);
			
 
				+
			
 
				+                  if ((node_z == rank) || ((node_z != node_z_and_t) && (node_z_and_t == rank)))
			
 
				+				     create_task_save_local(iter, bz, -1, rank);
			
 
				+		     }
			
 
				+	     }
			
 
				+	}
			
 
				+}
			
--- a/examples/stencil/implicit-stencil.c
+++ b/examples/stencil/implicit-stencil.c
@@ -0,0 +1,387 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
			
 
				+ * Copyright (C) 2010-2012, 2014  Université de Bordeaux
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "implicit-stencil.h"
			
 
				+
			
 
				+/* Main application */
			
 
				+
			
 
				+/* default parameter values */
			
 
				+static unsigned  bind_tasks = 0;
			
 
				+
			
 
				+static unsigned ticks = 1000;
			
 
				+
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				+static unsigned niter = 4;
			
 
				+#define SIZE 16
			
 
				+#else
			
 
				+static unsigned niter = 32;
			
 
				+#define SIZE 128
			
 
				+#endif
			
 
				+
			
 
				+/* Problem size */
			
 
				+static unsigned sizex = SIZE;
			
 
				+static unsigned sizey = SIZE;
			
 
				+static unsigned sizez = 64*SIZE;
			
 
				+
			
 
				+/* Number of blocks (scattered over the different MPI processes) */
			
 
				+unsigned nbz = 64;
			
 
				+
			
 
				+/* StarPU top variables */
			
 
				+struct starpu_top_data* starpu_top_init_loop;
			
 
				+struct starpu_top_data* starpu_top_achieved_loop;
			
 
				+
			
 
				+double start;
			
 
				+double begin, end;
			
 
				+double timing; 
			
 
				+
			
 
				+/*
			
 
				+ *	Initialization
			
 
				+ */
			
 
				+
			
 
				+unsigned get_bind_tasks(void)
			
 
				+{
			
 
				+	return bind_tasks;
			
 
				+}
			
 
				+
			
 
				+unsigned get_nbz(void)
			
 
				+{
			
 
				+	return nbz;
			
 
				+}
			
 
				+
			
 
				+unsigned get_niter(void)
			
 
				+{
			
 
				+	return niter;
			
 
				+}
			
 
				+
			
 
				+unsigned get_ticks(void)
			
 
				+{
			
 
				+	return ticks;
			
 
				+}
			
 
				+
			
 
				+static void parse_args(int argc, char **argv)
			
 
				+{
			
 
				+	int i;
			
 
				+	for (i = 1; i < argc; i++)
			
 
				+	{
			
 
				+		if (strcmp(argv[i], "-b") == 0)
			
 
				+		{
			
 
				+			bind_tasks = 1;
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-nbz") == 0)
			
 
				+		{
			
 
				+			nbz = atoi(argv[++i]);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-sizex") == 0)
			
 
				+		{
			
 
				+			sizex = atoi(argv[++i]);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-sizey") == 0)
			
 
				+		{
			
 
				+			sizey = atoi(argv[++i]);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-sizez") == 0)
			
 
				+		{
			
 
				+			sizez = atoi(argv[++i]);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-niter") == 0)
			
 
				+		{
			
 
				+			niter = atoi(argv[++i]);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-ticks") == 0)
			
 
				+		{
			
 
				+			ticks = atoi(argv[++i]);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
			
 
				+		{
			
 
				+			 fprintf(stderr, "Usage : %s [options...]\n", argv[0]);
			
 
				+			 fprintf(stderr, "\n");
			
 
				+			 fprintf(stderr, "Options:\n");
			
 
				+			 fprintf(stderr, "-b			bind tasks on CPUs/GPUs\n");
			
 
				+			 fprintf(stderr, "-nbz <n>		Number of blocks on Z axis (%u by default)\n", nbz);
			
 
				+			 fprintf(stderr, "-size[xyz] <size>	Domain size on x/y/z axis (%ux%ux%u by default)\n", sizex, sizey, sizez);
			
 
				+			 fprintf(stderr, "-niter <n>		Number of iterations (%u by default)\n", niter);
			
 
				+			 fprintf(stderr, "-ticks <t>		How often to put ticks in the output (ms, %u by default)\n", ticks);
			
 
				+			 exit(0);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void init_problem(int argc, char **argv, int rank, int world_size)
			
 
				+{
			
 
				+	parse_args(argc, argv);
			
 
				+
			
 
				+	if (getenv("STARPU_TOP"))
			
 
				+	{
			
 
				+		starpu_top_init_loop = starpu_top_add_data_integer("Task creation iter", 0, niter, 1);
			
 
				+		starpu_top_achieved_loop = starpu_top_add_data_integer("Task achieved iter", 0, niter, 1);
			
 
				+		starpu_top_init_and_wait("stencil_top example");
			
 
				+	}
			
 
				+	create_blocks_array(sizex, sizey, sizez, nbz);
			
 
				+
			
 
				+	/* Select the MPI process which should compute the different blocks */
			
 
				+	assign_blocks_to_mpi_nodes(world_size);
			
 
				+
			
 
				+	assign_blocks_to_workers(rank);
			
 
				+
			
 
				+	/* Allocate the different memory blocks, if used by the MPI process */
			
 
				+	start = starpu_timing_now();
			
 
				+
			
 
				+	allocate_memory_on_node(rank);
			
 
				+
			
 
				+	end = starpu_timing_now();
			
 
				+	timing = end - begin;
			
 
				+
			
 
				+	display_memory_consumption(rank, timing);
			
 
				+
			
 
				+	who_runs_what_len = 2*niter;
			
 
				+	who_runs_what = (int *) calloc(nbz * who_runs_what_len, sizeof(*who_runs_what));
			
 
				+	who_runs_what_index = (int *) calloc(nbz, sizeof(*who_runs_what_index));
			
 
				+	last_tick = (double *) calloc(nbz, sizeof(*last_tick));
			
 
				+}
			
 
				+
			
 
				+static void free_problem(int rank)
			
 
				+{
			
 
				+    free_memory_on_node(rank);
			
 
				+	free_blocks_array();
			
 
				+	free(who_runs_what);
			
 
				+	free(who_runs_what_index);
			
 
				+	free(last_tick);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *	Main body
			
 
				+ */
			
 
				+
			
 
				+void f(unsigned task_per_worker[STARPU_NMAXWORKERS])
			
 
				+{
			
 
				+	unsigned total = 0;
			
 
				+	int worker;
			
 
				+
			
 
				+	for (worker = 0; worker < STARPU_NMAXWORKERS; worker++)
			
 
				+		total += task_per_worker[worker];
			
 
				+	for (worker = 0; worker < STARPU_NMAXWORKERS; worker++)
			
 
				+	{
			
 
				+		if (task_per_worker[worker])
			
 
				+		{
			
 
				+			char name[32];
			
 
				+			starpu_worker_get_name(worker, name, sizeof(name));
			
 
				+			FPRINTF(stderr,"\t%s -> %u (%2.2f%%)\n", name, task_per_worker[worker], (100.0*task_per_worker[worker])/total);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+unsigned global_workerid(unsigned local_workerid)
			
 
				+{
			
 
				+#ifdef STARPU_USE_MPI
			
 
				+	int rank;
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	unsigned workers_per_node = starpu_worker_get_count();
			
 
				+
			
 
				+	return (local_workerid + rank*workers_per_node);
			
 
				+#else
			
 
				+	return local_workerid;
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int rank;
			
 
				+	int world_size;
			
 
				+	int ret;
			
 
				+
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+	int thread_support;
			
 
				+	if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support))
			
 
				+	{
			
 
				+		FPRINTF(stderr, "MPI_Init_thread failed\n");
			
 
				+	}
			
 
				+	if (thread_support == MPI_THREAD_FUNNELED)
			
 
				+		FPRINTF(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n");
			
 
				+	if (thread_support < MPI_THREAD_FUNNELED)
			
 
				+		FPRINTF(stderr,"Warning: MPI does not have thread support!\n");
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
			
 
				+#else
			
 
				+	rank = 0;
			
 
				+	world_size = 1;
			
 
				+#endif
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		FPRINTF(stderr, "Running on %d nodes\n", world_size);
			
 
				+		fflush(stderr);
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return 77;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+        opencl_life_init();
			
 
				+        opencl_shadow_init();
			
 
				+#endif /*STARPU_USE_OPENCL*/
			
 
				+
			
 
				+	init_problem(argc, argv, rank, world_size);
			
 
				+
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+	int barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
			
 
				+#endif
			
 
				+	if (rank == 0)
			
 
				+		FPRINTF(stderr, "GO !\n");
			
 
				+
			
 
				+	start = starpu_timing_now();
			
 
				+
			
 
				+	begin = starpu_timing_now();
			
 
				+
			
 
				+	create_tasks(rank);
			
 
				+
			
 
				+	//starpu_tag_notify_from_apps(TAG_INIT_TASK);
			
 
				+
			
 
				+	//wait_end_tasks(rank);
			
 
				+
			
 
				+    starpu_task_wait_for_all();
			
 
				+
			
 
				+	end = starpu_timing_now();
			
 
				+
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
			
 
				+#endif
			
 
				+
			
 
				+#if 0
			
 
				+	check(rank);
			
 
				+#endif
			
 
				+
			
 
				+	/*display_debug(nbz, niter, rank);*/
			
 
				+
			
 
				+	/* timing in us */
			
 
				+	timing = end - begin;
			
 
				+
			
 
				+	double min_timing = timing;
			
 
				+	double max_timing = timing;
			
 
				+	double sum_timing = timing;
			
 
				+
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+	int reduce_ret;
			
 
				+
			
 
				+	reduce_ret = MPI_Reduce(&timing, &min_timing, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
			
 
				+
			
 
				+	reduce_ret = MPI_Reduce(&timing, &max_timing, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
			
 
				+
			
 
				+	reduce_ret = MPI_Reduce(&timing, &sum_timing, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
			
 
				+
			
 
				+	/* XXX we should do a gather instead, here we assume that non initialized values are still 0 */
			
 
				+	int *who_runs_what_tmp = malloc(nbz * who_runs_what_len * sizeof(*who_runs_what));
			
 
				+	reduce_ret = MPI_Reduce(who_runs_what, who_runs_what_tmp, nbz * who_runs_what_len, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
			
 
				+
			
 
				+	memcpy(who_runs_what, who_runs_what_tmp, nbz * who_runs_what_len * sizeof(*who_runs_what));
			
 
				+	free(who_runs_what_tmp);
			
 
				+
			
 
				+	/* XXX we should do a gather instead, here we assume that non initialized values are still 0 */
			
 
				+	int *who_runs_what_index_tmp = malloc(nbz * sizeof(*who_runs_what_index));
			
 
				+	reduce_ret = MPI_Reduce(who_runs_what_index, who_runs_what_index_tmp, nbz, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
			
 
				+
			
 
				+	memcpy(who_runs_what_index, who_runs_what_index_tmp, nbz * sizeof(*who_runs_what_index));
			
 
				+	free(who_runs_what_index_tmp);
			
 
				+#endif
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+#if 1 
			
 
				+		FPRINTF(stderr, "update:\n");
			
 
				+		f(update_per_worker);
			
 
				+		FPRINTF(stderr, "top:\n");
			
 
				+		f(top_per_worker);
			
 
				+		FPRINTF(stderr, "bottom:\n");
			
 
				+		f(bottom_per_worker);
			
 
				+#endif
			
 
				+#if 1
			
 
				+		unsigned nzblocks_per_process = (nbz + world_size - 1) / world_size;
			
 
				+
			
 
				+		int iter;
			
 
				+		for (iter = 0; iter < who_runs_what_len; iter++)
			
 
				+		{
			
 
				+			unsigned last, bz;
			
 
				+			last = 1;
			
 
				+			for (bz = 0; bz < nbz; bz++)
			
 
				+			{
			
 
				+				if ((bz % nzblocks_per_process) == 0)
			
 
				+					FPRINTF(stderr, "| ");
			
 
				+
			
 
				+				if (who_runs_what_index[bz] <= iter)
			
 
				+					FPRINTF(stderr,"_ ");
			
 
				+				else
			
 
				+				{
			
 
				+					last = 0;
			
 
				+					if (who_runs_what[bz + iter * nbz] == -1)
			
 
				+						FPRINTF(stderr,"* ");
			
 
				+					else
			
 
				+						FPRINTF(stderr, "%d ", who_runs_what[bz + iter * nbz]);
			
 
				+				}
			
 
				+			}
			
 
				+			FPRINTF(stderr, "\n");
			
 
				+
			
 
				+			if (last)
			
 
				+				break;
			
 
				+		}
			
 
				+#endif
			
 
				+
			
 
				+		fflush(stderr);
			
 
				+
			
 
				+		FPRINTF(stdout, "Computation took: %f ms on %d MPI processes\n", max_timing/1000, world_size);
			
 
				+		FPRINTF(stdout, "\tMIN : %f ms\n", min_timing/1000);
			
 
				+		FPRINTF(stdout, "\tMAX : %f ms\n", max_timing/1000);
			
 
				+		FPRINTF(stdout, "\tAVG : %f ms\n", sum_timing/(world_size*1000));
			
 
				+	}
			
 
				+
			
 
				+	free_problem(rank);
			
 
				+
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+	starpu_mpi_shutdown();
			
 
				+#endif
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+	MPI_Finalize();
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+        opencl_life_free();
			
 
				+        opencl_shadow_free();
			
 
				+#endif /*STARPU_USE_OPENCL*/
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/examples/stencil/implicit-stencil.h
+++ b/examples/stencil/implicit-stencil.h
@@ -0,0 +1,157 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  CNRS
			
 
				+ * Copyright (C) 2010-2011, 2014  Université de Bordeaux
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __IMPLICIT_STENCIL_H__
			
 
				+#define __IMPLICIT_STENCIL_H__
			
 
				+
			
 
				+#include <stdlib.h>
			
 
				+#include <stdio.h>
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#ifndef __CUDACC__
			
 
				+#ifdef STARPU_USE_MPI
			
 
				+#include <mpi.h>
			
 
				+#include <starpu_mpi.h>
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				+
			
 
				+#define LIFE
			
 
				+
			
 
				+#ifdef LIFE
			
 
				+#define TYPE	unsigned char
			
 
				+extern void life_update(int bz, const TYPE *old, TYPE *newp, int nx, int ny, int nz, int ldy, int ldz, int iter);
			
 
				+#else
			
 
				+#define TYPE	float
			
 
				+#endif
			
 
				+
			
 
				+#define K	1
			
 
				+
			
 
				+#define NDIRS 2
			
 
				+extern struct starpu_top_data* starpu_top_init_loop;
			
 
				+extern struct starpu_top_data* starpu_top_achieved_loop;
			
 
				+
			
 
				+
			
 
				+/* Split only on the z axis to make things simple */
			
 
				+typedef enum
			
 
				+{
			
 
				+	B = 0,
			
 
				+	T = 1
			
 
				+} direction;
			
 
				+
			
 
				+/* Description of a domain block */
			
 
				+struct block_description
			
 
				+{
			
 
				+	/* Which MPI node should process that block ? */
			
 
				+	int mpi_node;
			
 
				+
			
 
				+	unsigned preferred_worker;
			
 
				+
			
 
				+	unsigned bz;
			
 
				+
			
 
				+
			
 
				+	/* For each of the following buffers, there are two (0/1) buffers to
			
 
				+	 * make new/old switch costless. */
			
 
				+
			
 
				+	/* This is the computation buffer for this block, it includes
			
 
				+	 * neighbours' border to make computation easier */
			
 
				+	TYPE *layers[2];
			
 
				+	starpu_data_handle_t layers_handle[2];
			
 
				+
			
 
				+	/* This is the "save" buffer, i.e. a copy of our neighbour's border.
			
 
				+	 * This one is used for CPU/GPU or MPI communication (rather than the
			
 
				+	 * whole domain block) */
			
 
				+	TYPE *boundaries[NDIRS][2];
			
 
				+	starpu_data_handle_t boundaries_handle[NDIRS][2];
			
 
				+
			
 
				+	/* Shortcut pointer to the neighbours */
			
 
				+	struct block_description *boundary_blocks[NDIRS];
			
 
				+};
			
 
				+
			
 
				+#define TAG_INIT_TASK			((starpu_tag_t)1)
			
 
				+
			
 
				+starpu_tag_t TAG_FINISH(int z);
			
 
				+starpu_tag_t TAG_START(int z, int dir);
			
 
				+int MPI_TAG0(int z, int iter, int dir);
			
 
				+int MPI_TAG1(int z, int iter, int dir);
			
 
				+
			
 
				+#define MIN(a,b)	((a)<(b)?(a):(b))
			
 
				+
			
 
				+void create_blocks_array(unsigned sizex, unsigned sizey, unsigned sizez, unsigned nbz);
			
 
				+void free_blocks_array();
			
 
				+struct block_description *get_block_description(int z);
			
 
				+void assign_blocks_to_mpi_nodes(int world_size);
			
 
				+void allocate_memory_on_node(int rank);
			
 
				+void assign_blocks_to_workers(int rank);
			
 
				+void create_tasks(int rank);
			
 
				+void wait_end_tasks(int rank);
			
 
				+void check(int rank);
			
 
				+void free_memory_on_node(int rank);
			
 
				+
			
 
				+void display_memory_consumption(int rank, double time);
			
 
				+
			
 
				+int get_block_mpi_node(int z);
			
 
				+unsigned get_block_size(int z);
			
 
				+unsigned get_bind_tasks(void);
			
 
				+
			
 
				+unsigned get_nbz(void);
			
 
				+unsigned get_niter(void);
			
 
				+unsigned get_ticks(void);
			
 
				+
			
 
				+unsigned global_workerid(unsigned local_workerid);
			
 
				+
			
 
				+void create_task_memset(unsigned sizex, unsigned sizey, unsigned z);
			
 
				+void create_task_initlayer(unsigned sizex, unsigned sizey, unsigned z);
			
 
				+void create_task_update(unsigned iter, unsigned z, int local_rank);
			
 
				+void create_task_save(unsigned iter, unsigned z, int dir, int local_rank);
			
 
				+
			
 
				+extern int starpu_mpi_initialize(void);
			
 
				+extern int starpu_mpi_shutdown(void);
			
 
				+
			
 
				+/* kernels */
			
 
				+extern struct starpu_codelet cl_update;
			
 
				+extern struct starpu_codelet save_cl_bottom;
			
 
				+extern struct starpu_codelet save_cl_top;
			
 
				+extern struct starpu_codelet cl_memset;
			
 
				+extern struct starpu_codelet cl_initlayer;
			
 
				+
			
 
				+extern unsigned update_per_worker[STARPU_NMAXWORKERS];
			
 
				+extern unsigned top_per_worker[STARPU_NMAXWORKERS];
			
 
				+extern unsigned bottom_per_worker[STARPU_NMAXWORKERS];
			
 
				+
			
 
				+extern double start;
			
 
				+extern int who_runs_what_len;
			
 
				+extern int *who_runs_what;
			
 
				+extern int *who_runs_what_index;
			
 
				+extern double *last_tick;
			
 
				+
			
 
				+#ifndef _externC
			
 
				+#define _externC
			
 
				+#endif
			
 
				+
			
 
				+_externC void cuda_life_update_host(int bz, const TYPE *old, TYPE *newp, int nx, int ny, int nz, int ldy, int ldz, int iter);
			
 
				+_externC void cuda_shadow_host(int bz, TYPE *ptr, int nx, int ny, int nz, int ldy, int ldz, int i);
			
 
				+
			
 
				+_externC void opencl_shadow_init(void);
			
 
				+_externC void opencl_shadow_free(void);
			
 
				+_externC void opencl_shadow_host(int bz, TYPE *ptr, int nx, int ny, int nz, int ldy, int ldz, int i);
			
 
				+_externC void opencl_life_init(void);
			
 
				+_externC void opencl_life_free(void);
			
 
				+_externC void opencl_life_update_host(int bz, const TYPE *old, TYPE *newp, int nx, int ny, int nz, int ldy, int ldz, int iter);
			
 
				+
			
 
				+#endif /* __IMPLICIT_STENCIL_H__ */