10 years ago · 68605a2f15
--- a/ChangeLog
+++ b/ChangeLog
@@ -107,6 +107,8 @@ New features:
 
				   * Add Fortran 90 module and example using it
			
 
				   * New StarPU-MPI gdb debug functions
			
 
				   * Generate animated html trace of modular schedulers.
			
 
				+  * Add asynchronous partition planning. It only supports coherency through
			
 
				+    the main RAM for now.
			
 
				 
			
 
				 Small features:
			
 
				   * Tasks can now have a name (via the field const char *name of
			
--- a/doc/doxygen/chapters/07data_management.doxy
+++ b/doc/doxygen/chapters/07data_management.doxy
@@ -191,10 +191,63 @@ StarPU provides various interfaces and filters for matrices, vectors, etc.,
 
				 but applications can also write their own data interfaces and filters, see
			
 
				 <c>examples/interface</c> and <c>examples/filters/custom_mf</c> for an example.
			
 
				 
			
 
				-\section MultipleView Multiple views
			
 
				+\section AsynchronousPartitioning Asynchronous Partitioning
			
 
				 
			
 
				-Partitioning is synchronous, which can be a problem for dynamic applications.
			
 
				-Another way is to register several views on the same piece of data.
			
 
				+The partitioning functions described in the previous section are synchronous:
			
 
				+starpu_data_partition and starpu_data_unpartition both wait for all the tasks
			
 
				+currently working on the data.  This can be a bottleneck for the application.
			
 
				+
			
 
				+An asynchronous API also exists, it works only on handles with sequential
			
 
				+consistency. The principle is to first plan the partitioning, which returns
			
 
				+data handles of the partition, which are not functional yet. Along other task
			
 
				+submission, one can submit the actual partitioning, and then use the handles
			
 
				+of the partition. Before using the handle of the whole data, one has to submit
			
 
				+the unpartitioning. <c>fmultiple_automatic</c> is a complete example using this
			
 
				+technique.
			
 
				+
			
 
				+In short, we first register a matrix and plan the partitioning:
			
 
				+
			
 
				+\code{.c}
			
 
				+starpu_matrix_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)matrix, NX, NX, NY, sizeof(matrix[0]));
			
 
				+struct starpu_data_filter f_vert =
			
 
				+{
			
 
				+	.filter_func = starpu_matrix_filter_block,
			
 
				+	.nchildren = PARTS
			
 
				+};
			
 
				+starpu_data_partition_plan(handle, &f_vert, vert_handle);
			
 
				+\endcode
			
 
				+
			
 
				+starpu_data_partition_plan returns the handles for the partition in vert_handle.
			
 
				+
			
 
				+One can submit tasks working on the main handle, but not yet on the vert_handle
			
 
				+handles. Now we submit the partitioning:
			
 
				+
			
 
				+\code{.c}
			
 
				+starpu_data_partition_submit(handle, PARTS, vert_handle);
			
 
				+\endcode
			
 
				+
			
 
				+And now we can submit tasks working on vert_handle handles (and not on the main
			
 
				+handle any more). Eventually we want to work on the main handle again, so we
			
 
				+submit the unpartitioning:
			
 
				+
			
 
				+starpu_data_unpartition_submit(handle, PARTS, vert_handle);
			
 
				+
			
 
				+And now we can submit tasks working on the main handle again.
			
 
				+
			
 
				+All this code is asynchronous, just submitting which tasks, partitioning and
			
 
				+unpartitioning should be done at runtime.
			
 
				+
			
 
				+Planning several partitioning of the same data is also possible, one just has
			
 
				+to submit unpartitioning (to get back to the initial handle) before submitting
			
 
				+another partitioning.
			
 
				+
			
 
				+It is also possible to activate several partitioning at the same time, in read-only mode.
			
 
				+
			
 
				+\section ManualPartitioning Manual Partitioning
			
 
				+
			
 
				+One can also handle partitioning by hand, by registering several views on the
			
 
				+same piece of data. The idea is then to manage the coherency of the various
			
 
				+views through the common buffer in the main memory.
			
 
				 <c>fmultiple_manual</c> is a complete example using this technique.
			
 
				 
			
 
				 In short, we first register the same matrix several times:
			
--- a/doc/doxygen/chapters/api/data_partition.doxy
+++ b/doc/doxygen/chapters/api/data_partition.doxy
@@ -43,9 +43,7 @@ Here an example of how to use the function.
 
				 \code{.c}
			
 
				 struct starpu_data_filter f = {
			
 
				         .filter_func = starpu_matrix_filter_block,
			
 
				-        .nchildren = nslicesx,
			
 
				-        .get_nchildren = NULL,
			
 
				-        .get_child_ops = NULL
			
 
				+        .nchildren = nslicesx
			
 
				 };
			
 
				 starpu_data_partition(A_handle, &f);
			
 
				 \endcode
			
@@ -103,6 +101,100 @@ Applies \p nfilters filters to the handle designated by
 
				 \p root_handle recursively. It uses a va_list of pointers to variables of
			
 
				 the type starpu_data_filter.
			
 
				 
			
 
				+@name Asynchronous API
			
 
				+\ingroup API_Data_Partition
			
 
				+
			
 
				+\fn void starpu_data_partition_plan(starpu_data_handle_t initial_handle, struct starpu_data_filter *f, starpu_data_handle_t *children)
			
 
				+\ingroup API_Data_Partition
			
 
				+This plans for partitioning one StarPU data handle \p initial_handle into
			
 
				+several subdata according to the filter \p f. The handles are returned into
			
 
				+the \p children array, which has to be the same size as the number of parts
			
 
				+described in \p f. These handles are not immediately usable,
			
 
				+starpu_data_partition_submit has to be called to submit the actual partitioning.
			
 
				+
			
 
				+Here is an example of how to use the function:
			
 
				+
			
 
				+\code{.c}
			
 
				+starpu_data_handle_t children[nslicesx];
			
 
				+struct starpu_data_filter f = {
			
 
				+        .filter_func = starpu_matrix_filter_block,
			
 
				+        .nchildren = nslicesx
			
 
				+};
			
 
				+starpu_data_partition_plan(A_handle, &f, children);
			
 
				+\endcode
			
 
				+
			
 
				+\fn void starpu_data_partition_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children)
			
 
				+\ingroup API_Data_Partition
			
 
				+
			
 
				+This submits the actual partitioning of \p initial_handle into the \p nparts
			
 
				+\p children handles. This call is asynchronous, it only submits that the
			
 
				+partitioning should be done, so that the \p children handles can now be used to
			
 
				+submit tasks, and \p initial_handle can not be used to submit tasks any more (to
			
 
				+guarantee coherency).
			
 
				+
			
 
				+For instance,
			
 
				+
			
 
				+\code{.c}
			
 
				+starpu_data_partition_submit(A_handle, nslicesx, children);
			
 
				+\endcode
			
 
				+
			
 
				+\fn void starpu_data_partition_readonly_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children)
			
 
				+\ingroup API_Data_Partition
			
 
				+
			
 
				+This is the same as starpu_data_partition_submit, but does not invalidate \p
			
 
				+initial_handle. This allows to continue using it, but the application has to be
			
 
				+careful not to write to \p initial_handle or \p children handles, only read from
			
 
				+them, since the coherency is otherwise not guaranteed.  This thus allows to
			
 
				+submit various tasks which concurrently read from various partitions of the data.
			
 
				+
			
 
				+When the application wants to write to \p initial_handle again, it should call
			
 
				+starpu_data_unpartition_submit, which will properly add dependencies between the
			
 
				+reads on the \p children and the writes to be submitted.
			
 
				+
			
 
				+If instead the application wants to write to \p children handles, it should
			
 
				+call starpu_data_partition_readwrite_upgrade_submit, which will properly add
			
 
				+dependencies between the reads on the \p initial_handle and the writes to be
			
 
				+submitted.
			
 
				+
			
 
				+\fn void starpu_data_partition_readwrite_upgrade_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children)
			
 
				+\ingroup API_Data_Partition
			
 
				+
			
 
				+This assumes that a partitioning of \p initial_handle has already been submited
			
 
				+in readonly mode through starpu_data_partition_readonly_submit, and will upgrade
			
 
				+that partitioning into read-write mode for the \p children, by invalidating \p
			
 
				+initial_handle, and adding the necessary dependencies.
			
 
				+
			
 
				+\fn void starpu_data_unpartition_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children, int gathering_node)
			
 
				+\ingroup API_Data_Partition
			
 
				+
			
 
				+This assumes that \p initial_handle is partitioned into \p children, and submits
			
 
				+an unpartitionning of it, i.e. submitting a gathering of the pieces on the
			
 
				+requested \p gathering_node memory node, and submitting an invalidation of the
			
 
				+children.
			
 
				+
			
 
				+\p gathering_node can be set to -1 to let the runtime decide which memory node
			
 
				+should be used to gather the pieces.
			
 
				+
			
 
				+\fn void starpu_data_unpartition_readonly_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children, int gathering_node)
			
 
				+\ingroup API_Data_Partition
			
 
				+
			
 
				+This assumes that \p initial_handle is partitioned into \p children, and submits
			
 
				+just a readonly unpartitionning of it, i.e. submitting a gathering of the pieces
			
 
				+on the requested \p gathering_node memory node. It does not invalidate the
			
 
				+children. This brings \p initial_handle and \p children handles to the same
			
 
				+state as obtained with starpu_data_partition_readonly_submit.
			
 
				+
			
 
				+\p gathering_node can be set to -1 to let the runtime decide which memory node
			
 
				+should be used to gather the pieces.
			
 
				+
			
 
				+\fn void starpu_data_partition_clean(starpu_data_handle_t root_data, unsigned nparts, starpu_data_handle_t *children)
			
 
				+\ingroup API_Data_Partition
			
 
				+
			
 
				+This should be used to clear the partition planning established between \p
			
 
				+root_data and \p children with starpu_data_partition_plan. This will notably
			
 
				+submit an unregister all the \p children, which can thus not be used any more
			
 
				+afterwards.
			
 
				+
			
 
				 @name Predefined Vector Filter Functions
			
 
				 \ingroup API_Data_Partition
			
 
				 
			
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -183,6 +183,8 @@ STARPU_EXAMPLES =				\
 
				 	filters/fblock				\
			
 
				 	filters/fmatrix				\
			
 
				 	filters/fmultiple_manual		\
			
 
				+	filters/fmultiple_submit		\
			
 
				+	filters/fmultiple_submit_readonly	\
			
 
				 	tag_example/tag_example			\
			
 
				 	tag_example/tag_example2		\
			
 
				 	tag_example/tag_example3		\
			
@@ -430,6 +432,22 @@ filters_fmultiple_manual_SOURCES +=		\
 
				 	filters/fmultiple_cuda.cu
			
 
				 endif
			
 
				 
			
 
				+filters_fmultiple_submit_SOURCES =		\
			
 
				+	filters/fmultiple_submit.c
			
 
				+
			
 
				+if STARPU_USE_CUDA
			
 
				+filters_fmultiple_submit_SOURCES +=		\
			
 
				+	filters/fmultiple_cuda.cu
			
 
				+endif
			
 
				+
			
 
				+filters_fmultiple_submit_readonly_SOURCES =	\
			
 
				+	filters/fmultiple_submit_readonly.c
			
 
				+
			
 
				+if STARPU_USE_CUDA
			
 
				+filters_fmultiple_submit_readonly_SOURCES +=	\
			
 
				+	filters/fmultiple_cuda.cu
			
 
				+endif
			
 
				+
			
 
				 examplebin_PROGRAMS +=				\
			
 
				 	filters/shadow				\
			
 
				 	filters/shadow2d			\
			
--- a/examples/filters/fmultiple_cuda.cu
+++ b/examples/filters/fmultiple_cuda.cu
@@ -18,7 +18,7 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 
			
 
				-static __global__ void _fmultiple_check_cuda(int *val, int nx, int ny, unsigned ld, int start, int factor)
			
 
				+static __global__ void _fmultiple_check_scale_cuda(int *val, int nx, int ny, unsigned ld, int start, int factor)
			
 
				 {
			
 
				         int i, j;
			
 
				 	for(j=0; j<ny ; j++)
			
@@ -32,6 +32,33 @@ static __global__ void _fmultiple_check_cuda(int *val, int nx, int ny, unsigned
 
				         }
			
 
				 }
			
 
				 
			
 
				+extern "C" void fmultiple_check_scale_cuda(void *buffers[], void *cl_arg)
			
 
				+{
			
 
				+	int start, factor;
			
 
				+	int nx = (int)STARPU_MATRIX_GET_NX(buffers[0]);
			
 
				+	int ny = (int)STARPU_MATRIX_GET_NY(buffers[0]);
			
 
				+        unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]);
			
 
				+	int *val = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
			
 
				+
			
 
				+	starpu_codelet_unpack_args(cl_arg, &start, &factor);
			
 
				+
			
 
				+        /* TODO: use more vals and threads in vals */
			
 
				+	_fmultiple_check_scale_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val, nx, ny, ld, start, factor);
			
 
				+}
			
 
				+
			
 
				+static __global__ void _fmultiple_check_cuda(int *val, int nx, int ny, unsigned ld, int start, int factor)
			
 
				+{
			
 
				+        int i, j;
			
 
				+	for(j=0; j<ny ; j++)
			
 
				+	{
			
 
				+		for(i=0; i<nx ; i++)
			
 
				+		{
			
 
				+			if (val[(j*ld)+i] != start + factor*(i+100*j))
			
 
				+				asm("trap;");
			
 
				+		}
			
 
				+        }
			
 
				+}
			
 
				+
			
 
				 extern "C" void fmultiple_check_cuda(void *buffers[], void *cl_arg)
			
 
				 {
			
 
				 	int start, factor;
			
--- a/examples/filters/fmultiple_manual.c
+++ b/examples/filters/fmultiple_manual.c
@@ -16,9 +16,10 @@
 
				 
			
 
				 /*
			
 
				  * This examplifies how to access the same matrix with different partitioned
			
 
				- * views.
			
 
				+ * views, doing the coherency by hand.
			
 
				  * We first run a kernel on the whole matrix to fill it, then run a kernel on
			
 
				- * each vertical slice, then run a kernel on each horizontal slice.
			
 
				+ * each vertical slice to check the value and multiply it by two, then run a
			
 
				+ * kernel on each horizontal slice to do the same.
			
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
@@ -55,7 +56,7 @@ struct starpu_codelet cl_fill =
 
				 	.name = "matrix_fill"
			
 
				 };
			
 
				 
			
 
				-void fmultiple_check(void *buffers[], void *cl_arg)
			
 
				+void fmultiple_check_scale(void *buffers[], void *cl_arg)
			
 
				 {
			
 
				 	int start, factor;
			
 
				 	unsigned i, j;
			
@@ -79,21 +80,21 @@ void fmultiple_check(void *buffers[], void *cl_arg)
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-extern void fmultiple_check_cuda(void *buffers[], void *cl_arg);
			
 
				+extern void fmultiple_check_scale_cuda(void *buffers[], void *cl_arg);
			
 
				 #endif
			
 
				-struct starpu_codelet cl_check =
			
 
				+struct starpu_codelet cl_check_scale =
			
 
				 {
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_funcs = {fmultiple_check_cuda},
			
 
				+	.cuda_funcs = {fmultiple_check_scale_cuda},
			
 
				 	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 #else
			
 
				 	/* Only enable it on CPUs if we don't have a CUDA device, to force remote execution on the CUDA device */
			
 
				-	.cpu_funcs = {fmultiple_check},
			
 
				-	.cpu_funcs_name = {"fmultiple_check"},
			
 
				+	.cpu_funcs = {fmultiple_check_scale},
			
 
				+	.cpu_funcs_name = {"fmultiple_check_scale"},
			
 
				 #endif
			
 
				 	.nbuffers = 1,
			
 
				 	.modes = {STARPU_RW},
			
 
				-	.name = "fmultiple_check"
			
 
				+	.name = "fmultiple_check_scale"
			
 
				 };
			
 
				 
			
 
				 void empty(void *buffers[] STARPU_ATTRIBUTE_UNUSED, void *cl_arg STARPU_ATTRIBUTE_UNUSED)
			
@@ -169,7 +170,7 @@ int main(int argc, char **argv)
 
				 	{
			
 
				 		int factor = 1;
			
 
				 		int start = i*(NX/PARTS);
			
 
				-		ret = starpu_task_insert(&cl_check,
			
 
				+		ret = starpu_task_insert(&cl_check_scale,
			
 
				 				STARPU_RW, vert_handle[i],
			
 
				 				STARPU_VALUE, &start, sizeof(start),
			
 
				 				STARPU_VALUE, &factor, sizeof(factor),
			
@@ -206,7 +207,7 @@ int main(int argc, char **argv)
 
				 	{
			
 
				 		int factor = 2;
			
 
				 		int start = factor*100*i*(NY/PARTS);
			
 
				-		ret = starpu_task_insert(&cl_check,
			
 
				+		ret = starpu_task_insert(&cl_check_scale,
			
 
				 				STARPU_RW, horiz_handle[i],
			
 
				 				STARPU_VALUE, &start, sizeof(start),
			
 
				 				STARPU_VALUE, &factor, sizeof(factor),
			
--- a/examples/filters/fmultiple_submit.c
+++ b/examples/filters/fmultiple_submit.c
@@ -0,0 +1,207 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2015  Université Bordeaux
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * This examplifies how to access the same matrix with different partitioned
			
 
				+ * views, doing the coherency through partition planning.
			
 
				+ * We first run a kernel on the whole matrix to fill it, then run a kernel on
			
 
				+ * each vertical slice to check the value and multiply it by two, then run a
			
 
				+ * kernel on each horizontal slice to do the same.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#define NX    6
			
 
				+#define NY    6
			
 
				+#define PARTS 2
			
 
				+
			
 
				+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				+
			
 
				+void matrix_fill(void *buffers[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
			
 
				+{
			
 
				+	unsigned i, j;
			
 
				+
			
 
				+	/* length of the matrix */
			
 
				+	unsigned nx = STARPU_MATRIX_GET_NX(buffers[0]);
			
 
				+	unsigned ny = STARPU_MATRIX_GET_NY(buffers[0]);
			
 
				+	unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]);
			
 
				+	int *val = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
			
 
				+
			
 
				+	for(j=0; j<ny ; j++)
			
 
				+	{
			
 
				+		for(i=0; i<nx ; i++)
			
 
				+			val[(j*ld)+i] = i+100*j;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet cl_fill =
			
 
				+{
			
 
				+	.cpu_funcs = {matrix_fill},
			
 
				+	.cpu_funcs_name = {"matrix_fill"},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_W},
			
 
				+	.name = "matrix_fill"
			
 
				+};
			
 
				+
			
 
				+void fmultiple_check_scale(void *buffers[], void *cl_arg)
			
 
				+{
			
 
				+	int start, factor;
			
 
				+	unsigned i, j;
			
 
				+
			
 
				+	/* length of the matrix */
			
 
				+	unsigned nx = STARPU_MATRIX_GET_NX(buffers[0]);
			
 
				+	unsigned ny = STARPU_MATRIX_GET_NY(buffers[0]);
			
 
				+	unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]);
			
 
				+	int *val = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
			
 
				+
			
 
				+	starpu_codelet_unpack_args(cl_arg, &start, &factor);
			
 
				+
			
 
				+	for(j=0; j<ny ; j++)
			
 
				+	{
			
 
				+		for(i=0; i<nx ; i++)
			
 
				+		{
			
 
				+			STARPU_ASSERT(val[(j*ld)+i] == start + factor*((int)(i+100*j)));
			
 
				+			val[(j*ld)+i] *= 2;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+extern void fmultiple_check_scale_cuda(void *buffers[], void *cl_arg);
			
 
				+#endif
			
 
				+struct starpu_codelet cl_check_scale =
			
 
				+{
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {fmultiple_check_scale_cuda},
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				+#else
			
 
				+	/* Only enable it on CPUs if we don't have a CUDA device, to force remote execution on the CUDA device */
			
 
				+	.cpu_funcs = {fmultiple_check_scale},
			
 
				+	.cpu_funcs_name = {"fmultiple_check_scale"},
			
 
				+#endif
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_RW},
			
 
				+	.name = "fmultiple_check_scale"
			
 
				+};
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	unsigned j, n=1;
			
 
				+	int matrix[NX][NY];
			
 
				+	int ret, i;
			
 
				+
			
 
				+	/* We haven't taken care otherwise */
			
 
				+	STARPU_ASSERT((NX%PARTS) == 0);
			
 
				+	STARPU_ASSERT((NY%PARTS) == 0);
			
 
				+
			
 
				+	starpu_data_handle_t handle;
			
 
				+	starpu_data_handle_t vert_handle[PARTS];
			
 
				+	starpu_data_handle_t horiz_handle[PARTS];
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV)
			
 
				+		return 77;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	/* Declare the whole matrix to StarPU */
			
 
				+	starpu_matrix_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)matrix, NX, NX, NY, sizeof(matrix[0][0]));
			
 
				+
			
 
				+	/* Partition the matrix in PARTS vertical slices */
			
 
				+	struct starpu_data_filter f_vert =
			
 
				+	{
			
 
				+		.filter_func = starpu_matrix_filter_block,
			
 
				+		.nchildren = PARTS
			
 
				+	};
			
 
				+	starpu_data_partition_plan(handle, &f_vert, vert_handle);
			
 
				+
			
 
				+	/* Partition the matrix in PARTS horizontal slices */
			
 
				+	struct starpu_data_filter f_horiz =
			
 
				+	{
			
 
				+		.filter_func = starpu_matrix_filter_vertical_block,
			
 
				+		.nchildren = PARTS
			
 
				+	};
			
 
				+	starpu_data_partition_plan(handle, &f_horiz, horiz_handle);
			
 
				+
			
 
				+	/* Fill the matrix */
			
 
				+	ret = starpu_task_insert(&cl_fill, STARPU_W, handle, 0);
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				+	/* Now switch to vertical view of the matrix */
			
 
				+	starpu_data_partition_submit(handle, PARTS, vert_handle);
			
 
				+
			
 
				+	/* Check the values of the vertical slices */
			
 
				+	for (i = 0; i < PARTS; i++)
			
 
				+	{
			
 
				+		int factor = 1;
			
 
				+		int start = i*(NX/PARTS);
			
 
				+		ret = starpu_task_insert(&cl_check_scale,
			
 
				+				STARPU_RW, vert_handle[i],
			
 
				+				STARPU_VALUE, &start, sizeof(start),
			
 
				+				STARPU_VALUE, &factor, sizeof(factor),
			
 
				+				0);
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+
			
 
				+	/* Now switch back to total view of the matrix */
			
 
				+	starpu_data_unpartition_submit(handle, PARTS, vert_handle, -1);
			
 
				+
			
 
				+	/* And switch to horizontal view of the matrix */
			
 
				+	starpu_data_partition_submit(handle, PARTS, horiz_handle);
			
 
				+
			
 
				+	/* Check the values of the horizontal slices */
			
 
				+	for (i = 0; i < PARTS; i++)
			
 
				+	{
			
 
				+		int factor = 2;
			
 
				+		int start = factor*100*i*(NY/PARTS);
			
 
				+		ret = starpu_task_insert(&cl_check_scale,
			
 
				+				STARPU_RW, horiz_handle[i],
			
 
				+				STARPU_VALUE, &start, sizeof(start),
			
 
				+				STARPU_VALUE, &factor, sizeof(factor),
			
 
				+				0);
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+
			
 
				+	/* Now switch back to total view of the matrix */
			
 
				+	starpu_data_unpartition_submit(handle, PARTS, horiz_handle, -1);
			
 
				+
			
 
				+	/* And check the values of the whole matrix */
			
 
				+	int factor = 4;
			
 
				+	int start = 0;
			
 
				+	ret = starpu_task_insert(&cl_check_scale,
			
 
				+			STARPU_RW, handle,
			
 
				+			STARPU_VALUE, &start, sizeof(start),
			
 
				+			STARPU_VALUE, &factor, sizeof(factor),
			
 
				+			0);
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				+	/*
			
 
				+	 * Unregister data from StarPU and shutdown.
			
 
				+	 */
			
 
				+	starpu_data_partition_clean(handle, PARTS, vert_handle);
			
 
				+	starpu_data_partition_clean(handle, PARTS, horiz_handle);
			
 
				+	starpu_data_unregister(handle);
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return ret;
			
 
				+
			
 
				+enodev:
			
 
				+	starpu_shutdown();
			
 
				+	return 77;
			
 
				+}
			
--- a/examples/filters/fmultiple_submit_readonly.c
+++ b/examples/filters/fmultiple_submit_readonly.c
@@ -0,0 +1,367 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2015  Université Bordeaux
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * This examplifies how to access the same matrix with different partitioned
			
 
				+ * views, doing the coherency through partition planning.
			
 
				+ * We first run a kernel on the whole matrix to fill it, then run a kernel on
			
 
				+ * each vertical slice to check the value,
			
 
				+ * and multiply it by two, then run a
			
 
				+ * kernel on each horizontal slice to do the same.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#define NX    6
			
 
				+#define NY    6
			
 
				+#define PARTS 2
			
 
				+
			
 
				+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				+
			
 
				+void matrix_fill(void *buffers[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
			
 
				+{
			
 
				+	unsigned i, j;
			
 
				+
			
 
				+	/* length of the matrix */
			
 
				+	unsigned nx = STARPU_MATRIX_GET_NX(buffers[0]);
			
 
				+	unsigned ny = STARPU_MATRIX_GET_NY(buffers[0]);
			
 
				+	unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]);
			
 
				+	int *val = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
			
 
				+
			
 
				+	for(j=0; j<ny ; j++)
			
 
				+	{
			
 
				+		for(i=0; i<nx ; i++)
			
 
				+			val[(j*ld)+i] = i+100*j;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet cl_fill =
			
 
				+{
			
 
				+	.cpu_funcs = {matrix_fill},
			
 
				+	.cpu_funcs_name = {"matrix_fill"},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_W},
			
 
				+	.name = "matrix_fill"
			
 
				+};
			
 
				+
			
 
				+void fmultiple_check_scale(void *buffers[], void *cl_arg)
			
 
				+{
			
 
				+	int start, factor;
			
 
				+	unsigned i, j;
			
 
				+
			
 
				+	/* length of the matrix */
			
 
				+	unsigned nx = STARPU_MATRIX_GET_NX(buffers[0]);
			
 
				+	unsigned ny = STARPU_MATRIX_GET_NY(buffers[0]);
			
 
				+	unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]);
			
 
				+	int *val = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
			
 
				+
			
 
				+	starpu_codelet_unpack_args(cl_arg, &start, &factor);
			
 
				+
			
 
				+	for(j=0; j<ny ; j++)
			
 
				+	{
			
 
				+		for(i=0; i<nx ; i++)
			
 
				+		{
			
 
				+			STARPU_ASSERT(val[(j*ld)+i] == start + factor*((int)(i+100*j)));
			
 
				+			val[(j*ld)+i] *= 2;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+extern void fmultiple_check_scale_cuda(void *buffers[], void *cl_arg);
			
 
				+#endif
			
 
				+struct starpu_codelet cl_check_scale =
			
 
				+{
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {fmultiple_check_scale_cuda},
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				+#else
			
 
				+	/* Only enable it on CPUs if we don't have a CUDA device, to force remote execution on the CUDA device */
			
 
				+	.cpu_funcs = {fmultiple_check_scale},
			
 
				+	.cpu_funcs_name = {"fmultiple_check_scale"},
			
 
				+#endif
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_RW},
			
 
				+	.name = "fmultiple_check_scale"
			
 
				+};
			
 
				+
			
 
				+void fmultiple_check(void *buffers[], void *cl_arg)
			
 
				+{
			
 
				+	int start, factor;
			
 
				+	unsigned i, j;
			
 
				+
			
 
				+	/* length of the matrix */
			
 
				+	unsigned nx = STARPU_MATRIX_GET_NX(buffers[0]);
			
 
				+	unsigned ny = STARPU_MATRIX_GET_NY(buffers[0]);
			
 
				+	unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]);
			
 
				+	int *val = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
			
 
				+
			
 
				+	starpu_codelet_unpack_args(cl_arg, &start, &factor);
			
 
				+
			
 
				+	for(j=0; j<ny ; j++)
			
 
				+	{
			
 
				+		for(i=0; i<nx ; i++)
			
 
				+		{
			
 
				+			STARPU_ASSERT(val[(j*ld)+i] == start + factor*((int)(i+100*j)));
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+extern void fmultiple_check_cuda(void *buffers[], void *cl_arg);
			
 
				+#endif
			
 
				+struct starpu_codelet cl_check =
			
 
				+{
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {fmultiple_check_cuda},
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				+#else
			
 
				+	/* Only enable it on CPUs if we don't have a CUDA device, to force remote execution on the CUDA device */
			
 
				+	.cpu_funcs = {fmultiple_check},
			
 
				+	.cpu_funcs_name = {"fmultiple_check"},
			
 
				+#endif
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_R},
			
 
				+	.name = "fmultiple_check"
			
 
				+};
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int start, factor;
			
 
				+	unsigned j, n=1;
			
 
				+	int matrix[NX][NY];
			
 
				+	int ret, i;
			
 
				+
			
 
				+	/* We haven't taken care otherwise */
			
 
				+	STARPU_ASSERT((NX%PARTS) == 0);
			
 
				+	STARPU_ASSERT((NY%PARTS) == 0);
			
 
				+
			
 
				+	starpu_data_handle_t handle;
			
 
				+	starpu_data_handle_t vert_handle[PARTS];
			
 
				+	starpu_data_handle_t horiz_handle[PARTS];
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV)
			
 
				+		return 77;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	/* Declare the whole matrix to StarPU */
			
 
				+	starpu_matrix_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)matrix, NX, NX, NY, sizeof(matrix[0][0]));
			
 
				+
			
 
				+	/* Partition the matrix in PARTS vertical slices */
			
 
				+	struct starpu_data_filter f_vert =
			
 
				+	{
			
 
				+		.filter_func = starpu_matrix_filter_block,
			
 
				+		.nchildren = PARTS
			
 
				+	};
			
 
				+	starpu_data_partition_plan(handle, &f_vert, vert_handle);
			
 
				+
			
 
				+	/* Partition the matrix in PARTS horizontal slices */
			
 
				+	struct starpu_data_filter f_horiz =
			
 
				+	{
			
 
				+		.filter_func = starpu_matrix_filter_vertical_block,
			
 
				+		.nchildren = PARTS
			
 
				+	};
			
 
				+	starpu_data_partition_plan(handle, &f_horiz, horiz_handle);
			
 
				+
			
 
				+	/* Fill the matrix */
			
 
				+	ret = starpu_task_insert(&cl_fill, STARPU_W, handle, 0);
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	factor = 1;
			
 
				+
			
 
				+	/* Now switch to vertical view of the matrix, but readonly */
			
 
				+	starpu_data_partition_readonly_submit(handle, PARTS, vert_handle);
			
 
				+	/* as well as horizontal view of the matrix, but readonly */
			
 
				+	starpu_data_partition_readonly_submit(handle, PARTS, horiz_handle);
			
 
				+
			
 
				+	/* Check the values of the vertical slices */
			
 
				+	for (i = 0; i < PARTS; i++)
			
 
				+	{
			
 
				+		start = i*(NX/PARTS);
			
 
				+		ret = starpu_task_insert(&cl_check,
			
 
				+				STARPU_R, vert_handle[i],
			
 
				+				STARPU_VALUE, &start, sizeof(start),
			
 
				+				STARPU_VALUE, &factor, sizeof(factor),
			
 
				+				0);
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+	/* Check the values of the horizontal slices */
			
 
				+	for (i = 0; i < PARTS; i++)
			
 
				+	{
			
 
				+		start = factor*100*i*(NY/PARTS);
			
 
				+		ret = starpu_task_insert(&cl_check,
			
 
				+				STARPU_R, horiz_handle[i],
			
 
				+				STARPU_VALUE, &start, sizeof(start),
			
 
				+				STARPU_VALUE, &factor, sizeof(factor),
			
 
				+				0);
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+	/* And of the main matrix */
			
 
				+	start = 0;
			
 
				+	ret = starpu_task_insert(&cl_check,
			
 
				+			STARPU_R, handle,
			
 
				+			STARPU_VALUE, &start, sizeof(start),
			
 
				+			STARPU_VALUE, &factor, sizeof(factor),
			
 
				+			0);
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				+	/* Now switch back to total view of the matrix */
			
 
				+	starpu_data_unpartition_submit(handle, PARTS, horiz_handle, -1);
			
 
				+	starpu_data_unpartition_submit(handle, PARTS, vert_handle, -1);
			
 
				+
			
 
				+	/* Check and scale it */
			
 
				+	start = 0;
			
 
				+	ret = starpu_task_insert(&cl_check_scale,
			
 
				+			STARPU_RW, handle,
			
 
				+			STARPU_VALUE, &start, sizeof(start),
			
 
				+			STARPU_VALUE, &factor, sizeof(factor),
			
 
				+			0);
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	factor = 2;
			
 
				+
			
 
				+	/* Now switch to vertical view of the matrix, but readonly */
			
 
				+	starpu_data_partition_readonly_submit(handle, PARTS, vert_handle);
			
 
				+	/* as well as horizontal view of the matrix, but readonly */
			
 
				+	starpu_data_partition_readonly_submit(handle, PARTS, horiz_handle);
			
 
				+
			
 
				+	/* Check the values of the vertical slices */
			
 
				+	for (i = 0; i < PARTS; i++)
			
 
				+	{
			
 
				+		start = factor*i*(NX/PARTS);
			
 
				+		ret = starpu_task_insert(&cl_check,
			
 
				+				STARPU_R, vert_handle[i],
			
 
				+				STARPU_VALUE, &start, sizeof(start),
			
 
				+				STARPU_VALUE, &factor, sizeof(factor),
			
 
				+				0);
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+	/* Check the values of the horizontal slices */
			
 
				+	for (i = 0; i < PARTS; i++)
			
 
				+	{
			
 
				+		start = factor*100*i*(NY/PARTS);
			
 
				+		ret = starpu_task_insert(&cl_check,
			
 
				+				STARPU_R, horiz_handle[i],
			
 
				+				STARPU_VALUE, &start, sizeof(start),
			
 
				+				STARPU_VALUE, &factor, sizeof(factor),
			
 
				+				0);
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+	/* And of the main matrix */
			
 
				+	start = 0;
			
 
				+	ret = starpu_task_insert(&cl_check,
			
 
				+			STARPU_R, handle,
			
 
				+			STARPU_VALUE, &start, sizeof(start),
			
 
				+			STARPU_VALUE, &factor, sizeof(factor),
			
 
				+			0);
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				+	/* Disable the read-only vertical view of the matrix */
			
 
				+	starpu_data_unpartition_submit(handle, PARTS, vert_handle, -1);
			
 
				+	/* And switch to read-write horizontal view of the matrix */
			
 
				+	starpu_data_partition_readwrite_upgrade_submit(handle, PARTS, horiz_handle);
			
 
				+
			
 
				+	/* Check and scale the values of the horizontal slices */
			
 
				+	for (i = 0; i < PARTS; i++)
			
 
				+	{
			
 
				+		start = factor*100*i*(NY/PARTS);
			
 
				+		ret = starpu_task_insert(&cl_check_scale,
			
 
				+				STARPU_RW, horiz_handle[i],
			
 
				+				STARPU_VALUE, &start, sizeof(start),
			
 
				+				STARPU_VALUE, &factor, sizeof(factor),
			
 
				+				0);
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+	factor = 4;
			
 
				+
			
 
				+	/* Now switch back to total view of the matrix, but readonly */
			
 
				+	starpu_data_unpartition_readonly_submit(handle, PARTS, horiz_handle, -1);
			
 
				+	/* And also enable a vertical view of the matrix, but readonly */
			
 
				+	starpu_data_partition_readonly_submit(handle, PARTS, vert_handle);
			
 
				+
			
 
				+	/* Check the values of the vertical slices */
			
 
				+	for (i = 0; i < PARTS; i++)
			
 
				+	{
			
 
				+		start = factor*i*(NX/PARTS);
			
 
				+		ret = starpu_task_insert(&cl_check,
			
 
				+				STARPU_R, vert_handle[i],
			
 
				+				STARPU_VALUE, &start, sizeof(start),
			
 
				+				STARPU_VALUE, &factor, sizeof(factor),
			
 
				+				0);
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+	/* Check the values of the horizontal slices */
			
 
				+	for (i = 0; i < PARTS; i++)
			
 
				+	{
			
 
				+		start = factor*100*i*(NY/PARTS);
			
 
				+		ret = starpu_task_insert(&cl_check,
			
 
				+				STARPU_R, horiz_handle[i],
			
 
				+				STARPU_VALUE, &start, sizeof(start),
			
 
				+				STARPU_VALUE, &factor, sizeof(factor),
			
 
				+				0);
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+	/* And of the main matrix */
			
 
				+	start = 0;
			
 
				+	ret = starpu_task_insert(&cl_check,
			
 
				+			STARPU_R, handle,
			
 
				+			STARPU_VALUE, &start, sizeof(start),
			
 
				+			STARPU_VALUE, &factor, sizeof(factor),
			
 
				+			0);
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				+	/* Disable the read-only vertical view of the matrix */
			
 
				+	starpu_data_unpartition_submit(handle, PARTS, vert_handle, -1);
			
 
				+	/* And the read-only horizontal view of the matrix */
			
 
				+	starpu_data_unpartition_submit(handle, PARTS, horiz_handle, -1);
			
 
				+	/* Thus getting back to a read-write total view of the matrix */
			
 
				+
			
 
				+	/* And check and scale the values of the whole matrix */
			
 
				+	start = 0;
			
 
				+	ret = starpu_task_insert(&cl_check_scale,
			
 
				+			STARPU_RW, handle,
			
 
				+			STARPU_VALUE, &start, sizeof(start),
			
 
				+			STARPU_VALUE, &factor, sizeof(factor),
			
 
				+			0);
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	factor = 8;
			
 
				+
			
 
				+	/*
			
 
				+	 * Unregister data from StarPU and shutdown.
			
 
				+	 */
			
 
				+	starpu_data_partition_clean(handle, PARTS, vert_handle);
			
 
				+	starpu_data_partition_clean(handle, PARTS, horiz_handle);
			
 
				+	starpu_data_unregister(handle);
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return ret;
			
 
				+
			
 
				+enodev:
			
 
				+	starpu_shutdown();
			
 
				+	return 77;
			
 
				+}
			
--- a/include/starpu_data_filters.h
+++ b/include/starpu_data_filters.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2012  Université de Bordeaux
			
 
				+ * Copyright (C) 2010-2012, 2015  Université de Bordeaux
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013  CNRS
			
 
				  *
			
@@ -42,6 +42,14 @@ struct starpu_data_filter
 
				 void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_data_filter *f);
			
 
				 void starpu_data_unpartition(starpu_data_handle_t root_data, unsigned gathering_node);
			
 
				 
			
 
				+void starpu_data_partition_plan(starpu_data_handle_t initial_handle, struct starpu_data_filter *f, starpu_data_handle_t *children);
			
 
				+void starpu_data_partition_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children);
			
 
				+void starpu_data_partition_readonly_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children);
			
 
				+void starpu_data_partition_readwrite_upgrade_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children);
			
 
				+void starpu_data_unpartition_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children, int gathering_node);
			
 
				+void starpu_data_unpartition_readonly_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children, int gathering_node);
			
 
				+void starpu_data_partition_clean(starpu_data_handle_t root_data, unsigned nparts, starpu_data_handle_t *children);
			
 
				+
			
 
				 int starpu_data_get_nb_children(starpu_data_handle_t handle);
			
 
				 starpu_data_handle_t starpu_data_get_child(starpu_data_handle_t handle, unsigned i);
			
 
				 
			
--- a/src/datawizard/coherency.h
+++ b/src/datawizard/coherency.h
@@ -145,6 +145,17 @@ struct _starpu_data_state
 
				 
			
 
				 	starpu_data_handle_t children;
			
 
				 	unsigned nchildren;
			
 
				+	/* How many partition plans this handle has */
			
 
				+	unsigned nplans;
			
 
				+	/* Whether a partition plan is currently submitted and the
			
 
				+	 * corresponding unpartition has not been yet
			
 
				+	 *
			
 
				+	 * Or the number of partition plans currently submitted in readonly
			
 
				+	 * mode.
			
 
				+	 */
			
 
				+	unsigned partitioned;
			
 
				+	/* Whether a partition plan is currently submitted in readonly mode */
			
 
				+	unsigned readonly;
			
 
				 
			
 
				 	/* describe the state of the data in term of coherency */
			
 
				 	struct _starpu_data_replicate per_node[STARPU_MAXNODES];
			
--- a/src/datawizard/filters.c
+++ b/src/datawizard/filters.c
@@ -22,8 +22,6 @@
 
				 #include <datawizard/interfaces/data_interface.h>
			
 
				 #include <core/task.h>
			
 
				 
			
 
				-static void starpu_data_create_children(starpu_data_handle_t handle, unsigned nchildren, struct starpu_data_filter *f);
			
 
				-
			
 
				 /*
			
 
				  * This function applies a data filter on all the elements of a partition
			
 
				  */
			
@@ -114,27 +112,37 @@ starpu_data_handle_t starpu_data_vget_sub_data(starpu_data_handle_t root_handle,
 
				 	return current_handle;
			
 
				 }
			
 
				 
			
 
				-void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_data_filter *f)
			
 
				+static unsigned _starpu_data_partition_nparts(starpu_data_handle_t initial_handle, struct starpu_data_filter *f)
			
 
				+{
			
 
				+	/* how many parts ? */
			
 
				+	if (f->get_nchildren)
			
 
				+	  return f->get_nchildren(f, initial_handle);
			
 
				+	else
			
 
				+	  return f->nchildren;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_data_handle_t *childrenp, unsigned nparts, struct starpu_data_filter *f, int inherit_state)
			
 
				 {
			
 
				-	unsigned nparts;
			
 
				 	unsigned i;
			
 
				 	unsigned node;
			
 
				 
			
 
				 	/* first take care to properly lock the data header */
			
 
				 	_starpu_spin_lock(&initial_handle->header_lock);
			
 
				 
			
 
				-	STARPU_ASSERT_MSG(initial_handle->nchildren == 0, "there should not be mutiple filters applied on the same data %p, futher filtering has to be done on children", initial_handle);
			
 
				-
			
 
				-	/* how many parts ? */
			
 
				-	if (f->get_nchildren)
			
 
				-	  nparts = f->get_nchildren(f, initial_handle);
			
 
				-	else
			
 
				-	  nparts = f->nchildren;
			
 
				+	initial_handle->nplans++;
			
 
				 
			
 
				 	STARPU_ASSERT_MSG(nparts > 0, "Partitioning data %p in 0 piece does not make sense", initial_handle);
			
 
				 
			
 
				 	/* allocate the children */
			
 
				-	starpu_data_create_children(initial_handle, nparts, f);
			
 
				+	if (inherit_state)
			
 
				+	{
			
 
				+		initial_handle->children = (struct _starpu_data_state *) calloc(nparts, sizeof(struct _starpu_data_state));
			
 
				+		STARPU_ASSERT(initial_handle->children);
			
 
				+
			
 
				+		/* this handle now has children */
			
 
				+		initial_handle->nchildren = nparts;
			
 
				+	}
			
 
				 
			
 
				 	unsigned nworkers = starpu_worker_get_count();
			
 
				 
			
@@ -147,6 +155,7 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 
				 	{
			
 
				 		/* This is lazy allocation, allocate it now in main RAM, so as
			
 
				 		 * to have somewhere to gather pieces later */
			
 
				+		/* FIXME: mark as unevictable! */
			
 
				 		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[0], 0);
			
 
				 #ifdef STARPU_DEVEL
			
 
				 #warning we should reclaim memory if allocation failed
			
@@ -156,12 +165,29 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 
				 
			
 
				 	for (i = 0; i < nparts; i++)
			
 
				 	{
			
 
				-		starpu_data_handle_t child =
			
 
				-			starpu_data_get_child(initial_handle, i);
			
 
				+		starpu_data_handle_t child;
			
 
				 
			
 
				+		if (inherit_state)
			
 
				+			child = &initial_handle->children[i];
			
 
				+		else
			
 
				+			child = childrenp[i];
			
 
				 		STARPU_ASSERT(child);
			
 
				 
			
 
				+		struct starpu_data_interface_ops *ops;
			
 
				+
			
 
				+		/* each child may have his own interface type */
			
 
				+		/* what's this child's interface ? */
			
 
				+		if (f->get_child_ops)
			
 
				+			ops = f->get_child_ops(f, i);
			
 
				+		else
			
 
				+			ops = initial_handle->ops;
			
 
				+
			
 
				+		_starpu_data_handle_init(child, ops, initial_handle->mf_node);
			
 
				+
			
 
				 		child->nchildren = 0;
			
 
				+		child->nplans = 0;
			
 
				+		child->partitioned = 0;
			
 
				+		child->readonly = 0;
			
 
				                 child->mpi_data = initial_handle->mpi_data;
			
 
				 		child->root_handle = initial_handle->root_handle;
			
 
				 		child->father_handle = initial_handle;
			
@@ -224,19 +250,29 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 
				 			initial_replicate = &initial_handle->per_node[node];
			
 
				 			child_replicate = &child->per_node[node];
			
 
				 
			
 
				-			child_replicate->state = initial_replicate->state;
			
 
				-			child_replicate->allocated = initial_replicate->allocated;
			
 
				+			if (inherit_state)
			
 
				+				child_replicate->state = initial_replicate->state;
			
 
				+			else
			
 
				+				child_replicate->state = STARPU_INVALID;
			
 
				+			if (inherit_state || !initial_replicate->automatically_allocated)
			
 
				+				child_replicate->allocated = initial_replicate->allocated;
			
 
				+			else
			
 
				+				child_replicate->allocated = 0;
			
 
				 			/* Do not allow memory reclaiming within the child for parent bits */
			
 
				 			child_replicate->automatically_allocated = 0;
			
 
				 			child_replicate->refcnt = 0;
			
 
				 			child_replicate->memory_node = node;
			
 
				 			child_replicate->relaxed_coherency = 0;
			
 
				-			child_replicate->initialized = initial_replicate->initialized;
			
 
				+			if (inherit_state)
			
 
				+				child_replicate->initialized = initial_replicate->initialized;
			
 
				+			else
			
 
				+				child_replicate->initialized = 0;
			
 
				 
			
 
				 			/* update the interface */
			
 
				 			void *initial_interface = starpu_data_get_interface_on_node(initial_handle, node);
			
 
				 			void *child_interface = starpu_data_get_interface_on_node(child, node);
			
 
				 
			
 
				+			STARPU_ASSERT_MSG(!(!inherit_state && child_replicate->automatically_allocated && child_replicate->allocated), "partition planning is currently not supported when handle has some automatically allocated buffers");
			
 
				 			f->filter_func(initial_interface, child_interface, f, i, nparts);
			
 
				 		}
			
 
				 
			
@@ -459,6 +495,7 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 
				 	starpu_data_handle_t children = root_handle->children;
			
 
				 	root_handle->children = NULL;
			
 
				 	root_handle->nchildren = 0;
			
 
				+	root_handle->nplans--;
			
 
				 
			
 
				 	/* now the parent may be used again so we release the lock */
			
 
				 	_starpu_spin_unlock(&root_handle->header_lock);
			
@@ -468,31 +505,164 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 
				 	_STARPU_TRACE_END_UNPARTITION(root_handle, gathering_node);
			
 
				 }
			
 
				 
			
 
				-/* each child may have his own interface type */
			
 
				-static void starpu_data_create_children(starpu_data_handle_t handle, unsigned nchildren, struct starpu_data_filter *f)
			
 
				+void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_data_filter *f)
			
 
				 {
			
 
				-	handle->children = (struct _starpu_data_state *) calloc(nchildren, sizeof(struct _starpu_data_state));
			
 
				-	STARPU_ASSERT(handle->children);
			
 
				+	unsigned nparts = _starpu_data_partition_nparts(initial_handle, f);
			
 
				+	STARPU_ASSERT_MSG(initial_handle->nchildren == 0, "there should not be mutiple filters applied on the same data %p, futher filtering has to be done on children", initial_handle);
			
 
				+	STARPU_ASSERT_MSG(initial_handle->nplans == 0, "partition planning and synchronous partitioning is not supported");
			
 
				 
			
 
				-	unsigned child;
			
 
				+	initial_handle->children = NULL;
			
 
				+	_starpu_data_partition(initial_handle, NULL, nparts, f, 1);
			
 
				+}
			
 
				+
			
 
				+void starpu_data_partition_plan(starpu_data_handle_t initial_handle, struct starpu_data_filter *f, starpu_data_handle_t *childrenp)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	unsigned nparts = _starpu_data_partition_nparts(initial_handle, f);
			
 
				+	STARPU_ASSERT_MSG(initial_handle->nchildren == 0, "partition planning and synchronous partitioning is not supported");
			
 
				+	STARPU_ASSERT_MSG(initial_handle->home_node == STARPU_MAIN_RAM, "partition planning is currently only supported from main RAM");
			
 
				+	STARPU_ASSERT_MSG(initial_handle->sequential_consistency, "partition planning is currently only supported for data with sequential consistency");
			
 
				+
			
 
				+	for (i = 0; i < nparts; i++)
			
 
				+		childrenp[i] = calloc(1, sizeof(struct _starpu_data_state));
			
 
				+	_starpu_data_partition(initial_handle, childrenp, nparts, f, 0);
			
 
				+}
			
 
				+
			
 
				+void starpu_data_partition_clean(starpu_data_handle_t root_handle, unsigned nparts, starpu_data_handle_t *children)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+
			
 
				+	for (i = 0; i < nparts; i++)
			
 
				+		starpu_data_unregister_submit(children[i]);
			
 
				+
			
 
				+	_starpu_spin_lock(&root_handle->header_lock);
			
 
				+	root_handle->nplans--;
			
 
				+	_starpu_spin_unlock(&root_handle->header_lock);
			
 
				+}
			
 
				+
			
 
				+static void empty(void *buffers[] STARPU_ATTRIBUTE_UNUSED, void *cl_arg STARPU_ATTRIBUTE_UNUSED)
			
 
				+{
			
 
				+	/* This doesn't need to do anything, it's simply used to make coherency
			
 
				+	 * between the two views, by simply running on the home node of the
			
 
				+	 * data, thus getting back all data pieces there.  */
			
 
				+}
			
 
				+
			
 
				+static struct starpu_codelet cl_switch =
			
 
				+{
			
 
				+	.cpu_funcs = {empty},
			
 
				+	.nbuffers = STARPU_VARIABLE_NBUFFERS,
			
 
				+	.name = "data_partition_switch"
			
 
				+};
			
 
				 
			
 
				-	for (child = 0; child < nchildren; child++)
			
 
				+void starpu_data_partition_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children)
			
 
				+{
			
 
				+	STARPU_ASSERT_MSG(initial_handle->sequential_consistency, "partition planning is currently only supported for data with sequential consistency");
			
 
				+	_starpu_spin_lock(&initial_handle->header_lock);
			
 
				+	STARPU_ASSERT_MSG(initial_handle->partitioned == 0, "One can't submit several partition plannings at the same time");
			
 
				+	STARPU_ASSERT_MSG(initial_handle->readonly == 0, "One can't submit a partition planning while a readonly partitioning is active");
			
 
				+	initial_handle->partitioned++;
			
 
				+	_starpu_spin_unlock(&initial_handle->header_lock);
			
 
				+
			
 
				+	unsigned i;
			
 
				+	struct starpu_data_descr descr[nparts];
			
 
				+	for (i = 0; i < nparts; i++)
			
 
				 	{
			
 
				-		starpu_data_handle_t handle_child;
			
 
				-		struct starpu_data_interface_ops *ops;
			
 
				+		STARPU_ASSERT_MSG(children[i]->father_handle == initial_handle, "children parameter of starpu_data_partition_submit must be the children of the parent parameter");
			
 
				+		descr[i].handle = children[i];
			
 
				+		descr[i].mode = STARPU_W;
			
 
				+	}
			
 
				+	/* TODO: assert nparts too */
			
 
				+	starpu_task_insert(&cl_switch, STARPU_RW, initial_handle, STARPU_DATA_MODE_ARRAY, descr, nparts, 0);
			
 
				+	starpu_data_invalidate_submit(initial_handle);
			
 
				+}
			
 
				 
			
 
				-		/* what's this child's interface ? */
			
 
				-		if (f->get_child_ops)
			
 
				-		  ops = f->get_child_ops(f, child);
			
 
				-		else
			
 
				-		  ops = handle->ops;
			
 
				+void starpu_data_partition_readonly_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children)
			
 
				+{
			
 
				+	STARPU_ASSERT_MSG(initial_handle->sequential_consistency, "partition planning is currently only supported for data with sequential consistency");
			
 
				+	_starpu_spin_lock(&initial_handle->header_lock);
			
 
				+	STARPU_ASSERT_MSG(initial_handle->partitioned == 0 || initial_handle->readonly, "One can't submit a readonly partition planning at the same time as a readwrite partition planning");
			
 
				+	initial_handle->partitioned++;
			
 
				+	initial_handle->readonly = 1;
			
 
				+	_starpu_spin_unlock(&initial_handle->header_lock);
			
 
				+
			
 
				+	unsigned i;
			
 
				+	struct starpu_data_descr descr[nparts];
			
 
				+	for (i = 0; i < nparts; i++)
			
 
				+	{
			
 
				+		STARPU_ASSERT_MSG(children[i]->father_handle == initial_handle, "children parameter of starpu_data_partition_submit must be the children of the parent parameter");
			
 
				+		descr[i].handle = children[i];
			
 
				+		descr[i].mode = STARPU_W;
			
 
				+	}
			
 
				+	/* TODO: assert nparts too */
			
 
				+	starpu_task_insert(&cl_switch, STARPU_R, initial_handle, STARPU_DATA_MODE_ARRAY, descr, nparts, 0);
			
 
				+}
			
 
				+
			
 
				+void starpu_data_partition_readwrite_upgrade_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children)
			
 
				+{
			
 
				+	STARPU_ASSERT_MSG(initial_handle->sequential_consistency, "partition planning is currently only supported for data with sequential consistency");
			
 
				+	_starpu_spin_lock(&initial_handle->header_lock);
			
 
				+	STARPU_ASSERT_MSG(initial_handle->partitioned == 1, "One can't upgrade a readonly partition planning to readwrite while other readonly partition plannings are active");
			
 
				+	STARPU_ASSERT_MSG(initial_handle->readonly == 1, "One can only upgrade a readonly partition planning");
			
 
				+	initial_handle->readonly = 0;
			
 
				+	_starpu_spin_unlock(&initial_handle->header_lock);
			
 
				+
			
 
				+	unsigned i;
			
 
				+	struct starpu_data_descr descr[nparts];
			
 
				+	for (i = 0; i < nparts; i++)
			
 
				+	{
			
 
				+		STARPU_ASSERT_MSG(children[i]->father_handle == initial_handle, "children parameter of starpu_data_partition_submit must be the children of the parent parameter");
			
 
				+		descr[i].handle = children[i];
			
 
				+		descr[i].mode = STARPU_W;
			
 
				+	}
			
 
				+	/* TODO: assert nparts too */
			
 
				+	starpu_task_insert(&cl_switch, STARPU_RW, initial_handle, STARPU_DATA_MODE_ARRAY, descr, nparts, 0);
			
 
				+	starpu_data_invalidate_submit(initial_handle);
			
 
				+}
			
 
				+
			
 
				+void starpu_data_unpartition_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children, int gather_node)
			
 
				+{
			
 
				+	STARPU_ASSERT_MSG(initial_handle->sequential_consistency, "partition planning is currently only supported for data with sequential consistency");
			
 
				+	STARPU_ASSERT_MSG(gather_node == STARPU_MAIN_RAM || gather_node == -1, "gathering node different from main RAM is currently not supported");
			
 
				+	_starpu_spin_lock(&initial_handle->header_lock);
			
 
				+	STARPU_ASSERT_MSG(initial_handle->partitioned >= 1, "No partition planning is active for this handle");
			
 
				+	initial_handle->partitioned--;
			
 
				+	if (!initial_handle->partitioned)
			
 
				+		initial_handle->readonly = 0;
			
 
				+	_starpu_spin_unlock(&initial_handle->header_lock);
			
 
				 
			
 
				-		handle_child = &handle->children[child];
			
 
				-		_starpu_data_handle_init(handle_child, ops, handle->mf_node);
			
 
				+	unsigned i;
			
 
				+	struct starpu_data_descr descr[nparts];
			
 
				+	for (i = 0; i < nparts; i++)
			
 
				+	{
			
 
				+		STARPU_ASSERT_MSG(children[i]->father_handle == initial_handle, "children parameter of starpu_data_partition_submit must be the children of the parent parameter");
			
 
				+		descr[i].handle = children[i];
			
 
				+		descr[i].mode = STARPU_RW;
			
 
				 	}
			
 
				+	/* TODO: assert nparts too */
			
 
				+	starpu_task_insert(&cl_switch, STARPU_W, initial_handle, STARPU_DATA_MODE_ARRAY, descr, nparts, 0);
			
 
				+	for (i = 0; i < nparts; i++)
			
 
				+		starpu_data_invalidate_submit(children[i]);
			
 
				+}
			
 
				+
			
 
				+void starpu_data_unpartition_readonly_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children, int gather_node)
			
 
				+{
			
 
				+	STARPU_ASSERT_MSG(initial_handle->sequential_consistency, "partition planning is currently only supported for data with sequential consistency");
			
 
				+	STARPU_ASSERT_MSG(gather_node == STARPU_MAIN_RAM || gather_node == -1, "gathering node different from main RAM is currently not supported");
			
 
				+	_starpu_spin_lock(&initial_handle->header_lock);
			
 
				+	STARPU_ASSERT_MSG(initial_handle->partitioned >= 1, "No partition planning is active for this handle");
			
 
				+	initial_handle->readonly = 1;
			
 
				+	_starpu_spin_unlock(&initial_handle->header_lock);
			
 
				 
			
 
				-	/* this handle now has children */
			
 
				-	handle->nchildren = nchildren;
			
 
				+	unsigned i;
			
 
				+	struct starpu_data_descr descr[nparts];
			
 
				+	for (i = 0; i < nparts; i++)
			
 
				+	{
			
 
				+		STARPU_ASSERT_MSG(children[i]->father_handle == initial_handle, "children parameter of starpu_data_partition_submit must be the children of the parent parameter");
			
 
				+		descr[i].handle = children[i];
			
 
				+		descr[i].mode = STARPU_R;
			
 
				+	}
			
 
				+	/* TODO: assert nparts too */
			
 
				+	starpu_task_insert(&cl_switch, STARPU_W, initial_handle, STARPU_DATA_MODE_ARRAY, descr, nparts, 0);
			
 
				 }
			
 
				 
			
 
				 /*
			
--- a/src/datawizard/interfaces/data_interface.c
+++ b/src/datawizard/interfaces/data_interface.c
@@ -252,6 +252,9 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 
				 
			
 
				 	/* there is no hierarchy yet */
			
 
				 	handle->nchildren = 0;
			
 
				+	handle->nplans = 0;
			
 
				+	handle->partitioned = 0;
			
 
				+	handle->readonly = 0;
			
 
				 	handle->root_handle = handle;
			
 
				 	handle->father_handle = NULL;
			
 
				 	handle->sibling_index = 0; /* could be anything for the root */
			
@@ -641,6 +644,9 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 
				 {
			
 
				 	STARPU_ASSERT(handle);
			
 
				 	STARPU_ASSERT_MSG(handle->nchildren == 0, "data %p needs to be unpartitioned before unregistration", handle);
			
 
				+	STARPU_ASSERT_MSG(handle->nplans == 0, "data %p needs its partition plans to be cleaned before unregistration", handle);
			
 
				+	STARPU_ASSERT_MSG(handle->partitioned == 0, "data %p needs its partitioned plans to be unpartitioned before unregistration", handle);
			
 
				+	/* TODO: also check that it has the latest coherency */
			
 
				 	STARPU_ASSERT(!(nowait && handle->busy_count != 0));
			
 
				 
			
 
				 	int sequential_consistency = handle->sequential_consistency;