9 years ago · 68605a2f15
--- a/ChangeLog
+++ b/ChangeLog
@@ -107,6 +107,8 @@ New features:
 
																   * Add Fortran 90 module and example using it
															
 
																   * New StarPU-MPI gdb debug functions
															
 
																   * Generate animated html trace of modular schedulers.
															
 
																+  * Add asynchronous partition planning. It only supports coherency through
															
 
																+    the main RAM for now.
															
 
																 Small features:
															
 
																   * Tasks can now have a name (via the field const char *name of
															
--- a/doc/doxygen/chapters/07data_management.doxy
+++ b/doc/doxygen/chapters/07data_management.doxy
@@ -191,10 +191,63 @@ StarPU provides various interfaces and filters for matrices, vectors, etc.,
 
																 but applications can also write their own data interfaces and filters, see
															
 
																 <c>examples/interface</c> and <c>examples/filters/custom_mf</c> for an example.
															
 
																-\section MultipleView Multiple views
															
 
																+\section AsynchronousPartitioning Asynchronous Partitioning
															
 
																-Partitioning is synchronous, which can be a problem for dynamic applications.
															
 
																-Another way is to register several views on the same piece of data.
															
 
																+The partitioning functions described in the previous section are synchronous:
															
 
																+starpu_data_partition and starpu_data_unpartition both wait for all the tasks
															
 
																+currently working on the data.  This can be a bottleneck for the application.
															
 
																+
															
 
																+An asynchronous API also exists, it works only on handles with sequential
															
 
																+consistency. The principle is to first plan the partitioning, which returns
															
 
																+data handles of the partition, which are not functional yet. Along other task
															
 
																+submission, one can submit the actual partitioning, and then use the handles
															
 
																+of the partition. Before using the handle of the whole data, one has to submit
															
 
																+the unpartitioning. <c>fmultiple_automatic</c> is a complete example using this
															
 
																+technique.
															
 
																+
															
 
																+In short, we first register a matrix and plan the partitioning:
															
 
																+
															
 
																+\code{.c}
															
 
																+starpu_matrix_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)matrix, NX, NX, NY, sizeof(matrix[0]));
															
 
																+struct starpu_data_filter f_vert =
															
 
																+{
															
 
																+	.filter_func = starpu_matrix_filter_block,
															
 
																+	.nchildren = PARTS
															
 
																+};
															
 
																+starpu_data_partition_plan(handle, &f_vert, vert_handle);
															
 
																+\endcode
															
 
																+
															
 
																+starpu_data_partition_plan returns the handles for the partition in vert_handle.
															
 
																+
															
 
																+One can submit tasks working on the main handle, but not yet on the vert_handle
															
 
																+handles. Now we submit the partitioning:
															
 
																+
															
 
																+\code{.c}
															
 
																+starpu_data_partition_submit(handle, PARTS, vert_handle);
															
 
																+\endcode
															
 
																+
															
 
																+And now we can submit tasks working on vert_handle handles (and not on the main
															
 
																+handle any more). Eventually we want to work on the main handle again, so we
															
 
																+submit the unpartitioning:
															
 
																+
															
 
																+starpu_data_unpartition_submit(handle, PARTS, vert_handle);
															
 
																+
															
 
																+And now we can submit tasks working on the main handle again.
															
 
																+
															
 
																+All this code is asynchronous, just submitting which tasks, partitioning and
															
 
																+unpartitioning should be done at runtime.
															
 
																+
															
 
																+Planning several partitioning of the same data is also possible, one just has
															
 
																+to submit unpartitioning (to get back to the initial handle) before submitting
															
 
																+another partitioning.
															
 
																+
															
 
																+It is also possible to activate several partitioning at the same time, in read-only mode.
															
 
																+
															
 
																+\section ManualPartitioning Manual Partitioning
															
 
																+
															
 
																+One can also handle partitioning by hand, by registering several views on the
															
 
																+same piece of data. The idea is then to manage the coherency of the various
															
 
																+views through the common buffer in the main memory.
															
 
																 <c>fmultiple_manual</c> is a complete example using this technique.
															
 
																 In short, we first register the same matrix several times:
															
--- a/doc/doxygen/chapters/api/data_partition.doxy
+++ b/doc/doxygen/chapters/api/data_partition.doxy
@@ -43,9 +43,7 @@ Here an example of how to use the function.
 
																 \code{.c}
															
 
																 struct starpu_data_filter f = {
															
 
																         .filter_func = starpu_matrix_filter_block,
															
 
																-        .nchildren = nslicesx,
															
 
																-        .get_nchildren = NULL,
															
 
																-        .get_child_ops = NULL
															
 
																+        .nchildren = nslicesx
															
 
																 };
															
 
																 starpu_data_partition(A_handle, &f);
															
 
																 \endcode
															
@@ -103,6 +101,100 @@ Applies \p nfilters filters to the handle designated by
 
																 \p root_handle recursively. It uses a va_list of pointers to variables of
															
 
																 the type starpu_data_filter.
															
 
																+@name Asynchronous API
															
 
																+\ingroup API_Data_Partition
															
 
																+
															
 
																+\fn void starpu_data_partition_plan(starpu_data_handle_t initial_handle, struct starpu_data_filter *f, starpu_data_handle_t *children)
															
 
																+\ingroup API_Data_Partition
															
 
																+This plans for partitioning one StarPU data handle \p initial_handle into
															
 
																+several subdata according to the filter \p f. The handles are returned into
															
 
																+the \p children array, which has to be the same size as the number of parts
															
 
																+described in \p f. These handles are not immediately usable,
															
 
																+starpu_data_partition_submit has to be called to submit the actual partitioning.
															
 
																+
															
 
																+Here is an example of how to use the function:
															
 
																+
															
 
																+\code{.c}
															
 
																+starpu_data_handle_t children[nslicesx];
															
 
																+struct starpu_data_filter f = {
															
 
																+        .filter_func = starpu_matrix_filter_block,
															
 
																+        .nchildren = nslicesx
															
 
																+};
															
 
																+starpu_data_partition_plan(A_handle, &f, children);
															
 
																+\endcode
															
 
																+
															
 
																+\fn void starpu_data_partition_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children)
															
 
																+\ingroup API_Data_Partition
															
 
																+
															
 
																+This submits the actual partitioning of \p initial_handle into the \p nparts
															
 
																+\p children handles. This call is asynchronous, it only submits that the
															
 
																+partitioning should be done, so that the \p children handles can now be used to
															
 
																+submit tasks, and \p initial_handle can not be used to submit tasks any more (to
															
 
																+guarantee coherency).
															
 
																+
															
 
																+For instance,
															
 
																+
															
 
																+\code{.c}
															
 
																+starpu_data_partition_submit(A_handle, nslicesx, children);
															
 
																+\endcode
															
 
																+
															
 
																+\fn void starpu_data_partition_readonly_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children)
															
 
																+\ingroup API_Data_Partition
															
 
																+
															
 
																+This is the same as starpu_data_partition_submit, but does not invalidate \p
															
 
																+initial_handle. This allows to continue using it, but the application has to be
															
 
																+careful not to write to \p initial_handle or \p children handles, only read from
															
 
																+them, since the coherency is otherwise not guaranteed.  This thus allows to
															
 
																+submit various tasks which concurrently read from various partitions of the data.
															
 
																+
															
 
																+When the application wants to write to \p initial_handle again, it should call
															
 
																+starpu_data_unpartition_submit, which will properly add dependencies between the
															
 
																+reads on the \p children and the writes to be submitted.
															
 
																+
															
 
																+If instead the application wants to write to \p children handles, it should
															
 
																+call starpu_data_partition_readwrite_upgrade_submit, which will properly add
															
 
																+dependencies between the reads on the \p initial_handle and the writes to be
															
 
																+submitted.
															
 
																+
															
 
																+\fn void starpu_data_partition_readwrite_upgrade_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children)
															
 
																+\ingroup API_Data_Partition
															
 
																+
															
 
																+This assumes that a partitioning of \p initial_handle has already been submited
															
 
																+in readonly mode through starpu_data_partition_readonly_submit, and will upgrade
															
 
																+that partitioning into read-write mode for the \p children, by invalidating \p
															
 
																+initial_handle, and adding the necessary dependencies.
															
 
																+
															
 
																+\fn void starpu_data_unpartition_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children, int gathering_node)
															
 
																+\ingroup API_Data_Partition
															
 
																+
															
 
																+This assumes that \p initial_handle is partitioned into \p children, and submits
															
 
																+an unpartitionning of it, i.e. submitting a gathering of the pieces on the
															
 
																+requested \p gathering_node memory node, and submitting an invalidation of the
															
 
																+children.
															
 
																+
															
 
																+\p gathering_node can be set to -1 to let the runtime decide which memory node
															
 
																+should be used to gather the pieces.
															
 
																+
															
 
																+\fn void starpu_data_unpartition_readonly_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children, int gathering_node)
															
 
																+\ingroup API_Data_Partition
															
 
																+
															
 
																+This assumes that \p initial_handle is partitioned into \p children, and submits
															
 
																+just a readonly unpartitionning of it, i.e. submitting a gathering of the pieces
															
 
																+on the requested \p gathering_node memory node. It does not invalidate the
															
 
																+children. This brings \p initial_handle and \p children handles to the same
															
 
																+state as obtained with starpu_data_partition_readonly_submit.
															
 
																+
															
 
																+\p gathering_node can be set to -1 to let the runtime decide which memory node
															
 
																+should be used to gather the pieces.
															
 
																+
															
 
																+\fn void starpu_data_partition_clean(starpu_data_handle_t root_data, unsigned nparts, starpu_data_handle_t *children)
															
 
																+\ingroup API_Data_Partition
															
 
																+
															
 
																+This should be used to clear the partition planning established between \p
															
 
																+root_data and \p children with starpu_data_partition_plan. This will notably
															
 
																+submit an unregister all the \p children, which can thus not be used any more
															
 
																+afterwards.
															
 
																+
															
 
																 @name Predefined Vector Filter Functions
															
 
																 \ingroup API_Data_Partition
															
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -183,6 +183,8 @@ STARPU_EXAMPLES =				\
 
																 	filters/fblock				\
															
 
																 	filters/fmatrix				\
															
 
																 	filters/fmultiple_manual		\
															
 
																+	filters/fmultiple_submit		\
															
 
																+	filters/fmultiple_submit_readonly	\
															
 
																 	tag_example/tag_example			\
															
 
																 	tag_example/tag_example2		\
															
 
																 	tag_example/tag_example3		\
															
@@ -430,6 +432,22 @@ filters_fmultiple_manual_SOURCES +=		\
 
																 	filters/fmultiple_cuda.cu
															
 
																 endif
															
 
																+filters_fmultiple_submit_SOURCES =		\
															
 
																+	filters/fmultiple_submit.c
															
 
																+
															
 
																+if STARPU_USE_CUDA
															
 
																+filters_fmultiple_submit_SOURCES +=		\
															
 
																+	filters/fmultiple_cuda.cu
															
 
																+endif
															
 
																+
															
 
																+filters_fmultiple_submit_readonly_SOURCES =	\
															
 
																+	filters/fmultiple_submit_readonly.c
															
 
																+
															
 
																+if STARPU_USE_CUDA
															
 
																+filters_fmultiple_submit_readonly_SOURCES +=	\
															
 
																+	filters/fmultiple_cuda.cu
															
 
																+endif
															
 
																+
															
 
																 examplebin_PROGRAMS +=				\
															
 
																 	filters/shadow				\
															
 
																 	filters/shadow2d			\
															
--- a/examples/filters/fmultiple_cuda.cu
+++ b/examples/filters/fmultiple_cuda.cu
@@ -18,7 +18,7 @@
 
																 #include <starpu.h>
															
 
																-static __global__ void _fmultiple_check_cuda(int *val, int nx, int ny, unsigned ld, int start, int factor)
															
 
																+static __global__ void _fmultiple_check_scale_cuda(int *val, int nx, int ny, unsigned ld, int start, int factor)
															
 
																 {
															
 
																         int i, j;
															
 
																 	for(j=0; j<ny ; j++)
															
@@ -32,6 +32,33 @@ static __global__ void _fmultiple_check_cuda(int *val, int nx, int ny, unsigned
 
																         }
															
 
																 }
															
 
																+extern "C" void fmultiple_check_scale_cuda(void *buffers[], void *cl_arg)
															
 
																+{
															
 
																+	int start, factor;
															
 
																+	int nx = (int)STARPU_MATRIX_GET_NX(buffers[0]);
															
 
																+	int ny = (int)STARPU_MATRIX_GET_NY(buffers[0]);
															
 
																+        unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]);
															
 
																+	int *val = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
															
 
																+
															
 
																+	starpu_codelet_unpack_args(cl_arg, &start, &factor);
															
 
																+
															
 
																+        /* TODO: use more vals and threads in vals */
															
 
																+	_fmultiple_check_scale_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val, nx, ny, ld, start, factor);
															
 
																+}
															
 
																+
															
 
																+static __global__ void _fmultiple_check_cuda(int *val, int nx, int ny, unsigned ld, int start, int factor)
															
 
																+{
															
 
																+        int i, j;
															
 
																+	for(j=0; j<ny ; j++)
															
 
																+	{
															
 
																+		for(i=0; i<nx ; i++)
															
 
																+		{
															
 
																+			if (val[(j*ld)+i] != start + factor*(i+100*j))
															
 
																+				asm("trap;");
															
 
																+		}
															
 
																+        }
															
 
																+}
															
 
																+
															
 
																 extern "C" void fmultiple_check_cuda(void *buffers[], void *cl_arg)
															
 
																 {
															
 
																 	int start, factor;
															
--- a/examples/filters/fmultiple_manual.c
+++ b/examples/filters/fmultiple_manual.c
@@ -16,9 +16,10 @@
 
																 /*
															
 
																  * This examplifies how to access the same matrix with different partitioned
															
 
																- * views.
															
 
																+ * views, doing the coherency by hand.
															
 
																  * We first run a kernel on the whole matrix to fill it, then run a kernel on
															
 
																- * each vertical slice, then run a kernel on each horizontal slice.
															
 
																+ * each vertical slice to check the value and multiply it by two, then run a
															
 
																+ * kernel on each horizontal slice to do the same.
															
 
																  */
															
 
																 #include <starpu.h>
															
@@ -55,7 +56,7 @@ struct starpu_codelet cl_fill =
 
																 	.name = "matrix_fill"
															
 
																 };
															
 
																-void fmultiple_check(void *buffers[], void *cl_arg)
															
 
																+void fmultiple_check_scale(void *buffers[], void *cl_arg)
															
 
																 {
															
 
																 	int start, factor;
															
 
																 	unsigned i, j;
															
@@ -79,21 +80,21 @@ void fmultiple_check(void *buffers[], void *cl_arg)
 
																 }
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-extern void fmultiple_check_cuda(void *buffers[], void *cl_arg);
															
 
																+extern void fmultiple_check_scale_cuda(void *buffers[], void *cl_arg);
															
 
																 #endif
															
 
																-struct starpu_codelet cl_check =
															
 
																+struct starpu_codelet cl_check_scale =
															
 
																 {
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-	.cuda_funcs = {fmultiple_check_cuda},
															
 
																+	.cuda_funcs = {fmultiple_check_scale_cuda},
															
 
																 	.cuda_flags = {STARPU_CUDA_ASYNC},
															
 
																 #else
															
 
																 	/* Only enable it on CPUs if we don't have a CUDA device, to force remote execution on the CUDA device */
															
 
																-	.cpu_funcs = {fmultiple_check},
															
 
																-	.cpu_funcs_name = {"fmultiple_check"},
															
 
																+	.cpu_funcs = {fmultiple_check_scale},
															
 
																+	.cpu_funcs_name = {"fmultiple_check_scale"},
															
 
																 #endif
															
 
																 	.nbuffers = 1,
															
 
																 	.modes = {STARPU_RW},
															
 
																-	.name = "fmultiple_check"
															
 
																+	.name = "fmultiple_check_scale"
															
 
																 };
															
 
																 void empty(void *buffers[] STARPU_ATTRIBUTE_UNUSED, void *cl_arg STARPU_ATTRIBUTE_UNUSED)
															
@@ -169,7 +170,7 @@ int main(int argc, char **argv)
 
																 	{
															
 
																 		int factor = 1;
															
 
																 		int start = i*(NX/PARTS);
															
 
																-		ret = starpu_task_insert(&cl_check,
															
 
																+		ret = starpu_task_insert(&cl_check_scale,
															
 
																 				STARPU_RW, vert_handle[i],
															
 
																 				STARPU_VALUE, &start, sizeof(start),
															
 
																 				STARPU_VALUE, &factor, sizeof(factor),
															
@@ -206,7 +207,7 @@ int main(int argc, char **argv)
 
																 	{
															
 
																 		int factor = 2;
															
 
																 		int start = factor*100*i*(NY/PARTS);
															
 
																-		ret = starpu_task_insert(&cl_check,
															
 
																+		ret = starpu_task_insert(&cl_check_scale,
															
 
																 				STARPU_RW, horiz_handle[i],
															
 
																 				STARPU_VALUE, &start, sizeof(start),
															
 
																 				STARPU_VALUE, &factor, sizeof(factor),
															
--- a/examples/filters/fmultiple_submit.c
+++ b/examples/filters/fmultiple_submit.c
@@ -0,0 +1,207 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2015  Université Bordeaux
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+/*
															
 
																+ * This examplifies how to access the same matrix with different partitioned
															
 
																+ * views, doing the coherency through partition planning.
															
 
																+ * We first run a kernel on the whole matrix to fill it, then run a kernel on
															
 
																+ * each vertical slice to check the value and multiply it by two, then run a
															
 
																+ * kernel on each horizontal slice to do the same.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+
															
 
																+#define NX    6
															
 
																+#define NY    6
															
 
																+#define PARTS 2
															
 
																+
															
 
																+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
															
 
																+
															
 
																+void matrix_fill(void *buffers[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
															
 
																+{
															
 
																+	unsigned i, j;
															
 
																+
															
 
																+	/* length of the matrix */
															
 
																+	unsigned nx = STARPU_MATRIX_GET_NX(buffers[0]);
															
 
																+	unsigned ny = STARPU_MATRIX_GET_NY(buffers[0]);
															
 
																+	unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]);
															
 
																+	int *val = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
															
 
																+
															
 
																+	for(j=0; j<ny ; j++)
															
 
																+	{
															
 
																+		for(i=0; i<nx ; i++)
															
 
																+			val[(j*ld)+i] = i+100*j;
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+struct starpu_codelet cl_fill =
															
 
																+{
															
 
																+	.cpu_funcs = {matrix_fill},
															
 
																+	.cpu_funcs_name = {"matrix_fill"},
															
 
																+	.nbuffers = 1,
															
 
																+	.modes = {STARPU_W},
															
 
																+	.name = "matrix_fill"
															
 
																+};
															
 
																+
															
 
																+void fmultiple_check_scale(void *buffers[], void *cl_arg)
															
 
																+{
															
 
																+	int start, factor;
															
 
																+	unsigned i, j;
															
 
																+
															
 
																+	/* length of the matrix */
															
 
																+	unsigned nx = STARPU_MATRIX_GET_NX(buffers[0]);
															
 
																+	unsigned ny = STARPU_MATRIX_GET_NY(buffers[0]);
															
 
																+	unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]);
															
 
																+	int *val = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
															
 
																+
															
 
																+	starpu_codelet_unpack_args(cl_arg, &start, &factor);
															
 
																+
															
 
																+	for(j=0; j<ny ; j++)
															
 
																+	{
															
 
																+		for(i=0; i<nx ; i++)
															
 
																+		{
															
 
																+			STARPU_ASSERT(val[(j*ld)+i] == start + factor*((int)(i+100*j)));
															
 
																+			val[(j*ld)+i] *= 2;
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+extern void fmultiple_check_scale_cuda(void *buffers[], void *cl_arg);
															
 
																+#endif
															
 
																+struct starpu_codelet cl_check_scale =
															
 
																+{
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	.cuda_funcs = {fmultiple_check_scale_cuda},
															
 
																+	.cuda_flags = {STARPU_CUDA_ASYNC},
															
 
																+#else
															
 
																+	/* Only enable it on CPUs if we don't have a CUDA device, to force remote execution on the CUDA device */
															
 
																+	.cpu_funcs = {fmultiple_check_scale},
															
 
																+	.cpu_funcs_name = {"fmultiple_check_scale"},
															
 
																+#endif
															
 
																+	.nbuffers = 1,
															
 
																+	.modes = {STARPU_RW},
															
 
																+	.name = "fmultiple_check_scale"
															
 
																+};
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	unsigned j, n=1;
															
 
																+	int matrix[NX][NY];
															
 
																+	int ret, i;
															
 
																+
															
 
																+	/* We haven't taken care otherwise */
															
 
																+	STARPU_ASSERT((NX%PARTS) == 0);
															
 
																+	STARPU_ASSERT((NY%PARTS) == 0);
															
 
																+
															
 
																+	starpu_data_handle_t handle;
															
 
																+	starpu_data_handle_t vert_handle[PARTS];
															
 
																+	starpu_data_handle_t horiz_handle[PARTS];
															
 
																+
															
 
																+	ret = starpu_init(NULL);
															
 
																+	if (ret == -ENODEV)
															
 
																+		return 77;
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+
															
 
																+	/* Declare the whole matrix to StarPU */
															
 
																+	starpu_matrix_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)matrix, NX, NX, NY, sizeof(matrix[0][0]));
															
 
																+
															
 
																+	/* Partition the matrix in PARTS vertical slices */
															
 
																+	struct starpu_data_filter f_vert =
															
 
																+	{
															
 
																+		.filter_func = starpu_matrix_filter_block,
															
 
																+		.nchildren = PARTS
															
 
																+	};
															
 
																+	starpu_data_partition_plan(handle, &f_vert, vert_handle);
															
 
																+
															
 
																+	/* Partition the matrix in PARTS horizontal slices */
															
 
																+	struct starpu_data_filter f_horiz =
															
 
																+	{
															
 
																+		.filter_func = starpu_matrix_filter_vertical_block,
															
 
																+		.nchildren = PARTS
															
 
																+	};
															
 
																+	starpu_data_partition_plan(handle, &f_horiz, horiz_handle);
															
 
																+
															
 
																+	/* Fill the matrix */
															
 
																+	ret = starpu_task_insert(&cl_fill, STARPU_W, handle, 0);
															
 
																+	if (ret == -ENODEV) goto enodev;
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+
															
 
																+	/* Now switch to vertical view of the matrix */
															
 
																+	starpu_data_partition_submit(handle, PARTS, vert_handle);
															
 
																+
															
 
																+	/* Check the values of the vertical slices */
															
 
																+	for (i = 0; i < PARTS; i++)
															
 
																+	{
															
 
																+		int factor = 1;
															
 
																+		int start = i*(NX/PARTS);
															
 
																+		ret = starpu_task_insert(&cl_check_scale,
															
 
																+				STARPU_RW, vert_handle[i],
															
 
																+				STARPU_VALUE, &start, sizeof(start),
															
 
																+				STARPU_VALUE, &factor, sizeof(factor),
															
 
																+				0);
															
 
																+		if (ret == -ENODEV) goto enodev;
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+	}
															
 
																+
															
 
																+	/* Now switch back to total view of the matrix */
															
 
																+	starpu_data_unpartition_submit(handle, PARTS, vert_handle, -1);
															
 
																+
															
 
																+	/* And switch to horizontal view of the matrix */
															
 
																+	starpu_data_partition_submit(handle, PARTS, horiz_handle);
															
 
																+
															
 
																+	/* Check the values of the horizontal slices */
															
 
																+	for (i = 0; i < PARTS; i++)
															
 
																+	{
															
 
																+		int factor = 2;
															
 
																+		int start = factor*100*i*(NY/PARTS);
															
 
																+		ret = starpu_task_insert(&cl_check_scale,
															
 
																+				STARPU_RW, horiz_handle[i],
															
 
																+				STARPU_VALUE, &start, sizeof(start),
															
 
																+				STARPU_VALUE, &factor, sizeof(factor),
															
 
																+				0);
															
 
																+		if (ret == -ENODEV) goto enodev;
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+	}
															
 
																+
															
 
																+	/* Now switch back to total view of the matrix */
															
 
																+	starpu_data_unpartition_submit(handle, PARTS, horiz_handle, -1);
															
 
																+
															
 
																+	/* And check the values of the whole matrix */
															
 
																+	int factor = 4;
															
 
																+	int start = 0;
															
 
																+	ret = starpu_task_insert(&cl_check_scale,
															
 
																+			STARPU_RW, handle,
															
 
																+			STARPU_VALUE, &start, sizeof(start),
															
 
																+			STARPU_VALUE, &factor, sizeof(factor),
															
 
																+			0);
															
 
																+	if (ret == -ENODEV) goto enodev;
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+
															
 
																+	/*
															
 
																+	 * Unregister data from StarPU and shutdown.
															
 
																+	 */
															
 
																+	starpu_data_partition_clean(handle, PARTS, vert_handle);
															
 
																+	starpu_data_partition_clean(handle, PARTS, horiz_handle);
															
 
																+	starpu_data_unregister(handle);
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	return ret;
															
 
																+
															
 
																+enodev:
															
 
																+	starpu_shutdown();
															
 
																+	return 77;
															
 
																+}
															
--- a/examples/filters/fmultiple_submit_readonly.c
+++ b/examples/filters/fmultiple_submit_readonly.c
@@ -0,0 +1,367 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2015  Université Bordeaux
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+/*
															
 
																+ * This examplifies how to access the same matrix with different partitioned
															
 
																+ * views, doing the coherency through partition planning.
															
 
																+ * We first run a kernel on the whole matrix to fill it, then run a kernel on
															
 
																+ * each vertical slice to check the value,
															
 
																+ * and multiply it by two, then run a
															
 
																+ * kernel on each horizontal slice to do the same.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+
															
 
																+#define NX    6
															
 
																+#define NY    6
															
 
																+#define PARTS 2
															
 
																+
															
 
																+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
															
 
																+
															
 
																+void matrix_fill(void *buffers[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
															
 
																+{
															
 
																+	unsigned i, j;
															
 
																+
															
 
																+	/* length of the matrix */
															
 
																+	unsigned nx = STARPU_MATRIX_GET_NX(buffers[0]);
															
 
																+	unsigned ny = STARPU_MATRIX_GET_NY(buffers[0]);
															
 
																+	unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]);
															
 
																+	int *val = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
															
 
																+
															
 
																+	for(j=0; j<ny ; j++)
															
 
																+	{
															
 
																+		for(i=0; i<nx ; i++)
															
 
																+			val[(j*ld)+i] = i+100*j;
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+struct starpu_codelet cl_fill =
															
 
																+{
															
 
																+	.cpu_funcs = {matrix_fill},
															
 
																+	.cpu_funcs_name = {"matrix_fill"},
															
 
																+	.nbuffers = 1,
															
 
																+	.modes = {STARPU_W},
															
 
																+	.name = "matrix_fill"
															
 
																+};
															
 
																+
															
 
																+void fmultiple_check_scale(void *buffers[], void *cl_arg)
															
 
																+{
															
 
																+	int start, factor;
															
 
																+	unsigned i, j;
															
 
																+
															
 
																+	/* length of the matrix */
															
 
																+	unsigned nx = STARPU_MATRIX_GET_NX(buffers[0]);
															
 
																+	unsigned ny = STARPU_MATRIX_GET_NY(buffers[0]);
															
 
																+	unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]);
															
 
																+	int *val = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
															
 
																+
															
 
																+	starpu_codelet_unpack_args(cl_arg, &start, &factor);
															
 
																+
															
 
																+	for(j=0; j<ny ; j++)
															
 
																+	{
															
 
																+		for(i=0; i<nx ; i++)
															
 
																+		{
															
 
																+			STARPU_ASSERT(val[(j*ld)+i] == start + factor*((int)(i+100*j)));
															
 
																+			val[(j*ld)+i] *= 2;
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+extern void fmultiple_check_scale_cuda(void *buffers[], void *cl_arg);
															
 
																+#endif
															
 
																+struct starpu_codelet cl_check_scale =
															
 
																+{
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	.cuda_funcs = {fmultiple_check_scale_cuda},
															
 
																+	.cuda_flags = {STARPU_CUDA_ASYNC},
															
 
																+#else
															
 
																+	/* Only enable it on CPUs if we don't have a CUDA device, to force remote execution on the CUDA device */
															
 
																+	.cpu_funcs = {fmultiple_check_scale},
															
 
																+	.cpu_funcs_name = {"fmultiple_check_scale"},
															
 
																+#endif
															
 
																+	.nbuffers = 1,
															
 
																+	.modes = {STARPU_RW},
															
 
																+	.name = "fmultiple_check_scale"
															
 
																+};
															
 
																+
															
 
																+void fmultiple_check(void *buffers[], void *cl_arg)
															
 
																+{
															
 
																+	int start, factor;
															
 
																+	unsigned i, j;
															
 
																+
															
 
																+	/* length of the matrix */
															
 
																+	unsigned nx = STARPU_MATRIX_GET_NX(buffers[0]);
															
 
																+	unsigned ny = STARPU_MATRIX_GET_NY(buffers[0]);
															
 
																+	unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]);
															
 
																+	int *val = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
															
 
																+
															
 
																+	starpu_codelet_unpack_args(cl_arg, &start, &factor);
															
 
																+
															
 
																+	for(j=0; j<ny ; j++)
															
 
																+	{
															
 
																+		for(i=0; i<nx ; i++)
															
 
																+		{
															
 
																+			STARPU_ASSERT(val[(j*ld)+i] == start + factor*((int)(i+100*j)));
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+extern void fmultiple_check_cuda(void *buffers[], void *cl_arg);
															
 
																+#endif
															
 
																+struct starpu_codelet cl_check =
															
 
																+{
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	.cuda_funcs = {fmultiple_check_cuda},
															
 
																+	.cuda_flags = {STARPU_CUDA_ASYNC},
															
 
																+#else
															
 
																+	/* Only enable it on CPUs if we don't have a CUDA device, to force remote execution on the CUDA device */
															
 
																+	.cpu_funcs = {fmultiple_check},
															
 
																+	.cpu_funcs_name = {"fmultiple_check"},
															
 
																+#endif
															
 
																+	.nbuffers = 1,
															
 
																+	.modes = {STARPU_R},
															
 
																+	.name = "fmultiple_check"
															
 
																+};
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	int start, factor;
															
 
																+	unsigned j, n=1;
															
 
																+	int matrix[NX][NY];
															
 
																+	int ret, i;
															
 
																+
															
 
																+	/* We haven't taken care otherwise */
															
 
																+	STARPU_ASSERT((NX%PARTS) == 0);
															
 
																+	STARPU_ASSERT((NY%PARTS) == 0);
															
 
																+
															
 
																+	starpu_data_handle_t handle;
															
 
																+	starpu_data_handle_t vert_handle[PARTS];
															
 
																+	starpu_data_handle_t horiz_handle[PARTS];
															
 
																+
															
 
																+	ret = starpu_init(NULL);
															
 
																+	if (ret == -ENODEV)
															
 
																+		return 77;
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+
															
 
																+	/* Declare the whole matrix to StarPU */
															
 
																+	starpu_matrix_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)matrix, NX, NX, NY, sizeof(matrix[0][0]));
															
 
																+
															
 
																+	/* Partition the matrix in PARTS vertical slices */
															
 
																+	struct starpu_data_filter f_vert =
															
 
																+	{
															
 
																+		.filter_func = starpu_matrix_filter_block,
															
 
																+		.nchildren = PARTS
															
 
																+	};
															
 
																+	starpu_data_partition_plan(handle, &f_vert, vert_handle);
															
 
																+
															
 
																+	/* Partition the matrix in PARTS horizontal slices */
															
 
																+	struct starpu_data_filter f_horiz =
															
 
																+	{
															
 
																+		.filter_func = starpu_matrix_filter_vertical_block,
															
 
																+		.nchildren = PARTS
															
 
																+	};
															
 
																+	starpu_data_partition_plan(handle, &f_horiz, horiz_handle);
															
 
																+
															
 
																+	/* Fill the matrix */
															
 
																+	ret = starpu_task_insert(&cl_fill, STARPU_W, handle, 0);
															
 
																+	if (ret == -ENODEV) goto enodev;
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+	factor = 1;
															
 
																+
															
 
																+	/* Now switch to vertical view of the matrix, but readonly */
															
 
																+	starpu_data_partition_readonly_submit(handle, PARTS, vert_handle);
															
 
																+	/* as well as horizontal view of the matrix, but readonly */
															
 
																+	starpu_data_partition_readonly_submit(handle, PARTS, horiz_handle);
															
 
																+
															
 
																+	/* Check the values of the vertical slices */
															
 
																+	for (i = 0; i < PARTS; i++)
															
 
																+	{
															
 
																+		start = i*(NX/PARTS);
															
 
																+		ret = starpu_task_insert(&cl_check,
															
 
																+				STARPU_R, vert_handle[i],
															
 
																+				STARPU_VALUE, &start, sizeof(start),
															
 
																+				STARPU_VALUE, &factor, sizeof(factor),
															
 
																+				0);
															
 
																+		if (ret == -ENODEV) goto enodev;
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+	}
															
 
																+	/* Check the values of the horizontal slices */
															
 
																+	for (i = 0; i < PARTS; i++)
															
 
																+	{
															
 
																+		start = factor*100*i*(NY/PARTS);
															
 
																+		ret = starpu_task_insert(&cl_check,
															
 
																+				STARPU_R, horiz_handle[i],
															
 
																+				STARPU_VALUE, &start, sizeof(start),
															
 
																+				STARPU_VALUE, &factor, sizeof(factor),
															
 
																+				0);
															
 
																+		if (ret == -ENODEV) goto enodev;
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+	}
															
 
																+	/* And of the main matrix */
															
 
																+	start = 0;
															
 
																+	ret = starpu_task_insert(&cl_check,
															
 
																+			STARPU_R, handle,
															
 
																+			STARPU_VALUE, &start, sizeof(start),
															
 
																+			STARPU_VALUE, &factor, sizeof(factor),
															
 
																+			0);
															
 
																+	if (ret == -ENODEV) goto enodev;
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+
															
 
																+	/* Now switch back to total view of the matrix */
															
 
																+	starpu_data_unpartition_submit(handle, PARTS, horiz_handle, -1);
															
 
																+	starpu_data_unpartition_submit(handle, PARTS, vert_handle, -1);
															
 
																+
															
 
																+	/* Check and scale it */
															
 
																+	start = 0;
															
 
																+	ret = starpu_task_insert(&cl_check_scale,
															
 
																+			STARPU_RW, handle,
															
 
																+			STARPU_VALUE, &start, sizeof(start),
															
 
																+			STARPU_VALUE, &factor, sizeof(factor),
															
 
																+			0);
															
 
																+	if (ret == -ENODEV) goto enodev;
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+	factor = 2;
															
 
																+
															
 
																+	/* Now switch to vertical view of the matrix, but readonly */
															
 
																+	starpu_data_partition_readonly_submit(handle, PARTS, vert_handle);
															
 
																+	/* as well as horizontal view of the matrix, but readonly */
															
 
																+	starpu_data_partition_readonly_submit(handle, PARTS, horiz_handle);
															
 
																+
															
 
																+	/* Check the values of the vertical slices */
															
 
																+	for (i = 0; i < PARTS; i++)
															
 
																+	{
															
 
																+		start = factor*i*(NX/PARTS);
															
 
																+		ret = starpu_task_insert(&cl_check,
															
 
																+				STARPU_R, vert_handle[i],
															
 
																+				STARPU_VALUE, &start, sizeof(start),
															
 
																+				STARPU_VALUE, &factor, sizeof(factor),
															
 
																+				0);
															
 
																+		if (ret == -ENODEV) goto enodev;
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+	}
															
 
																+	/* Check the values of the horizontal slices */
															
 
																+	for (i = 0; i < PARTS; i++)
															
 
																+	{
															
 
																+		start = factor*100*i*(NY/PARTS);
															
 
																+		ret = starpu_task_insert(&cl_check,
															
 
																+				STARPU_R, horiz_handle[i],
															
 
																+				STARPU_VALUE, &start, sizeof(start),
															
 
																+				STARPU_VALUE, &factor, sizeof(factor),
															
 
																+				0);
															
 
																+		if (ret == -ENODEV) goto enodev;
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+	}
															
 
																+	/* And of the main matrix */
															
 
																+	start = 0;
															
 
																+	ret = starpu_task_insert(&cl_check,
															
 
																+			STARPU_R, handle,
															
 
																+			STARPU_VALUE, &start, sizeof(start),
															
 
																+			STARPU_VALUE, &factor, sizeof(factor),
															
 
																+			0);
															
 
																+	if (ret == -ENODEV) goto enodev;
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+
															
 
																+	/* Disable the read-only vertical view of the matrix */
															
 
																+	starpu_data_unpartition_submit(handle, PARTS, vert_handle, -1);
															
 
																+	/* And switch to read-write horizontal view of the matrix */
															
 
																+	starpu_data_partition_readwrite_upgrade_submit(handle, PARTS, horiz_handle);
															
 
																+
															
 
																+	/* Check and scale the values of the horizontal slices */
															
 
																+	for (i = 0; i < PARTS; i++)
															
 
																+	{
															
 
																+		start = factor*100*i*(NY/PARTS);
															
 
																+		ret = starpu_task_insert(&cl_check_scale,
															
 
																+				STARPU_RW, horiz_handle[i],
															
 
																+				STARPU_VALUE, &start, sizeof(start),
															
 
																+				STARPU_VALUE, &factor, sizeof(factor),
															
 
																+				0);
															
 
																+		if (ret == -ENODEV) goto enodev;
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+	}
															
 
																+	factor = 4;
															
 
																+
															
 
																+	/* Now switch back to total view of the matrix, but readonly */
															
 
																+	starpu_data_unpartition_readonly_submit(handle, PARTS, horiz_handle, -1);
															
 
																+	/* And also enable a vertical view of the matrix, but readonly */
															
 
																+	starpu_data_partition_readonly_submit(handle, PARTS, vert_handle);
															
 
																+
															
 
																+	/* Check the values of the vertical slices */
															
 
																+	for (i = 0; i < PARTS; i++)
															
 
																+	{
															
 
																+		start = factor*i*(NX/PARTS);
															
 
																+		ret = starpu_task_insert(&cl_check,
															
 
																+				STARPU_R, vert_handle[i],
															
 
																+				STARPU_VALUE, &start, sizeof(start),
															
 
																+				STARPU_VALUE, &factor, sizeof(factor),
															
 
																+				0);
															
 
																+		if (ret == -ENODEV) goto enodev;
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+	}
															
 
																+	/* Check the values of the horizontal slices */
															
 
																+	for (i = 0; i < PARTS; i++)
															
 
																+	{
															
 
																+		start = factor*100*i*(NY/PARTS);
															
 
																+		ret = starpu_task_insert(&cl_check,
															
 
																+				STARPU_R, horiz_handle[i],
															
 
																+				STARPU_VALUE, &start, sizeof(start),
															
 
																+				STARPU_VALUE, &factor, sizeof(factor),
															
 
																+				0);
															
 
																+		if (ret == -ENODEV) goto enodev;
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+	}
															
 
																+	/* And of the main matrix */
															
 
																+	start = 0;
															
 
																+	ret = starpu_task_insert(&cl_check,
															
 
																+			STARPU_R, handle,
															
 
																+			STARPU_VALUE, &start, sizeof(start),
															
 
																+			STARPU_VALUE, &factor, sizeof(factor),
															
 
																+			0);
															
 
																+	if (ret == -ENODEV) goto enodev;
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+
															
 
																+	/* Disable the read-only vertical view of the matrix */
															
 
																+	starpu_data_unpartition_submit(handle, PARTS, vert_handle, -1);
															
 
																+	/* And the read-only horizontal view of the matrix */
															
 
																+	starpu_data_unpartition_submit(handle, PARTS, horiz_handle, -1);
															
 
																+	/* Thus getting back to a read-write total view of the matrix */
															
 
																+
															
 
																+	/* And check and scale the values of the whole matrix */
															
 
																+	start = 0;
															
 
																+	ret = starpu_task_insert(&cl_check_scale,
															
 
																+			STARPU_RW, handle,
															
 
																+			STARPU_VALUE, &start, sizeof(start),
															
 
																+			STARPU_VALUE, &factor, sizeof(factor),
															
 
																+			0);
															
 
																+	if (ret == -ENODEV) goto enodev;
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+	factor = 8;
															
 
																+
															
 
																+	/*
															
 
																+	 * Unregister data from StarPU and shutdown.
															
 
																+	 */
															
 
																+	starpu_data_partition_clean(handle, PARTS, vert_handle);
															
 
																+	starpu_data_partition_clean(handle, PARTS, horiz_handle);
															
 
																+	starpu_data_unregister(handle);
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	return ret;
															
 
																+
															
 
																+enodev:
															
 
																+	starpu_shutdown();
															
 
																+	return 77;
															
 
																+}
															
--- a/include/starpu_data_filters.h
+++ b/include/starpu_data_filters.h
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010-2012  Université de Bordeaux
															
 
																+ * Copyright (C) 2010-2012, 2015  Université de Bordeaux
															
 
																  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013  CNRS
															
 
																  *
															
@@ -42,6 +42,14 @@ struct starpu_data_filter
 
																 void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_data_filter *f);
															
 
																 void starpu_data_unpartition(starpu_data_handle_t root_data, unsigned gathering_node);
															
 
																+void starpu_data_partition_plan(starpu_data_handle_t initial_handle, struct starpu_data_filter *f, starpu_data_handle_t *children);
															
 
																+void starpu_data_partition_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children);
															
 
																+void starpu_data_partition_readonly_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children);
															
 
																+void starpu_data_partition_readwrite_upgrade_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children);
															
 
																+void starpu_data_unpartition_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children, int gathering_node);
															
 
																+void starpu_data_unpartition_readonly_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children, int gathering_node);
															
 
																+void starpu_data_partition_clean(starpu_data_handle_t root_data, unsigned nparts, starpu_data_handle_t *children);
															
 
																+
															
 
																 int starpu_data_get_nb_children(starpu_data_handle_t handle);
															
 
																 starpu_data_handle_t starpu_data_get_child(starpu_data_handle_t handle, unsigned i);
															
--- a/src/datawizard/coherency.h
+++ b/src/datawizard/coherency.h
@@ -145,6 +145,17 @@ struct _starpu_data_state
 
																 	starpu_data_handle_t children;
															
 
																 	unsigned nchildren;
															
 
																+	/* How many partition plans this handle has */
															
 
																+	unsigned nplans;
															
 
																+	/* Whether a partition plan is currently submitted and the
															
 
																+	 * corresponding unpartition has not been yet
															
 
																+	 *
															
 
																+	 * Or the number of partition plans currently submitted in readonly
															
 
																+	 * mode.
															
 
																+	 */
															
 
																+	unsigned partitioned;
															
 
																+	/* Whether a partition plan is currently submitted in readonly mode */
															
 
																+	unsigned readonly;
															
 
																 	/* describe the state of the data in term of coherency */
															
 
																 	struct _starpu_data_replicate per_node[STARPU_MAXNODES];
															
--- a/src/datawizard/filters.c
+++ b/src/datawizard/filters.c
@@ -22,8 +22,6 @@
 
																 #include <datawizard/interfaces/data_interface.h>
															
 
																 #include <core/task.h>
															
 
																-static void starpu_data_create_children(starpu_data_handle_t handle, unsigned nchildren, struct starpu_data_filter *f);
															
 
																-
															
 
																 /*
															
 
																  * This function applies a data filter on all the elements of a partition
															
 
																  */
															
@@ -114,27 +112,37 @@ starpu_data_handle_t starpu_data_vget_sub_data(starpu_data_handle_t root_handle,
 
																 	return current_handle;
															
 
																 }
															
 
																-void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_data_filter *f)
															
 
																+static unsigned _starpu_data_partition_nparts(starpu_data_handle_t initial_handle, struct starpu_data_filter *f)
															
 
																+{
															
 
																+	/* how many parts ? */
															
 
																+	if (f->get_nchildren)
															
 
																+	  return f->get_nchildren(f, initial_handle);
															
 
																+	else
															
 
																+	  return f->nchildren;
															
 
																+
															
 
																+}
															
 
																+
															
 
																+static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_data_handle_t *childrenp, unsigned nparts, struct starpu_data_filter *f, int inherit_state)
															
 
																 {
															
 
																-	unsigned nparts;
															
 
																 	unsigned i;
															
 
																 	unsigned node;
															
 
																 	/* first take care to properly lock the data header */
															
 
																 	_starpu_spin_lock(&initial_handle->header_lock);
															
 
																-	STARPU_ASSERT_MSG(initial_handle->nchildren == 0, "there should not be mutiple filters applied on the same data %p, futher filtering has to be done on children", initial_handle);
															
 
																-
															
 
																-	/* how many parts ? */
															
 
																-	if (f->get_nchildren)
															
 
																-	  nparts = f->get_nchildren(f, initial_handle);
															
 
																-	else
															
 
																-	  nparts = f->nchildren;
															
 
																+	initial_handle->nplans++;
															
 
																 	STARPU_ASSERT_MSG(nparts > 0, "Partitioning data %p in 0 piece does not make sense", initial_handle);
															
 
																 	/* allocate the children */
															
 
																-	starpu_data_create_children(initial_handle, nparts, f);
															
 
																+	if (inherit_state)
															
 
																+	{
															
 
																+		initial_handle->children = (struct _starpu_data_state *) calloc(nparts, sizeof(struct _starpu_data_state));
															
 
																+		STARPU_ASSERT(initial_handle->children);
															
 
																+
															
 
																+		/* this handle now has children */
															
 
																+		initial_handle->nchildren = nparts;
															
 
																+	}
															
 
																 	unsigned nworkers = starpu_worker_get_count();
															
@@ -147,6 +155,7 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 
																 	{
															
 
																 		/* This is lazy allocation, allocate it now in main RAM, so as
															
 
																 		 * to have somewhere to gather pieces later */
															
 
																+		/* FIXME: mark as unevictable! */
															
 
																 		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[0], 0);
															
 
																 #ifdef STARPU_DEVEL
															
 
																 #warning we should reclaim memory if allocation failed
															
@@ -156,12 +165,29 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 
																 	for (i = 0; i < nparts; i++)
															
 
																 	{
															
 
																-		starpu_data_handle_t child =
															
 
																-			starpu_data_get_child(initial_handle, i);
															
 
																+		starpu_data_handle_t child;
															
 
																+		if (inherit_state)
															
 
																+			child = &initial_handle->children[i];
															
 
																+		else
															
 
																+			child = childrenp[i];
															
 
																 		STARPU_ASSERT(child);
															
 
																+		struct starpu_data_interface_ops *ops;
															
 
																+
															
 
																+		/* each child may have his own interface type */
															
 
																+		/* what's this child's interface ? */
															
 
																+		if (f->get_child_ops)
															
 
																+			ops = f->get_child_ops(f, i);
															
 
																+		else
															
 
																+			ops = initial_handle->ops;
															
 
																+
															
 
																+		_starpu_data_handle_init(child, ops, initial_handle->mf_node);
															
 
																+
															
 
																 		child->nchildren = 0;
															
 
																+		child->nplans = 0;
															
 
																+		child->partitioned = 0;
															
 
																+		child->readonly = 0;
															
 
																                 child->mpi_data = initial_handle->mpi_data;
															
 
																 		child->root_handle = initial_handle->root_handle;
															
 
																 		child->father_handle = initial_handle;
															
@@ -224,19 +250,29 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 
																 			initial_replicate = &initial_handle->per_node[node];
															
 
																 			child_replicate = &child->per_node[node];
															
 
																-			child_replicate->state = initial_replicate->state;
															
 
																-			child_replicate->allocated = initial_replicate->allocated;
															
 
																+			if (inherit_state)
															
 
																+				child_replicate->state = initial_replicate->state;
															
 
																+			else
															
 
																+				child_replicate->state = STARPU_INVALID;
															
 
																+			if (inherit_state || !initial_replicate->automatically_allocated)
															
 
																+				child_replicate->allocated = initial_replicate->allocated;
															
 
																+			else
															
 
																+				child_replicate->allocated = 0;
															
 
																 			/* Do not allow memory reclaiming within the child for parent bits */
															
 
																 			child_replicate->automatically_allocated = 0;
															
 
																 			child_replicate->refcnt = 0;
															
 
																 			child_replicate->memory_node = node;
															
 
																 			child_replicate->relaxed_coherency = 0;
															
 
																-			child_replicate->initialized = initial_replicate->initialized;
															
 
																+			if (inherit_state)
															
 
																+				child_replicate->initialized = initial_replicate->initialized;
															
 
																+			else
															
 
																+				child_replicate->initialized = 0;
															
 
																 			/* update the interface */
															
 
																 			void *initial_interface = starpu_data_get_interface_on_node(initial_handle, node);
															
 
																 			void *child_interface = starpu_data_get_interface_on_node(child, node);
															
 
																+			STARPU_ASSERT_MSG(!(!inherit_state && child_replicate->automatically_allocated && child_replicate->allocated), "partition planning is currently not supported when handle has some automatically allocated buffers");
															
 
																 			f->filter_func(initial_interface, child_interface, f, i, nparts);
															
 
																 		}
															
@@ -459,6 +495,7 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 
																 	starpu_data_handle_t children = root_handle->children;
															
 
																 	root_handle->children = NULL;
															
 
																 	root_handle->nchildren = 0;
															
 
																+	root_handle->nplans--;
															
 
																 	/* now the parent may be used again so we release the lock */
															
 
																 	_starpu_spin_unlock(&root_handle->header_lock);
															
@@ -468,31 +505,164 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 
																 	_STARPU_TRACE_END_UNPARTITION(root_handle, gathering_node);
															
 
																 }
															
 
																-/* each child may have his own interface type */
															
 
																-static void starpu_data_create_children(starpu_data_handle_t handle, unsigned nchildren, struct starpu_data_filter *f)
															
 
																+void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_data_filter *f)
															
 
																 {
															
 
																-	handle->children = (struct _starpu_data_state *) calloc(nchildren, sizeof(struct _starpu_data_state));
															
 
																-	STARPU_ASSERT(handle->children);
															
 
																+	unsigned nparts = _starpu_data_partition_nparts(initial_handle, f);
															
 
																+	STARPU_ASSERT_MSG(initial_handle->nchildren == 0, "there should not be mutiple filters applied on the same data %p, futher filtering has to be done on children", initial_handle);
															
 
																+	STARPU_ASSERT_MSG(initial_handle->nplans == 0, "partition planning and synchronous partitioning is not supported");
															
 
																-	unsigned child;
															
 
																+	initial_handle->children = NULL;
															
 
																+	_starpu_data_partition(initial_handle, NULL, nparts, f, 1);
															
 
																+}
															
 
																+
															
 
																+void starpu_data_partition_plan(starpu_data_handle_t initial_handle, struct starpu_data_filter *f, starpu_data_handle_t *childrenp)
															
 
																+{
															
 
																+	unsigned i;
															
 
																+	unsigned nparts = _starpu_data_partition_nparts(initial_handle, f);
															
 
																+	STARPU_ASSERT_MSG(initial_handle->nchildren == 0, "partition planning and synchronous partitioning is not supported");
															
 
																+	STARPU_ASSERT_MSG(initial_handle->home_node == STARPU_MAIN_RAM, "partition planning is currently only supported from main RAM");
															
 
																+	STARPU_ASSERT_MSG(initial_handle->sequential_consistency, "partition planning is currently only supported for data with sequential consistency");
															
 
																+
															
 
																+	for (i = 0; i < nparts; i++)
															
 
																+		childrenp[i] = calloc(1, sizeof(struct _starpu_data_state));
															
 
																+	_starpu_data_partition(initial_handle, childrenp, nparts, f, 0);
															
 
																+}
															
 
																+
															
 
																+void starpu_data_partition_clean(starpu_data_handle_t root_handle, unsigned nparts, starpu_data_handle_t *children)
															
 
																+{
															
 
																+	unsigned i;
															
 
																+
															
 
																+	for (i = 0; i < nparts; i++)
															
 
																+		starpu_data_unregister_submit(children[i]);
															
 
																+
															
 
																+	_starpu_spin_lock(&root_handle->header_lock);
															
 
																+	root_handle->nplans--;
															
 
																+	_starpu_spin_unlock(&root_handle->header_lock);
															
 
																+}
															
 
																+
															
 
																+static void empty(void *buffers[] STARPU_ATTRIBUTE_UNUSED, void *cl_arg STARPU_ATTRIBUTE_UNUSED)
															
 
																+{
															
 
																+	/* This doesn't need to do anything, it's simply used to make coherency
															
 
																+	 * between the two views, by simply running on the home node of the
															
 
																+	 * data, thus getting back all data pieces there.  */
															
 
																+}
															
 
																+
															
 
																+static struct starpu_codelet cl_switch =
															
 
																+{
															
 
																+	.cpu_funcs = {empty},
															
 
																+	.nbuffers = STARPU_VARIABLE_NBUFFERS,
															
 
																+	.name = "data_partition_switch"
															
 
																+};
															
 
																-	for (child = 0; child < nchildren; child++)
															
 
																+void starpu_data_partition_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children)
															
 
																+{
															
 
																+	STARPU_ASSERT_MSG(initial_handle->sequential_consistency, "partition planning is currently only supported for data with sequential consistency");
															
 
																+	_starpu_spin_lock(&initial_handle->header_lock);
															
 
																+	STARPU_ASSERT_MSG(initial_handle->partitioned == 0, "One can't submit several partition plannings at the same time");
															
 
																+	STARPU_ASSERT_MSG(initial_handle->readonly == 0, "One can't submit a partition planning while a readonly partitioning is active");
															
 
																+	initial_handle->partitioned++;
															
 
																+	_starpu_spin_unlock(&initial_handle->header_lock);
															
 
																+
															
 
																+	unsigned i;
															
 
																+	struct starpu_data_descr descr[nparts];
															
 
																+	for (i = 0; i < nparts; i++)
															
 
																 	{
															
 
																-		starpu_data_handle_t handle_child;
															
 
																-		struct starpu_data_interface_ops *ops;
															
 
																+		STARPU_ASSERT_MSG(children[i]->father_handle == initial_handle, "children parameter of starpu_data_partition_submit must be the children of the parent parameter");
															
 
																+		descr[i].handle = children[i];
															
 
																+		descr[i].mode = STARPU_W;
															
 
																+	}
															
 
																+	/* TODO: assert nparts too */
															
 
																+	starpu_task_insert(&cl_switch, STARPU_RW, initial_handle, STARPU_DATA_MODE_ARRAY, descr, nparts, 0);
															
 
																+	starpu_data_invalidate_submit(initial_handle);
															
 
																+}
															
 
																-		/* what's this child's interface ? */
															
 
																-		if (f->get_child_ops)
															
 
																-		  ops = f->get_child_ops(f, child);
															
 
																-		else
															
 
																-		  ops = handle->ops;
															
 
																+void starpu_data_partition_readonly_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children)
															
 
																+{
															
 
																+	STARPU_ASSERT_MSG(initial_handle->sequential_consistency, "partition planning is currently only supported for data with sequential consistency");
															
 
																+	_starpu_spin_lock(&initial_handle->header_lock);
															
 
																+	STARPU_ASSERT_MSG(initial_handle->partitioned == 0 || initial_handle->readonly, "One can't submit a readonly partition planning at the same time as a readwrite partition planning");
															
 
																+	initial_handle->partitioned++;
															
 
																+	initial_handle->readonly = 1;
															
 
																+	_starpu_spin_unlock(&initial_handle->header_lock);
															
 
																+
															
 
																+	unsigned i;
															
 
																+	struct starpu_data_descr descr[nparts];
															
 
																+	for (i = 0; i < nparts; i++)
															
 
																+	{
															
 
																+		STARPU_ASSERT_MSG(children[i]->father_handle == initial_handle, "children parameter of starpu_data_partition_submit must be the children of the parent parameter");
															
 
																+		descr[i].handle = children[i];
															
 
																+		descr[i].mode = STARPU_W;
															
 
																+	}
															
 
																+	/* TODO: assert nparts too */
															
 
																+	starpu_task_insert(&cl_switch, STARPU_R, initial_handle, STARPU_DATA_MODE_ARRAY, descr, nparts, 0);
															
 
																+}
															
 
																+
															
 
																+void starpu_data_partition_readwrite_upgrade_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children)
															
 
																+{
															
 
																+	STARPU_ASSERT_MSG(initial_handle->sequential_consistency, "partition planning is currently only supported for data with sequential consistency");
															
 
																+	_starpu_spin_lock(&initial_handle->header_lock);
															
 
																+	STARPU_ASSERT_MSG(initial_handle->partitioned == 1, "One can't upgrade a readonly partition planning to readwrite while other readonly partition plannings are active");
															
 
																+	STARPU_ASSERT_MSG(initial_handle->readonly == 1, "One can only upgrade a readonly partition planning");
															
 
																+	initial_handle->readonly = 0;
															
 
																+	_starpu_spin_unlock(&initial_handle->header_lock);
															
 
																+
															
 
																+	unsigned i;
															
 
																+	struct starpu_data_descr descr[nparts];
															
 
																+	for (i = 0; i < nparts; i++)
															
 
																+	{
															
 
																+		STARPU_ASSERT_MSG(children[i]->father_handle == initial_handle, "children parameter of starpu_data_partition_submit must be the children of the parent parameter");
															
 
																+		descr[i].handle = children[i];
															
 
																+		descr[i].mode = STARPU_W;
															
 
																+	}
															
 
																+	/* TODO: assert nparts too */
															
 
																+	starpu_task_insert(&cl_switch, STARPU_RW, initial_handle, STARPU_DATA_MODE_ARRAY, descr, nparts, 0);
															
 
																+	starpu_data_invalidate_submit(initial_handle);
															
 
																+}
															
 
																+
															
 
																+void starpu_data_unpartition_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children, int gather_node)
															
 
																+{
															
 
																+	STARPU_ASSERT_MSG(initial_handle->sequential_consistency, "partition planning is currently only supported for data with sequential consistency");
															
 
																+	STARPU_ASSERT_MSG(gather_node == STARPU_MAIN_RAM || gather_node == -1, "gathering node different from main RAM is currently not supported");
															
 
																+	_starpu_spin_lock(&initial_handle->header_lock);
															
 
																+	STARPU_ASSERT_MSG(initial_handle->partitioned >= 1, "No partition planning is active for this handle");
															
 
																+	initial_handle->partitioned--;
															
 
																+	if (!initial_handle->partitioned)
															
 
																+		initial_handle->readonly = 0;
															
 
																+	_starpu_spin_unlock(&initial_handle->header_lock);
															
 
																-		handle_child = &handle->children[child];
															
 
																-		_starpu_data_handle_init(handle_child, ops, handle->mf_node);
															
 
																+	unsigned i;
															
 
																+	struct starpu_data_descr descr[nparts];
															
 
																+	for (i = 0; i < nparts; i++)
															
 
																+	{
															
 
																+		STARPU_ASSERT_MSG(children[i]->father_handle == initial_handle, "children parameter of starpu_data_partition_submit must be the children of the parent parameter");
															
 
																+		descr[i].handle = children[i];
															
 
																+		descr[i].mode = STARPU_RW;
															
 
																 	}
															
 
																+	/* TODO: assert nparts too */
															
 
																+	starpu_task_insert(&cl_switch, STARPU_W, initial_handle, STARPU_DATA_MODE_ARRAY, descr, nparts, 0);
															
 
																+	for (i = 0; i < nparts; i++)
															
 
																+		starpu_data_invalidate_submit(children[i]);
															
 
																+}
															
 
																+
															
 
																+void starpu_data_unpartition_readonly_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children, int gather_node)
															
 
																+{
															
 
																+	STARPU_ASSERT_MSG(initial_handle->sequential_consistency, "partition planning is currently only supported for data with sequential consistency");
															
 
																+	STARPU_ASSERT_MSG(gather_node == STARPU_MAIN_RAM || gather_node == -1, "gathering node different from main RAM is currently not supported");
															
 
																+	_starpu_spin_lock(&initial_handle->header_lock);
															
 
																+	STARPU_ASSERT_MSG(initial_handle->partitioned >= 1, "No partition planning is active for this handle");
															
 
																+	initial_handle->readonly = 1;
															
 
																+	_starpu_spin_unlock(&initial_handle->header_lock);
															
 
																-	/* this handle now has children */
															
 
																-	handle->nchildren = nchildren;
															
 
																+	unsigned i;
															
 
																+	struct starpu_data_descr descr[nparts];
															
 
																+	for (i = 0; i < nparts; i++)
															
 
																+	{
															
 
																+		STARPU_ASSERT_MSG(children[i]->father_handle == initial_handle, "children parameter of starpu_data_partition_submit must be the children of the parent parameter");
															
 
																+		descr[i].handle = children[i];
															
 
																+		descr[i].mode = STARPU_R;
															
 
																+	}
															
 
																+	/* TODO: assert nparts too */
															
 
																+	starpu_task_insert(&cl_switch, STARPU_W, initial_handle, STARPU_DATA_MODE_ARRAY, descr, nparts, 0);
															
 
																 }
															
 
																 /*
															
--- a/src/datawizard/interfaces/data_interface.c
+++ b/src/datawizard/interfaces/data_interface.c
@@ -252,6 +252,9 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 
																 	/* there is no hierarchy yet */
															
 
																 	handle->nchildren = 0;
															
 
																+	handle->nplans = 0;
															
 
																+	handle->partitioned = 0;
															
 
																+	handle->readonly = 0;
															
 
																 	handle->root_handle = handle;
															
 
																 	handle->father_handle = NULL;
															
 
																 	handle->sibling_index = 0; /* could be anything for the root */
															
@@ -641,6 +644,9 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 
																 {
															
 
																 	STARPU_ASSERT(handle);
															
 
																 	STARPU_ASSERT_MSG(handle->nchildren == 0, "data %p needs to be unpartitioned before unregistration", handle);
															
 
																+	STARPU_ASSERT_MSG(handle->nplans == 0, "data %p needs its partition plans to be cleaned before unregistration", handle);
															
 
																+	STARPU_ASSERT_MSG(handle->partitioned == 0, "data %p needs its partitioned plans to be unpartitioned before unregistration", handle);
															
 
																+	/* TODO: also check that it has the latest coherency */
															
 
																 	STARPU_ASSERT(!(nowait && handle->busy_count != 0));
															
 
																 	int sequential_consistency = handle->sequential_consistency;