il y a 8 ans · abfe581427
--- a/ChangeLog
+++ b/ChangeLog
@@ -32,6 +32,9 @@ New features:
 
				   * Add a new implementation of StarPU-MPI on top of NewMadeleine
			
 
				   * Add optional callbacks to notify an external resource manager
			
 
				     about workers going to sleep and waking up
			
 
				+  * Add implicit support for asynchronous partition planning. This means one
			
 
				+    does not need to call starpu_data_partition_submit etc. explicitly any
			
 
				+    more, StarPU will make the appropriate calls as needed.
			
 
				 
			
 
				 Small features:
			
 
				   * Scheduling contexts may now be associated a user data pointer at creation
			
--- a/doc/doxygen/chapters/310_data_management.doxy
+++ b/doc/doxygen/chapters/310_data_management.doxy
@@ -341,11 +341,15 @@ currently working on the data.  This can be a bottleneck for the application.
 
				 
			
 
				 An asynchronous API also exists, it works only on handles with sequential
			
 
				 consistency. The principle is to first plan the partitioning, which returns
			
 
				-data handles of the partition, which are not functional yet. Along other task
			
 
				-submission, one can submit the actual partitioning, and then use the handles
			
 
				-of the partition. Before using the handle of the whole data, one has to submit
			
 
				-the unpartitioning. <c>fmultiple_submit</c> is a complete example using this
			
 
				-technique.
			
 
				+data handles of the partition, which are not functional yet. When submitting
			
 
				+tasks, one can mix using the handles of the partition, of the whole data. One
			
 
				+can even partition recursively and mix using handles at different levels of the
			
 
				+recursion. Of course, StarPU will have to introduce coherency synchronization.
			
 
				+
			
 
				+<c>fmultiple_submit_implicit</c> is a complete example using this technique.
			
 
				+One can also look at <c>fmultiple_submit_readonly</c> which contains the
			
 
				+explicit coherency synchronization which are automatically introduced by StarPU
			
 
				+for <c>fmultiple_submit_implicit</c>.
			
 
				 
			
 
				 In short, we first register a matrix and plan the partitioning:
			
 
				 
			
@@ -361,33 +365,20 @@ starpu_data_partition_plan(handle, &f_vert, vert_handle);
 
				 
			
 
				 starpu_data_partition_plan() returns the handles for the partition in <c>vert_handle</c>.
			
 
				 
			
 
				-One can submit tasks working on the main handle, but not yet on the <c>vert_handle</c>
			
 
				-handles. Now we submit the partitioning:
			
 
				-
			
 
				-\code{.c}
			
 
				-starpu_data_partition_submit(handle, PARTS, vert_handle);
			
 
				-\endcode
			
 
				-
			
 
				-And now we can submit tasks working on <c>vert_handle</c> handles (and not on the main
			
 
				-handle any more). Eventually we want to work on the main handle again, so we
			
 
				-submit the unpartitioning:
			
 
				-
			
 
				-\code{.c}
			
 
				-starpu_data_unpartition_submit(handle, PARTS, vert_handle, -1);
			
 
				-\endcode
			
 
				-
			
 
				-And now we can submit tasks working on the main handle again.
			
 
				+One can then submit tasks working on the main handle, and tasks working on
			
 
				+<c>vert_handle</c> handles. Between using the main handle and <c>vert_handle</c>
			
 
				+handles, StarPU will automatically call starpu_data_partition_submit and
			
 
				+starpu_data_unpartition_submit.
			
 
				 
			
 
				 All this code is asynchronous, just submitting which tasks, partitioning and
			
 
				-unpartitioning should be done at runtime.
			
 
				-
			
 
				-Planning several partitioning of the same data is also possible, one just has
			
 
				-to submit unpartitioning (to get back to the initial handle) before submitting
			
 
				-another partitioning.
			
 
				-
			
 
				-It is also possible to activate several partitioning at the same time, in
			
 
				-read-only mode, by using starpu_data_partition_readonly_submit(). A complete
			
 
				-example is available in <c>examples/filters/fmultiple_submit_readonly.c</c>.
			
 
				+unpartitioning will be done at runtime.
			
 
				+
			
 
				+Planning several partitioning of the same data is also possible, StarPU will
			
 
				+unpartition and repartition as needed when mixing accesses of different
			
 
				+partitions. If data access is done in read-only mode, StarPU will allow the
			
 
				+different partitioning to coexist. As soon as a data is accessed in read-write
			
 
				+mode, StarPU will automatically unpartition everything and activate only the
			
 
				+partitioning leading to the data being written to.
			
 
				 
			
 
				 \section ManualPartitioning Manual Partitioning
			
 
				 
			
--- a/doc/doxygen/chapters/api/codelet_and_tasks.doxy
+++ b/doc/doxygen/chapters/api/codelet_and_tasks.doxy
@@ -150,6 +150,11 @@ Value to be set in starpu_codelet::flags to execute the codelet functions even i
 
				 Value to be set in starpu_codelet::flags to execute the codelet functions even in simgrid mode,
			
 
				 and later inject the measured timing inside the simulation.
			
 
				 
			
 
				+\def STARPU_CODELET_NOPLANS
			
 
				+\ingroup API_Codelet_And_Tasks
			
 
				+Value to be set in starpu_codelet::flags to make starpu_task_submit not submit
			
 
				+automatic asynchronous partitioning/unpartitioning.
			
 
				+
			
 
				 \typedef starpu_cpu_func_t
			
 
				 \ingroup API_Codelet_And_Tasks
			
 
				 CPU implementation of a codelet.
			
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -224,6 +224,7 @@ STARPU_EXAMPLES +=				\
 
				 	filters/fmultiple_manual		\
			
 
				 	filters/fmultiple_submit		\
			
 
				 	filters/fmultiple_submit_readonly	\
			
 
				+	filters/fmultiple_submit_implicit	\
			
 
				 	tag_example/tag_example			\
			
 
				 	tag_example/tag_example2		\
			
 
				 	tag_example/tag_example3		\
			
@@ -569,6 +570,14 @@ filters_fmultiple_submit_readonly_SOURCES +=	\
 
				 	filters/fmultiple_cuda.cu
			
 
				 endif
			
 
				 
			
 
				+filters_fmultiple_submit_implicit_SOURCES =		\
			
 
				+	filters/fmultiple_submit_implicit.c
			
 
				+
			
 
				+if STARPU_USE_CUDA
			
 
				+filters_fmultiple_submit_implicit_SOURCES +=		\
			
 
				+	filters/fmultiple_cuda.cu
			
 
				+endif
			
 
				+
			
 
				 examplebin_PROGRAMS +=				\
			
 
				 	filters/shadow				\
			
 
				 	filters/shadow2d			\
			
--- a/examples/filters/fmultiple_submit.c
+++ b/examples/filters/fmultiple_submit.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2017                                     CNRS
			
 
				- * Copyright (C) 2015                                     Université de Bordeaux
			
 
				+ * Copyright (C) 2015,2017                                Université de Bordeaux
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -89,11 +89,9 @@ struct starpu_codelet cl_check_scale =
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {fmultiple_check_scale_cuda},
			
 
				 	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				-#else
			
 
				-	/* Only enable it on CPUs if we don't have a CUDA device, to force remote execution on the CUDA device */
			
 
				+#endif
			
 
				 	.cpu_funcs = {fmultiple_check_scale},
			
 
				 	.cpu_funcs_name = {"fmultiple_check_scale"},
			
 
				-#endif
			
 
				 	.nbuffers = 1,
			
 
				 	.modes = {STARPU_RW},
			
 
				 	.name = "fmultiple_check_scale"
			
@@ -118,6 +116,12 @@ int main(void)
 
				 		return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				+	/* Disable codelet on CPUs if we have a CUDA device, to force remote execution on the CUDA device */
			
 
				+	if (starpu_cuda_worker_get_count()) {
			
 
				+		cl_check_scale.cpu_funcs[0] = NULL;
			
 
				+		cl_check_scale.cpu_funcs_name[0] = NULL;
			
 
				+	}
			
 
				+
			
 
				 	/* Declare the whole matrix to StarPU */
			
 
				 	starpu_matrix_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)matrix, NX, NX, NY, sizeof(matrix[0][0]));
			
 
				 
			
@@ -182,7 +186,7 @@ int main(void)
 
				 	/* Now switch back to total view of the matrix */
			
 
				 	starpu_data_unpartition_submit(handle, PARTS, horiz_handle, -1);
			
 
				 
			
 
				-	/* And check the values of the whole matrix */
			
 
				+	/* And check and scale the values of the whole matrix */
			
 
				 	int factor = 4;
			
 
				 	int start = 0;
			
 
				 	ret = starpu_task_insert(&cl_check_scale,
			
--- a/examples/filters/fmultiple_submit_implicit.c
+++ b/examples/filters/fmultiple_submit_implicit.c
@@ -0,0 +1,362 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2017                                     CNRS
			
 
				+ * Copyright (C) 2015,2017                                Université de Bordeaux
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * This examplifies how to access the same matrix with different partitioned
			
 
				+ * views, doing the coherency through partition planning.
			
 
				+ *
			
 
				+ * We first run a kernel on the whole matrix to fill it, then check the value
			
 
				+ * in parallel from the whole handle, from the horizontal slices, and from the
			
 
				+ * vertical slices. Then we switch back to the whole matrix to check and scale
			
 
				+ * it. Then we check the result again from the whole handle, the horizontal
			
 
				+ * slices, and the vertical slices. Then we switch to read-write on the
			
 
				+ * horizontal slices to check and scale them. Then we check again from the
			
 
				+ * whole handle, the horizontal slices, and the vertical slices. Eventually we
			
 
				+ * switch back to the whole matrix to check and scale it.
			
 
				+ *
			
 
				+ * Please keep this in sync with fmultiple_submit_readonly.c
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#define NX    6
			
 
				+#define NY    6
			
 
				+#define PARTS 2
			
 
				+
			
 
				+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				+
			
 
				+void matrix_fill(void *buffers[], void *cl_arg)
			
 
				+{
			
 
				+	unsigned i, j;
			
 
				+	(void)cl_arg;
			
 
				+
			
 
				+	/* length of the matrix */
			
 
				+	unsigned nx = STARPU_MATRIX_GET_NX(buffers[0]);
			
 
				+	unsigned ny = STARPU_MATRIX_GET_NY(buffers[0]);
			
 
				+	unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]);
			
 
				+	int *val = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
			
 
				+
			
 
				+	for(j=0; j<ny ; j++)
			
 
				+	{
			
 
				+		for(i=0; i<nx ; i++)
			
 
				+			val[(j*ld)+i] = i+100*j;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet cl_fill =
			
 
				+{
			
 
				+	.cpu_funcs = {matrix_fill},
			
 
				+	.cpu_funcs_name = {"matrix_fill"},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_W},
			
 
				+	.name = "matrix_fill"
			
 
				+};
			
 
				+
			
 
				+void fmultiple_check_scale(void *buffers[], void *cl_arg)
			
 
				+{
			
 
				+	int start, factor;
			
 
				+	unsigned i, j;
			
 
				+
			
 
				+	/* length of the matrix */
			
 
				+	unsigned nx = STARPU_MATRIX_GET_NX(buffers[0]);
			
 
				+	unsigned ny = STARPU_MATRIX_GET_NY(buffers[0]);
			
 
				+	unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]);
			
 
				+	int *val = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
			
 
				+
			
 
				+	starpu_codelet_unpack_args(cl_arg, &start, &factor);
			
 
				+
			
 
				+	for(j=0; j<ny ; j++)
			
 
				+	{
			
 
				+		for(i=0; i<nx ; i++)
			
 
				+		{
			
 
				+			STARPU_ASSERT(val[(j*ld)+i] == start + factor*((int)(i+100*j)));
			
 
				+			val[(j*ld)+i] *= 2;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+extern void fmultiple_check_scale_cuda(void *buffers[], void *cl_arg);
			
 
				+#endif
			
 
				+struct starpu_codelet cl_check_scale =
			
 
				+{
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {fmultiple_check_scale_cuda},
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				+#endif
			
 
				+	.cpu_funcs = {fmultiple_check_scale},
			
 
				+	.cpu_funcs_name = {"fmultiple_check_scale"},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_RW},
			
 
				+	.name = "fmultiple_check_scale"
			
 
				+};
			
 
				+
			
 
				+void fmultiple_check(void *buffers[], void *cl_arg)
			
 
				+{
			
 
				+	int start, factor;
			
 
				+	unsigned i, j;
			
 
				+
			
 
				+	/* length of the matrix */
			
 
				+	unsigned nx = STARPU_MATRIX_GET_NX(buffers[0]);
			
 
				+	unsigned ny = STARPU_MATRIX_GET_NY(buffers[0]);
			
 
				+	unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]);
			
 
				+	int *val = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
			
 
				+
			
 
				+	starpu_codelet_unpack_args(cl_arg, &start, &factor);
			
 
				+
			
 
				+	for(j=0; j<ny ; j++)
			
 
				+	{
			
 
				+		for(i=0; i<nx ; i++)
			
 
				+		{
			
 
				+			STARPU_ASSERT(val[(j*ld)+i] == start + factor*((int)(i+100*j)));
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+extern void fmultiple_check_cuda(void *buffers[], void *cl_arg);
			
 
				+#endif
			
 
				+struct starpu_codelet cl_check =
			
 
				+{
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {fmultiple_check_cuda},
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				+#endif
			
 
				+	.cpu_funcs = {fmultiple_check},
			
 
				+	.cpu_funcs_name = {"fmultiple_check"},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_R},
			
 
				+	.name = "fmultiple_check"
			
 
				+};
			
 
				+
			
 
				+int main(void)
			
 
				+{
			
 
				+	int start, factor;
			
 
				+	unsigned j, n=1;
			
 
				+	int matrix[NX][NY];
			
 
				+	int ret, i;
			
 
				+
			
 
				+	/* We haven't taken care otherwise */
			
 
				+	STARPU_ASSERT((NX%PARTS) == 0);
			
 
				+	STARPU_ASSERT((NY%PARTS) == 0);
			
 
				+
			
 
				+	starpu_data_handle_t handle;
			
 
				+	starpu_data_handle_t vert_handle[PARTS];
			
 
				+	starpu_data_handle_t horiz_handle[PARTS];
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV)
			
 
				+		return 77;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	/* Disable codelet on CPUs if we have a CUDA device, to force remote execution on the CUDA device */
			
 
				+	if (starpu_cuda_worker_get_count()) {
			
 
				+		cl_check_scale.cpu_funcs[0] = NULL;
			
 
				+		cl_check_scale.cpu_funcs_name[0] = NULL;
			
 
				+		cl_check.cpu_funcs[0] = NULL;
			
 
				+		cl_check.cpu_funcs_name[0] = NULL;
			
 
				+	}
			
 
				+
			
 
				+	/* Declare the whole matrix to StarPU */
			
 
				+	starpu_matrix_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)matrix, NX, NX, NY, sizeof(matrix[0][0]));
			
 
				+
			
 
				+	/* Partition the matrix in PARTS vertical slices */
			
 
				+	struct starpu_data_filter f_vert =
			
 
				+	{
			
 
				+		.filter_func = starpu_matrix_filter_block,
			
 
				+		.nchildren = PARTS
			
 
				+	};
			
 
				+	starpu_data_partition_plan(handle, &f_vert, vert_handle);
			
 
				+
			
 
				+	/* Partition the matrix in PARTS horizontal slices */
			
 
				+	struct starpu_data_filter f_horiz =
			
 
				+	{
			
 
				+		.filter_func = starpu_matrix_filter_vertical_block,
			
 
				+		.nchildren = PARTS
			
 
				+	};
			
 
				+	starpu_data_partition_plan(handle, &f_horiz, horiz_handle);
			
 
				+
			
 
				+	/* Fill the matrix */
			
 
				+	ret = starpu_task_insert(&cl_fill, STARPU_W, handle, 0);
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	factor = 1;
			
 
				+
			
 
				+	/* Look at readonly vertical and horizontal view of the matrix */
			
 
				+
			
 
				+	/* Check the values of the vertical slices */
			
 
				+	for (i = 0; i < PARTS; i++)
			
 
				+	{
			
 
				+		start = i*(NX/PARTS);
			
 
				+		ret = starpu_task_insert(&cl_check,
			
 
				+				STARPU_R, vert_handle[i],
			
 
				+				STARPU_VALUE, &start, sizeof(start),
			
 
				+				STARPU_VALUE, &factor, sizeof(factor),
			
 
				+				0);
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+	/* Check the values of the horizontal slices */
			
 
				+	for (i = 0; i < PARTS; i++)
			
 
				+	{
			
 
				+		start = factor*100*i*(NY/PARTS);
			
 
				+		ret = starpu_task_insert(&cl_check,
			
 
				+				STARPU_R, horiz_handle[i],
			
 
				+				STARPU_VALUE, &start, sizeof(start),
			
 
				+				STARPU_VALUE, &factor, sizeof(factor),
			
 
				+				0);
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+	/* And of the main matrix */
			
 
				+	start = 0;
			
 
				+	ret = starpu_task_insert(&cl_check,
			
 
				+			STARPU_R, handle,
			
 
				+			STARPU_VALUE, &start, sizeof(start),
			
 
				+			STARPU_VALUE, &factor, sizeof(factor),
			
 
				+			0);
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				+	/* Now look at the total view of the matrix */
			
 
				+
			
 
				+	/* Check and scale it */
			
 
				+	start = 0;
			
 
				+	ret = starpu_task_insert(&cl_check_scale,
			
 
				+			STARPU_RW, handle,
			
 
				+			STARPU_VALUE, &start, sizeof(start),
			
 
				+			STARPU_VALUE, &factor, sizeof(factor),
			
 
				+			0);
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	factor = 2;
			
 
				+
			
 
				+	/* Look again readonly vertical and horizontal slices */
			
 
				+
			
 
				+	/* Check the values of the vertical slices */
			
 
				+	for (i = 0; i < PARTS; i++)
			
 
				+	{
			
 
				+		start = factor*i*(NX/PARTS);
			
 
				+		ret = starpu_task_insert(&cl_check,
			
 
				+				STARPU_R, vert_handle[i],
			
 
				+				STARPU_VALUE, &start, sizeof(start),
			
 
				+				STARPU_VALUE, &factor, sizeof(factor),
			
 
				+				0);
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+	/* Check the values of the horizontal slices */
			
 
				+	for (i = 0; i < PARTS; i++)
			
 
				+	{
			
 
				+		start = factor*100*i*(NY/PARTS);
			
 
				+		ret = starpu_task_insert(&cl_check,
			
 
				+				STARPU_R, horiz_handle[i],
			
 
				+				STARPU_VALUE, &start, sizeof(start),
			
 
				+				STARPU_VALUE, &factor, sizeof(factor),
			
 
				+				0);
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+	/* And of the main matrix */
			
 
				+	start = 0;
			
 
				+	ret = starpu_task_insert(&cl_check,
			
 
				+			STARPU_R, handle,
			
 
				+			STARPU_VALUE, &start, sizeof(start),
			
 
				+			STARPU_VALUE, &factor, sizeof(factor),
			
 
				+			0);
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				+	/* Now try to touch horizontal slices */
			
 
				+
			
 
				+	/* Check and scale the values of the horizontal slices */
			
 
				+	for (i = 0; i < PARTS; i++)
			
 
				+	{
			
 
				+		start = factor*100*i*(NY/PARTS);
			
 
				+		ret = starpu_task_insert(&cl_check_scale,
			
 
				+				STARPU_RW, horiz_handle[i],
			
 
				+				STARPU_VALUE, &start, sizeof(start),
			
 
				+				STARPU_VALUE, &factor, sizeof(factor),
			
 
				+				0);
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+	factor = 4;
			
 
				+
			
 
				+	/* And come back to read-only */
			
 
				+
			
 
				+	/* Check the values of the vertical slices */
			
 
				+	for (i = 0; i < PARTS; i++)
			
 
				+	{
			
 
				+		start = factor*i*(NX/PARTS);
			
 
				+		ret = starpu_task_insert(&cl_check,
			
 
				+				STARPU_R, vert_handle[i],
			
 
				+				STARPU_VALUE, &start, sizeof(start),
			
 
				+				STARPU_VALUE, &factor, sizeof(factor),
			
 
				+				0);
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+	/* Check the values of the horizontal slices */
			
 
				+	for (i = 0; i < PARTS; i++)
			
 
				+	{
			
 
				+		start = factor*100*i*(NY/PARTS);
			
 
				+		ret = starpu_task_insert(&cl_check,
			
 
				+				STARPU_R, horiz_handle[i],
			
 
				+				STARPU_VALUE, &start, sizeof(start),
			
 
				+				STARPU_VALUE, &factor, sizeof(factor),
			
 
				+				0);
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+	/* And of the main matrix */
			
 
				+	start = 0;
			
 
				+	ret = starpu_task_insert(&cl_check,
			
 
				+			STARPU_R, handle,
			
 
				+			STARPU_VALUE, &start, sizeof(start),
			
 
				+			STARPU_VALUE, &factor, sizeof(factor),
			
 
				+			0);
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				+	/* And access the whole matrix again */
			
 
				+
			
 
				+	/* And check and scale the values of the whole matrix */
			
 
				+	start = 0;
			
 
				+	ret = starpu_task_insert(&cl_check_scale,
			
 
				+			STARPU_RW, handle,
			
 
				+			STARPU_VALUE, &start, sizeof(start),
			
 
				+			STARPU_VALUE, &factor, sizeof(factor),
			
 
				+			0);
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	factor = 8;
			
 
				+
			
 
				+	/*
			
 
				+	 * Unregister data from StarPU and shutdown.
			
 
				+	 */
			
 
				+	starpu_data_partition_clean(handle, PARTS, vert_handle);
			
 
				+	starpu_data_partition_clean(handle, PARTS, horiz_handle);
			
 
				+	starpu_data_unregister(handle);
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return ret;
			
 
				+
			
 
				+enodev:
			
 
				+	starpu_shutdown();
			
 
				+	return 77;
			
 
				+}
			
--- a/examples/filters/fmultiple_submit_readonly.c
+++ b/examples/filters/fmultiple_submit_readonly.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2017                                     CNRS
			
 
				- * Copyright (C) 2015                                     Université de Bordeaux
			
 
				+ * Copyright (C) 2015,2017                                Université de Bordeaux
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -17,7 +17,8 @@
 
				 
			
 
				 /*
			
 
				  * This examplifies how to access the same matrix with different partitioned
			
 
				- * views, doing the coherency through partition planning.
			
 
				+ * views, doing the coherency through partition planning, but without having to
			
 
				+ * explicitly submit partitioning/unpartitioning.
			
 
				  *
			
 
				  * We first run a kernel on the whole matrix to fill it, then check the value
			
 
				  * in parallel from the whole handle, from the horizontal slices, and from the
			
@@ -27,6 +28,8 @@
 
				  * horizontal slices to check and scale them. Then we check again from the
			
 
				  * whole handle, the horizontal slices, and the vertical slices. Eventually we
			
 
				  * switch back to the whole matrix to check and scale it.
			
 
				+ *
			
 
				+ * Please keep this in sync with fmultiple_submit_implicit.c
			
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
@@ -95,11 +98,9 @@ struct starpu_codelet cl_check_scale =
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {fmultiple_check_scale_cuda},
			
 
				 	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				-#else
			
 
				-	/* Only enable it on CPUs if we don't have a CUDA device, to force remote execution on the CUDA device */
			
 
				+#endif
			
 
				 	.cpu_funcs = {fmultiple_check_scale},
			
 
				 	.cpu_funcs_name = {"fmultiple_check_scale"},
			
 
				-#endif
			
 
				 	.nbuffers = 1,
			
 
				 	.modes = {STARPU_RW},
			
 
				 	.name = "fmultiple_check_scale"
			
@@ -135,11 +136,9 @@ struct starpu_codelet cl_check =
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {fmultiple_check_cuda},
			
 
				 	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				-#else
			
 
				-	/* Only enable it on CPUs if we don't have a CUDA device, to force remote execution on the CUDA device */
			
 
				+#endif
			
 
				 	.cpu_funcs = {fmultiple_check},
			
 
				 	.cpu_funcs_name = {"fmultiple_check"},
			
 
				-#endif
			
 
				 	.nbuffers = 1,
			
 
				 	.modes = {STARPU_R},
			
 
				 	.name = "fmultiple_check"
			
@@ -165,6 +164,14 @@ int main(void)
 
				 		return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				+	/* Disable codelet on CPUs if we have a CUDA device, to force remote execution on the CUDA device */
			
 
				+	if (starpu_cuda_worker_get_count()) {
			
 
				+		cl_check_scale.cpu_funcs[0] = NULL;
			
 
				+		cl_check_scale.cpu_funcs_name[0] = NULL;
			
 
				+		cl_check.cpu_funcs[0] = NULL;
			
 
				+		cl_check.cpu_funcs_name[0] = NULL;
			
 
				+	}
			
 
				+
			
 
				 	/* Declare the whole matrix to StarPU */
			
 
				 	starpu_matrix_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)matrix, NX, NX, NY, sizeof(matrix[0][0]));
			
 
				 
			
--- a/include/starpu_data_filters.h
+++ b/include/starpu_data_filters.h
@@ -49,6 +49,7 @@ void starpu_data_partition_submit(starpu_data_handle_t initial_handle, unsigned
 
				 void starpu_data_partition_readonly_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children);
			
 
				 void starpu_data_partition_readwrite_upgrade_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children);
			
 
				 void starpu_data_unpartition_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children, int gathering_node);
			
 
				+void starpu_data_unpartition_submit_r(starpu_data_handle_t initial_handle, int gathering_node);
			
 
				 void starpu_data_unpartition_readonly_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children, int gathering_node);
			
 
				 void starpu_data_partition_clean(starpu_data_handle_t root_data, unsigned nparts, starpu_data_handle_t *children);
			
 
				 
			
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -47,6 +47,7 @@ extern "C"
 
				 
			
 
				 #define STARPU_CODELET_SIMGRID_EXECUTE	(1<<0)
			
 
				 #define STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT	(1<<1)
			
 
				+#define STARPU_CODELET_NOPLANS	(1<<2)
			
 
				 #define STARPU_CUDA_ASYNC	(1<<0)
			
 
				 #define STARPU_OPENCL_ASYNC	(1<<0)
			
 
				 
			
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -584,6 +584,7 @@ static int _starpu_task_submit_head(struct starpu_task *task)
 
				 		for (i = 0; i < nbuffers; i++)
			
 
				 		{
			
 
				 			starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
			
 
				+			enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(task, i);
			
 
				 			/* Make sure handles are valid */
			
 
				 			STARPU_ASSERT_MSG(handle->magic == _STARPU_TASK_MAGIC, "data %p is invalid (was it already unregistered?)", handle);
			
 
				 			/* Make sure handles are not partitioned */
			
@@ -592,6 +593,14 @@ static int _starpu_task_submit_head(struct starpu_task *task)
 
				 			 * for can_execute hooks */
			
 
				 			if (handle->home_node != -1)
			
 
				 				_STARPU_TASK_SET_INTERFACE(task, starpu_data_get_interface_on_node(handle, handle->home_node), i);
			
 
				+			if (!(task->cl->flags & STARPU_CODELET_NOPLANS) &&
			
 
				+			    ((handle->nplans && !handle->nchildren) || handle->siblings))
			
 
				+				/* This handle is involved with asynchronous
			
 
				+				 * partitioning as a parent or a child, make
			
 
				+				 * sure the right plan is active, submit
			
 
				+				 * appropiate partitioning / unpartitioning if
			
 
				+				 * not */
			
 
				+				_starpu_data_partition_access_submit(handle, (mode & STARPU_W) != 0);
			
 
				 		}
			
 
				 
			
 
				 		/* Check the type of worker(s) required by the task exist */
			
--- a/src/datawizard/coherency.h
+++ b/src/datawizard/coherency.h
@@ -144,9 +144,16 @@ struct _starpu_data_state
 
				 	/* In case we user filters, the handle may describe a sub-data */
			
 
				 	struct _starpu_data_state *root_handle; /* root of the tree */
			
 
				 	struct _starpu_data_state *father_handle; /* father of the node, NULL if the current node is the root */
			
 
				+	starpu_data_handle_t *active_children; /* The currently active set of read-write children */
			
 
				+	starpu_data_handle_t **active_readonly_children; /* The currently active set of read-only children */
			
 
				+	unsigned nactive_readonly_children; /* Size of active_readonly_children array */
			
 
				+	/* Our siblings in the father partitioning */
			
 
				+	unsigned nsiblings; /* How many siblings */
			
 
				+	starpu_data_handle_t *siblings;
			
 
				 	unsigned sibling_index; /* indicate which child this node is from the father's perpsective (if any) */
			
 
				 	unsigned depth; /* what's the depth of the tree ? */
			
 
				 
			
 
				+	/* Synchronous partitioning */
			
 
				 	starpu_data_handle_t children;
			
 
				 	unsigned nchildren;
			
 
				 	/* How many partition plans this handle has */
			
@@ -163,7 +170,11 @@ struct _starpu_data_state
 
				 	 */
			
 
				 	unsigned partitioned;
			
 
				 	/* Whether a partition plan is currently submitted in readonly mode */
			
 
				-	unsigned readonly;
			
 
				+	unsigned readonly:1;
			
 
				+
			
 
				+	/* Whether our father is currently partitioned into ourself */
			
 
				+	unsigned active:1;
			
 
				+	unsigned active_ro:1;
			
 
				 
			
 
				 	/* describe the state of the data in term of coherency */
			
 
				 	struct _starpu_data_replicate per_node[STARPU_MAXNODES];
			
--- a/src/datawizard/filters.c
+++ b/src/datawizard/filters.c
@@ -19,6 +19,8 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				+#define STARPU_VERBOSE
			
 
				+
			
 
				 #include <datawizard/filters.h>
			
 
				 #include <datawizard/footprint.h>
			
 
				 #include <datawizard/interfaces/data_interface.h>
			
@@ -234,9 +236,19 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 
				 		child->switch_cl = NULL;
			
 
				 		child->partitioned = 0;
			
 
				 		child->readonly = 0;
			
 
				+		child->active = inherit_state;
			
 
				+		child->active_ro = 0;
			
 
				                 child->mpi_data = initial_handle->mpi_data;
			
 
				 		child->root_handle = initial_handle->root_handle;
			
 
				 		child->father_handle = initial_handle;
			
 
				+		child->active_children = NULL;
			
 
				+		child->active_readonly_children = NULL;
			
 
				+		child->nactive_readonly_children = 0;
			
 
				+		child->nsiblings = nparts;
			
 
				+		if (inherit_state)
			
 
				+			child->siblings = NULL;
			
 
				+		else
			
 
				+			child->siblings = childrenp;
			
 
				 		child->sibling_index = i;
			
 
				 		child->depth = initial_handle->depth + 1;
			
 
				 
			
@@ -586,6 +598,7 @@ void starpu_data_partition_plan(starpu_data_handle_t initial_handle, struct star
 
				 	STARPU_ASSERT_MSG(initial_handle->sequential_consistency, "partition planning is currently only supported for data with sequential consistency");
			
 
				 	struct starpu_codelet *cl = initial_handle->switch_cl;
			
 
				 	int home_node = initial_handle->home_node;
			
 
				+	starpu_data_handle_t *children;
			
 
				 	if (home_node == -1)
			
 
				 		/* Nothing better for now */
			
 
				 		/* TODO: pass -1, and make _starpu_fetch_nowhere_task_input
			
@@ -594,11 +607,13 @@ void starpu_data_partition_plan(starpu_data_handle_t initial_handle, struct star
 
				 		 */
			
 
				 		home_node = STARPU_MAIN_RAM;
			
 
				 
			
 
				+	_STARPU_MALLOC(children, nparts * sizeof(*children));
			
 
				 	for (i = 0; i < nparts; i++)
			
 
				 	{
			
 
				-		_STARPU_CALLOC(childrenp[i], 1, sizeof(struct _starpu_data_state));
			
 
				+		_STARPU_CALLOC(children[i], 1, sizeof(struct _starpu_data_state));
			
 
				+		childrenp[i] = children[i];
			
 
				 	}
			
 
				-	_starpu_data_partition(initial_handle, childrenp, nparts, f, 0);
			
 
				+	_starpu_data_partition(initial_handle, children, nparts, f, 0);
			
 
				 
			
 
				 	if (!cl)
			
 
				 	{
			
@@ -607,6 +622,7 @@ void starpu_data_partition_plan(starpu_data_handle_t initial_handle, struct star
 
				 		cl = initial_handle->switch_cl;
			
 
				 		cl->where = STARPU_NOWHERE;
			
 
				 		cl->nbuffers = STARPU_VARIABLE_NBUFFERS;
			
 
				+		cl->flags = STARPU_CODELET_NOPLANS;
			
 
				 		cl->name = "data_partition_switch";
			
 
				 		cl->specific_nodes = 1;
			
 
				 	}
			
@@ -624,6 +640,15 @@ void starpu_data_partition_clean(starpu_data_handle_t root_handle, unsigned npar
 
				 {
			
 
				 	unsigned i;
			
 
				 
			
 
				+	if (children[0]->active) {
			
 
				+#ifdef STARPU_DEVEL
			
 
				+#warning FIXME: better choose gathering node
			
 
				+#endif
			
 
				+		starpu_data_unpartition_submit(root_handle, nparts, children, STARPU_MAIN_RAM);
			
 
				+	}
			
 
				+
			
 
				+	free(children[0]->siblings);
			
 
				+
			
 
				 	for (i = 0; i < nparts; i++)
			
 
				 		starpu_data_unregister_submit(children[i]);
			
 
				 
			
@@ -634,17 +659,26 @@ void starpu_data_partition_clean(starpu_data_handle_t root_handle, unsigned npar
 
				 
			
 
				 void starpu_data_partition_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children)
			
 
				 {
			
 
				+	unsigned i;
			
 
				 	STARPU_ASSERT_MSG(initial_handle->sequential_consistency, "partition planning is currently only supported for data with sequential consistency");
			
 
				 	_starpu_spin_lock(&initial_handle->header_lock);
			
 
				 	STARPU_ASSERT_MSG(initial_handle->partitioned == 0, "One can't submit several partition plannings at the same time");
			
 
				 	STARPU_ASSERT_MSG(initial_handle->readonly == 0, "One can't submit a partition planning while a readonly partitioning is active");
			
 
				 	initial_handle->partitioned++;
			
 
				+	initial_handle->active_children = children[0]->siblings;
			
 
				 	_starpu_spin_unlock(&initial_handle->header_lock);
			
 
				 
			
 
				+	for (i = 0; i < nparts; i++)
			
 
				+	{
			
 
				+		_starpu_spin_lock(&children[i]->header_lock);
			
 
				+		children[i]->active = 1;
			
 
				+		_starpu_spin_unlock(&children[i]->header_lock);
			
 
				+	}
			
 
				+
			
 
				 	if (!initial_handle->initialized)
			
 
				 		/* No need for coherency, it is not initialized */
			
 
				 		return;
			
 
				-	unsigned i;
			
 
				+
			
 
				 	struct starpu_data_descr descr[nparts];
			
 
				 	for (i = 0; i < nparts; i++)
			
 
				 	{
			
@@ -659,15 +693,27 @@ void starpu_data_partition_submit(starpu_data_handle_t initial_handle, unsigned
 
				 
			
 
				 void starpu_data_partition_readonly_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children)
			
 
				 {
			
 
				+	unsigned i;
			
 
				 	STARPU_ASSERT_MSG(initial_handle->sequential_consistency, "partition planning is currently only supported for data with sequential consistency");
			
 
				 	_starpu_spin_lock(&initial_handle->header_lock);
			
 
				 	STARPU_ASSERT_MSG(initial_handle->partitioned == 0 || initial_handle->readonly, "One can't submit a readonly partition planning at the same time as a readwrite partition planning");
			
 
				 	initial_handle->partitioned++;
			
 
				 	initial_handle->readonly = 1;
			
 
				+	if (initial_handle->nactive_readonly_children < initial_handle->partitioned) {
			
 
				+		_STARPU_REALLOC(initial_handle->active_readonly_children, initial_handle->partitioned * sizeof(initial_handle->active_readonly_children));
			
 
				+		initial_handle->nactive_readonly_children = initial_handle->partitioned;
			
 
				+	}
			
 
				+	initial_handle->active_readonly_children[initial_handle->partitioned-1] = children[0]->siblings;
			
 
				 	_starpu_spin_unlock(&initial_handle->header_lock);
			
 
				 
			
 
				+	for (i = 0; i < nparts; i++)
			
 
				+	{
			
 
				+		_starpu_spin_lock(&children[i]->header_lock);
			
 
				+		children[i]->active = 1;
			
 
				+		_starpu_spin_unlock(&children[i]->header_lock);
			
 
				+	}
			
 
				+
			
 
				 	STARPU_ASSERT_MSG(initial_handle->initialized, "It is odd to read-only-partition a data which does not have a value yet");
			
 
				-	unsigned i;
			
 
				 	struct starpu_data_descr descr[nparts];
			
 
				 	for (i = 0; i < nparts; i++)
			
 
				 	{
			
@@ -686,6 +732,8 @@ void starpu_data_partition_readwrite_upgrade_submit(starpu_data_handle_t initial
 
				 	STARPU_ASSERT_MSG(initial_handle->partitioned == 1, "One can't upgrade a readonly partition planning to readwrite while other readonly partition plannings are active");
			
 
				 	STARPU_ASSERT_MSG(initial_handle->readonly == 1, "One can only upgrade a readonly partition planning");
			
 
				 	initial_handle->readonly = 0;
			
 
				+	initial_handle->active_children = initial_handle->active_readonly_children[0];
			
 
				+	initial_handle->active_readonly_children[0] = NULL;
			
 
				 	_starpu_spin_unlock(&initial_handle->header_lock);
			
 
				 
			
 
				 	unsigned i;
			
@@ -703,16 +751,37 @@ void starpu_data_partition_readwrite_upgrade_submit(starpu_data_handle_t initial
 
				 
			
 
				 void starpu_data_unpartition_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children, int gather_node)
			
 
				 {
			
 
				+	unsigned i;
			
 
				 	STARPU_ASSERT_MSG(initial_handle->sequential_consistency, "partition planning is currently only supported for data with sequential consistency");
			
 
				 	STARPU_ASSERT_MSG(gather_node == initial_handle->home_node || gather_node == -1, "gathering node different from home node is currently not supported");
			
 
				 	_starpu_spin_lock(&initial_handle->header_lock);
			
 
				 	STARPU_ASSERT_MSG(initial_handle->partitioned >= 1, "No partition planning is active for this handle");
			
 
				+	if (initial_handle->readonly) {
			
 
				+		/* Replace this children set with the last set in the list of readonly children sets */
			
 
				+		for (i = 0; i < initial_handle->partitioned-1; i++) {
			
 
				+			if (initial_handle->active_readonly_children[i] == children[0]->siblings) {
			
 
				+				initial_handle->active_readonly_children[i] = initial_handle->active_readonly_children[initial_handle->partitioned-1];
			
 
				+				initial_handle->active_readonly_children[initial_handle->partitioned-1] = NULL;
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+	} else {
			
 
				+		initial_handle->active_children = NULL;
			
 
				+	}
			
 
				 	initial_handle->partitioned--;
			
 
				 	if (!initial_handle->partitioned)
			
 
				 		initial_handle->readonly = 0;
			
 
				+	initial_handle->active_children = NULL;
			
 
				 	_starpu_spin_unlock(&initial_handle->header_lock);
			
 
				 
			
 
				-	unsigned i, n;
			
 
				+	for (i = 0; i < nparts; i++)
			
 
				+	{
			
 
				+		_starpu_spin_lock(&children[i]->header_lock);
			
 
				+		children[i]->active = 0;
			
 
				+		_starpu_spin_unlock(&children[i]->header_lock);
			
 
				+	}
			
 
				+
			
 
				+	unsigned n;
			
 
				 	struct starpu_data_descr descr[nparts];
			
 
				 	for (i = 0, n = 0; i < nparts; i++)
			
 
				 	{
			
@@ -755,6 +824,113 @@ void starpu_data_unpartition_readonly_submit(starpu_data_handle_t initial_handle
 
				 	starpu_task_insert(initial_handle->switch_cl, STARPU_W, initial_handle, STARPU_DATA_MODE_ARRAY, descr, n, 0);
			
 
				 }
			
 
				 
			
 
				+/* Unpartition everything below ancestor */
			
 
				+void starpu_data_unpartition_submit_r(starpu_data_handle_t ancestor, int gathering_node)
			
 
				+{
			
 
				+	unsigned i, j, nsiblings;
			
 
				+	if (!ancestor->partitioned)
			
 
				+		/* It's already unpartitioned */
			
 
				+		return;
			
 
				+	_STARPU_DEBUG("ancestor %p needs unpartitioning\n", ancestor);
			
 
				+	if (ancestor->readonly)
			
 
				+	{
			
 
				+		/* Uh, has to go through all read-only partitions */
			
 
				+		for (i = 0; i < ancestor->partitioned; i++) {
			
 
				+			starpu_data_handle_t *children = ancestor->active_readonly_children[i];
			
 
				+			_STARPU_DEBUG("unpartition readonly children %p\n", children);
			
 
				+			nsiblings = children[0]->nsiblings;
			
 
				+			for (j = 0; j < nsiblings; j++) {
			
 
				+				/* Make sure our children are unpartitioned */
			
 
				+				starpu_data_unpartition_submit_r(children[j], gathering_node);
			
 
				+			}
			
 
				+			/* And unpartition them */
			
 
				+			starpu_data_unpartition_submit(ancestor, nsiblings, children, gathering_node);
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		_STARPU_DEBUG("unpartition children %p\n", ancestor->active_children);
			
 
				+		/* Only one partition */
			
 
				+		nsiblings = ancestor->active_children[0]->nsiblings;
			
 
				+		for (i = 0; i < nsiblings; i++)
			
 
				+			starpu_data_unpartition_submit_r(ancestor->active_children[i], gathering_node);
			
 
				+		/* And unpartition ourself */
			
 
				+		starpu_data_unpartition_submit(ancestor, nsiblings, ancestor->active_children, gathering_node);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* Make ancestor partition itself properly for target */
			
 
				+static void _starpu_data_partition_access_look_up(starpu_data_handle_t ancestor, starpu_data_handle_t target, int write)
			
 
				+{
			
 
				+	/* First make sure ancestor has proper state, if not, ask father */
			
 
				+	if (!ancestor->active || (write && ancestor->active_ro))
			
 
				+	{
			
 
				+		/* (The root is always active-rw) */
			
 
				+		STARPU_ASSERT(ancestor->father_handle);
			
 
				+		_STARPU_DEBUG("ancestor %p is not ready: %s, asking father %p\n", ancestor, ancestor->active ? ancestor->active_ro ? "RO" : "RW" : "NONE", ancestor->father_handle);
			
 
				+		_starpu_data_partition_access_look_up(ancestor->father_handle, ancestor, write);
			
 
				+		_STARPU_DEBUG("ancestor %p is now ready\n", ancestor);
			
 
				+	}
			
 
				+	else
			
 
				+		_STARPU_DEBUG("ancestor %p was ready\n", ancestor);
			
 
				+
			
 
				+	/* We shouldn't be called for nothing */
			
 
				+	STARPU_ASSERT(!ancestor->partitioned || !target || ancestor->active_children != target->siblings || (ancestor->readonly && write));
			
 
				+
			
 
				+	/* Then unpartition ancestor if needed */
			
 
				+	if (ancestor->partitioned &&
			
 
				+			/* Not the right children, unpartition ourself */
			
 
				+			((target && ancestor->active_children != target->siblings) ||
			
 
				+			/* We are partitioned and we want to write or some child
			
 
				+			 * is writing and we want to read, unpartition ourself*/
			
 
				+			(!target && (write || !ancestor->readonly))))
			
 
				+	{
			
 
				+#ifdef STARPU_DEVEL
			
 
				+#warning FIXME: better choose gathering node
			
 
				+#endif
			
 
				+		starpu_data_unpartition_submit_r(ancestor, STARPU_MAIN_RAM);
			
 
				+	}
			
 
				+
			
 
				+	if (!target)
			
 
				+	{
			
 
				+		_STARPU_DEBUG("ancestor %p is done\n", ancestor);
			
 
				+		/* No child target, nothing more to do actually.  */
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	/* Then partition ancestor towards target */
			
 
				+	if (ancestor->partitioned)
			
 
				+	{
			
 
				+		_STARPU_DEBUG("ancestor %p is already partitioned RO, turn RW\n", ancestor);
			
 
				+		/* Already partitioned, normally it's already for the target */
			
 
				+		STARPU_ASSERT(ancestor->active_children == target->siblings);
			
 
				+		/* And we are here just because we haven't partitioned rw */
			
 
				+		STARPU_ASSERT(ancestor->readonly && write);
			
 
				+		/* So we just need to upgrade ro to rw */
			
 
				+		starpu_data_partition_readwrite_upgrade_submit(ancestor, target->nsiblings, target->siblings);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		/* Just need to partition properly for the child */
			
 
				+		if (write)
			
 
				+		{
			
 
				+			_STARPU_DEBUG("partition ancestor %p RW\n", ancestor);
			
 
				+			starpu_data_partition_submit(ancestor, target->nsiblings, target->siblings);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			_STARPU_DEBUG("partition ancestor %p RO\n", ancestor);
			
 
				+			starpu_data_partition_readonly_submit(ancestor, target->nsiblings, target->siblings);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void _starpu_data_partition_access_submit(starpu_data_handle_t target, int write)
			
 
				+{
			
 
				+	_STARPU_DEBUG("accessing %p %s\n", target, write ? "RW" : "RO");
			
 
				+	_starpu_data_partition_access_look_up(target, NULL, write);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Given an integer N, NPARTS the number of parts it must be divided in, ID the
			
 
				  * part currently considered, determines the CHUNK_SIZE and the OFFSET, taking
			
--- a/src/datawizard/filters.h
+++ b/src/datawizard/filters.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2012                                     Inria
			
 
				- * Copyright (C) 2008-2011,2014                           Université de Bordeaux
			
 
				+ * Copyright (C) 2008-2011,2014,2017                      Université de Bordeaux
			
 
				  * Copyright (C) 2010,2015                                CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -31,4 +31,8 @@ _starpu_filter_nparts_compute_chunk_size_and_offset(unsigned n, unsigned nparts,
 
				 					     size_t elemsize, unsigned id,
			
 
				 					     unsigned ld, unsigned *chunk_size,
			
 
				 					     size_t *offset);
			
 
				+
			
 
				+
			
 
				+/* submit asynchronous unpartitioning / partitioning to make target active read-only or read-write */
			
 
				+void _starpu_data_partition_access_submit(starpu_data_handle_t target, int write);
			
 
				 #endif
			
--- a/src/datawizard/interfaces/data_interface.c
+++ b/src/datawizard/interfaces/data_interface.c
@@ -280,8 +280,15 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 
				 	handle->switch_cl = NULL;
			
 
				 	handle->partitioned = 0;
			
 
				 	handle->readonly = 0;
			
 
				+	handle->active = 1;
			
 
				+	handle->active_ro = 0;
			
 
				 	handle->root_handle = handle;
			
 
				 	handle->father_handle = NULL;
			
 
				+	handle->active_children = NULL;
			
 
				+	handle->active_readonly_children = NULL;
			
 
				+	handle->nactive_readonly_children = 0;
			
 
				+	handle->nsiblings = 0;
			
 
				+	handle->siblings = NULL;
			
 
				 	handle->sibling_index = 0; /* could be anything for the root */
			
 
				 	handle->depth = 1; /* the tree is just a node yet */
			
 
				         handle->mpi_data = NULL; /* invalid until set */
			
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -321,6 +321,7 @@ myPROGRAMS +=				\
 
				 	datawizard/test_arbiter			\
			
 
				 	datawizard/invalidate_pending_requests	\
			
 
				 	datawizard/temporary_partition		\
			
 
				+	datawizard/temporary_partition_implicit	\
			
 
				 	datawizard/redux_acquire		\
			
 
				 	disk/disk_copy				\
			
 
				 	disk/disk_copy_unpack			\
			
--- a/tests/datawizard/temporary_partition.c
+++ b/tests/datawizard/temporary_partition.c
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2012-2013                                Inria
			
 
				  * Copyright (C) 2010-2013,2015,2017                      CNRS
			
 
				- * Copyright (C) 2010,2013-2014,2016                      Université de Bordeaux
			
 
				+ * Copyright (C) 2010,2013-2014,2016-2017                 Université de Bordeaux
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -82,7 +82,7 @@ int main(void)
 
				 	/* Invalidate one random piece we don't care coherency about */
			
 
				 	starpu_data_invalidate_submit(handles[NPARTS/2]);
			
 
				 
			
 
				-	/* Join */
			
 
				+	/* Clean */
			
 
				 	starpu_data_unpartition_submit(handle, NPARTS, handles, -1);
			
 
				 	starpu_data_partition_clean(handle, NPARTS, handles);
			
 
				 
			
--- a/tests/datawizard/temporary_partition_implicit.c
+++ b/tests/datawizard/temporary_partition_implicit.c
@@ -0,0 +1,107 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2012-2013                                Inria
			
 
				+ * Copyright (C) 2010-2013,2015,2017                      CNRS
			
 
				+ * Copyright (C) 2010,2013-2014,2016-2017                 Université de Bordeaux
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include "../helper.h"
			
 
				+
			
 
				+#define SIZE (1<<20)
			
 
				+#define NPARTS 16
			
 
				+
			
 
				+/*
			
 
				+ * Test asynchronous partitioning on a temporary data without submitting explit
			
 
				+ * partitioning/unpartitioning.
			
 
				+ */
			
 
				+
			
 
				+static void codelet(void *descr[], void *_args)
			
 
				+{
			
 
				+	(void)descr;
			
 
				+	(void)_args;
			
 
				+}
			
 
				+
			
 
				+static struct starpu_codelet clw =
			
 
				+{
			
 
				+	.where = STARPU_CPU,
			
 
				+	.cpu_funcs = {codelet},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_W}
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet clr =
			
 
				+{
			
 
				+	.where = STARPU_CPU,
			
 
				+	.cpu_funcs = {codelet},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_R}
			
 
				+};
			
 
				+
			
 
				+int main(void)
			
 
				+{
			
 
				+	int ret;
			
 
				+	starpu_data_handle_t handle, handles[NPARTS];
			
 
				+	int i;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	starpu_vector_data_register(&handle, -1, 0, SIZE, sizeof(char));
			
 
				+
			
 
				+	/* Fork */
			
 
				+	struct starpu_data_filter f =
			
 
				+	{
			
 
				+		.filter_func = starpu_vector_filter_block,
			
 
				+		.nchildren = NPARTS
			
 
				+	};
			
 
				+	starpu_data_partition_plan(handle, &f, handles);
			
 
				+
			
 
				+	/* Process in parallel */
			
 
				+	for (i = 0; i < NPARTS; i++)
			
 
				+	{
			
 
				+		ret = starpu_task_insert(&clw,
			
 
				+					 STARPU_W, handles[i],
			
 
				+					 0);
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				+	}
			
 
				+
			
 
				+	/* Invalidate one random piece we don't care coherency about */
			
 
				+	starpu_data_invalidate_submit(handles[NPARTS/2]);
			
 
				+
			
 
				+	/* Clean */
			
 
				+	starpu_data_unpartition_submit(handle, NPARTS, handles, -1);
			
 
				+	starpu_data_partition_clean(handle, NPARTS, handles);
			
 
				+
			
 
				+	/* Read result */
			
 
				+	starpu_task_insert(&clr, STARPU_R, handle, 0);
			
 
				+
			
 
				+	starpu_data_unregister(handle);
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+enodev:
			
 
				+	starpu_data_unpartition_submit(handle, NPARTS, handles, -1);
			
 
				+	starpu_data_partition_clean(handle, NPARTS, handles);
			
 
				+	starpu_data_unregister(handle);
			
 
				+	starpu_shutdown();
			
 
				+	/* yes, we do not perform the computation but we did detect that no one
			
 
				+	 * could perform the kernel, so this is not an error from StarPU */
			
 
				+	fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				+	return STARPU_TEST_SKIPPED;
			
 
				+}