8 年之前 · 5e3dbcf95a
--- a/ChangeLog
+++ b/ChangeLog
@@ -78,6 +78,7 @@ New features:
 
				     STARPU_TASK_BREAK_ON_EXEC environment variables, with the job_id
			
 
				     of a task. StarPU will raise SIGTRAP when the task is being
			
 
				     scheduled, pushed, or popped by the scheduler.
			
 
				+  * Add per-node MPI data.
			
 
				 
			
 
				 Small features:
			
 
				   * New function starpu_worker_get_job_id(struct starpu_task *task)
			
@@ -89,6 +90,7 @@ Small features:
 
				   * Add starpu_vector_filter_list_long filter.
			
 
				   * Add starpu_perfmodel_arch_comb_fetch function.
			
 
				   * Add STARPU_WATCHDOG_DELAY environment variable.
			
 
				+  * Add starpu_mpi_get_data_on_all_nodes_detached function.
			
 
				 
			
 
				 Small changes:
			
 
				   * Output generated through STARPU_MPI_COMM has been modified to
			
--- a/doc/doxygen/chapters/410_mpi_support.doxy
+++ b/doc/doxygen/chapters/410_mpi_support.doxy
@@ -613,6 +613,52 @@ starpu_mpi_data_register(data1, 43, 1);
 
				 starpu_mpi_task_insert(MPI_COMM_WORLD, &cl, STARPU_W, data, STARPU_R, data0, STARPU_R, data1, 0); /* Executes on node 0 */
			
 
				 \endcode
			
 
				 
			
 
				+\section MPIPerNodeData Per-node Data
			
 
				+
			
 
				+Further than temporary data on just one node, one may want per-node data,
			
 
				+to e.g. replicate some computation because that is less expensive than
			
 
				+communicating the value over MPI:
			
 
				+
			
 
				+\code{.c}
			
 
				+starpu_data_handle pernode, data0, data1;
			
 
				+starpu_variable_data_register(&pernode, -1, 0, sizeof(val));
			
 
				+starpu_mpi_data_register(pernode, -1, STARPU_MPI_PER_NODE);
			
 
				+
			
 
				+/* Normal data: one on node0, one on node1 */
			
 
				+if (rank == 0) {
			
 
				+	starpu_variable_data_register(&data0, STARPU_MAIN_RAM, (uintptr_t) &val0, sizeof(val0));
			
 
				+	starpu_variable_data_register(&data1, -1, 0, sizeof(val1));
			
 
				+} else if (rank == 1) {
			
 
				+	starpu_variable_data_register(&data0, -1, 0, sizeof(val1));
			
 
				+	starpu_variable_data_register(&data1, STARPU_MAIN_RAM, (uintptr_t) &val1, sizeof(val1));
			
 
				+}
			
 
				+starpu_mpi_data_register(data0, 42, 0);
			
 
				+starpu_mpi_data_register(data1, 43, 1);
			
 
				+
			
 
				+starpu_mpi_task_insert(MPI_COMM_WORLD, &cl, STARPU_W, pernode, 0); /* Will be replicated on all nodes */
			
 
				+
			
 
				+starpu_mpi_task_insert(MPI_COMM_WORLD, &cl2, STARPU_RW, data0, STARPU_R, pernode); /* Will execute on node 0, using its own pernode*/
			
 
				+starpu_mpi_task_insert(MPI_COMM_WORLD, &cl2, STARPU_RW, data1, STARPU_R, pernode); /* Will execute on node 1, using its own pernode*/
			
 
				+\endcode
			
 
				+
			
 
				+One can turn a normal data into pernode data, by first broadcasting it to all nodes:
			
 
				+
			
 
				+\code{.c}
			
 
				+starpu_data_handle data;
			
 
				+starpu_variable_data_register(&data, -1, 0, sizeof(val));
			
 
				+starpu_mpi_data_register(data, 42, 0);
			
 
				+
			
 
				+/* Compute some value */
			
 
				+starpu_mpi_task_insert(MPI_COMM_WORLD, &cl, STARPU_W, data, 0); /* Node 0 computes it */
			
 
				+
			
 
				+/* Get it on all nodes */
			
 
				+starpu_mpi_get_data_on_all_nodes_detached(MPI_COMM_WORLD, data);
			
 
				+/* And turn it per-node */
			
 
				+starpu_mpi_data_set_rank(data, STARPU_MPI_PER_NODE);
			
 
				+\endcode
			
 
				+
			
 
				+The data can then be used just like pernode above.
			
 
				+
			
 
				 \section MPIPriorities Priorities
			
 
				 
			
 
				 All send functions have a <c>_prio</c> variant which takes an additional
			
--- a/doc/doxygen/chapters/api/mpi.doxy
+++ b/doc/doxygen/chapters/api/mpi.doxy
@@ -339,6 +339,14 @@ Return the tag of the given data.
 
				 Return the tag of the given data.
			
 
				 Symbol kept for backward compatibility. Calling function starpu_mpi_data_get_tag()
			
 
				 
			
 
				+\def STARPU_MPI_PER_NODE
			
 
				+\ingroup API_MPI_Support
			
 
				+Can be used as rank when calling starpu_mpi_data_register() and alike, to
			
 
				+specify that the data is per-node: each node will have its own value. Tasks
			
 
				+writing to such data will be replicated on all nodes (and all parameters then
			
 
				+have to be per-node). Tasks not writing to such data will just take the
			
 
				+node-local value without any MPI communication.
			
 
				+
			
 
				 \fn void starpu_mpi_data_migrate(MPI_Comm comm, starpu_data_handle_t handle, int new_rank)
			
 
				 \ingroup API_MPI_Support
			
 
				 Migrate the data onto the \p new_rank MPI node. This means both transferring
			
@@ -440,6 +448,11 @@ owner if needed. At least the target node and the owner have to call
 
				 the function. On reception, the \p callback function is called with
			
 
				 the argument \p arg.
			
 
				 
			
 
				+\fn void starpu_mpi_get_data_on_all_nodes_detached(MPI_Comm comm, starpu_data_handle_t data_handle)
			
 
				+\ingroup API_MPI_Support
			
 
				+Transfer data \p data_handle to all MPI nodes, sending it from its
			
 
				+owner if needed. All nodes have to call the function.
			
 
				+
			
 
				 @name Node Selection Policy
			
 
				 \anchor MPINodeSelectionPolicy
			
 
				 \ingroup API_MPI_Support
			
--- a/mpi/include/starpu_mpi.h
+++ b/mpi/include/starpu_mpi.h
@@ -65,6 +65,7 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...);
 
				 
			
 
				 void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle, int node);
			
 
				 void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg);
			
 
				+void starpu_mpi_get_data_on_all_nodes_detached(MPI_Comm comm, starpu_data_handle_t data_handle);
			
 
				 void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle);
			
 
				 void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle, int prio);
			
 
				 
			
@@ -98,6 +99,8 @@ void starpu_mpi_set_communication_tag(int tag);
 
				 void starpu_mpi_data_register_comm(starpu_data_handle_t data_handle, int tag, int rank, MPI_Comm comm);
			
 
				 #define starpu_mpi_data_register(data_handle, tag, rank) starpu_mpi_data_register_comm(data_handle, tag, rank, MPI_COMM_WORLD)
			
 
				 
			
 
				+#define STARPU_MPI_PER_NODE -2
			
 
				+
			
 
				 void starpu_mpi_data_set_rank_comm(starpu_data_handle_t handle, int rank, MPI_Comm comm);
			
 
				 #define starpu_mpi_data_set_rank(handle, rank) starpu_mpi_data_set_rank_comm(handle, rank, MPI_COMM_WORLD)
			
 
				 void starpu_mpi_data_set_tag(starpu_data_handle_t handle, int tag);
			
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -1931,6 +1931,17 @@ void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle
 
				 	}
			
 
				 }
			
 
				 
			
 
				+void starpu_mpi_get_data_on_all_nodes_detached(MPI_Comm comm, starpu_data_handle_t data_handle)
			
 
				+{
			
 
				+	int size, i;
			
 
				+	starpu_mpi_comm_size(comm, &size);
			
 
				+#ifdef STARPU_DEVEL
			
 
				+#warning TODO: use binary communication tree to optimize broadcast
			
 
				+#endif
			
 
				+	for (i = 0; i < size; i++)
			
 
				+		starpu_mpi_get_data_on_node_detached(comm, data_handle, i, NULL, NULL);
			
 
				+}
			
 
				+
			
 
				 void starpu_mpi_data_migrate(MPI_Comm comm, starpu_data_handle_t data, int new_rank)
			
 
				 {
			
 
				 	int old_rank = starpu_mpi_data_get_rank(data);
			
--- a/mpi/src/starpu_mpi_task_insert.c
+++ b/mpi/src/starpu_mpi_task_insert.c
@@ -82,11 +82,11 @@ int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_data_a
 
				 			// No node has been selected yet
			
 
				 			*xrank = mpi_rank;
			
 
				 			_STARPU_MPI_DEBUG(100, "Codelet is going to be executed by node %d\n", *xrank);
			
 
				-			*do_execute = (mpi_rank == me);
			
 
				+			*do_execute = mpi_rank == STARPU_MPI_PER_NODE || (mpi_rank == me);
			
 
				 		}
			
 
				 		else if (mpi_rank != *xrank)
			
 
				 		{
			
 
				-			_STARPU_MPI_DEBUG(100, "Another node %d had already been selected to execute the codelet\n", *xrank);
			
 
				+			_STARPU_MPI_DEBUG(100, "Another node %d had already been selected to execute the codelet, can't now set %d\n", *xrank, mpi_rank);
			
 
				 			*inconsistent_execute = 1;
			
 
				 		}
			
 
				 	}
			
@@ -105,7 +105,7 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 
				 			_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
			
 
				 		}
			
 
				 
			
 
				-		if (do_execute && mpi_rank != me)
			
 
				+		if (do_execute && mpi_rank != STARPU_MPI_PER_NODE && mpi_rank != me)
			
 
				 		{
			
 
				 			/* The node is going to execute the codelet, but it does not own the data, it needs to receive the data from the owner node */
			
 
				 			int already_received = _starpu_mpi_cache_received_data_set(data);
			
@@ -146,9 +146,13 @@ void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum s
 
				 		{
			
 
				 			_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
			
 
				 		}
			
 
				+		if (mpi_rank == STARPU_MPI_PER_NODE)
			
 
				+		{
			
 
				+			mpi_rank = me;
			
 
				+		}
			
 
				 		if (mpi_rank == me)
			
 
				 		{
			
 
				-			if (xrank != -1 && me != xrank)
			
 
				+			if (xrank != -1 && (xrank != STARPU_MPI_PER_NODE && me != xrank))
			
 
				 			{
			
 
				 				_STARPU_MPI_DEBUG(1, "Receive data %p back from the task %d which executed the codelet ...\n", data, xrank);
			
 
				 				if(data_tag == -1)
			
@@ -184,6 +188,10 @@ void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum star
 
				 		if ((mode & STARPU_R) && do_execute)
			
 
				 		{
			
 
				 			int mpi_rank = starpu_mpi_data_get_rank(data);
			
 
				+			if (mpi_rank == STARPU_MPI_PER_NODE)
			
 
				+			{
			
 
				+				mpi_rank = me;
			
 
				+			}
			
 
				 			if (mpi_rank != me && mpi_rank != -1)
			
 
				 			{
			
 
				 				starpu_data_invalidate_submit(data);
			
@@ -195,6 +203,7 @@ void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum star
 
				 static
			
 
				 int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nodes, int *xrank, int *do_execute, struct starpu_data_descr **descrs_p, int *nb_data_p, int *prio_p, va_list varg_list)
			
 
				 {
			
 
				+	/* XXX: _fstarpu_mpi_task_decode_v needs to be updated at the same time */
			
 
				 	va_list varg_list_copy;
			
 
				 	int inconsistent_execute = 0;
			
 
				 	int arg_type;
			
@@ -436,12 +445,12 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
 
				 		// We need to find out which node is going to execute the codelet.
			
 
				 		_STARPU_MPI_DEBUG(100, "Different nodes are owning W data. The node to execute the codelet is going to be selected with the current selection node policy. See starpu_mpi_node_selection_set_current_policy() to change the policy, or use STARPU_EXECUTE_ON_NODE or STARPU_EXECUTE_ON_DATA to specify the node\n");
			
 
				 		*xrank = _starpu_mpi_select_node(me, nb_nodes, descrs, nb_data, select_node_policy);
			
 
				-		*do_execute = (me == *xrank);
			
 
				+		*do_execute = *xrank == STARPU_MPI_PER_NODE || (me == *xrank);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				 		_STARPU_MPI_DEBUG(100, "Inconsistent=%d - xrank=%d\n", inconsistent_execute, *xrank);
			
 
				-		*do_execute = (me == *xrank);
			
 
				+		*do_execute = *xrank == STARPU_MPI_PER_NODE || (me == *xrank);
			
 
				 	}
			
 
				 	_STARPU_MPI_DEBUG(100, "do_execute=%d\n", *do_execute);
			
 
				 
			
--- a/mpi/src/starpu_mpi_task_insert_fortran.c
+++ b/mpi/src/starpu_mpi_task_insert_fortran.c
@@ -291,12 +291,12 @@ int _fstarpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_no
 
				 		// We need to find out which node is going to execute the codelet.
			
 
				 		_STARPU_MPI_DISP("Different nodes are owning W data. The node to execute the codelet is going to be selected with the current selection node policy. See starpu_mpi_node_selection_set_current_policy() to change the policy, or use STARPU_EXECUTE_ON_NODE or STARPU_EXECUTE_ON_DATA to specify the node\n");
			
 
				 		*xrank = _starpu_mpi_select_node(me, nb_nodes, descrs, nb_data, select_node_policy);
			
 
				-		*do_execute = (me == *xrank);
			
 
				+		*do_execute = *xrank == STARPU_MPI_PER_NODE || (me == *xrank);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				 		_STARPU_MPI_DEBUG(100, "Inconsistent=%d - xrank=%d\n", inconsistent_execute, *xrank);
			
 
				-		*do_execute = (me == *xrank);
			
 
				+		*do_execute = *xrank == STARPU_MPI_PER_NODE || (me == *xrank);
			
 
				 	}
			
 
				 	_STARPU_MPI_DEBUG(100, "do_execute=%d\n", *do_execute);
			
 
				 
			
--- a/mpi/tests/Makefile.am
+++ b/mpi/tests/Makefile.am
@@ -126,7 +126,8 @@ starpu_mpi_TESTS +=				\
 
				 	policy_register_many			\
			
 
				 	policy_selection			\
			
 
				 	policy_selection2			\
			
 
				-	ring_async_implicit
			
 
				+	ring_async_implicit			\
			
 
				+	temporary
			
 
				 
			
 
				 if !STARPU_SIMGRID
			
 
				 starpu_mpi_TESTS +=				\
			
@@ -182,6 +183,7 @@ noinst_PROGRAMS =				\
 
				 	ring_sync_detached			\
			
 
				 	ring_async				\
			
 
				 	ring_async_implicit			\
			
 
				+	temporary				\
			
 
				 	block_interface				\
			
 
				 	block_interface_pinned			\
			
 
				 	cache					\
			
--- a/mpi/tests/temporary.c
+++ b/mpi/tests/temporary.c
@@ -0,0 +1,135 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2015, 2016, 2017  CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/* This tests that one can register temporary data0 on each MPI node which can mix with common data0 */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+static void func_add(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+	int *a = (void*) STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	const int *b = (void*) STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				+	const int *c = (void*) STARPU_VARIABLE_GET_PTR(descr[2]);
			
 
				+
			
 
				+	*a = *b + *c;
			
 
				+	FPRINTF_MPI(stderr, "%d + %d = %d\n", *b, *c, *a);
			
 
				+}
			
 
				+
			
 
				+/* Dummy cost function for simgrid */
			
 
				+static double cost_function(struct starpu_task *task STARPU_ATTRIBUTE_UNUSED, unsigned nimpl STARPU_ATTRIBUTE_UNUSED)
			
 
				+{
			
 
				+	return 0.000001;
			
 
				+}
			
 
				+static struct starpu_perfmodel dumb_model =
			
 
				+{
			
 
				+	.type          = STARPU_COMMON,
			
 
				+	.cost_function = cost_function
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet codelet_add =
			
 
				+{
			
 
				+	.cpu_funcs = {func_add},
			
 
				+	.nbuffers = 3,
			
 
				+	.modes = {STARPU_W, STARPU_R, STARPU_R},
			
 
				+	.model = &dumb_model
			
 
				+};
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int rank, size, n;
			
 
				+	int ret;
			
 
				+	int a;
			
 
				+	int val0, val1;
			
 
				+	starpu_data_handle_t data0, data1, tmp0, tmp, tmp2;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(&argc, &argv, 1);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	if (starpu_mpi_cache_is_enabled() == 0) goto skip;
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		val0 = 1;
			
 
				+		starpu_variable_data_register(&data0, STARPU_MAIN_RAM, (uintptr_t)&val0, sizeof(val0));
			
 
				+		starpu_variable_data_register(&data1, -1, (uintptr_t)NULL, sizeof(val0));
			
 
				+		starpu_variable_data_register(&tmp0, -1, (uintptr_t)NULL, sizeof(val0));
			
 
				+		starpu_mpi_data_register(tmp0, -1, 0);
			
 
				+	}
			
 
				+	else if (rank == 1)
			
 
				+	{
			
 
				+		starpu_variable_data_register(&data0, -1, (uintptr_t)NULL, sizeof(val0));
			
 
				+		starpu_variable_data_register(&data1, STARPU_MAIN_RAM, (uintptr_t)&val1, sizeof(val1));
			
 
				+		tmp0 = NULL;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		starpu_variable_data_register(&data0, -1, (uintptr_t)NULL, sizeof(val0));
			
 
				+		starpu_variable_data_register(&data1, -1, (uintptr_t)NULL, sizeof(val0));
			
 
				+		tmp0 = NULL;
			
 
				+	}
			
 
				+	starpu_variable_data_register(&tmp, -1, (uintptr_t)NULL, sizeof(val0));
			
 
				+	starpu_variable_data_register(&tmp2, -1, (uintptr_t)NULL, sizeof(val0));
			
 
				+
			
 
				+	starpu_mpi_data_register(data0, 42, 0);
			
 
				+	starpu_mpi_data_register(data1, 43, 1);
			
 
				+	starpu_mpi_data_register(tmp, 44, 0);
			
 
				+	starpu_mpi_data_register(tmp2, -1, STARPU_MPI_PER_NODE);
			
 
				+
			
 
				+	/* Test temporary data0 on node 0 only */
			
 
				+	starpu_mpi_task_insert(MPI_COMM_WORLD, &codelet_add, STARPU_W, tmp0, STARPU_R, data0, STARPU_R, data0, 0);
			
 
				+
			
 
				+	starpu_mpi_task_insert(MPI_COMM_WORLD, &codelet_add, STARPU_W, data0, STARPU_R, tmp0, STARPU_R, tmp0, 0);
			
 
				+
			
 
				+	starpu_mpi_task_insert(MPI_COMM_WORLD, &codelet_add, STARPU_W, tmp, STARPU_R, data0, STARPU_R, data0, 0);
			
 
				+
			
 
				+	/* Now make some tmp per-node, so that each node replicates the computation */
			
 
				+	for (n = 0; n < size; n++)
			
 
				+		if (n != 0)
			
 
				+			/* Get the value on all nodes */
			
 
				+			starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, tmp, n, NULL, NULL);
			
 
				+	starpu_mpi_data_set_rank(tmp, STARPU_MPI_PER_NODE);
			
 
				+
			
 
				+	/* This task writes to a per-node data, so will be executed by all nodes */
			
 
				+	starpu_mpi_task_insert(MPI_COMM_WORLD, &codelet_add, STARPU_W, tmp2, STARPU_R, tmp, STARPU_R, tmp, 0);
			
 
				+
			
 
				+	/* All MPI nodes have computed the value (no MPI communication here!) */
			
 
				+	starpu_data_acquire_on_node(tmp2, STARPU_MAIN_RAM, STARPU_R);
			
 
				+	STARPU_ASSERT(*(int*)starpu_data_handle_to_pointer(tmp2, STARPU_MAIN_RAM) == 16);
			
 
				+	starpu_data_release_on_node(tmp2, STARPU_MAIN_RAM);
			
 
				+
			
 
				+	/* And nodes 0 and 1 do something with it */
			
 
				+	starpu_mpi_task_insert(MPI_COMM_WORLD, &codelet_add, STARPU_W, data0, STARPU_R, tmp, STARPU_R, tmp2, 0);
			
 
				+	starpu_mpi_task_insert(MPI_COMM_WORLD, &codelet_add, STARPU_W, data1, STARPU_R, tmp, STARPU_R, tmp2, 0);
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	starpu_data_unregister(data0);
			
 
				+
			
 
				+skip:
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+		STARPU_ASSERT_MSG(val0 == 24, "%d should be %d\n", val0, 16 * size);
			
 
				+	if (rank == 1)
			
 
				+		STARPU_ASSERT_MSG(val1 == 24, "%d should be %d\n", val0, 16 * size);
			
 
				+	return 0;
			
 
				+}