Browse Source

Merge remote-tracking branch 'gitlab/master' into nathalie/master

Nathalie Furmento 5 years ago
parent
commit
d5bfd51fcd

+ 3 - 0
ChangeLog

@@ -47,6 +47,9 @@ New features:
   * Add an experimental python interface (not actually parallel yet)
   * Add task submission file+line in traces.
   * Add papi- and nvml-based energy measurement.
+  * Add starpu_mpi_datatype_node_register and
+    starpu_mpi_interface_datatype_node_register which will be needed for
+    MPI/NUMA/GPUDirect.
 
 Small changes:
   * Add a synthetic energy efficiency testcase.

+ 14 - 1
include/starpu_data_interfaces.h

@@ -622,16 +622,29 @@ enum starpu_data_interface_id starpu_data_get_interface_id(starpu_data_handle_t
 /**
    Execute the packing operation of the interface of the data
    registered at \p handle (see starpu_data_interface_ops). This
-   packing operation must allocate a buffer large enough at \p ptr and copy
+   packing operation must allocate a buffer large enough at \p ptr on node \p node and copy
    into the newly allocated buffer the data associated to \p handle. \p count
    will be set to the size of the allocated buffer. If \p ptr is <c>NULL</c>, the
    function should not copy the data in the buffer but just set \p count to
    the size of the buffer which would have been allocated. The special
    value -1 indicates the size is yet unknown.
 */
+int starpu_data_pack_node(starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ssize_t *count);
+
+/**
+   Like starpu_data_pack_node(), but for the local memory node.
+*/
 int starpu_data_pack(starpu_data_handle_t handle, void **ptr, starpu_ssize_t *count);
 
 /**
+   Unpack in handle the data located at \p ptr of size \p count allocated
+   on node \p node as described by the interface of the data. The interface
+   registered at \p handle must define a unpacking operation (see
+   starpu_data_interface_ops).
+*/
+int starpu_data_unpack_node(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count);
+
+/**
    Unpack in handle the data located at \p ptr of size \p count as
    described by the interface of the data. The interface registered at
    \p handle must define a unpacking operation (see

+ 2 - 0
mpi/GNUmakefile.in

@@ -19,6 +19,8 @@ SUBDIRS=
 @STARPU_BUILD_EXAMPLES_TRUE@SUBDIRS += examples
 @STARPU_BUILD_TESTS_TRUE@SUBDIRS += tests
 
+all:
+
 check: check-recursive
 
 # divide by 4 the number of jobs to run in parallel, since mpirun will start 4

+ 2 - 0
mpi/examples/Makefile.am

@@ -159,6 +159,8 @@ examplebin_PROGRAMS += 			\
 
 if !STARPU_SIMGRID
 starpu_mpi_EXAMPLES	+=	\
+	mpi_lu/plu_example_float	\
+	mpi_lu/plu_example_double	\
 	mpi_lu/plu_implicit_example_float	\
 	mpi_lu/plu_implicit_example_double	\
 	mpi_lu/plu_outofcore_example_float	\

+ 3 - 3
mpi/examples/mpi_lu/plu_example.c

@@ -610,7 +610,9 @@ int main(int argc, char **argv)
 	{
 		for (i = 0; i < nblocks; i++)
 		{
-			starpu_data_unregister(dataA_handles[j+nblocks*i]);
+			starpu_data_handle_t handle = dataA_handles[j+nblocks*i];
+			if (handle != STARPU_POISON_PTR)
+				starpu_data_unregister(handle);
 			TYPE *blockptr = dataA[j+i*nblocks];
 			if (blockptr != STARPU_POISON_PTR)
 				starpu_free(blockptr);
@@ -625,9 +627,7 @@ int main(int argc, char **argv)
 	starpu_cublas_shutdown();
 	starpu_mpi_shutdown();
 
-#if 0
 	MPI_Finalize();
-#endif
 
 	return 0;
 }

+ 1 - 2
mpi/examples/mpi_lu/pxlu.c

@@ -817,6 +817,7 @@ static void wait_tag_and_fetch_handle(starpu_tag_t tag, starpu_data_handle_t han
 //	fprintf(stderr, "Rank %d : tag %lx is done\n", rank, tag);
 
 	starpu_data_acquire(handle, STARPU_R);
+	starpu_data_release(handle);
 
 //	starpu_data_unregister(handle);
 }
@@ -899,8 +900,6 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size, unsig
 		starpu_iteration_pop();
 	}
 
-	int wait_ret = starpu_mpi_wait_for_all(MPI_COMM_WORLD);
-	STARPU_ASSERT(wait_ret == MPI_SUCCESS);
 	int barrier_ret = starpu_mpi_barrier(MPI_COMM_WORLD);
 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
 

+ 6 - 6
mpi/examples/user_datatype/my_interface.c

@@ -45,7 +45,7 @@ void starpu_my_data_compare_codelet_cpu(void *descr[], void *_args)
 	*compare = (d0 == d1 && c0 == c1);
 }
 
-void _starpu_my_data_datatype_allocate(MPI_Datatype *mpi_datatype)
+void _starpu_my_data_datatype_allocate(unsigned node, MPI_Datatype *mpi_datatype)
 {
 	int ret;
 	int blocklengths[2] = {1, 1};
@@ -68,10 +68,10 @@ void _starpu_my_data_datatype_allocate(MPI_Datatype *mpi_datatype)
 	free(myinterface);
 }
 
-int starpu_my_data_datatype_allocate(starpu_data_handle_t handle, MPI_Datatype *mpi_datatype)
+int starpu_my_data_datatype_allocate(starpu_data_handle_t handle, unsigned node, MPI_Datatype *mpi_datatype)
 {
 	(void)handle;
-	_starpu_my_data_datatype_allocate(mpi_datatype);
+	_starpu_my_data_datatype_allocate(node, mpi_datatype);
 	return 0;
 }
 
@@ -80,7 +80,7 @@ void starpu_my_data_datatype_free(MPI_Datatype *mpi_datatype)
 	MPI_Type_free(mpi_datatype);
 }
 
-int starpu_my_data2_datatype_allocate(starpu_data_handle_t handle, MPI_Datatype *mpi_datatype)
+int starpu_my_data2_datatype_allocate(starpu_data_handle_t handle, unsigned node, MPI_Datatype *mpi_datatype)
 {
 	(void)handle;
 	(void)mpi_datatype;
@@ -315,7 +315,7 @@ void starpu_my_data_register(starpu_data_handle_t *handleptr, unsigned home_node
 	if (interface_data_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
 	{
 		interface_data_ops.interfaceid = starpu_data_interface_get_next_id();
-		starpu_mpi_interface_datatype_register(interface_data_ops.interfaceid, starpu_my_data_datatype_allocate, starpu_my_data_datatype_free);
+		starpu_mpi_interface_datatype_node_register(interface_data_ops.interfaceid, starpu_my_data_datatype_allocate, starpu_my_data_datatype_free);
 	}
 
 	struct starpu_my_data_interface data =
@@ -357,7 +357,7 @@ void starpu_my_data2_register(starpu_data_handle_t *handleptr, unsigned home_nod
 	if (interface_data2_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
 	{
 		interface_data2_ops.interfaceid = starpu_data_interface_get_next_id();
-		starpu_mpi_interface_datatype_register(interface_data2_ops.interfaceid, starpu_my_data2_datatype_allocate, starpu_my_data2_datatype_free);
+		starpu_mpi_interface_datatype_node_register(interface_data2_ops.interfaceid, starpu_my_data2_datatype_allocate, starpu_my_data2_datatype_free);
 	}
 
 	struct starpu_my_data_interface data =

+ 3 - 3
mpi/examples/user_datatype/my_interface.h

@@ -47,10 +47,10 @@ int starpu_my_data_interface_get_int(void *interface);
 #define STARPU_MY_DATA_GET_CHAR(interface)	starpu_my_data_interface_get_char(interface)
 #define STARPU_MY_DATA_GET_INT(interface)	starpu_my_data_interface_get_int(interface)
 
-void _starpu_my_data_datatype_allocate(MPI_Datatype *mpi_datatype);
-int starpu_my_data_datatype_allocate(starpu_data_handle_t handle, MPI_Datatype *mpi_datatype);
+void _starpu_my_data_datatype_allocate(unsigned node, MPI_Datatype *mpi_datatype);
+int starpu_my_data_datatype_allocate(starpu_data_handle_t handle, unsigned node, MPI_Datatype *mpi_datatype);
 void starpu_my_data_datatype_free(MPI_Datatype *mpi_datatype);
-int starpu_my_data2_datatype_allocate(starpu_data_handle_t handle, MPI_Datatype *mpi_datatype);
+int starpu_my_data2_datatype_allocate(starpu_data_handle_t handle, unsigned node, MPI_Datatype *mpi_datatype);
 void starpu_my_data2_datatype_free(MPI_Datatype *mpi_datatype);
 
 void starpu_my_data_display_codelet_cpu(void *descr[], void *_args);

+ 2 - 2
mpi/examples/user_datatype/user_datatype.c

@@ -62,7 +62,7 @@ int main(int argc, char **argv)
 	if (rank == 0)
 	{
 		MPI_Datatype mpi_datatype;
-		_starpu_my_data_datatype_allocate(&mpi_datatype);
+		_starpu_my_data_datatype_allocate(STARPU_MAIN_RAM, &mpi_datatype);
 		MPI_Send(&my0, 1, mpi_datatype, 1, 42, MPI_COMM_WORLD);
 		starpu_my_data_datatype_free(&mpi_datatype);
 	}
@@ -71,7 +71,7 @@ int main(int argc, char **argv)
 		MPI_Datatype mpi_datatype;
 		MPI_Status status;
 		struct starpu_my_data myx;
-		_starpu_my_data_datatype_allocate(&mpi_datatype);
+		_starpu_my_data_datatype_allocate(STARPU_MAIN_RAM, &mpi_datatype);
 		MPI_Recv(&myx, 1, mpi_datatype, 0, 42, MPI_COMM_WORLD, &status);
 		FPRINTF(stderr, "[mpi] Received value: '%c' %d\n", myx.c, myx.d);
 		starpu_my_data_datatype_free(&mpi_datatype);

+ 21 - 0
mpi/include/starpu_mpi.h

@@ -344,6 +344,7 @@ int starpu_mpi_isend_array_detached_unlock_tag_prio(unsigned array_size, starpu_
 int starpu_mpi_irecv_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *source, starpu_mpi_tag_t *data_tag, MPI_Comm *comm, starpu_tag_t tag);
 
 typedef int (*starpu_mpi_datatype_allocate_func_t)(starpu_data_handle_t, MPI_Datatype *);
+typedef int (*starpu_mpi_datatype_node_allocate_func_t)(starpu_data_handle_t, unsigned node, MPI_Datatype *);
 typedef void (*starpu_mpi_datatype_free_func_t)(MPI_Datatype *);
 
 /**
@@ -367,6 +368,26 @@ int starpu_mpi_datatype_register(starpu_data_handle_t handle, starpu_mpi_datatyp
 int starpu_mpi_interface_datatype_register(enum starpu_data_interface_id id, starpu_mpi_datatype_allocate_func_t allocate_datatype_func, starpu_mpi_datatype_free_func_t free_datatype_func);
 
 /**
+   Register functions to create and free a MPI datatype for the given
+   handle.
+   Similar to starpu_mpi_interface_datatype_register().
+   It is important that the function is called before any
+   communication can take place for a data with the given handle. See
+   \ref ExchangingUserDefinedDataInterface for an example.
+*/
+int starpu_mpi_datatype_node_register(starpu_data_handle_t handle, starpu_mpi_datatype_node_allocate_func_t allocate_datatype_func, starpu_mpi_datatype_free_func_t free_datatype_func);
+
+/**
+   Register functions to create and free a MPI datatype for the given
+   interface id.
+   Similar to starpu_mpi_datatype_register().
+   It is important that the function is called before any
+   communication can take place for a data with the given handle. See
+   \ref ExchangingUserDefinedDataInterface for an example.
+*/
+int starpu_mpi_interface_datatype_node_register(enum starpu_data_interface_id id, starpu_mpi_datatype_node_allocate_func_t allocate_datatype_func, starpu_mpi_datatype_free_func_t free_datatype_func);
+
+/**
    Unregister the MPI datatype functions stored for the interface of
    the given handle.
 */

+ 1 - 0
mpi/src/mpi/starpu_mpi_early_data.h

@@ -39,6 +39,7 @@ LIST_TYPE(_starpu_mpi_early_data_handle,
 	  struct _starpu_mpi_req *req;
 	  void *buffer;
 	  size_t size;
+	  unsigned buffer_node;
 	  int req_ready;
 	  struct _starpu_mpi_node_tag node_tag;
 	  starpu_pthread_mutex_t req_mutex;

+ 36 - 26
mpi/src/mpi/starpu_mpi_mpi.c

@@ -132,6 +132,7 @@ struct _starpu_mpi_early_data_cb_args
 	struct _starpu_mpi_req *req;
 	void *buffer;
 	size_t size;
+	unsigned buffer_node;
 };
 
 void _starpu_mpi_submit_ready_request_inc(struct _starpu_mpi_req *req)
@@ -173,7 +174,7 @@ void _starpu_mpi_submit_ready_request(void *arg)
 	if (req->reserved_size)
 	{
 		/* The core will have really allocated the reception buffer now, release our reservation */
-		starpu_memory_deallocate(STARPU_MAIN_RAM, req->reserved_size);
+		starpu_memory_deallocate(req->node, req->reserved_size);
 		req->reserved_size = 0;
 	}
 
@@ -197,12 +198,12 @@ void _starpu_mpi_submit_ready_request(void *arg)
 			if (req->registered_datatype == 1)
 			{
 				req->count = 1;
-				req->ptr = starpu_data_handle_to_pointer(req->data_handle, STARPU_MAIN_RAM);
+				req->ptr = starpu_data_handle_to_pointer(req->data_handle, req->node);
 			}
 			else
 			{
 				STARPU_ASSERT(req->count);
-				req->ptr = (void *)starpu_malloc_on_node_flags(STARPU_MAIN_RAM, req->count, 0);
+				req->ptr = (void *)starpu_malloc_on_node_flags(req->node, req->count, 0);
 			}
 
 			_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %"PRIi64" src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
@@ -249,11 +250,13 @@ void _starpu_mpi_submit_ready_request(void *arg)
 				cb_args->early_handle = early_data_handle->handle;
 				cb_args->buffer = early_data_handle->buffer;
 				cb_args->size = early_data_handle->size;
+				cb_args->buffer_node = early_data_handle->buffer_node;
 				cb_args->req = req;
 
 				_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
 				STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
-				starpu_data_acquire_on_node_cb(early_data_handle->handle,STARPU_MAIN_RAM,STARPU_R,_starpu_mpi_early_data_cb,(void*) cb_args);
+				// FIXME: when buffer == NULL, do not hardcode acquiring on early_data_handle->buffer_node, to just acquire where the data happens to have been stored by MPI
+				starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(early_data_handle->handle,early_data_handle->buffer_node,STARPU_R,_starpu_mpi_early_data_cb,(void*) cb_args,  1, 0, NULL, NULL);
 				STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 			}
 			else
@@ -268,13 +271,13 @@ void _starpu_mpi_submit_ready_request(void *arg)
 					if (req->registered_datatype == 1)
 					{
 						req->count = 1;
-						req->ptr = starpu_data_handle_to_pointer(req->data_handle, STARPU_MAIN_RAM);
+						req->ptr = starpu_data_handle_to_pointer(req->data_handle, req->node);
 					}
 					else
 					{
 						req->count = sync_req->count;
 						STARPU_ASSERT(req->count);
-						req->ptr = (void *)starpu_malloc_on_node_flags(STARPU_MAIN_RAM, req->count, 0);
+						req->ptr = (void *)starpu_malloc_on_node_flags(req->node, req->count, 0);
 					}
 					_starpu_mpi_req_list_push_front(&ready_recv_requests, req);
 					_STARPU_MPI_INC_READY_REQUESTS(+1);
@@ -431,7 +434,7 @@ void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
 	{
 		int size, ret;
 		req->count = 1;
-		req->ptr = starpu_data_handle_to_pointer(req->data_handle, STARPU_MAIN_RAM);
+		req->ptr = starpu_data_handle_to_pointer(req->data_handle, req->node);
 
 		MPI_Type_size(req->datatype, &size);
 		req->backend->envelope->size = (starpu_ssize_t)req->count * size;
@@ -445,7 +448,7 @@ void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
 		int ret;
 
  		// Do not pack the data, just try to find out the size
-		starpu_data_pack(req->data_handle, NULL, &(req->backend->envelope->size));
+		starpu_data_pack_node(req->data_handle, req->node, NULL, &(req->backend->envelope->size));
 
 		if (req->backend->envelope->size != -1)
  		{
@@ -458,7 +461,7 @@ void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
  		}
 
  		// Pack the data
- 		starpu_data_pack(req->data_handle, &req->ptr, &req->count);
+		starpu_data_pack_node(req->data_handle, req->node, &req->ptr, &req->count);
 		if (req->backend->envelope->size == -1)
  		{
  			// We know the size now, let's send it
@@ -879,14 +882,14 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 					int ret;
 					ret = MPI_Wait(&req->backend->size_req, MPI_STATUS_IGNORE);
 					STARPU_MPI_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Wait returning %s", _starpu_mpi_get_mpi_error_code(ret));
-					starpu_free_on_node_flags(STARPU_MAIN_RAM, (uintptr_t)req->ptr, req->count, 0);
+					starpu_free_on_node_flags(req->node, (uintptr_t)req->ptr, req->count, 0);
 					req->ptr = NULL;
 				}
 				else if (req->request_type == RECV_REQ)
 				{
 					// req->ptr is freed by starpu_data_unpack
-					starpu_data_unpack(req->data_handle, req->ptr, req->count);
-					starpu_memory_deallocate(STARPU_MAIN_RAM, req->count);
+					starpu_data_unpack_node(req->data_handle, req->node, req->ptr, req->count);
+					starpu_memory_deallocate(req->node, req->count);
 				}
 			}
 			else
@@ -930,44 +933,47 @@ static void _starpu_mpi_early_data_cb(void* arg)
 		/* Data has been received as a raw memory, it has to be unpacked */
 		struct starpu_data_interface_ops *itf_src = starpu_data_get_interface_ops(args->early_handle);
 		struct starpu_data_interface_ops *itf_dst = starpu_data_get_interface_ops(args->data_handle);
-		MPI_Datatype datatype = _starpu_mpi_datatype_get_user_defined_datatype(args->data_handle);
+		MPI_Datatype datatype = _starpu_mpi_datatype_get_user_defined_datatype(args->data_handle, args->req->node);
 
 		if (datatype)
 		{
 			int position=0;
-			void *ptr = starpu_data_get_local_ptr(args->data_handle);
+			void *ptr = starpu_data_handle_to_pointer(args->data_handle, args->req->node);
 			MPI_Unpack(args->buffer, itf_src->get_size(args->early_handle), &position, ptr, 1, datatype, args->req->node_tag.node.comm);
-			starpu_free_on_node_flags(STARPU_MAIN_RAM, (uintptr_t) args->buffer, args->size, 0);
+			starpu_free_on_node_flags(args->buffer_node, (uintptr_t) args->buffer, args->size, 0);
 			args->buffer = NULL;
 			_starpu_mpi_datatype_free(args->data_handle, &datatype);
 		}
 		else
 		{
 			STARPU_MPI_ASSERT_MSG(itf_dst->unpack_data, "The data interface does not define an unpack function\n");
-			itf_dst->unpack_data(args->data_handle, STARPU_MAIN_RAM, args->buffer, itf_src->get_size(args->early_handle));
+			// FIXME: args->buffer is in args->buffer_node, not req->node
+			// There is conflation between the memory node for the handle and the memory node for the buffer
+			// Actually we may not want unpack_data to free the buffer, for the case when we are participating to a collective send
+			itf_dst->unpack_data(args->data_handle, args->req->node, args->buffer, itf_src->get_size(args->early_handle));
 			args->buffer = NULL;
 		}
 	}
 	else
 	{
 		struct starpu_data_interface_ops *itf = starpu_data_get_interface_ops(args->early_handle);
-		void* itf_src = starpu_data_get_interface_on_node(args->early_handle, STARPU_MAIN_RAM);
-		void* itf_dst = starpu_data_get_interface_on_node(args->data_handle, STARPU_MAIN_RAM);
+		void* itf_src = starpu_data_get_interface_on_node(args->early_handle, args->buffer_node);
+		void* itf_dst = starpu_data_get_interface_on_node(args->data_handle, args->req->node);
 
 		if (!itf->copy_methods->ram_to_ram)
 		{
 			_STARPU_MPI_DEBUG(3, "Initiating any_to_any copy..\n");
-			itf->copy_methods->any_to_any(itf_src, STARPU_MAIN_RAM, itf_dst, STARPU_MAIN_RAM, NULL);
+			itf->copy_methods->any_to_any(itf_src, args->buffer_node, itf_dst, args->req->node, NULL);
 		}
 		else
 		{
 			_STARPU_MPI_DEBUG(3, "Initiating ram_to_ram copy..\n");
-			itf->copy_methods->ram_to_ram(itf_src, STARPU_MAIN_RAM, itf_dst, STARPU_MAIN_RAM);
+			itf->copy_methods->ram_to_ram(itf_src, args->buffer_node, itf_dst, args->req->node);
 		}
 	}
 
 	_STARPU_MPI_DEBUG(3, "Done, handling release of early_handle..\n");
-	starpu_data_release_on_node(args->early_handle, STARPU_MAIN_RAM);
+	starpu_data_release_on_node(args->early_handle, args->buffer_node);
 
 	_STARPU_MPI_DEBUG(3, "Done, handling unregister of early_handle..\n");
 	/* XXX: note that we have already freed the registered buffer above. In
@@ -1147,10 +1153,13 @@ static void _starpu_mpi_receive_early_data(struct _starpu_mpi_envelope *envelope
 	data_handle = _starpu_mpi_tag_get_data_handle_from_tag(envelope->data_tag);
 	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 
+	// TODO: rather select some memory node next to the NIC
+	unsigned buffer_node = STARPU_MAIN_RAM;
 	if (data_handle && starpu_data_get_interface_id(data_handle) < STARPU_MAX_INTERFACE_ID)
 	{
 		/* We know which data will receive it and we won't have to unpack, use just the same kind of data.  */
 		early_data_handle->buffer = NULL;
+		early_data_handle->buffer_node = buffer_node;
 		starpu_data_register_same(&early_data_handle->handle, data_handle);
 		//_starpu_mpi_early_data_add(early_data_handle);
 	}
@@ -1161,9 +1170,10 @@ static void _starpu_mpi_receive_early_data(struct _starpu_mpi_envelope *envelope
 		 * to the application when it post a receive for this tag
 		 */
 		_STARPU_MPI_DEBUG(3, "Posting a receive for a data of size %d which has not yet been registered\n", (int)envelope->size);
-		early_data_handle->buffer = (void *)starpu_malloc_on_node_flags(STARPU_MAIN_RAM, envelope->size, 0);
+		early_data_handle->buffer = (void *)starpu_malloc_on_node_flags(buffer_node, envelope->size, 0);
 		early_data_handle->size = envelope->size;
-		starpu_variable_data_register(&early_data_handle->handle, STARPU_MAIN_RAM, (uintptr_t) early_data_handle->buffer, envelope->size);
+		early_data_handle->buffer_node = buffer_node;
+		starpu_variable_data_register(&early_data_handle->handle, buffer_node, (uintptr_t) early_data_handle->buffer, envelope->size);
 		//_starpu_mpi_early_data_add(early_data_handle);
 	}
 
@@ -1456,13 +1466,13 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 						if (early_request->registered_datatype == 1)
 						{
 							early_request->count = 1;
-							early_request->ptr = starpu_data_handle_to_pointer(early_request->data_handle, STARPU_MAIN_RAM);
+							early_request->ptr = starpu_data_handle_to_pointer(early_request->data_handle, early_request->node);
 						}
 						else
 						{
 							early_request->count = envelope->size;
-							early_request->ptr = (void *)starpu_malloc_on_node_flags(STARPU_MAIN_RAM, early_request->count, 0);
-							starpu_memory_allocate(STARPU_MAIN_RAM, early_request->count, STARPU_MEMORY_OVERFLOW);
+							early_request->ptr = (void *)starpu_malloc_on_node_flags(early_request->node, early_request->count, 0);
+							starpu_memory_allocate(early_request->node, early_request->count, STARPU_MEMORY_OVERFLOW);
 
 							STARPU_MPI_ASSERT_MSG(early_request->ptr, "cannot allocate message of size %ld\n", early_request->count);
 						}

+ 5 - 5
mpi/src/nmad/starpu_mpi_nmad.c

@@ -138,7 +138,7 @@ void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
 	if (req->registered_datatype == 1)
 	{
 		req->count = 1;
-		req->ptr = starpu_data_handle_to_pointer(req->data_handle, STARPU_MAIN_RAM);
+		req->ptr = starpu_data_handle_to_pointer(req->data_handle, req->node);
 
 		_starpu_mpi_isend_data_func(req);
 	}
@@ -185,7 +185,7 @@ void _starpu_mpi_irecv_size_func(struct _starpu_mpi_req *req)
 	if (req->registered_datatype == 1)
 	{
 		req->count = 1;
-		req->ptr = starpu_data_handle_to_pointer(req->data_handle, STARPU_MAIN_RAM);
+		req->ptr = starpu_data_handle_to_pointer(req->data_handle, req->node);
 		_starpu_mpi_irecv_data_func(req);
 	}
 	else
@@ -344,9 +344,9 @@ void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req* req)
 		{
 			if (req->request_type == RECV_REQ)
 				// req->ptr is freed by starpu_data_unpack
-				starpu_data_unpack(req->data_handle, req->ptr, req->count);
+				starpu_data_unpack_node(req->data_handle, req->node, req->ptr, req->count);
 			else
-				starpu_free_on_node_flags(STARPU_MAIN_RAM, (uintptr_t) req->ptr, req->count, 0);
+				starpu_free_on_node_flags(req->node, (uintptr_t) req->ptr, req->count, 0);
 		}
 		else
 		{
@@ -445,7 +445,7 @@ void _starpu_mpi_submit_ready_request(void *arg)
 	if (req->reserved_size)
 	{
 		/* The core will have really allocated the reception buffer now, release our reservation */
-		starpu_memory_deallocate(STARPU_MAIN_RAM, req->reserved_size);
+		starpu_memory_deallocate(req->node, req->reserved_size);
 		req->reserved_size = 0;
 	}
 

+ 2 - 2
mpi/src/nmad/starpu_mpi_nmad_unknown_datatype.c

@@ -50,7 +50,7 @@ void _starpu_mpi_isend_unknown_datatype(struct _starpu_mpi_req *req)
 
 	_STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(req->node_tag.node.rank, req->node_tag.data_tag, 0);
 
-	starpu_data_pack(req->data_handle, &req->ptr, &req->count);
+	starpu_data_pack_node(req->data_handle, req->node, &req->ptr, &req->count);
 
 	req->backend->unknown_datatype_v[0].iov_base = &req->count;
 	req->backend->unknown_datatype_v[0].iov_len = sizeof(starpu_ssize_t);
@@ -102,7 +102,7 @@ static void _starpu_mpi_unknown_datatype_recv_callback(nm_sr_event_t event, cons
 		nm_sr_recv_peek(req->backend->session, &req->backend->data_request, &data_header);
 
 		// Now we know the size, allocate the buffer:
-		req->ptr = (void *)starpu_malloc_on_node_flags(STARPU_MAIN_RAM, req->count, 0);
+		req->ptr = (void *)starpu_malloc_on_node_flags(req->node, req->count, 0);
 		STARPU_ASSERT_MSG(req->ptr, "cannot allocate message of size %ld", req->count);
 
 		/* Last step: give this buffer to NewMadeleine to receive data

+ 7 - 3
mpi/src/starpu_mpi.c

@@ -36,6 +36,8 @@
 
 static void _starpu_mpi_isend_irecv_common(struct _starpu_mpi_req *req, enum starpu_data_access_mode mode, int sequential_consistency)
 {
+	unsigned node = STARPU_MAIN_RAM; // XXX For now
+
 	/* Asynchronously request StarPU to fetch the data in main memory: when
 	 * it is available in main memory, _starpu_mpi_submit_ready_request(req) is called and
 	 * the request is actually submitted */
@@ -47,19 +49,21 @@ static void _starpu_mpi_isend_irecv_common(struct _starpu_mpi_req *req, enum sta
 		if (size)
 		{
 			/* This will potentially block */
-			starpu_memory_allocate(STARPU_MAIN_RAM, size, STARPU_MEMORY_WAIT);
+			starpu_memory_allocate(node, size, STARPU_MEMORY_WAIT);
 			req->reserved_size = size;
+			/* This also decides where we will store the data */
+			req->node = node;
 		}
 	}
 
 	if (sequential_consistency)
 	{
-		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, STARPU_MAIN_RAM, mode, _starpu_mpi_submit_ready_request, (void *)req, 1 /*sequential consistency*/, 1, &req->pre_sync_jobid, &req->post_sync_jobid);
+		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, node, mode, _starpu_mpi_submit_ready_request, (void *)req, 1 /*sequential consistency*/, 1, &req->pre_sync_jobid, &req->post_sync_jobid);
 	}
 	else
 	{
 		/* post_sync_job_id has already been filled */
-		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, STARPU_MAIN_RAM, mode, _starpu_mpi_submit_ready_request, (void *)req, 0 /*sequential consistency*/, 1, &req->pre_sync_jobid, NULL);
+		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, node, mode, _starpu_mpi_submit_ready_request, (void *)req, 0 /*sequential consistency*/, 1, &req->pre_sync_jobid, NULL);
 	}
 }
 

+ 5 - 3
mpi/src/starpu_mpi_coop_sends.c

@@ -47,13 +47,13 @@ void _starpu_mpi_release_req_data(struct _starpu_mpi_req *req)
 			/* We were last, release data */
 			free(coop_sends->reqs_array);
 			free(coop_sends);
-			starpu_data_release_on_node(req->data_handle, STARPU_MAIN_RAM);
+			starpu_data_release_on_node(req->data_handle, req->node);
 		}
 	}
 	else
 	{
 		/* Trivial request */
-		starpu_data_release_on_node(req->data_handle, STARPU_MAIN_RAM);
+		starpu_data_release_on_node(req->data_handle, req->node);
 	}
 }
 
@@ -265,9 +265,11 @@ void _starpu_mpi_coop_send(starpu_data_handle_t data_handle, struct _starpu_mpi_
 	/* In case we created one for nothing after all */
 	free(tofree);
 
+	unsigned node = STARPU_MAIN_RAM; // XXX For now
+
 	if (first)
 		/* We were first, we are responsible for acquiring the data for everybody */
-		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, STARPU_MAIN_RAM, mode, _starpu_mpi_coop_sends_data_ready, coop_sends, sequential_consistency, 0, &coop_sends->pre_sync_jobid, NULL);
+		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, node, mode, _starpu_mpi_coop_sends_data_ready, coop_sends, sequential_consistency, 0, &coop_sends->pre_sync_jobid, NULL);
 	else
 		req->pre_sync_jobid = coop_sends->pre_sync_jobid;
 }

+ 77 - 36
mpi/src/starpu_mpi_datatype.c

@@ -22,6 +22,7 @@ struct _starpu_mpi_datatype_funcs
 {
 	enum starpu_data_interface_id id;
 	starpu_mpi_datatype_allocate_func_t allocate_datatype_func;
+	starpu_mpi_datatype_node_allocate_func_t allocate_datatype_node_func;
 	starpu_mpi_datatype_free_func_t free_datatype_func;
 	UT_hash_handle hh;
 };
@@ -42,14 +43,16 @@ void _starpu_mpi_datatype_shutdown(void)
  * 	Matrix
  */
 
-static int handle_to_datatype_matrix(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+static int handle_to_datatype_matrix(starpu_data_handle_t data_handle, unsigned node, MPI_Datatype *datatype)
 {
+	struct starpu_matrix_interface *matrix_interface = starpu_data_get_interface_on_node(data_handle, node);
+
 	int ret;
 
-	unsigned nx = starpu_matrix_get_nx(data_handle);
-	unsigned ny = starpu_matrix_get_ny(data_handle);
-	unsigned ld = starpu_matrix_get_local_ld(data_handle);
-	size_t elemsize = starpu_matrix_get_elemsize(data_handle);
+	unsigned nx = STARPU_MATRIX_GET_NX(matrix_interface);
+	unsigned ny = STARPU_MATRIX_GET_NY(matrix_interface);
+	unsigned ld = STARPU_MATRIX_GET_LD(matrix_interface);
+	size_t elemsize = STARPU_MATRIX_GET_ELEMSIZE(matrix_interface);
 
 	ret = MPI_Type_vector(ny, nx*elemsize, ld*elemsize, MPI_BYTE, datatype);
 	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_vector failed");
@@ -64,16 +67,18 @@ static int handle_to_datatype_matrix(starpu_data_handle_t data_handle, MPI_Datat
  * 	Block
  */
 
-static int handle_to_datatype_block(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+static int handle_to_datatype_block(starpu_data_handle_t data_handle, unsigned node, MPI_Datatype *datatype)
 {
+	struct starpu_block_interface *block_interface = starpu_data_get_interface_on_node(data_handle, node);
+
 	int ret;
 
-	unsigned nx = starpu_block_get_nx(data_handle);
-	unsigned ny = starpu_block_get_ny(data_handle);
-	unsigned nz = starpu_block_get_nz(data_handle);
-	unsigned ldy = starpu_block_get_local_ldy(data_handle);
-	unsigned ldz = starpu_block_get_local_ldz(data_handle);
-	size_t elemsize = starpu_block_get_elemsize(data_handle);
+	unsigned nx = STARPU_BLOCK_GET_NX(block_interface);
+	unsigned ny = STARPU_BLOCK_GET_NY(block_interface);
+	unsigned nz = STARPU_BLOCK_GET_NZ(block_interface);
+	unsigned ldy = STARPU_BLOCK_GET_LDY(block_interface);
+	unsigned ldz = STARPU_BLOCK_GET_LDZ(block_interface);
+	size_t elemsize = STARPU_BLOCK_GET_ELEMSIZE(block_interface);
 
 	MPI_Datatype datatype_2dlayer;
 	ret = MPI_Type_vector(ny, nx*elemsize, ldy*elemsize, MPI_BYTE, &datatype_2dlayer);
@@ -95,18 +100,20 @@ static int handle_to_datatype_block(starpu_data_handle_t data_handle, MPI_Dataty
  * 	Tensor
  */
 
-static int handle_to_datatype_tensor(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+static int handle_to_datatype_tensor(starpu_data_handle_t data_handle, unsigned node, MPI_Datatype *datatype)
 {
+	struct starpu_tensor_interface *tensor_interface = starpu_data_get_interface_on_node(data_handle, node);
+
 	int ret;
 
-	unsigned nx = starpu_tensor_get_nx(data_handle);
-	unsigned ny = starpu_tensor_get_ny(data_handle);
-	unsigned nz = starpu_tensor_get_nz(data_handle);
-	unsigned nt = starpu_tensor_get_nt(data_handle);
-	unsigned ldy = starpu_tensor_get_local_ldy(data_handle);
-	unsigned ldz = starpu_tensor_get_local_ldz(data_handle);
-	unsigned ldt = starpu_tensor_get_local_ldt(data_handle);
-	size_t elemsize = starpu_tensor_get_elemsize(data_handle);
+	unsigned nx = STARPU_TENSOR_GET_NX(tensor_interface);
+	unsigned ny = STARPU_TENSOR_GET_NY(tensor_interface);
+	unsigned nz = STARPU_TENSOR_GET_NZ(tensor_interface);
+	unsigned nt = STARPU_TENSOR_GET_NT(tensor_interface);
+	unsigned ldy = STARPU_TENSOR_GET_LDY(tensor_interface);
+	unsigned ldz = STARPU_TENSOR_GET_LDZ(tensor_interface);
+	unsigned ldt = STARPU_TENSOR_GET_LDT(tensor_interface);
+	size_t elemsize = STARPU_TENSOR_GET_ELEMSIZE(tensor_interface);
 
 	MPI_Datatype datatype_3dlayer;
 	ret = MPI_Type_vector(ny, nx*elemsize, ldy*elemsize, MPI_BYTE, &datatype_3dlayer);
@@ -135,12 +142,14 @@ static int handle_to_datatype_tensor(starpu_data_handle_t data_handle, MPI_Datat
  * 	Vector
  */
 
-static int handle_to_datatype_vector(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+static int handle_to_datatype_vector(starpu_data_handle_t data_handle, unsigned node, MPI_Datatype *datatype)
 {
+	struct starpu_vector_interface *vector_interface = starpu_data_get_interface_on_node(data_handle, node);
+
 	int ret;
 
-	unsigned nx = starpu_vector_get_nx(data_handle);
-	size_t elemsize = starpu_vector_get_elemsize(data_handle);
+	unsigned nx = STARPU_VECTOR_GET_NX(vector_interface);
+	size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(vector_interface);
 
 	ret = MPI_Type_contiguous(nx*elemsize, MPI_BYTE, datatype);
 	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_contiguous failed");
@@ -155,11 +164,13 @@ static int handle_to_datatype_vector(starpu_data_handle_t data_handle, MPI_Datat
  * 	Variable
  */
 
-static int handle_to_datatype_variable(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+static int handle_to_datatype_variable(starpu_data_handle_t data_handle, unsigned node, MPI_Datatype *datatype)
 {
+	struct starpu_variable_interface *variable_interface = starpu_data_get_interface_on_node(data_handle, node);
+
 	int ret;
 
-	size_t elemsize = starpu_variable_get_elemsize(data_handle);
+	size_t elemsize = STARPU_VARIABLE_GET_ELEMSIZE(variable_interface);
 
 	ret = MPI_Type_contiguous(elemsize, MPI_BYTE, datatype);
 	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_contiguous failed");
@@ -174,10 +185,11 @@ static int handle_to_datatype_variable(starpu_data_handle_t data_handle, MPI_Dat
  * 	Void
  */
 
-static int handle_to_datatype_void(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+static int handle_to_datatype_void(starpu_data_handle_t data_handle, unsigned node, MPI_Datatype *datatype)
 {
 	int ret;
 	(void)data_handle;
+	(void)node;
 
 	ret = MPI_Type_contiguous(0, MPI_BYTE, datatype);
 	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_contiguous failed");
@@ -192,7 +204,7 @@ static int handle_to_datatype_void(starpu_data_handle_t data_handle, MPI_Datatyp
  *	Generic
  */
 
-static starpu_mpi_datatype_allocate_func_t handle_to_datatype_funcs[STARPU_MAX_INTERFACE_ID] =
+static starpu_mpi_datatype_node_allocate_func_t handle_to_datatype_funcs[STARPU_MAX_INTERFACE_ID] =
 {
 //#define DYNAMIC_MATRICES
 #ifndef DYNAMIC_MATRICES
@@ -208,7 +220,7 @@ static starpu_mpi_datatype_allocate_func_t handle_to_datatype_funcs[STARPU_MAX_I
 	[STARPU_MULTIFORMAT_INTERFACE_ID] = NULL,
 };
 
-MPI_Datatype _starpu_mpi_datatype_get_user_defined_datatype(starpu_data_handle_t data_handle)
+MPI_Datatype _starpu_mpi_datatype_get_user_defined_datatype(starpu_data_handle_t data_handle, unsigned node)
 {
 	enum starpu_data_interface_id id = starpu_data_get_interface_id(data_handle);
 	if (id < STARPU_MAX_INTERFACE_ID) return 0;
@@ -217,10 +229,14 @@ MPI_Datatype _starpu_mpi_datatype_get_user_defined_datatype(starpu_data_handle_t
 	STARPU_PTHREAD_MUTEX_LOCK(&_starpu_mpi_datatype_funcs_table_mutex);
 	HASH_FIND_INT(_starpu_mpi_datatype_funcs_table, &id, table);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&_starpu_mpi_datatype_funcs_table_mutex);
-	if (table && table->allocate_datatype_func)
+	if (table && (table->allocate_datatype_node_func || table->allocate_datatype_func))
 	{
 		MPI_Datatype datatype;
-		int ret = table->allocate_datatype_func(data_handle, &datatype);
+		int ret;
+		if (table->allocate_datatype_node_func)
+			ret = table->allocate_datatype_node_func(data_handle, node, &datatype);
+		else
+			ret = table->allocate_datatype_func(data_handle, &datatype);
 		if (ret == 0)
 			return datatype;
 		else
@@ -235,10 +251,10 @@ void _starpu_mpi_datatype_allocate(starpu_data_handle_t data_handle, struct _sta
 
 	if (id < STARPU_MAX_INTERFACE_ID)
 	{
-		starpu_mpi_datatype_allocate_func_t func = handle_to_datatype_funcs[id];
+		starpu_mpi_datatype_node_allocate_func_t func = handle_to_datatype_funcs[id];
 		if (func)
 		{
-			func(data_handle, &req->datatype);
+			func(data_handle, req->node, &req->datatype);
 			req->registered_datatype = 1;
 		}
 		else
@@ -256,8 +272,12 @@ void _starpu_mpi_datatype_allocate(starpu_data_handle_t data_handle, struct _sta
 		STARPU_PTHREAD_MUTEX_UNLOCK(&_starpu_mpi_datatype_funcs_table_mutex);
 		if (table)
 		{
-			STARPU_ASSERT_MSG(table->allocate_datatype_func, "Handle To Datatype Function not defined for StarPU data interface %d", id);
-			int ret = table->allocate_datatype_func(data_handle, &req->datatype);
+			STARPU_ASSERT_MSG(table->allocate_datatype_node_func || table->allocate_datatype_func, "Handle To Datatype Function not defined for StarPU data interface %d", id);
+			int ret;
+			if (table->allocate_datatype_node_func)
+				ret = table->allocate_datatype_node_func(data_handle, req->node, &req->datatype);
+			else
+				ret = table->allocate_datatype_func(data_handle, &req->datatype);
 			if (ret == 0)
 				req->registered_datatype = 1;
 			else
@@ -362,7 +382,7 @@ void _starpu_mpi_datatype_free(starpu_data_handle_t data_handle, MPI_Datatype *d
 	/* else the datatype is not predefined by StarPU */
 }
 
-int starpu_mpi_interface_datatype_register(enum starpu_data_interface_id id, starpu_mpi_datatype_allocate_func_t allocate_datatype_func, starpu_mpi_datatype_free_func_t free_datatype_func)
+int _starpu_mpi_interface_datatype_register(enum starpu_data_interface_id id, starpu_mpi_datatype_node_allocate_func_t allocate_datatype_node_func, starpu_mpi_datatype_allocate_func_t allocate_datatype_func, starpu_mpi_datatype_free_func_t free_datatype_func)
 {
 	struct _starpu_mpi_datatype_funcs *table;
 
@@ -372,6 +392,7 @@ int starpu_mpi_interface_datatype_register(enum starpu_data_interface_id id, sta
 	HASH_FIND_INT(_starpu_mpi_datatype_funcs_table, &id, table);
 	if (table)
 	{
+		table->allocate_datatype_node_func = allocate_datatype_node_func;
 		table->allocate_datatype_func = allocate_datatype_func;
 		table->free_datatype_func = free_datatype_func;
 	}
@@ -379,6 +400,7 @@ int starpu_mpi_interface_datatype_register(enum starpu_data_interface_id id, sta
 	{
 		_STARPU_MPI_MALLOC(table, sizeof(struct _starpu_mpi_datatype_funcs));
 		table->id = id;
+		table->allocate_datatype_node_func = allocate_datatype_node_func;
 		table->allocate_datatype_func = allocate_datatype_func;
 		table->free_datatype_func = free_datatype_func;
 		HASH_ADD_INT(_starpu_mpi_datatype_funcs_table, id, table);
@@ -387,6 +409,25 @@ int starpu_mpi_interface_datatype_register(enum starpu_data_interface_id id, sta
 	return 0;
 }
 
+int starpu_mpi_interface_datatype_node_register(enum starpu_data_interface_id id, starpu_mpi_datatype_node_allocate_func_t allocate_datatype_node_func, starpu_mpi_datatype_free_func_t free_datatype_func)
+{
+	return _starpu_mpi_interface_datatype_register(id, allocate_datatype_node_func, NULL, free_datatype_func);
+}
+
+int starpu_mpi_interface_datatype_register(enum starpu_data_interface_id id, starpu_mpi_datatype_allocate_func_t allocate_datatype_func, starpu_mpi_datatype_free_func_t free_datatype_func)
+{
+	return _starpu_mpi_interface_datatype_register(id, NULL, allocate_datatype_func, free_datatype_func);
+}
+
+int starpu_mpi_datatype_node_register(starpu_data_handle_t handle, starpu_mpi_datatype_node_allocate_func_t allocate_datatype_node_func, starpu_mpi_datatype_free_func_t free_datatype_func)
+{
+	enum starpu_data_interface_id id = starpu_data_get_interface_id(handle);
+	int ret;
+	ret = starpu_mpi_interface_datatype_node_register(id, allocate_datatype_node_func, free_datatype_func);
+	STARPU_ASSERT_MSG(handle->ops->handle_to_pointer || handle->ops->to_pointer, "The data interface must define the operation 'to_pointer'\n");
+	return ret;
+}
+
 int starpu_mpi_datatype_register(starpu_data_handle_t handle, starpu_mpi_datatype_allocate_func_t allocate_datatype_func, starpu_mpi_datatype_free_func_t free_datatype_func)
 {
 	enum starpu_data_interface_id id = starpu_data_get_interface_id(handle);

+ 1 - 1
mpi/src/starpu_mpi_datatype.h

@@ -33,7 +33,7 @@ void _starpu_mpi_datatype_shutdown(void);
 void _starpu_mpi_datatype_allocate(starpu_data_handle_t data_handle, struct _starpu_mpi_req *req);
 void _starpu_mpi_datatype_free(starpu_data_handle_t data_handle, MPI_Datatype *datatype);
 
-MPI_Datatype _starpu_mpi_datatype_get_user_defined_datatype(starpu_data_handle_t data_handle);
+MPI_Datatype _starpu_mpi_datatype_get_user_defined_datatype(starpu_data_handle_t data_handle, unsigned node);
 
 #ifdef __cplusplus
 }

+ 1 - 0
mpi/src/starpu_mpi_private.h

@@ -225,6 +225,7 @@ LIST_TYPE(_starpu_mpi_req,
 	starpu_data_handle_t data_handle;
 
 	int prio;
+	unsigned node;	/* Which StarPU memory node this will read from / write to */
 
 	/** description of the data to be sent/received */
 	MPI_Datatype datatype;

+ 1 - 0
mpi/src/starpu_mpi_req.c

@@ -25,6 +25,7 @@ void _starpu_mpi_request_init(struct _starpu_mpi_req **req)
 	/* Initialize the request structure */
 	//(*req)->data_handle = NULL;
 	//(*req)->prio = 0;
+	(*req)->node = STARPU_MAIN_RAM; // XXX For now
 
 	//(*req)->datatype = 0;
 	//(*req)->datatype_name = NULL;

+ 9 - 4
src/datawizard/coherency.c

@@ -575,9 +575,14 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 			if (_starpu_allocate_memory_on_node(handle, dst_replicate, is_prefetch) == 0)
 			{
 				_starpu_update_data_state(handle, dst_replicate, mode);
-				if (is_prefetch == STARPU_TASK_PREFETCH)
-					/* Make sure it stays there */
-					dst_replicate->mc->nb_tasks_prefetch++;
+				if (dst_replicate->mc)
+				{
+					if (is_prefetch == STARPU_TASK_PREFETCH)
+						/* Make sure it stays there */
+						dst_replicate->mc->nb_tasks_prefetch++;
+
+					_starpu_memchunk_recently_used(dst_replicate->mc, requesting_node);
+				}
 
 				_starpu_spin_unlock(&handle->header_lock);
 
@@ -745,7 +750,7 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 	if (cpt == STARPU_SPIN_MAXTRY)
 		_starpu_spin_lock(&handle->header_lock);
 
-	if (is_prefetch > STARPU_FETCH)
+	if (mode & STARPU_R && is_prefetch > STARPU_FETCH)
 	{
 		unsigned src_node_mask = 0;
 

+ 14 - 4
src/datawizard/interfaces/data_interface.c

@@ -1217,20 +1217,30 @@ int starpu_data_interface_get_next_id(void)
 	return _data_interface_number-1;
 }
 
-int starpu_data_pack(starpu_data_handle_t handle, void **ptr, starpu_ssize_t *count)
+int starpu_data_pack_node(starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ssize_t *count)
 {
 	STARPU_ASSERT_MSG(handle->ops->pack_data, "The datatype interface %s (%d) does not have a pack operation", handle->ops->name, handle->ops->interfaceid);
-	return handle->ops->pack_data(handle, starpu_worker_get_local_memory_node(), ptr, count);
+	return handle->ops->pack_data(handle, node, ptr, count);
 }
 
-int starpu_data_unpack(starpu_data_handle_t handle, void *ptr, size_t count)
+int starpu_data_pack(starpu_data_handle_t handle, void **ptr, starpu_ssize_t *count)
+{
+	return starpu_data_pack_node(handle, starpu_worker_get_local_memory_node(), ptr, count);
+}
+
+int starpu_data_unpack_node(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
 {
 	STARPU_ASSERT_MSG(handle->ops->unpack_data, "The datatype interface %s (%d) does not have an unpack operation", handle->ops->name, handle->ops->interfaceid);
 	int ret;
-	ret = handle->ops->unpack_data(handle, starpu_worker_get_local_memory_node(), ptr, count);
+	ret = handle->ops->unpack_data(handle, node, ptr, count);
 	return ret;
 }
 
+int starpu_data_unpack(starpu_data_handle_t handle, void *ptr, size_t count)
+{
+	return starpu_data_unpack_node(handle, starpu_worker_get_local_memory_node(), ptr, count);
+}
+
 size_t starpu_data_get_size(starpu_data_handle_t handle)
 {
 	return handle->ops->get_size(handle);

+ 0 - 1
src/datawizard/user_interactions.c

@@ -72,7 +72,6 @@ struct user_interaction_wrapper
 	unsigned async;
 	int prio;
 	void (*callback)(void *);
-	void (*callback_fetch_data)(void *); // called after fetch_data
 	void *callback_arg;
 	struct starpu_task *pre_sync_task;
 	struct starpu_task *post_sync_task;

+ 4 - 0
src/drivers/mic/driver_mic_sink.c

@@ -223,6 +223,10 @@ void _starpu_mic_sink_bind_thread(const struct _starpu_mp_node *mp_node STARPU_A
 
 void (*_starpu_mic_sink_lookup (const struct _starpu_mp_node * node STARPU_ATTRIBUTE_UNUSED, char* func_name))(void)
 {
+#ifdef RTLD_DEFAULT
+	return dlsym(RTLD_DEFAULT, func_name);
+#else
 	void *dl_handle = dlopen(NULL, RTLD_NOW);
 	return dlsym(dl_handle, func_name);
+#endif
 }

+ 4 - 0
src/drivers/mpi/driver_mpi_sink.c

@@ -44,8 +44,12 @@ void _starpu_mpi_sink_deinit(struct _starpu_mp_node *node)
 
 void (*_starpu_mpi_sink_lookup (const struct _starpu_mp_node * node STARPU_ATTRIBUTE_UNUSED, char* func_name))(void)
 {
+#ifdef RTLD_DEFAULT
+        return dlsym(RTLD_DEFAULT, func_name);
+#else
         void *dl_handle = dlopen(NULL, RTLD_NOW);
         return dlsym(dl_handle, func_name);
+#endif
 }
 
 void _starpu_mpi_sink_launch_workers(struct _starpu_mp_node *node)

+ 8 - 4
tools/perfmodels/sampling/codelets/45/starpu_sgemm_gemm.attila

@@ -29,7 +29,7 @@
 #####
 # Model for cpu0_impl0 (Comb3)
 # number of entries
-7
+8
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -41,6 +41,7 @@ nan            	nan            	nan
 9c6670ef	29491200       	7.077888e+09   	3.328725e+05   	1.185902e+04   	2.563119e+07   	8.542747e+12   	77
 c00cf6b7	29491200       	7.077888e+09   	3.328725e+05   	1.185902e+04   	2.563119e+07   	8.542747e+12   	77
 78a2cc08	29491200       	7.077888e+09   	3.328725e+05   	1.185902e+04   	2.563119e+07   	8.542747e+12   	77
+a7cdf15b	44236800       	1.415578e+10   	6.657450e+05   	2.371804e+04   	5.126238e+07   	3.417099e+13   	77
 24c84a50	11059200       	1.769472e+09   	8.321812e+04   	2.964755e+03   	6.407798e+06   	5.339217e+11   	77
 0b0b0ce8	3686400        	2.621440e+08   	1.421718e+04   	3.409134e+02   	9.098993e+05   	1.294364e+10   	64
 4220e23d	14745600       	2.097152e+09   	1.008105e+05   	2.361630e+03   	8.064841e+06   	8.134670e+11   	80
@@ -67,7 +68,7 @@ c00cf6b7	29491200       	7.077888e+09   	3.328725e+05   	1.185902e+04   	2.56311
 #####
 # Model for cuda0_impl0 (Comb0)
 # number of entries
-7
+8
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -79,6 +80,7 @@ nan            	nan            	nan
 9c6670ef	29491200       	7.077888e+09   	1.123499e+04   	6.785566e+01   	1.190909e+06   	1.338033e+10   	106
 c00cf6b7	29491200       	7.077888e+09   	1.123499e+04   	6.785566e+01   	1.190909e+06   	1.338033e+10   	106
 78a2cc08	29491200       	7.077888e+09   	1.123499e+04   	6.785566e+01   	1.190909e+06   	1.338033e+10   	106
+a7cdf15b	44236800       	1.415578e+10   	2.246998e+04   	1.357113e+02   	2.381818e+06   	5.352132e+10   	106
 24c84a50	11059200       	1.769472e+09   	2.808747e+03   	1.696392e+01   	2.977272e+05   	8.362706e+08   	106
 0b0b0ce8	3686400        	2.621440e+08   	6.738679e+02   	4.393713e+01   	6.873452e+04   	4.651489e+07   	102
 4220e23d	14745600       	2.097152e+09   	5.557425e+03   	3.241733e+02   	5.835297e+05   	3.253957e+09   	105
@@ -105,7 +107,7 @@ c00cf6b7	29491200       	7.077888e+09   	1.123499e+04   	6.785566e+01   	1.19090
 #####
 # Model for cuda1_impl0 (Comb2)
 # number of entries
-7
+8
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -117,6 +119,7 @@ nan            	nan            	nan
 9c6670ef	29491200       	7.077888e+09   	1.123077e+04   	9.504466e+01   	1.179231e+06   	1.324463e+10   	105
 c00cf6b7	29491200       	7.077888e+09   	1.123077e+04   	9.504466e+01   	1.179231e+06   	1.324463e+10   	105
 78a2cc08	29491200       	7.077888e+09   	1.123077e+04   	9.504466e+01   	1.179231e+06   	1.324463e+10   	105
+a7cdf15b	44236800       	1.415578e+10   	2.246154e+04   	1.900893e+02   	2.358462e+06   	5.297852e+10   	105
 24c84a50	11059200       	1.769472e+09   	2.807693e+03   	2.376116e+01   	2.948078e+05   	8.277894e+08   	105
 0b0b0ce8	3686400        	2.621440e+08   	6.672056e+02   	3.376608e+01   	6.805497e+04   	4.552295e+07   	102
 4220e23d	14745600       	2.097152e+09   	5.553764e+03   	3.500896e+02   	5.831453e+05   	3.251521e+09   	105
@@ -143,7 +146,7 @@ c00cf6b7	29491200       	7.077888e+09   	1.123077e+04   	9.504466e+01   	1.17923
 #####
 # Model for cuda2_impl0 (Comb1)
 # number of entries
-7
+8
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -155,6 +158,7 @@ nan            	nan            	nan
 9c6670ef	29491200       	7.077888e+09   	1.124174e+04   	2.629960e+01   	1.180383e+06   	1.326963e+10   	105
 c00cf6b7	29491200       	7.077888e+09   	1.124174e+04   	2.629960e+01   	1.180383e+06   	1.326963e+10   	105
 78a2cc08	29491200       	7.077888e+09   	1.124174e+04   	2.629960e+01   	1.180383e+06   	1.326963e+10   	105
+a7cdf15b	44236800       	1.415578e+10   	2.248348e+04   	5.259920e+01   	2.360766e+06   	5.307852e+10   	105
 24c84a50	11059200       	1.769472e+09   	2.810435e+03   	6.574900e+00   	2.950958e+05   	8.293519e+08   	105
 0b0b0ce8	3686400        	2.621440e+08   	6.002221e+02   	2.259043e+01   	6.242310e+04   	3.752080e+07   	104
 4220e23d	14745600       	2.097152e+09   	5.577722e+03   	1.615194e+02   	5.912385e+05   	3.300529e+09   	106

+ 18 - 9
tools/perfmodels/sampling/codelets/45/starpu_sgemm_gemm.idgraf

@@ -28,7 +28,7 @@
 #####
 # Model for cuda0_impl0 (Comb2)
 # number of entries
-7
+8
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -42,6 +42,7 @@ nan            	nan            	nan
 9c6670ef	29491200       	7.077888e+09   	1.150361e+04   	5.884814e+02   	1.000814e+06   	1.154310e+10   	87
 c00cf6b7	29491200       	7.077888e+09   	1.150361e+04   	5.884814e+02   	1.000814e+06   	1.154310e+10   	87
 78a2cc08	29491200       	7.077888e+09   	1.150361e+04   	5.884814e+02   	1.000814e+06   	1.154310e+10   	87
+a7cdf15b	44236800       	1.415578e+10   	2.300722e+04   	1.176963e+03   	2.001628e+06   	4.617240e+10   	87
 24c84a50	11059200       	1.769472e+09   	2.875903e+03   	1.471204e+02   	2.502035e+05   	7.214438e+08   	87
 
 ####################
@@ -66,7 +67,7 @@ c00cf6b7	29491200       	7.077888e+09   	1.150361e+04   	5.884814e+02   	1.00081
 #####
 # Model for cuda1_impl0 (Comb4)
 # number of entries
-7
+8
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -80,6 +81,7 @@ nan            	nan            	nan
 9c6670ef	29491200       	7.077888e+09   	1.157020e+04   	6.521027e+02   	1.018178e+06   	1.181795e+10   	88
 c00cf6b7	29491200       	7.077888e+09   	1.157020e+04   	6.521027e+02   	1.018178e+06   	1.181795e+10   	88
 78a2cc08	29491200       	7.077888e+09   	1.157020e+04   	6.521027e+02   	1.018178e+06   	1.181795e+10   	88
+a7cdf15b	44236800       	1.415578e+10   	2.314040e+04   	1.304205e+03   	2.036356e+06   	4.727180e+10   	88
 24c84a50	11059200       	1.769472e+09   	2.892550e+03   	1.630257e+02   	2.545445e+05   	7.386219e+08   	88
 
 ####################
@@ -104,7 +106,7 @@ c00cf6b7	29491200       	7.077888e+09   	1.157020e+04   	6.521027e+02   	1.01817
 #####
 # Model for cuda2_impl0 (Comb6)
 # number of entries
-7
+8
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -118,6 +120,7 @@ nan            	nan            	nan
 9c6670ef	29491200       	7.077888e+09   	1.162826e+04   	6.757302e+02   	1.023286e+06   	1.193922e+10   	88
 c00cf6b7	29491200       	7.077888e+09   	1.162826e+04   	6.757302e+02   	1.023286e+06   	1.193922e+10   	88
 78a2cc08	29491200       	7.077888e+09   	1.162826e+04   	6.757302e+02   	1.023286e+06   	1.193922e+10   	88
+a7cdf15b	44236800       	1.415578e+10   	2.325652e+04   	1.351460e+03   	2.046572e+06   	4.775688e+10   	88
 24c84a50	11059200       	1.769472e+09   	2.907065e+03   	1.689325e+02   	2.558215e+05   	7.462012e+08   	88
 
 ####################
@@ -142,7 +145,7 @@ c00cf6b7	29491200       	7.077888e+09   	1.162826e+04   	6.757302e+02   	1.02328
 #####
 # Model for cuda3_impl0 (Comb7)
 # number of entries
-7
+8
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -156,6 +159,7 @@ nan            	nan            	nan
 9c6670ef	29491200       	7.077888e+09   	1.150498e+04   	4.254093e+02   	9.894285e+05   	1.139892e+10   	86
 c00cf6b7	29491200       	7.077888e+09   	1.150498e+04   	4.254093e+02   	9.894285e+05   	1.139892e+10   	86
 78a2cc08	29491200       	7.077888e+09   	1.150498e+04   	4.254093e+02   	9.894285e+05   	1.139892e+10   	86
+a7cdf15b	44236800       	1.415578e+10   	2.300996e+04   	8.508186e+02   	1.978857e+06   	4.559568e+10   	86
 24c84a50	11059200       	1.769472e+09   	2.876245e+03   	1.063523e+02   	2.473571e+05   	7.124325e+08   	86
 
 ####################
@@ -180,7 +184,7 @@ c00cf6b7	29491200       	7.077888e+09   	1.150498e+04   	4.254093e+02   	9.89428
 #####
 # Model for cuda4_impl0 (Comb0)
 # number of entries
-7
+8
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -194,6 +198,7 @@ nan            	nan            	nan
 9c6670ef	29491200       	7.077888e+09   	1.146770e+04   	1.768909e+02   	1.100899e+06   	1.262778e+10   	96
 c00cf6b7	29491200       	7.077888e+09   	1.146770e+04   	1.768909e+02   	1.100899e+06   	1.262778e+10   	96
 78a2cc08	29491200       	7.077888e+09   	1.146770e+04   	1.768909e+02   	1.100899e+06   	1.262778e+10   	96
+a7cdf15b	44236800       	1.415578e+10   	2.293540e+04   	3.537818e+02   	2.201798e+06   	5.051112e+10   	96
 24c84a50	11059200       	1.769472e+09   	2.866925e+03   	4.422272e+01   	2.752248e+05   	7.892362e+08   	96
 
 ####################
@@ -218,7 +223,7 @@ c00cf6b7	29491200       	7.077888e+09   	1.146770e+04   	1.768909e+02   	1.10089
 #####
 # Model for cuda5_impl0 (Comb1)
 # number of entries
-7
+8
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -232,6 +237,7 @@ nan            	nan            	nan
 9c6670ef	29491200       	7.077888e+09   	1.149102e+04   	5.375188e+02   	1.114629e+06   	1.283625e+10   	97
 c00cf6b7	29491200       	7.077888e+09   	1.149102e+04   	5.375188e+02   	1.114629e+06   	1.283625e+10   	97
 78a2cc08	29491200       	7.077888e+09   	1.149102e+04   	5.375188e+02   	1.114629e+06   	1.283625e+10   	97
+a7cdf15b	44236800       	1.415578e+10   	2.298204e+04   	1.075038e+03   	2.229258e+06   	5.134500e+10   	97
 24c84a50	11059200       	1.769472e+09   	2.872755e+03   	1.343797e+02   	2.786572e+05   	8.022656e+08   	97
 
 ####################
@@ -256,7 +262,7 @@ c00cf6b7	29491200       	7.077888e+09   	1.149102e+04   	5.375188e+02   	1.11462
 #####
 # Model for cuda6_impl0 (Comb3)
 # number of entries
-7
+8
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -270,6 +276,7 @@ nan            	nan            	nan
 9c6670ef	29491200       	7.077888e+09   	1.155069e+04   	5.660846e+02   	1.108866e+06   	1.283893e+10   	96
 c00cf6b7	29491200       	7.077888e+09   	1.155069e+04   	5.660846e+02   	1.108866e+06   	1.283893e+10   	96
 78a2cc08	29491200       	7.077888e+09   	1.155069e+04   	5.660846e+02   	1.108866e+06   	1.283893e+10   	96
+a7cdf15b	44236800       	1.415578e+10   	2.310138e+04   	1.132169e+03   	2.217732e+06   	5.135572e+10   	96
 24c84a50	11059200       	1.769472e+09   	2.887673e+03   	1.415212e+02   	2.772165e+05   	8.024331e+08   	96
 
 ####################
@@ -294,7 +301,7 @@ c00cf6b7	29491200       	7.077888e+09   	1.155069e+04   	5.660846e+02   	1.10886
 #####
 # Model for cuda7_impl0 (Comb5)
 # number of entries
-7
+8
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -308,6 +315,7 @@ nan            	nan            	nan
 9c6670ef	29491200       	7.077888e+09   	1.144012e+04   	2.531108e+02   	1.109691e+06   	1.270122e+10   	97
 c00cf6b7	29491200       	7.077888e+09   	1.144012e+04   	2.531108e+02   	1.109691e+06   	1.270122e+10   	97
 78a2cc08	29491200       	7.077888e+09   	1.144012e+04   	2.531108e+02   	1.109691e+06   	1.270122e+10   	97
+a7cdf15b	44236800       	1.415578e+10   	2.288024e+04   	5.062216e+02   	2.219382e+06   	5.080488e+10   	97
 24c84a50	11059200       	1.769472e+09   	2.860030e+03   	6.327770e+01   	2.774228e+05   	7.938262e+08   	97
 
 ####################
@@ -332,7 +340,7 @@ c00cf6b7	29491200       	7.077888e+09   	1.144012e+04   	2.531108e+02   	1.10969
 #####
 # Model for cpu0_impl0 (Comb8)
 # number of entries
-7
+8
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -346,5 +354,6 @@ nan            	nan            	nan
 9c6670ef	29491200       	7.077888e+09   	3.621356e+05   	7.764608e+03   	8.329119e+06   	3.017657e+12   	23
 c00cf6b7	29491200       	7.077888e+09   	3.621356e+05   	7.764608e+03   	8.329119e+06   	3.017657e+12   	23
 78a2cc08	29491200       	7.077888e+09   	3.621356e+05   	7.764608e+03   	8.329119e+06   	3.017657e+12   	23
+a7cdf15b	44236800       	1.415578e+10   	7.242712e+05   	1.552922e+04   	1.665824e+07   	1.207063e+13   	23
 24c84a50	11059200       	1.769472e+09   	9.053390e+04   	1.941152e+03   	2.082280e+06   	1.886036e+11   	23
 

+ 8 - 4
tools/perfmodels/sampling/codelets/45/starpu_sgemm_gemm.mirage

@@ -29,7 +29,7 @@
 #####
 # Model for cpu0_impl0 (Comb3)
 # number of entries
-7
+8
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -42,6 +42,7 @@ nan            	nan            	nan
 9c6670ef	29491200       	7.077888e+09   	3.550396e+05   	8.949994e+03   	2.840317e+07   	1.009066e+13   	80
 c00cf6b7	29491200       	7.077888e+09   	3.550396e+05   	8.949994e+03   	2.840317e+07   	1.009066e+13   	80
 78a2cc08	29491200       	7.077888e+09   	3.550396e+05   	8.949994e+03   	2.840317e+07   	1.009066e+13   	80
+a7cdf15b	44236800       	1.415578e+10   	7.100792e+05   	1.789999e+04   	5.680634e+07   	4.036264e+13   	80
 24c84a50	11059200       	1.769472e+09   	8.875990e+04   	2.237499e+03   	7.100792e+06   	6.306662e+11   	80
 4220e23d	14745600       	2.097152e+09   	1.078112e+05   	1.983800e+03   	8.624897e+06   	9.301755e+11   	80
 
@@ -67,7 +68,7 @@ c00cf6b7	29491200       	7.077888e+09   	3.550396e+05   	8.949994e+03   	2.84031
 #####
 # Model for cuda0_impl0 (Comb1)
 # number of entries
-7
+8
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -80,6 +81,7 @@ nan            	nan            	nan
 9c6670ef	29491200       	7.077888e+09   	1.151398e+04   	9.050114e+01   	1.220482e+06   	1.405348e+10   	106
 c00cf6b7	29491200       	7.077888e+09   	1.151398e+04   	9.050114e+01   	1.220482e+06   	1.405348e+10   	106
 78a2cc08	29491200       	7.077888e+09   	1.151398e+04   	9.050114e+01   	1.220482e+06   	1.405348e+10   	106
+a7cdf15b	44236800       	1.415578e+10   	2.302796e+04   	1.810023e+02   	2.440964e+06   	5.621392e+10   	106
 24c84a50	11059200       	1.769472e+09   	2.878495e+03   	2.262529e+01   	3.051205e+05   	8.783425e+08   	106
 4220e23d	14745600       	2.097152e+09   	5.574713e+03   	3.353004e+02   	5.909196e+05   	3.306125e+09   	106
 
@@ -105,7 +107,7 @@ c00cf6b7	29491200       	7.077888e+09   	1.151398e+04   	9.050114e+01   	1.22048
 #####
 # Model for cuda1_impl0 (Comb0)
 # number of entries
-7
+8
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -118,6 +120,7 @@ nan            	nan            	nan
 9c6670ef	29491200       	7.077888e+09   	1.150036e+04   	8.404527e+01   	1.207538e+06   	1.388786e+10   	105
 c00cf6b7	29491200       	7.077888e+09   	1.150036e+04   	8.404527e+01   	1.207538e+06   	1.388786e+10   	105
 78a2cc08	29491200       	7.077888e+09   	1.150036e+04   	8.404527e+01   	1.207538e+06   	1.388786e+10   	105
+a7cdf15b	44236800       	1.415578e+10   	2.300072e+04   	1.680905e+02   	2.415076e+06   	5.555144e+10   	105
 24c84a50	11059200       	1.769472e+09   	2.875090e+03   	2.101132e+01   	3.018845e+05   	8.679912e+08   	105
 4220e23d	14745600       	2.097152e+09   	5.579034e+03   	3.672012e+02   	5.857985e+05   	3.282348e+09   	105
 
@@ -143,7 +146,7 @@ c00cf6b7	29491200       	7.077888e+09   	1.150036e+04   	8.404527e+01   	1.20753
 #####
 # Model for cuda2_impl0 (Comb2)
 # number of entries
-7
+8
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -156,6 +159,7 @@ nan            	nan            	nan
 9c6670ef	29491200       	7.077888e+09   	1.148096e+04   	7.289415e+01   	1.205501e+06   	1.384086e+10   	105
 c00cf6b7	29491200       	7.077888e+09   	1.148096e+04   	7.289415e+01   	1.205501e+06   	1.384086e+10   	105
 78a2cc08	29491200       	7.077888e+09   	1.148096e+04   	7.289415e+01   	1.205501e+06   	1.384086e+10   	105
+a7cdf15b	44236800       	1.415578e+10   	2.296192e+04   	1.457883e+02   	2.411002e+06   	5.536344e+10   	105
 24c84a50	11059200       	1.769472e+09   	2.870240e+03   	1.822354e+01   	3.013752e+05   	8.650538e+08   	105
 4220e23d	14745600       	2.097152e+09   	5.580581e+03   	3.970717e+02   	5.859610e+05   	3.286558e+09   	105
 

+ 10 - 5
tools/perfmodels/sampling/codelets/45/starpu_sgemm_gemm.sirocco

@@ -28,7 +28,7 @@
 #####
 # Model for cuda0_impl0 (Comb2)
 # number of entries
-8
+9
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -40,6 +40,7 @@ nan            	nan            	nan
 9c6670ef	29491200       	7.077888e+09   	2.745578e+03   	3.064191e+02   	6.616844e+05   	1.839335e+09   	241
 c00cf6b7	29491200       	7.077888e+09   	2.745578e+03   	3.064191e+02   	6.616844e+05   	1.839335e+09   	241
 78a2cc08	29491200       	7.077888e+09   	2.745578e+03   	3.064191e+02   	6.616844e+05   	1.839335e+09   	241
+a7cdf15b	44236800       	1.415578e+10   	5.491156e+03   	6.128382e+02   	1.323369e+06   	7.357340e+09   	241
 24c84a50	11059200       	1.769472e+09   	6.863945e+02   	7.660478e+01   	1.654211e+05   	1.149584e+08   	241
 0b0b0ce8	3686400        	2.621440e+08   	1.582927e+02   	3.333442e+01   	3.434951e+04   	5.678402e+06   	217
 4220e23d	14745600       	2.097152e+09   	8.206871e+02   	1.017181e+02   	1.148962e+05   	9.574235e+07   	140
@@ -67,7 +68,7 @@ c00cf6b7	29491200       	7.077888e+09   	2.745578e+03   	3.064191e+02   	6.61684
 #####
 # Model for cuda3_impl0 (Comb1)
 # number of entries
-8
+9
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -79,6 +80,7 @@ nan            	nan            	nan
 9c6670ef	29491200       	7.077888e+09   	2.686428e+03   	2.002215e+02   	6.716071e+05   	1.814247e+09   	250
 c00cf6b7	29491200       	7.077888e+09   	2.686428e+03   	2.002215e+02   	6.716071e+05   	1.814247e+09   	250
 78a2cc08	29491200       	7.077888e+09   	2.686428e+03   	2.002215e+02   	6.716071e+05   	1.814247e+09   	250
+a7cdf15b	44236800       	1.415578e+10   	5.372856e+03   	4.004430e+02   	1.343214e+06   	7.256988e+09   	251
 24c84a50	11059200       	1.769472e+09   	6.716070e+02   	5.005537e+01   	1.679018e+05   	1.133904e+08   	250
 0b0b0ce8	3686400        	2.621440e+08   	1.630480e+02   	3.438768e+01   	3.097912e+04   	5.275762e+06   	190
 4220e23d	14745600       	2.097152e+09   	8.448030e+02   	7.773742e+01   	2.433033e+05   	2.072837e+08   	288
@@ -106,7 +108,7 @@ c00cf6b7	29491200       	7.077888e+09   	2.686428e+03   	2.002215e+02   	6.71607
 #####
 # Model for cuda1_impl0 (Comb0)
 # number of entries
-8
+9
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -118,6 +120,7 @@ nan            	nan            	nan
 9c6670ef	29491200       	7.077888e+09   	2.791098e+03   	3.147711e+02   	6.503258e+05   	1.838209e+09   	233
 c00cf6b7	29491200       	7.077888e+09   	2.791098e+03   	3.147711e+02   	6.503258e+05   	1.838209e+09   	233
 78a2cc08	29491200       	7.077888e+09   	2.791098e+03   	3.147711e+02   	6.503258e+05   	1.838209e+09   	233
+a7cdf15b	44236800       	1.415578e+10   	5.582196e+03   	6.295422e+02   	1.300652e+06   	7.352836e+09   	233
 24c84a50	11059200       	1.769472e+09   	6.977745e+02   	7.869277e+01   	1.625815e+05   	1.148881e+08   	233
 0b0b0ce8	3686400        	2.621440e+08   	1.624855e+02   	3.298013e+01   	2.940987e+04   	4.975550e+06   	181
 4220e23d	14745600       	2.097152e+09   	8.152506e+02   	1.017614e+02   	1.173961e+05   	9.719839e+07   	144
@@ -145,7 +148,7 @@ c00cf6b7	29491200       	7.077888e+09   	2.791098e+03   	3.147711e+02   	6.50325
 #####
 # Model for cuda2_impl0 (Comb3)
 # number of entries
-8
+9
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -157,6 +160,7 @@ nan            	nan            	nan
 9c6670ef	29491200       	7.077888e+09   	2.754203e+03   	2.682327e+02   	6.830422e+05   	1.899080e+09   	248
 c00cf6b7	29491200       	7.077888e+09   	2.754203e+03   	2.682327e+02   	6.830422e+05   	1.899080e+09   	248
 78a2cc08	29491200       	7.077888e+09   	2.754203e+03   	2.682327e+02   	6.830422e+05   	1.899080e+09   	248
+a7cdf15b	44236800       	1.415578e+10   	5.508406e+03   	5.364654e+02   	1.366084e+06   	7.596320e+09   	248
 24c84a50	11059200       	1.769472e+09   	6.885507e+02   	6.705818e+01   	1.707605e+05   	1.186925e+08   	248
 0b0b0ce8	3686400        	2.621440e+08   	1.622246e+02   	3.553894e+01   	3.714942e+04   	6.315779e+06   	229
 4220e23d	14745600       	2.097152e+09   	8.611626e+02   	9.290485e+01   	2.411255e+05   	2.100651e+08   	280
@@ -184,7 +188,7 @@ c00cf6b7	29491200       	7.077888e+09   	2.754203e+03   	2.682327e+02   	6.83042
 #####
 # Model for cpu0_impl0 (Comb4)
 # number of entries
-8
+9
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -196,6 +200,7 @@ nan            	nan            	nan
 9c6670ef	29491200       	7.077888e+09   	1.712078e+05   	4.163047e+04   	2.773567e+07   	5.029326e+12   	162
 c00cf6b7	29491200       	7.077888e+09   	1.712078e+05   	4.163047e+04   	2.773567e+07   	5.029326e+12   	162
 78a2cc08	29491200       	7.077888e+09   	1.712078e+05   	4.163047e+04   	2.773567e+07   	5.029326e+12   	162
+a7cdf15b	44236800       	1.415578e+10   	3.424156e+05   	8.326094e+04   	5.547134e+07   	2.011730e+13   	162
 24c84a50	11059200       	1.769472e+09   	4.280195e+04   	1.040762e+04   	6.933918e+06   	3.143329e+11   	162
 0b0b0ce8	3686400        	2.621440e+08   	6.441655e+03   	1.152866e+03   	3.220827e+05   	2.141201e+09   	50
 4220e23d	14745600       	2.097152e+09   	4.927734e+04   	1.166029e+04   	5.913281e+06   	3.077063e+11   	120