Kaynağa Gözat

new merge with the trunk

Andra Hugo 12 yıl önce
ebeveyn
işleme
a6a9a0ac35

+ 6 - 0
ChangeLog

@@ -36,6 +36,9 @@ New features:
   * SOCL
         - Manual mapping of commands on specific devices is now possible
   * New interface: COO matrix.
+  * Data interfaces: The pack operation of user-defined data interface
+    defines a new parameter count which should be set to the size of
+    the buffer created by the packing of the data.
   * MPI:
         - Communication statistics for MPI can only be enabled at
 	  execution time by defining the environment variable
@@ -50,6 +53,9 @@ New features:
         - Collective detached operations have new parameters, a
 	  callback function and a argument. This is to be consistent
 	  with the detached point-to-point communications.
+        - When exchanging user-defined data interfaces, the size of
+	  the data is the size returned by the pack operation, i.e
+	  data with dynamic size can now be exchanged with StarPU-MPI.
 
 Changes:
   * Fix the block filter functions.

+ 57 - 59
configure.ac

@@ -1451,88 +1451,86 @@ AC_ARG_ENABLE(blas-lib,
  ])
 
 if test x$blas_lib = xmaybe -o x$blas_lib = xgoto; then
-AC_ARG_WITH(goto-dir, [AS_HELP_STRING([--with-goto-dir=<dir>], [specify GotoBLAS lib location])],
-	[
-		blas_lib=goto
-		gotodir=$withval
-		AC_SUBST(GOTODIR, $gotodir)
-
-		CPPFLAGS="${CPPFLAGS} -I$gotodir/ "
-		LDFLAGS="${LDFLAGS} -L$gotodir/ "
+   AC_ARG_WITH(goto-dir, [AS_HELP_STRING([--with-goto-dir=<dir>], [specify GotoBLAS lib location])],
+   	[
+	    blas_lib=goto
+	    gotodir=$withval
+	    AC_SUBST(GOTODIR, $gotodir)
+
+	    CPPFLAGS="${CPPFLAGS} -I$gotodir/ "
+	    LDFLAGS="${LDFLAGS} -L$gotodir/ "
 	]
 	)
 
-if test x$blas_lib = xgoto; then
-STARPU_CHECK_LIB(BLAS, gfortran, main,,)
-STARPU_CHECK_LIB(BLAS, ifcore, main,,)
-# Perhaps that GotoBLAS2 is available instead (so that we have libgotoblas2.{so,a})
-STARPU_CHECK_LIB(BLAS, goto2, sgemm_,, [havegoto2=no], [$STARPU_BLAS_LDFLAGS])
-if test x$havegoto2 = xno; then
-STARPU_CHECK_LIB(BLAS, goto, sgemm_,,AC_MSG_ERROR([cannot find goto lib]), [$STARPU_BLAS_LDFLAGS])
-fi
-AC_DEFINE(STARPU_GOTO, [1], [use STARPU_GOTO library])
-fi
-
+   if test x$blas_lib = xgoto; then
+       STARPU_CHECK_LIB(BLAS, gfortran, main,,)
+       STARPU_CHECK_LIB(BLAS, ifcore, main,,)
+       # Perhaps that GotoBLAS2 is available instead (so that we have libgotoblas2.{so,a})
+       STARPU_CHECK_LIB(BLAS, goto2, sgemm_,, [havegoto2=no], [$STARPU_BLAS_LDFLAGS])
+       if test x$havegoto2 = xno; then
+	   STARPU_CHECK_LIB(BLAS, goto, sgemm_,,AC_MSG_ERROR([cannot find goto lib]), [$STARPU_BLAS_LDFLAGS])
+       fi
+       AC_DEFINE(STARPU_GOTO, [1], [use STARPU_GOTO library])
+   fi
 fi
 
 if test x$blas_lib = xmaybe -o x$blas_lib = xatlas; then
-AC_ARG_WITH(atlas-dir, [AS_HELP_STRING([--with-atlas-dir=<dir>], [specify ATLAS lib location])],
+    AC_ARG_WITH(atlas-dir, [AS_HELP_STRING([--with-atlas-dir=<dir>], [specify ATLAS lib location])],
 	[
-		AC_MSG_CHECKING(STARPU_ATLAS location)
-		blas_lib=atlas
-		atlasdir=$withval
-		AC_MSG_RESULT($atlasdir)
-		AC_SUBST(ATLASDIR, $atlasdir)
-
-		CPPFLAGS="${CPPFLAGS} -I$atlasdir/include/ "
-		LDFLAGS="${LDFLAGS} -L$atlasdir/lib/ "
+	    AC_MSG_CHECKING(STARPU_ATLAS location)
+	    blas_lib=atlas
+	    atlasdir=$withval
+	    AC_MSG_RESULT($atlasdir)
+	    AC_SUBST(ATLASDIR, $atlasdir)
+
+	    CPPFLAGS="${CPPFLAGS} -I$atlasdir/include/ "
+	    LDFLAGS="${LDFLAGS} -L$atlasdir/lib/ "
 	]
-	)
-
-if test x$blas_lib = xatlas; then
-# test whether STARPU_ATLAS is actually available
-AC_CHECK_HEADER([cblas.h],,AC_MSG_ERROR([cannot find atlas headers]))
-STARPU_CHECK_LIB(BLAS, atlas, ATL_sgemm,,AC_MSG_ERROR([cannot find atlas lib]),)
-STARPU_CHECK_LIB(BLAS, cblas, cblas_sgemm,,AC_MSG_ERROR([cannot find atlas lib]),[-latlas])
-AC_DEFINE(STARPU_ATLAS, [1], [use STARPU_ATLAS library])
-fi
-
+    )
+
+    if test x$blas_lib = xatlas; then
+	# test whether STARPU_ATLAS is actually available
+	AC_CHECK_HEADER([cblas.h],,AC_MSG_ERROR([cannot find atlas headers]))
+	STARPU_CHECK_LIB(BLAS, atlas, ATL_sgemm,,AC_MSG_ERROR([cannot find atlas lib]),)
+	STARPU_CHECK_LIB(BLAS, cblas, cblas_sgemm,,AC_MSG_ERROR([cannot find atlas lib]),[-latlas])
+	AC_DEFINE(STARPU_ATLAS, [1], [use STARPU_ATLAS library])
+    fi
 fi
 
 if test x$blas_lib = xmaybe; then
-	# Should we use MKL ?
-	AC_ARG_WITH(mkl-cflags, [AS_HELP_STRING([--with-mkl-cflags], [specify MKL compilation flags])],
-		[
-			CPPFLAGS="${CPPFLAGS} $withval"
-			blas_lib=mkl
-		])
+    # Should we use MKL ?
+    AC_ARG_WITH(mkl-cflags, [AS_HELP_STRING([--with-mkl-cflags], [specify MKL compilation flags])],
+	[
+	    CPPFLAGS="${CPPFLAGS} $withval"
+	    blas_lib=mkl
+	    ])
 
-	AC_ARG_WITH(mkl-ldflags, [AS_HELP_STRING([--with-mkl-ldflags], [specify MKL linking flags])],
-		[
-			LDFLAGS="${LDFLAGS} $withval"
-			blas_lib=mkl
-		])
-	if test x$blas_lib = xmkl; then
-	        AC_DEFINE(STARPU_MKL, [1], [use MKL library])
-	fi
+    AC_ARG_WITH(mkl-ldflags, [AS_HELP_STRING([--with-mkl-ldflags], [specify MKL linking flags])],
+	[
+	    LDFLAGS="${LDFLAGS} $withval"
+	    blas_lib=mkl
+	    ])
+    if test x$blas_lib = xmkl; then
+	AC_DEFINE(STARPU_MKL, [1], [use MKL library])
+    fi
 fi
 
 if test x$blas_lib = xmaybe; then
-     #perhaps it is possible to use some BLAS lib from the system
-     use_system_blas=no
-     STARPU_SEARCH_LIBS(BLAS,[sgemm_],[blas],use_system_blas=yes,,)
-     if test x$use_system_blas = xyes; then
+    #perhaps it is possible to use some BLAS lib from the system
+    use_system_blas=no
+    STARPU_SEARCH_LIBS(BLAS,[sgemm_],[blas],use_system_blas=yes,,)
+    if test x$use_system_blas = xyes; then
         AC_DEFINE(STARPU_SYSTEM_BLAS, [1], [use refblas library])
 	blas_lib=system
-     elif test x"$BLAS_LIBS" != x; then
+    elif test x"$BLAS_LIBS" != x; then
         AC_DEFINE(STARPU_SYSTEM_BLAS, [1], [use user defined library])
         STARPU_BLAS_LDFLAGS="$BLAS_LIBS"
         AC_SUBST(STARPU_BLAS_LDFLAGS)
         blas_lib=system
         AC_ARG_VAR([BLAS_LIBS], [linker flags for blas])
-     else
+    else
 	blas_lib=none
-     fi
+    fi
 fi
 
 AM_CONDITIONAL(ATLAS_BLAS_LIB, test x$blas_lib = xatlas)

+ 4 - 4
doc/chapters/advanced-api.texi

@@ -73,11 +73,11 @@ todo
 @item @code{struct starpu_multiformat_data_interface_ops* (*get_mf_ops)(void *data_interface)}
 todo
 
-@item @code{int (*pack_data)(starpu_data_handle_t handle, uint32_t node, void **ptr)}
-Pack the data handle into a contiguous buffer at the address @code{ptr}
+@item @code{int (*pack_data)(starpu_data_handle_t handle, uint32_t node, void **ptr, size_t *count)}
+Pack the data handle into a contiguous buffer at the address @code{ptr} and set the size of the newly created buffer in @code{count}
 
-@item @code{int (*unpack_data)(starpu_data_handle_t handle, uint32_t node, void *ptr)}
-Unpack the data handle from the contiguous buffer at the address @code{ptr}
+@item @code{int (*unpack_data)(starpu_data_handle_t handle, uint32_t node, void *ptr, size_t count)}
+Unpack the data handle from the contiguous buffer at the address @code{ptr} of size @var{count}
 
 @end table
 @end deftp

+ 5 - 4
doc/chapters/basic-api.texi

@@ -767,10 +767,11 @@ The function also sets @var{count} to the size of the data handle by calling
 @code{starpu_handle_get_size()}.
 @end deftypefun
 
-@deftypefun int starpu_handle_unpack_data (starpu_data_handle_t @var{handle}, {void *}@var{ptr})
-Copy in @var{handle} the data located at @var{ptr} as described by the
-interface of the data. The interface registered at @var{handle} must
-define a unpacking operation (@pxref{struct starpu_data_interface_ops}).
+@deftypefun int starpu_handle_unpack_data (starpu_data_handle_t @var{handle}, {void *}@var{ptr}, size_t @var{count})
+Unpack in @var{handle} the data located at @var{ptr} of size
+@var{count} as described by the interface of the data. The interface
+registered at @var{handle} must define a unpacking operation
+(@pxref{struct starpu_data_interface_ops}).
 @end deftypefun
 
 @node Accessing Variable Data Interfaces

+ 5 - 4
doc/chapters/mpi-support.texi

@@ -259,21 +259,22 @@ of data interface} can also be used within StarPU-MPI and exchanged
 between nodes. Two functions needs to be defined through
 the type @code{struct starpu_data_interface_ops} (@pxref{Data
 Interface API}). The pack function takes a handle and returns a
-contiguous memory buffer where data to be conveyed to another node
+contiguous memory buffer along with its size where data to be conveyed to another node
 should be copied. The reversed operation is implemented in the unpack
 function which takes a contiguous memory buffer and recreates the data
 handle.
 
 @cartouche
 @smallexample
-static int complex_pack_data(starpu_data_handle_t handle, uint32_t node, void **ptr)
+static int complex_pack_data(starpu_data_handle_t handle, uint32_t node, void **ptr, size_t *count)
 @{
   STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 
   struct starpu_complex_interface *complex_interface =
     (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, node);
 
-  *ptr = malloc(complex_get_size(handle));
+  *count = complex_get_size(handle);
+  *ptr = malloc(*count);
   memcpy(*ptr, complex_interface->real, complex_interface->nx*sizeof(double));
   memcpy(*ptr+complex_interface->nx*sizeof(double), complex_interface->imaginary,
          complex_interface->nx*sizeof(double));
@@ -285,7 +286,7 @@ static int complex_pack_data(starpu_data_handle_t handle, uint32_t node, void **
 
 @cartouche
 @smallexample
-static int complex_unpack_data(starpu_data_handle_t handle, uint32_t node, void *ptr)
+static int complex_unpack_data(starpu_data_handle_t handle, uint32_t node, void *ptr, size_t count)
 @{
   STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 

+ 2 - 0
examples/Makefile.am

@@ -749,6 +749,8 @@ interface_complex_SOURCES	=	\
 if STARPU_USE_CUDA
 interface_complex_SOURCES	+=	\
 	interface/complex_kernels.cu
+interface/complex_kernels.o: interface/complex_kernels.cu
+	$(NVCC) $< -c -o $@ $(NVCCFLAGS) -arch sm_13
 endif
 
 if STARPU_USE_OPENCL

+ 26 - 3
examples/interface/complex.c

@@ -18,6 +18,28 @@
 #include "complex_interface.h"
 #include "complex_codelet.h"
 
+static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
+{
+       if (starpu_worker_get_type(workerid) == STARPU_OPENCL_WORKER)
+               return 1;
+
+#ifdef STARPU_USE_CUDA
+       /* Cuda device */
+       const struct cudaDeviceProp *props;
+       props = starpu_cuda_get_device_properties(workerid);
+       if (props->major >= 2 || props->minor >= 3)
+       {
+               /* At least compute capability 1.3, supports doubles */
+               return 1;
+       }
+       else
+       {
+               /* Old card does not support doubles */
+               return 0;
+       }
+#endif
+}
+
 #ifdef STARPU_USE_CUDA
 extern void copy_complex_codelet_cuda(void *descr[], __attribute__ ((unused)) void *_args);
 #endif
@@ -34,10 +56,10 @@ struct starpu_codelet cl_copy =
 	.opencl_funcs = {copy_complex_codelet_opencl, NULL},
 #endif
 	.nbuffers = 2,
-	.modes = {STARPU_R, STARPU_W}
+	.modes = {STARPU_R, STARPU_W},
+	.can_execute = can_execute
 };
 
-
 #ifdef STARPU_USE_OPENCL
 struct starpu_opencl_program opencl_program;
 #endif
@@ -95,7 +117,6 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) goto enodev;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 
-
 	ret = starpu_insert_task(&cl_compare,
 				 STARPU_R, handle1,
 				 STARPU_R, handle2,
@@ -103,6 +124,8 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) goto enodev;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 
+#warning get the comparison result and return it as the application return code
+
 	starpu_task_wait_for_all();
 
 #ifdef STARPU_USE_OPENCL

+ 4 - 3
examples/interface/complex_interface.c

@@ -171,21 +171,22 @@ static void *complex_handle_to_pointer(starpu_data_handle_t handle, uint32_t nod
 	return (void*) complex_interface->real;
 }
 
-static int complex_pack_data(starpu_data_handle_t handle, uint32_t node, void **ptr)
+static int complex_pack_data(starpu_data_handle_t handle, uint32_t node, void **ptr, size_t *count)
 {
 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 
 	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *)
 		starpu_data_get_interface_on_node(handle, node);
 
-	*ptr = malloc(complex_get_size(handle));
+	*count = complex_get_size(handle);
+	*ptr = malloc(*count);
 	memcpy(*ptr, complex_interface->real, complex_interface->nx*sizeof(double));
 	memcpy(*ptr+complex_interface->nx*sizeof(double), complex_interface->imaginary, complex_interface->nx*sizeof(double));
 
 	return 0;
 }
 
-static int complex_unpack_data(starpu_data_handle_t handle, uint32_t node, void *ptr)
+static int complex_unpack_data(starpu_data_handle_t handle, uint32_t node, void *ptr, size_t count)
 {
 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 

+ 3 - 3
include/starpu_data_interfaces.h

@@ -133,9 +133,9 @@ struct starpu_data_interface_ops
 	struct starpu_multiformat_data_interface_ops* (*get_mf_ops)(void *data_interface);
 
 	/* Pack the data handle into a contiguous buffer at the address ptr and store the size of the buffer in count */
-	int (*pack_data)(starpu_data_handle_t handle, uint32_t node, void **ptr);
+        int (*pack_data)(starpu_data_handle_t handle, uint32_t node, void **ptr, size_t *count);
 	/* Unpack the data handle from the contiguous buffer at the address ptr */
-	int (*unpack_data)(starpu_data_handle_t handle, uint32_t node, void *ptr);
+	int (*unpack_data)(starpu_data_handle_t handle, uint32_t node, void *ptr, size_t count);
 };
 
 /* Return the next available id for a data interface */
@@ -435,7 +435,7 @@ void starpu_multiformat_data_register(starpu_data_handle_t *handle, uint32_t hom
 enum starpu_data_interface_id starpu_handle_get_interface_id(starpu_data_handle_t handle);
 
 int starpu_handle_pack_data(starpu_data_handle_t handle, void **ptr, size_t *count);
-int starpu_handle_unpack_data(starpu_data_handle_t handle, void *ptr);
+int starpu_handle_unpack_data(starpu_data_handle_t handle, void *ptr, size_t count);
 size_t starpu_handle_get_size(starpu_data_handle_t handle);
 
 /* Lookup a ram pointer into a StarPU handle */

+ 3 - 6
mpi/examples/Makefile.am

@@ -51,17 +51,14 @@ examplebindir = $(libdir)/starpu/mpi
 examplebin_PROGRAMS =
 
 if STARPU_USE_CUDA
-# TODO define NVCCFLAGS
-NVCC ?= nvcc
-
-NVCCFLAGS += -I$(top_srcdir)/include/ -I$(top_builddir)/include
+NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -I$(top_builddir)/include/ $(HWLOC_CFLAGS)
 
 .cu.cubin:
 	$(MKDIR_P) `dirname $@`
-	$(NVCC) -cubin $< -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS)
+	$(NVCC) -cubin $< -o $@ $(NVCCFLAGS)
 
 .cu.o:
-	$(NVCC) $< -c -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS) -I$(top_srcdir)/include/  -I$(top_builddir)/include/
+	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
 endif
 
 AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS)

+ 1 - 1
mpi/examples/cholesky/mpi_cholesky_codelets.c

@@ -90,7 +90,7 @@ void dw_cholesky(float ***matA, unsigned size, unsigned ld, unsigned nblocks, in
 				starpu_matrix_data_register(&data_handles[x][y], 0, (uintptr_t)matA[x][y],
 						ld, size/nblocks, size/nblocks, sizeof(float));
 			}
-			/* TODO: make better test to only registering what is needed */
+#warning TODO: make better test to only register what is needed
 			else
 			{
 				/* I don't own that index, but will need it for my computations */

+ 71 - 30
mpi/src/starpu_mpi.c

@@ -24,8 +24,9 @@
 #include <starpu_mpi_stats.h>
 #include <starpu_mpi_insert_task.h>
 
-/* TODO find a better way to select the polling method (perhaps during the
- * configuration) */
+#ifdef STARPU_DEVEL
+#  warning TODO find a better way to select the polling method (perhaps during the configuration)
+#endif
 //#define USE_STARPU_ACTIVITY	1
 
 static void _starpu_mpi_submit_new_mpi_request(void *arg);
@@ -33,6 +34,10 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req);
 #ifdef STARPU_MPI_VERBOSE
 static char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type);
 #endif
+static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t data_handle,
+							int dest, int mpi_tag, MPI_Comm comm,
+							unsigned detached, void (*callback)(void *), void *arg);
+static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, unsigned detached, void (*callback)(void *), void *arg);
 
 /* The list of requests that have been newly submitted by the application */
 static struct _starpu_mpi_req_list *new_requests;
@@ -109,23 +114,13 @@ static struct _starpu_mpi_req *_starpu_mpi_isend_irecv_common(starpu_data_handle
 /*                                                      */
 /********************************************************/
 
-static void _starpu_mpi_isend_func(struct _starpu_mpi_req *req)
+static void _starpu_mpi_isend_data_func(struct _starpu_mpi_req *req)
 {
         _STARPU_MPI_LOG_IN();
 
-	req->needs_unpacking = starpu_mpi_handle_to_datatype(req->data_handle, &req->datatype);
-	if (req->needs_unpacking)
-	{
-		starpu_handle_pack_data(req->data_handle, &req->ptr, &req->count);
-	}
-	else
-	{
-		req->count = 1;
-		req->ptr = starpu_handle_get_local_ptr(req->data_handle);
-	}
 	STARPU_ASSERT(req->ptr);
 
-        _STARPU_MPI_DEBUG("post MPI isend tag %d dst %d ptr %p datatype %p count %d req %p\n", req->mpi_tag, req->srcdst, req->ptr, req->datatype, req->count, &req->request);
+        _STARPU_MPI_DEBUG("post MPI isend tag %d dst %d ptr %p datatype %p count %d req %p\n", req->mpi_tag, req->srcdst, req->ptr, req->datatype, (int)req->count, &req->request);
 
 	_starpu_mpi_comm_amounts_inc(req->comm, req->srcdst, req->datatype, req->count);
 
@@ -142,11 +137,37 @@ static void _starpu_mpi_isend_func(struct _starpu_mpi_req *req)
         _STARPU_MPI_LOG_OUT();
 }
 
+static void _starpu_mpi_isend_size_callback(void *arg)
+{
+	struct _starpu_mpi_req *req = (struct _starpu_mpi_req *) arg;
+	_starpu_mpi_isend_data_func(req);
+}
+
+static void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
+{
+	req->needs_unpacking = starpu_mpi_handle_to_datatype(req->data_handle, &req->datatype);
+	if (!req->needs_unpacking)
+	{
+		req->count = 1;
+		req->ptr = starpu_handle_get_local_ptr(req->data_handle);
+		_starpu_mpi_isend_data_func(req);
+	}
+	else
+	{
+		starpu_data_handle_t count_handle;
+
+		starpu_handle_pack_data(req->data_handle, &req->ptr, &req->count);
+		starpu_variable_data_register(&count_handle, 0, (uintptr_t)&req->count, sizeof(req->count));
+		_starpu_mpi_isend_common(count_handle, req->srcdst, req->mpi_tag, req->comm, 1, _starpu_mpi_isend_size_callback, req);
+		starpu_data_unregister_submit(count_handle);
+	}
+}
+
 static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t data_handle,
 							int dest, int mpi_tag, MPI_Comm comm,
 							unsigned detached, void (*callback)(void *), void *arg)
 {
-	return _starpu_mpi_isend_irecv_common(data_handle, dest, mpi_tag, comm, detached, callback, arg, SEND_REQ, _starpu_mpi_isend_func, STARPU_R);
+	return _starpu_mpi_isend_irecv_common(data_handle, dest, mpi_tag, comm, detached, callback, arg, SEND_REQ, _starpu_mpi_isend_size_func, STARPU_R);
 }
 
 int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, int mpi_tag, MPI_Comm comm)
@@ -195,24 +216,13 @@ int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI
 /*                                                      */
 /********************************************************/
 
-static void _starpu_mpi_irecv_func(struct _starpu_mpi_req *req)
+static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
 {
         _STARPU_MPI_LOG_IN();
 
-	req->needs_unpacking = starpu_mpi_handle_to_datatype(req->data_handle, &req->datatype);
-	if (req->needs_unpacking == 1)
-	{
-		req->count = starpu_handle_get_size(req->data_handle);
-		req->ptr = malloc(req->count);
-	}
-	else
-	{
-		req->count = 1;
-		req->ptr = starpu_handle_get_local_ptr(req->data_handle);
-	}
 	STARPU_ASSERT(req->ptr);
 
-	_STARPU_MPI_DEBUG("post MPI irecv tag %d src %d data %p ptr %p req %p datatype %p\n", req->mpi_tag, req->srcdst, req->data_handle, req->ptr, &req->request, req->datatype);
+	_STARPU_MPI_DEBUG("post MPI irecv tag %d src %d data %p ptr %p datatype %p count %d req %p \n", req->mpi_tag, req->srcdst, req->data_handle, req->ptr, req->datatype, (int)req->count, &req->request);
 
         req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &req->request);
         STARPU_ASSERT(req->ret == MPI_SUCCESS);
@@ -225,9 +235,40 @@ static void _starpu_mpi_irecv_func(struct _starpu_mpi_req *req)
         _STARPU_MPI_LOG_OUT();
 }
 
+static void _starpu_mpi_irecv_size_callback(void *arg)
+{
+	struct _starpu_mpi_req *req = (struct _starpu_mpi_req *) arg;
+#ifdef STARPU_DEVEL
+#  warning TODO: are we sure that req->count can be used as we have not released count_handle?
+#endif
+	req->ptr = malloc(req->count);
+	_starpu_mpi_irecv_data_func(req);
+}
+
+static void _starpu_mpi_irecv_size_func(struct _starpu_mpi_req *req)
+{
+        _STARPU_MPI_LOG_IN();
+
+	req->needs_unpacking = starpu_mpi_handle_to_datatype(req->data_handle, &req->datatype);
+	if (!req->needs_unpacking)
+	{
+		req->count = 1;
+		req->ptr = starpu_handle_get_local_ptr(req->data_handle);
+		_starpu_mpi_irecv_data_func(req);
+	}
+	else
+	{
+		starpu_data_handle_t count_handle;
+
+		starpu_variable_data_register(&count_handle, 0, (uintptr_t)&req->count, sizeof(req->count));
+		_starpu_mpi_irecv_common(count_handle, req->srcdst, req->mpi_tag, req->comm, 1, _starpu_mpi_irecv_size_callback, req);
+		starpu_data_unregister_submit(count_handle);
+	}
+}
+
 static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, unsigned detached, void (*callback)(void *), void *arg)
 {
-	return _starpu_mpi_isend_irecv_common(data_handle, source, mpi_tag, comm, detached, callback, arg, RECV_REQ, _starpu_mpi_irecv_func, STARPU_W);
+	return _starpu_mpi_isend_irecv_common(data_handle, source, mpi_tag, comm, detached, callback, arg, RECV_REQ, _starpu_mpi_irecv_size_func, STARPU_W);
 }
 
 int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int source, int mpi_tag, MPI_Comm comm)
@@ -517,7 +558,7 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
         if (req->request_type != BARRIER_REQ)
 	{
 		if (req->needs_unpacking)
-			starpu_handle_unpack_data(req->data_handle, req->ptr);
+			starpu_handle_unpack_data(req->data_handle, req->ptr, req->count);
 		else
 			MPI_Type_free(&req->datatype);
                 starpu_data_release(req->data_handle);

+ 2 - 2
mpi/src/starpu_mpi_collective.c

@@ -47,7 +47,7 @@ int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, i
 	MPI_Comm_rank(comm, &rank);
 
 #ifdef STARPU_DEVEL
-#warning callback_arg needs to be free-ed
+#warning TODO: callback_arg needs to be free-ed
 #endif
 	callback_func = _callback_collective;
 	callback_arg = malloc(sizeof(struct _callback_arg));
@@ -115,7 +115,7 @@ int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, in
 	MPI_Comm_rank(comm, &rank);
 
 #ifdef STARPU_DEVEL
-#warning callback_arg needs to be free-ed
+#warning TODO: callback_arg needs to be free-ed
 #endif
 	callback_func = _callback_collective;
 	callback_arg = malloc(sizeof(struct _callback_arg));

+ 3 - 2
mpi/src/starpu_mpi_insert_task.c

@@ -293,8 +293,9 @@ void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum star
 				HASH_FIND_PTR(received_data[mpi_rank], &data, already_received);
 				if (already_received)
 				{
-					/* Somebody else will write to the data, so discard our cached copy if any */
-					/* TODO: starpu_mpi could just remember itself. */
+#ifdef STARPU_DEVEL
+#  warning TODO: Somebody else will write to the data, so discard our cached copy if any. starpu_mpi could just remember itself.
+#endif
 					_STARPU_MPI_DEBUG("Clearing receive cache for data %p\n", data);
 					HASH_DEL(received_data[mpi_rank], already_received);
 					starpu_data_invalidate_submit(data);

+ 3 - 6
mpi/tests/Makefile.am

@@ -37,17 +37,14 @@ examplebindir = $(libdir)/starpu/examples/mpi
 examplebin_PROGRAMS =
 
 if STARPU_USE_CUDA
-# TODO define NVCCFLAGS
-NVCC ?= nvcc
-
-NVCCFLAGS += -I$(top_srcdir)/include/ -I$(top_builddir)/include
+NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -I$(top_builddir)/include/ $(HWLOC_CFLAGS)
 
 .cu.cubin:
 	$(MKDIR_P) `dirname $@`
-	$(NVCC) -cubin $< -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS)
+	$(NVCC) -cubin $< -o $@ $(NVCCFLAGS)
 
 .cu.o:
-	$(NVCC) $< -c -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS) -I$(top_srcdir)/include/  -I$(top_builddir)/include/
+	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
 endif
 
 AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS)

+ 1 - 1
src/core/workers.h

@@ -79,7 +79,7 @@ struct _starpu_worker
 	unsigned worker_is_running;
 	unsigned worker_is_initialized;
 	enum _starpu_worker_status status; /* what is the worker doing now ? (eg. CALLBACK) */
-	char name[48];
+	char name[64];
 	char short_name[10];
 	unsigned run_by_starpu; /* Is this run by StarPU or directly by the application ? */
 

+ 1 - 1
src/datawizard/interfaces/coo_interface.c

@@ -374,7 +374,7 @@ allocate_coo_buffer_on_node(void *data_interface, uint32_t dst_node)
 		if (STARPU_UNLIKELY(addr_rows == NULL))
 			goto fail_rows;
 		addr_values = (uintptr_t) malloc(n_values * elemsize);
-		if (STARPU_UNLIKELY(addr_values == NULL))
+		if (STARPU_UNLIKELY(addr_values == (uintptr_t) NULL))
 			goto fail_values;
 		break;
 	}

+ 3 - 4
src/datawizard/interfaces/data_interface.c

@@ -684,14 +684,13 @@ int starpu_data_interface_get_next_id(void)
 int starpu_handle_pack_data(starpu_data_handle_t handle, void **ptr, size_t *count)
 {
 	STARPU_ASSERT(handle->ops->pack_data);
-	*count = starpu_handle_get_size(handle);
-	return handle->ops->pack_data(handle, _starpu_get_local_memory_node(), ptr);
+	return handle->ops->pack_data(handle, _starpu_get_local_memory_node(), ptr, count);
 }
 
-int starpu_handle_unpack_data(starpu_data_handle_t handle, void *ptr)
+int starpu_handle_unpack_data(starpu_data_handle_t handle, void *ptr, size_t count)
 {
 	STARPU_ASSERT(handle->ops->unpack_data);
-	return handle->ops->unpack_data(handle, _starpu_get_local_memory_node(), ptr);
+	return handle->ops->unpack_data(handle, _starpu_get_local_memory_node(), ptr, count);
 }
 
 size_t starpu_handle_get_size(starpu_data_handle_t handle)