Browse Source

Merge branch 'master' into knobs

Olivier Aumage 6 years ago
parent
commit
f78e43fdf2

+ 15 - 1
ChangeLog

@@ -20,7 +20,21 @@ StarPU 1.4.0 (svn revision xxxx)
 ==============================================
 ==============================================
 New features:
 New features:
 
 
-StarPU 1.3.0 (svn revision xxxx)
+StarPU 1.3.2 (git revision xxx)
+==============================================
+
+StarPU 1.3.1 (git revision 01949488b4f8e6fe26d2c200293b8aae5876b038)
+==============================================
+
+Small features:
+  * Add starpu_filter_nparts_compute_chunk_size_and_offset helper.
+  * Add starpu_bcsr_filter_canonical_block_child_ops.
+
+Small changes:
+  * Improve detection of NVML availability. Do not only check the
+    library is available, also check the compiled code can be run.
+
+StarPU 1.3.0 (git revision 24ca83c6dbb102e1cfc41db3bb21c49662067062)
 ==============================================
 ==============================================
 
 
 New features:
 New features:

+ 33 - 9
configure.ac

@@ -92,6 +92,9 @@ AC_PROG_MKDIR_P
 AC_CHECK_PROGS(PROG_STAT,gstat stat)
 AC_CHECK_PROGS(PROG_STAT,gstat stat)
 AC_CHECK_PROGS(PROG_DATE,gdate date)
 AC_CHECK_PROGS(PROG_DATE,gdate date)
 
 
+dnl locate pkg-config
+PKG_PROG_PKG_CONFIG
+
 if test x$enable_perf_debug = xyes; then
 if test x$enable_perf_debug = xyes; then
     enable_shared=no
     enable_shared=no
 fi
 fi
@@ -176,7 +179,7 @@ if test x$enable_simgrid = xyes ; then
 	AC_CHECK_TYPES([smx_actor_t], [AC_DEFINE([STARPU_HAVE_SMX_ACTOR_T], [1], [Define to 1 if you have the smx_actor_t type.])], [], [[#include <simgrid/simix.h>]])
 	AC_CHECK_TYPES([smx_actor_t], [AC_DEFINE([STARPU_HAVE_SMX_ACTOR_T], [1], [Define to 1 if you have the smx_actor_t type.])], [], [[#include <simgrid/simix.h>]])
 
 
 	# Latest functions
 	# Latest functions
-	AC_CHECK_FUNCS([MSG_process_attach MSG_zone_get_hosts MSG_process_self_name])
+	AC_CHECK_FUNCS([MSG_process_attach MSG_zone_get_hosts MSG_process_self_name MSG_process_userdata_init])
 	AC_CHECK_FUNCS([xbt_mutex_try_acquire smpi_process_set_user_data sg_zone_get_by_name sg_link_name sg_host_route sg_host_self sg_host_speed simcall_process_create sg_config_continue_after_help])
 	AC_CHECK_FUNCS([xbt_mutex_try_acquire smpi_process_set_user_data sg_zone_get_by_name sg_link_name sg_host_route sg_host_self sg_host_speed simcall_process_create sg_config_continue_after_help])
 	AC_CHECK_FUNCS([xbt_barrier_init], [AC_DEFINE([STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT], [1], [Define to 1 if you have the `xbt_barrier_init' function.])])
 	AC_CHECK_FUNCS([xbt_barrier_init], [AC_DEFINE([STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT], [1], [Define to 1 if you have the `xbt_barrier_init' function.])])
 	AC_CHECK_DECLS([smpi_process_set_user_data], [], [], [[#include <smpi/smpi.h>]])
 	AC_CHECK_DECLS([smpi_process_set_user_data], [], [], [[#include <smpi/smpi.h>]])
@@ -1337,18 +1340,39 @@ if test x$enable_cuda = xyes; then
 		      STARPU_CUDA_LDFLAGS="$STARPU_CUDA_LDFLAGS -lcusparse"])
 		      STARPU_CUDA_LDFLAGS="$STARPU_CUDA_LDFLAGS -lcusparse"])
 	AC_CHECK_DECLS([cusparseSetStream], [], [], [[#include <cusparse.h>]])
 	AC_CHECK_DECLS([cusparseSetStream], [], [], [[#include <cusparse.h>]])
 
 
-	AC_CHECK_HEADER([nvml.h],
-	  [AC_CHECK_LIB([nvidia-ml], [nvmlDeviceGetTotalEnergyConsumption],
-		        [AC_DEFINE([HAVE_LIBNVIDIA_ML], [1], [Define to 1 if you have the nvidia-ml library])
-		         STARPU_CUDA_LDFLAGS="$STARPU_CUDA_LDFLAGS -lnvidia-ml"])])
-        CPPFLAGS="${SAVED_CPPFLAGS}"
+	LDFLAGS="${LDFLAGS} -lnvidia-ml"
+	AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
+		[[#include <nvml.h>]],
+		[[nvmlInit();]]
+		)],
+	    [
+	      AC_RUN_IFELSE([AC_LANG_PROGRAM(
+	        [[#include <nvml.h>]],
+		[[nvmlInit();]]
+		)],
+		[have_valid_nvml="yes"],
+		[
+	          AC_MSG_RESULT([NVML found and can be compiled, but compiled application can not be run, you are probably on a machine without the CUDA driver])
+	          have_valid_nvml="no"
+		])
+	    ],
+	    [
+	    AC_MSG_ERROR([NVML found, but nvml.h could not be compiled])
+	    have_valid_nvml="no"
+	    ]
+	)
+	if test x$have_valid_nvml = xyes ; then
+		AC_DEFINE([HAVE_LIBNVIDIA_ML], [1], [Define to 1 if you have the nvidia-ml library])
+		STARPU_CUDA_LDFLAGS="$STARPU_CUDA_LDFLAGS -lnvidia-ml"
+	fi
+	AC_MSG_CHECKING(whether nvidia-ml should be used)
+	AC_MSG_RESULT($have_valid_nvml)
+
+	CPPFLAGS="${SAVED_CPPFLAGS}"
 	LDFLAGS="${SAVED_LDFLAGS}"
 	LDFLAGS="${SAVED_LDFLAGS}"
 	LIBS="${SAVED_LIBS}"
 	LIBS="${SAVED_LIBS}"
 fi
 fi
 
 
-dnl Hey dude, are you around?
-PKG_PROG_PKG_CONFIG
-
 have_magma=no
 have_magma=no
 if test x$enable_cuda = xyes; then
 if test x$enable_cuda = xyes; then
 	PKG_CHECK_MODULES([MAGMA],  [magma], [
 	PKG_CHECK_MODULES([MAGMA],  [magma], [

+ 13 - 1
doc/doxygen/chapters/310_data_management.doxy

@@ -355,13 +355,16 @@ __kernel void opencl_kernel(__global int *vector, unsigned offset)
 }
 }
 \endcode
 \endcode
 
 
+When the sub-data is not of the same type as the original data, the
+starpu_data_filter::get_child_ops field needs to be set appropriately for StarPU
+to know which type should be used.
+
 StarPU provides various interfaces and filters for matrices, vectors, etc.,
 StarPU provides various interfaces and filters for matrices, vectors, etc.,
 but applications can also write their own data interfaces and filters, see
 but applications can also write their own data interfaces and filters, see
 <c>examples/interface</c> and <c>examples/filters/custom_mf</c> for an example,
 <c>examples/interface</c> and <c>examples/filters/custom_mf</c> for an example,
 and see \ref DefiningANewDataInterface and \ref DefiningANewDataFilter
 and see \ref DefiningANewDataInterface and \ref DefiningANewDataFilter
 for documentation.
 for documentation.
 
 
-
 \section AsynchronousPartitioning Asynchronous Partitioning
 \section AsynchronousPartitioning Asynchronous Partitioning
 
 
 The partitioning functions described in the previous section are synchronous:
 The partitioning functions described in the previous section are synchronous:
@@ -480,6 +483,8 @@ additional filters can be defined by the application. The principle is that the
 filter function just fills the memory location of the <c>i-th</c> subpart of a data.
 filter function just fills the memory location of the <c>i-th</c> subpart of a data.
 Examples are provided in <c>src/datawizard/interfaces/*_filters.c</c>,
 Examples are provided in <c>src/datawizard/interfaces/*_filters.c</c>,
 and see \ref starpu_data_filter::filter_func for the details.
 and see \ref starpu_data_filter::filter_func for the details.
+The starpu_filter_nparts_compute_chunk_size_and_offset() helper can be used to
+compute the division of pieces of data.
 
 
 \section DataReduction Data Reduction
 \section DataReduction Data Reduction
 
 
@@ -790,6 +795,13 @@ void starpu_complex_data_register(starpu_data_handle_t *handle,
 }
 }
 \endcode
 \endcode
 
 
+The <c>starpu_complex_interface</c> structure is here used just to store the
+parameters that the user provided to <c>starpu_complex_data_register</c>.
+starpu_data_register() will first allocate the handle, and
+then pass the <c>starpu_complex_interface</c> structure to the
+starpu_data_interface_ops::register_data_handle method, which records them
+within the data handle (it is called once per node by starpu_data_register()).
+
 Different operations need to be defined for a data interface through
 Different operations need to be defined for a data interface through
 the type starpu_data_interface_ops. We only define here the basic
 the type starpu_data_interface_ops. We only define here the basic
 operations needed to run simple applications. The source code for the
 operations needed to run simple applications. The source code for the

+ 7 - 1
doc/doxygen/chapters/401_out_of_core.doxy

@@ -52,6 +52,8 @@ machine memory size, but part of it is taken by the kernel, the system,
 daemons, and the application's own allocated data, whose size can not be
 daemons, and the application's own allocated data, whose size can not be
 predicted. That is why the user needs to specify what StarPU can afford.
 predicted. That is why the user needs to specify what StarPU can afford.
 
 
+Some Out-of-core tests are worth giving a read, see <c>tests/disk/*.c</c>
+
 \section UseANewDiskMemory Use a new disk memory
 \section UseANewDiskMemory Use a new disk memory
 
 
 To use a disk memory node, you have to register it with this function:
 To use a disk memory node, you have to register it with this function:
@@ -119,7 +121,11 @@ starpu_task_insert(cl_fill_with_data, STARPU_W, h, 0);
 
 
 Which makes StarPU automatically do the allocation when the task running
 Which makes StarPU automatically do the allocation when the task running
 cl_fill_with_data gets executed. And then if its needs to, it will be able to
 cl_fill_with_data gets executed. And then if its needs to, it will be able to
-release it after having pushed the data to the disk.
+release it after having pushed the data to the disk. Since no initial buffer is
+provided to starpu_matrix_data_register(), the handle does not have any initial
+value right after this call, and thus the very first task using the handle needs
+to use the ::STARPU_W mode like above, ::STARPU_R or ::STARPU_RW would not make
+sense.
 
 
 By default, StarPU will try to push any data handle to the disk. 
 By default, StarPU will try to push any data handle to the disk. 
 To specify whether a given handle should be pushed to the disk,
 To specify whether a given handle should be pushed to the disk,

+ 2 - 2
doc/doxygen/chapters/470_simgrid.doxy

@@ -19,9 +19,9 @@
 /*! \page SimGridSupport SimGrid Support
 /*! \page SimGridSupport SimGrid Support
 
 
 StarPU can use Simgrid in order to simulate execution on an arbitrary
 StarPU can use Simgrid in order to simulate execution on an arbitrary
-platform. This was tested with simgrid from 3.11 to 3.16, and 3.18 to 3.20.
+platform. This was tested with simgrid from 3.11 to 3.16, and 3.18 to 3.22.
 Other versions may have compatibility issues. 3.17 notably does not build at
 Other versions may have compatibility issues. 3.17 notably does not build at
-all.
+all. MPI simulation does not work with version 3.22.
 
 
 \section Preparing Preparing Your Application For Simulation
 \section Preparing Preparing Your Application For Simulation
 
 

+ 8 - 0
doc/doxygen/chapters/501_environment_variables.doxy

@@ -660,6 +660,14 @@ for that environment variable to be used, and the
 STARPU_MPI_DRIVER_CALL_FREQUENCY environment variable set to a positive value.
 STARPU_MPI_DRIVER_CALL_FREQUENCY environment variable set to a positive value.
 </dd>
 </dd>
 
 
+<dt>STARPU_SIMGRID_TRANSFER_COST</dt>
+<dd>
+\anchor STARPU_SIMGRID_TRANSFER_COST
+\addindex __env__STARPU_SIMGRID_TRANSFER_COST
+When set to 1 (which is the default), data transfers (over PCI bus, typically) are taken into account
+in simgrid mode.
+</dd>
+
 <dt>STARPU_SIMGRID_CUDA_MALLOC_COST</dt>
 <dt>STARPU_SIMGRID_CUDA_MALLOC_COST</dt>
 <dd>
 <dd>
 \anchor STARPU_SIMGRID_CUDA_MALLOC_COST
 \anchor STARPU_SIMGRID_CUDA_MALLOC_COST

+ 2 - 1
examples/Makefile.am

@@ -971,7 +971,8 @@ endif
 
 
 interface_complex_SOURCES	=	\
 interface_complex_SOURCES	=	\
 	interface/complex.c		\
 	interface/complex.c		\
-	interface/complex_interface.c
+	interface/complex_interface.c	\
+	interface/complex_filters.c
 if STARPU_USE_CUDA
 if STARPU_USE_CUDA
 interface_complex_SOURCES	+=	\
 interface_complex_SOURCES	+=	\
 	interface/complex_kernels.cu
 	interface/complex_kernels.cu

+ 74 - 1
examples/interface/complex.c

@@ -2,7 +2,7 @@
  *
  *
  * Copyright (C) 2012,2013                                Inria
  * Copyright (C) 2012,2013                                Inria
  * Copyright (C) 2012,2013,2015,2017                      CNRS
  * Copyright (C) 2012,2013,2015,2017                      CNRS
- * Copyright (C) 2013,2014                                Université de Bordeaux
+ * Copyright (C) 2013,2014,2019                           Université de Bordeaux
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -83,6 +83,7 @@ int main(void)
 	int ret = 0;
 	int ret = 0;
 	starpu_data_handle_t handle1;
 	starpu_data_handle_t handle1;
 	starpu_data_handle_t handle2;
 	starpu_data_handle_t handle2;
+	starpu_data_handle_t handle3;
 
 
 	double real = 45.0;
 	double real = 45.0;
 	double imaginary = 12.0;
 	double imaginary = 12.0;
@@ -92,6 +93,10 @@ int main(void)
 	int compare;
 	int compare;
 	int *compare_ptr = &compare;
 	int *compare_ptr = &compare;
 
 
+	starpu_data_handle_t vectorh;
+	struct starpu_vector_interface *vectori;
+	double *vector;
+
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	if (ret == -ENODEV) return 77;
 	if (ret == -ENODEV) return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
@@ -112,6 +117,7 @@ int main(void)
 	if (ret == -ENODEV) goto end;
 	if (ret == -ENODEV) goto end;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
 
+	/* Compare two different complexs.  */
 	ret = starpu_task_insert(&cl_compare,
 	ret = starpu_task_insert(&cl_compare,
 				 STARPU_R, handle1,
 				 STARPU_R, handle1,
 				 STARPU_R, handle2,
 				 STARPU_R, handle2,
@@ -126,6 +132,7 @@ int main(void)
 	     goto end;
 	     goto end;
 	}
 	}
 
 
+	/* Copy one into the other.  */
 	ret = starpu_task_insert(&cl_copy,
 	ret = starpu_task_insert(&cl_copy,
 				 STARPU_R, handle1,
 				 STARPU_R, handle1,
 				 STARPU_W, handle2,
 				 STARPU_W, handle2,
@@ -141,6 +148,7 @@ int main(void)
 	if (ret == -ENODEV) goto end;
 	if (ret == -ENODEV) goto end;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
 
+	/* And compare again.  */
 	ret = starpu_task_insert(&cl_compare,
 	ret = starpu_task_insert(&cl_compare,
 				 STARPU_R, handle1,
 				 STARPU_R, handle1,
 				 STARPU_R, handle2,
 				 STARPU_R, handle2,
@@ -156,6 +164,71 @@ int main(void)
 	     FPRINTF(stderr, "Complex numbers should be similar\n");
 	     FPRINTF(stderr, "Complex numbers should be similar\n");
 	}
 	}
 
 
+	/* Put another value again */
+	starpu_data_acquire(handle2, STARPU_W);
+	copy_real = 78.0;
+	copy_imaginary = 77.0;
+	starpu_data_release(handle2);
+
+	/* Create a vector of two complexs.  */
+	starpu_complex_data_register(&handle3, -1, 0, 0, 2);
+
+	/* Split it in two pieces (thus one complex each).  */
+	struct starpu_data_filter f =
+	{
+		.filter_func = starpu_complex_filter_block,
+		.nchildren = 2,
+	};
+	starpu_data_partition(handle3, &f);
+
+	/* Copy the two complexs into each part */
+	ret = starpu_task_insert(&cl_copy,
+				 STARPU_R, handle1,
+				 STARPU_W, starpu_data_get_sub_data(handle3, 1, 0),
+				 0);
+	if (ret == -ENODEV) goto end;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+	ret = starpu_task_insert(&cl_copy,
+				 STARPU_R, handle2,
+				 STARPU_W, starpu_data_get_sub_data(handle3, 1, 1),
+				 0);
+	if (ret == -ENODEV) goto end;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+
+	/* Gather the two pieces.  */
+	starpu_data_unpartition(handle3, STARPU_MAIN_RAM);
+
+	/* Show it.  */
+	ret = starpu_task_insert(&cl_display, STARPU_VALUE, "handle3", strlen("handle3")+1, STARPU_R, handle3, 0);
+
+	/* Get the real and imaginary vectors.  */
+	struct starpu_data_filter fcanon =
+	{
+		.filter_func = starpu_complex_filter_canonical,
+		.nchildren = 2,
+		.get_child_ops = starpu_complex_filter_canonical_child_ops,
+	};
+	starpu_data_partition(handle3, &fcanon);
+
+	/* Check the corresponding data.  */
+	vectorh = starpu_data_get_sub_data(handle3, 1, 0);
+	starpu_data_acquire(vectorh, STARPU_R);
+	vectori = starpu_data_get_interface_on_node(vectorh, STARPU_MAIN_RAM);
+	vector = (double*) vectori->ptr;
+	STARPU_ASSERT_MSG(vector[0] == 45., "Bogus value: %f instead of %f", vector[0], 45.);
+	STARPU_ASSERT_MSG(vector[1] == 78., "Bogus value: %f instead of %f", vector[1], 78.);
+	starpu_data_release(vectorh);
+
+	vectorh = starpu_data_get_sub_data(handle3, 1, 1);
+	starpu_data_acquire(vectorh, STARPU_R);
+	vectori = starpu_data_get_interface_on_node(vectorh, STARPU_MAIN_RAM);
+	vector = (double*) vectori->ptr;
+	STARPU_ASSERT_MSG(vector[0] == 12., "Bogus value: %f instead of %f", vector[0], 12.);
+	STARPU_ASSERT_MSG(vector[1] == 77., "Bogus value: %f instead of %f", vector[1], 77.);
+	starpu_data_release(vectorh);
+
+	starpu_data_unpartition(handle3, STARPU_MAIN_RAM);
+
 end:
 end:
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
 	{
 	{

+ 73 - 0
examples/interface/complex_filters.c

@@ -0,0 +1,73 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2019                                     Université de Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+#include "complex_interface.h"
+
+void starpu_complex_filter_block(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned id, unsigned nchunks)
+{
+	struct starpu_complex_interface *complex_father = father_interface;
+	struct starpu_complex_interface *complex_child = child_interface;
+
+	uint32_t nx = complex_father->nx;
+	size_t elemsize = sizeof(double);
+
+	STARPU_ASSERT_MSG(nchunks <= nx, "%u parts for %u elements", nchunks, nx);
+
+	uint32_t child_nx;
+	size_t offset;
+	/* Compute the split */
+	starpu_filter_nparts_compute_chunk_size_and_offset(nx, nchunks, elemsize, id, 1,
+						     &child_nx, &offset);
+
+	complex_child->nx = child_nx;
+
+	if (complex_father->real)
+	{
+		complex_child->real = (void*) ((uintptr_t) complex_father->real + offset);
+		complex_child->imaginary = (void*) ((uintptr_t) complex_father->imaginary + offset);
+	}
+}
+
+void starpu_complex_filter_canonical(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned id, unsigned nchunks)
+{
+	struct starpu_complex_interface *complex_father = father_interface;
+	struct starpu_vector_interface *vector_child = child_interface;
+
+	STARPU_ASSERT_MSG(nchunks == 2, "complex can only be split into two pieces");
+	STARPU_ASSERT_MSG(id < 2, "complex has only two pieces");
+
+	vector_child->id = STARPU_VECTOR_INTERFACE_ID;
+	if (id == 0)
+		vector_child->ptr = (uintptr_t) complex_father->real;
+	else
+		vector_child->ptr = (uintptr_t) complex_father->imaginary;
+
+	/* the complex interface doesn't support dev_handle/offset */
+	vector_child->dev_handle = vector_child->ptr;
+	vector_child->offset = 0;
+
+	vector_child->nx = complex_father->nx;
+	vector_child->elemsize = sizeof(double);
+	vector_child->slice_base = 0;
+	vector_child->allocsize = vector_child->nx * vector_child->elemsize;
+}
+
+struct starpu_data_interface_ops *starpu_complex_filter_canonical_child_ops(STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned child)
+{
+	return &starpu_interface_vector_ops;
+}

+ 7 - 0
examples/interface/complex_interface.h

@@ -38,4 +38,11 @@ int starpu_complex_get_nx(starpu_data_handle_t handle);
 #define STARPU_COMPLEX_GET_IMAGINARY(interface)	(((struct starpu_complex_interface *)(interface))->imaginary)
 #define STARPU_COMPLEX_GET_IMAGINARY(interface)	(((struct starpu_complex_interface *)(interface))->imaginary)
 #define STARPU_COMPLEX_GET_NX(interface)	(((struct starpu_complex_interface *)(interface))->nx)
 #define STARPU_COMPLEX_GET_NX(interface)	(((struct starpu_complex_interface *)(interface))->nx)
 
 
+/* Split complex vector into smaller complex vectors */
+void starpu_complex_filter_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nchunks);
+
+/* Split complex into two simple vectors */
+void starpu_complex_filter_canonical(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nchunks);
+struct starpu_data_interface_ops *starpu_complex_filter_canonical_child_ops(struct starpu_data_filter *f, unsigned child);
+
 #endif /* __COMPLEX_INTERFACE_H */
 #endif /* __COMPLEX_INTERFACE_H */

+ 2 - 7
examples/spmv/dw_block_spmv.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2008-2015,2017                           Université de Bordeaux
+ * Copyright (C) 2008-2015,2017,2019                      Université de Bordeaux
  * Copyright (C) 2012,2013                                Inria
  * Copyright (C) 2012,2013                                Inria
  * Copyright (C) 2010                                     Mehdi Juhoor
  * Copyright (C) 2010                                     Mehdi Juhoor
  * Copyright (C) 2010-2017                                CNRS
  * Copyright (C) 2010-2017                                CNRS
@@ -126,11 +126,6 @@ unsigned get_bcsr_nchildren(struct starpu_data_filter *f, starpu_data_handle_t h
   return (unsigned)starpu_bcsr_get_nnz(handle);
   return (unsigned)starpu_bcsr_get_nnz(handle);
 }
 }
 
 
-struct starpu_data_interface_ops *get_bcsr_child_ops(struct starpu_data_filter *f, unsigned child)
-{
-  return &starpu_interface_matrix_ops;
-}
-
 void call_filters(void)
 void call_filters(void)
 {
 {
 
 
@@ -140,7 +135,7 @@ void call_filters(void)
 	bcsr_f.filter_func    = starpu_bcsr_filter_canonical_block;
 	bcsr_f.filter_func    = starpu_bcsr_filter_canonical_block;
 	bcsr_f.get_nchildren = get_bcsr_nchildren;
 	bcsr_f.get_nchildren = get_bcsr_nchildren;
 	/* the children use a matrix interface ! */
 	/* the children use a matrix interface ! */
-	bcsr_f.get_child_ops = get_bcsr_child_ops;
+	bcsr_f.get_child_ops = starpu_bcsr_filter_canonical_block_child_ops;
 
 
 	vector_in_f.filter_func = starpu_vector_filter_block;
 	vector_in_f.filter_func = starpu_vector_filter_block;
 	vector_in_f.nchildren  = size/c;
 	vector_in_f.nchildren  = size/c;

+ 19 - 1
include/starpu_data_filters.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2011                                     Antoine Lucas
  * Copyright (C) 2011                                     Antoine Lucas
- * Copyright (C) 2009-2012,2014,2015,2017                 Université de Bordeaux
+ * Copyright (C) 2009-2012,2014,2015,2017,2019            Université de Bordeaux
  * Copyright (C) 2010                                     Mehdi Juhoor
  * Copyright (C) 2010                                     Mehdi Juhoor
  * Copyright (C) 2010-2013,2015,2017,2018,2019            CNRS
  * Copyright (C) 2010-2013,2015,2017,2018,2019            CNRS
  * Copyright (C) 2011                                     Inria
  * Copyright (C) 2011                                     Inria
@@ -315,8 +315,14 @@ void starpu_data_partition_not_automatic(starpu_data_handle_t handle);
 
 
 /**
 /**
    Partition a block-sparse matrix into dense matrices.
    Partition a block-sparse matrix into dense matrices.
+   starpu_data_filter::get_child_ops needs to be set to
+   starpu_bcsr_filter_canonical_block_child_ops()
 */
 */
 void starpu_bcsr_filter_canonical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 void starpu_bcsr_filter_canonical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+/**
+   Return the child_ops of the partition obtained with starpu_bcsr_filter_canonical_block().
+*/
+void starpu_bcsr_filter_canonical_block_child_ops(struct starpu_data_filter *f, unsigned child);
 
 
 /** @} */
 /** @} */
 
 
@@ -505,6 +511,18 @@ void starpu_block_filter_depth_block(void *father_interface, void *child_interfa
 */
 */
 void starpu_block_filter_depth_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 void starpu_block_filter_depth_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 
+/**
+   Given an integer \p n, \p n the number of parts it must be divided in, \p id the
+   part currently considered, determines the \p chunk_size and the \p offset, taking
+   into account the size of the elements stored in the data structure \p elemsize
+   and \p ld, the leading dimension, which is most often 1.
+ */
+void
+starpu_filter_nparts_compute_chunk_size_and_offset(unsigned n, unsigned nparts,
+					     size_t elemsize, unsigned id,
+					     unsigned ld, unsigned *chunk_size,
+					     size_t *offset);
+
 /** @} */
 /** @} */
 
 
 /** @} */
 /** @} */

+ 46 - 6
include/starpu_data_interfaces.h

@@ -361,6 +361,13 @@ struct starpu_data_copy_methods
 	   must return <c>-EAGAIN</c> if any of the starpu_interface_copy()
 	   must return <c>-EAGAIN</c> if any of the starpu_interface_copy()
 	   calls has returned <c>-EAGAIN</c> (i.e. at least some transfer is
 	   calls has returned <c>-EAGAIN</c> (i.e. at least some transfer is
 	   still ongoing), and return 0 otherwise.
 	   still ongoing), and return 0 otherwise.
+
+	   This can only be implemented if the interface has ready-to-send
+	   data blocks. If the interface is more involved than
+	   this, i.e. it needs to collect pieces of data before
+	   transferring, starpu_data_interface_ops::pack_data and
+	   starpu_data_interface_ops::unpack_data should be implemented instead,
+	   and the core will just transfer the resulting data buffer.
 	*/
 	*/
 	int (*any_to_any)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 	int (*any_to_any)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 };
 };
@@ -396,19 +403,30 @@ struct starpu_data_interface_ops
 	   home node, pointers should be left as NULL except on the \p home_node, for
 	   home node, pointers should be left as NULL except on the \p home_node, for
 	   which the pointers should be copied from the given \p data_interface, which
 	   which the pointers should be copied from the given \p data_interface, which
 	   was filled with the application's pointers.
 	   was filled with the application's pointers.
+
+	   This method is mandatory.
 	*/
 	*/
 	void		 (*register_data_handle)	(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
 	void		 (*register_data_handle)	(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
 
 
 	/**
 	/**
 	   Allocate data for the interface on a given node. This should use
 	   Allocate data for the interface on a given node. This should use
-	   starpu_malloc_on_node to perform the allocation(s), and fill the pointers
+	   starpu_malloc_on_node() to perform the allocation(s), and fill the pointers
 	   in the data interface. It should return the size of the allocated memory, or
 	   in the data interface. It should return the size of the allocated memory, or
 	   -ENOMEM if memory could not be allocated.
 	   -ENOMEM if memory could not be allocated.
+
+	   Note that the memory node can be CPU memory, GPU memory, or even disk
+	   area. The result returned by starpu_malloc_on_node() should be just
+	   stored as uintptr_t without trying to interpret it since it may be a
+	   GPU pointer, a disk descriptor, etc.
+
+	   This method is mandatory to be able to support memory nodes.
 	*/
 	*/
 	starpu_ssize_t	 (*allocate_data_on_node)	(void *data_interface, unsigned node);
 	starpu_ssize_t	 (*allocate_data_on_node)	(void *data_interface, unsigned node);
 
 
 	/**
 	/**
 	   Free data of the interface on a given node.
 	   Free data of the interface on a given node.
+
+	   This method is mandatory to be able to support memory nodes.
 	*/
 	*/
 	void 		 (*free_data_on_node)		(void *data_interface, unsigned node);
 	void 		 (*free_data_on_node)		(void *data_interface, unsigned node);
 
 
@@ -421,6 +439,11 @@ struct starpu_data_interface_ops
 
 
 	/**
 	/**
 	   Struct with pointer to functions for performing ram/cuda/opencl synchronous and asynchronous transfers.
 	   Struct with pointer to functions for performing ram/cuda/opencl synchronous and asynchronous transfers.
+
+	   This field is mandatory to be able to support memory
+	   nodes, except disk nodes which can be supported by just
+	   implementing starpu_data_interface_ops::pack_data and
+	   starpu_data_interface_ops::unpack_data.
 	*/
 	*/
 	const struct starpu_data_copy_methods *copy_methods;
 	const struct starpu_data_copy_methods *copy_methods;
 
 
@@ -428,11 +451,17 @@ struct starpu_data_interface_ops
 	   @deprecated
 	   @deprecated
 	   Use starpu_data_interface_ops::to_pointer instead.
 	   Use starpu_data_interface_ops::to_pointer instead.
 	   Return the current pointer (if any) for the handle on the given node.
 	   Return the current pointer (if any) for the handle on the given node.
+
+	   This method is only required if starpu_data_interface_ops::to_pointer
+	   is not implemented.
 	*/
 	*/
 	void * 		 (*handle_to_pointer)		(starpu_data_handle_t handle, unsigned node);
 	void * 		 (*handle_to_pointer)		(starpu_data_handle_t handle, unsigned node);
 
 
 	/**
 	/**
 	   Return the current pointer (if any) for the given interface on the given node.
 	   Return the current pointer (if any) for the given interface on the given node.
+
+	   This method is only required for starpu_data_handle_to_pointer()
+	   and starpu_data_get_local_ptr(), and for disk support.
 	*/
 	*/
 	void * 		 (*to_pointer)			(void *data_interface, unsigned node);
 	void * 		 (*to_pointer)			(void *data_interface, unsigned node);
 
 
@@ -443,7 +472,7 @@ struct starpu_data_interface_ops
 	int 		 (*pointer_is_inside)		(void *data_interface, unsigned node, void *ptr);
 	int 		 (*pointer_is_inside)		(void *data_interface, unsigned node, void *ptr);
 
 
 	/**
 	/**
-	   Return an estimation of the size of data, for performance models.
+	   Return an estimation of the size of data, for performance models and tracing feedback.
 	*/
 	*/
 	size_t 		 (*get_size)			(starpu_data_handle_t handle);
 	size_t 		 (*get_size)			(starpu_data_handle_t handle);
 
 
@@ -456,7 +485,9 @@ struct starpu_data_interface_ops
 	size_t 		 (*get_alloc_size)		(starpu_data_handle_t handle);
 	size_t 		 (*get_alloc_size)		(starpu_data_handle_t handle);
 
 
 	/**
 	/**
-	  Return a 32bit footprint which characterizes the data size and layout (nx, ny, ld, elemsize, etc.), to be used for indexing performance models.
+	  Return a 32bit footprint which characterizes the data size and layout (nx, ny, ld, elemsize, etc.), required for indexing performance models.
+
+	  starpu_hash_crc32c_be() and alike can be used to produce this 32bit value from various types of values.
 	*/
 	*/
 	uint32_t 	 (*footprint)			(starpu_data_handle_t handle);
 	uint32_t 	 (*footprint)			(starpu_data_handle_t handle);
 
 
@@ -470,15 +501,15 @@ struct starpu_data_interface_ops
 
 
 	/**
 	/**
 	   Compare the data size and layout of two interfaces (nx, ny, ld, elemsize,
 	   Compare the data size and layout of two interfaces (nx, ny, ld, elemsize,
-	   etc.), to be used for indexing performance models.. It should return 1 if
-	   the two interfaces size and layout match, and 0 otherwise.
+	   etc.), to be used for indexing performance models. It should return 1 if
+	   the two interfaces size and layout match computation-wise, and 0 otherwise.
 	*/
 	*/
 	int 		 (*compare)			(void *data_interface_a, void *data_interface_b);
 	int 		 (*compare)			(void *data_interface_a, void *data_interface_b);
 
 
 	/**
 	/**
 	   Compare the data allocation of two interfaces etc.), to be used for indexing
 	   Compare the data allocation of two interfaces etc.), to be used for indexing
 	   allocation cache. It should return
 	   allocation cache. It should return
-	   1 if the two interfaces are allocation-compatible, and 0 otherwise.
+	   1 if the two interfaces are allocation-compatible, i.e. basically have the same alloc_size, and 0 otherwise.
 	   If not specified, the starpu_data_interface_ops::compare method is
 	   If not specified, the starpu_data_interface_ops::compare method is
 	   used instead.
 	   used instead.
 	*/
 	*/
@@ -486,6 +517,7 @@ struct starpu_data_interface_ops
 
 
 	/**
 	/**
 	   Dump the sizes of a handle to a file.
 	   Dump the sizes of a handle to a file.
+	   This is required for performance models
 	*/
 	*/
 	void 		 (*display)			(starpu_data_handle_t handle, FILE *f);
 	void 		 (*display)			(starpu_data_handle_t handle, FILE *f);
 
 
@@ -493,6 +525,7 @@ struct starpu_data_interface_ops
 	   Describe the data into a string in a brief way, such as one
 	   Describe the data into a string in a brief way, such as one
 	   letter to describe the type of data, and the data
 	   letter to describe the type of data, and the data
 	   dimensions.
 	   dimensions.
+	   This is required for tracing feedback.
 	*/
 	*/
 	starpu_ssize_t	 (*describe)			(void *data_interface, char *buf, size_t size);
 	starpu_ssize_t	 (*describe)			(void *data_interface, char *buf, size_t size);
 
 
@@ -531,6 +564,13 @@ struct starpu_data_interface_ops
 	   copy the data in the buffer but just set count to the size of the
 	   copy the data in the buffer but just set count to the size of the
 	   buffer which would have been allocated. The special value -1
 	   buffer which would have been allocated. The special value -1
 	   indicates the size is yet unknown.
 	   indicates the size is yet unknown.
+
+	   This method (and starpu_data_interface_ops::unpack_data) is required
+	   for disk support if the starpu_data_copy_methods::any_to_any method
+	   is not implemented (because the in-memory data layout is too
+	   complex).
+
+	   This is also required for MPI support if there is no registered MPI data type.
 	*/
 	*/
 	int (*pack_data) (starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ssize_t *count);
 	int (*pack_data) (starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ssize_t *count);
 
 

+ 8 - 4
src/core/perfmodel/perfmodel_bus.c

@@ -593,9 +593,10 @@ static int find_cpu_from_numa_node(hwloc_obj_t obj)
 	{
 	{
 		current = current->first_child;
 		current = current->first_child;
 
 
-		/* If we don't find a "PU" obj before the leave, this means
-		 * hwloc does not know whether there are CPU or not. */
-		STARPU_ASSERT(current);
+                /* If we don't find a "PU" obj before the leave, perhaps we are
+                 * just not allowed to use it. */
+                if (!current)
+                        return -1;
 	}
 	}
 
 
 	STARPU_ASSERT(current->depth == HWLOC_OBJ_PU);
 	STARPU_ASSERT(current->depth == HWLOC_OBJ_PU);
@@ -620,7 +621,7 @@ static void measure_bandwidth_between_numa_nodes_and_dev(int dev, struct dev_tim
 		dev_timing_per_numanode[timing_numa_index].numa_id = numa_id;
 		dev_timing_per_numanode[timing_numa_index].numa_id = numa_id;
 
 
 		/* Chose one CPU connected to this NUMA node */
 		/* Chose one CPU connected to this NUMA node */
-		unsigned cpu_id = 0;
+		int cpu_id = 0;
 #ifdef STARPU_HAVE_HWLOC
 #ifdef STARPU_HAVE_HWLOC
 		hwloc_obj_t obj = hwloc_get_obj_by_type(hwtopology, HWLOC_OBJ_NUMANODE, numa_id);
 		hwloc_obj_t obj = hwloc_get_obj_by_type(hwtopology, HWLOC_OBJ_NUMANODE, numa_id);
 
 
@@ -638,6 +639,9 @@ static void measure_bandwidth_between_numa_nodes_and_dev(int dev, struct dev_tim
 			cpu_id = find_cpu_from_numa_node(hwloc_get_root_obj(hwtopology));
 			cpu_id = find_cpu_from_numa_node(hwloc_get_root_obj(hwtopology));
 #endif
 #endif
 
 
+		if (cpu_id < 0)
+			continue;
+
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		if (strncmp(type, "CUDA", 4) == 0)
 		if (strncmp(type, "CUDA", 4) == 0)
 			measure_bandwidth_between_host_and_dev_on_numa_with_cuda(dev, numa_id, cpu_id, dev_timing_per_numanode);
 			measure_bandwidth_between_host_and_dev_on_numa_with_cuda(dev, numa_id, cpu_id, dev_timing_per_numanode);

+ 3 - 0
src/core/simgrid.c

@@ -391,6 +391,9 @@ void _starpu_simgrid_init_early(int *argc STARPU_ATTRIBUTE_UNUSED, char ***argv
 #ifndef STARPU_STATIC_ONLY
 #ifndef STARPU_STATIC_ONLY
 		_STARPU_ERROR("Simgrid currently does not support privatization for dynamically-linked libraries in SMPI. Please reconfigure and build StarPU with --disable-shared");
 		_STARPU_ERROR("Simgrid currently does not support privatization for dynamically-linked libraries in SMPI. Please reconfigure and build StarPU with --disable-shared");
 #endif
 #endif
+#ifdef HAVE_MSG_PROCESS_USERDATA_INIT
+		MSG_process_userdata_init();
+#endif
 		void **tsd;
 		void **tsd;
 		_STARPU_CALLOC(tsd, MAX_TSD+1, sizeof(void*));
 		_STARPU_CALLOC(tsd, MAX_TSD+1, sizeof(void*));
 		MSG_process_set_data(MSG_process_self(), tsd);
 		MSG_process_set_data(MSG_process_self(), tsd);

+ 1 - 7
src/datawizard/filters.c

@@ -1042,14 +1042,8 @@ void _starpu_data_partition_access_submit(starpu_data_handle_t target, int write
 	_starpu_data_partition_access_look_up(target, NULL, write);
 	_starpu_data_partition_access_look_up(target, NULL, write);
 }
 }
 
 
-/*
- * Given an integer N, NPARTS the number of parts it must be divided in, ID the
- * part currently considered, determines the CHUNK_SIZE and the OFFSET, taking
- * into account the size of the elements stored in the data structure ELEMSIZE
- * and LD, the leading dimension.
- */
 void
 void
-_starpu_filter_nparts_compute_chunk_size_and_offset(unsigned n, unsigned nparts,
+starpu_filter_nparts_compute_chunk_size_and_offset(unsigned n, unsigned nparts,
 					     size_t elemsize, unsigned id,
 					     size_t elemsize, unsigned id,
 					     unsigned ld, unsigned *chunk_size,
 					     unsigned ld, unsigned *chunk_size,
 					     size_t *offset)
 					     size_t *offset)

+ 5 - 0
src/datawizard/interfaces/bcsr_filters.c

@@ -50,3 +50,8 @@ void starpu_bcsr_filter_canonical_block(void *father_interface, void *child_inte
 		matrix_child->ptr = (uintptr_t)&nzval[firstentry + ptr_offset];
 		matrix_child->ptr = (uintptr_t)&nzval[firstentry + ptr_offset];
 	}
 	}
 }
 }
+
+void starpu_bcsr_filter_canonical_block_child_ops(STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned child)
+{
+	return &starpu_interface_matrix_ops;
+}

+ 7 - 7
src/datawizard/interfaces/block_filters.c

@@ -2,7 +2,7 @@
  *
  *
  * Copyright (C) 2012                                     Inria
  * Copyright (C) 2012                                     Inria
  * Copyright (C) 2010,2011,2013,2015,2017                 CNRS
  * Copyright (C) 2010,2011,2013,2015,2017                 CNRS
- * Copyright (C) 2011-2014,2016                           Université de Bordeaux
+ * Copyright (C) 2011-2014,2016, 2019                           Université de Bordeaux
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -35,7 +35,7 @@ void starpu_block_filter_block(void *father_interface, void *child_interface, ST
 
 
 	uint32_t chunk_size;
 	uint32_t chunk_size;
 	size_t offset;
 	size_t offset;
-	_starpu_filter_nparts_compute_chunk_size_and_offset(nx, nparts, elemsize, id, 1,
+	starpu_filter_nparts_compute_chunk_size_and_offset(nx, nparts, elemsize, id, 1,
 				       &chunk_size, &offset);
 				       &chunk_size, &offset);
 
 
 	STARPU_ASSERT_MSG(block_father->id == STARPU_BLOCK_INTERFACE_ID, "%s can only be applied on a block data", __func__);
 	STARPU_ASSERT_MSG(block_father->id == STARPU_BLOCK_INTERFACE_ID, "%s can only be applied on a block data", __func__);
@@ -74,7 +74,7 @@ void starpu_block_filter_block_shadow(void *father_interface, void *child_interf
 
 
 	uint32_t child_nx;
 	uint32_t child_nx;
 	size_t offset;
 	size_t offset;
-	_starpu_filter_nparts_compute_chunk_size_and_offset(nx, nparts, elemsize, id, 1,
+	starpu_filter_nparts_compute_chunk_size_and_offset(nx, nparts, elemsize, id, 1,
 						     &child_nx, &offset);
 						     &child_nx, &offset);
 	
 	
 
 
@@ -111,7 +111,7 @@ void starpu_block_filter_vertical_block(void *father_interface, void *child_inte
 
 
 	uint32_t child_ny;
 	uint32_t child_ny;
 	size_t offset;
 	size_t offset;
-	_starpu_filter_nparts_compute_chunk_size_and_offset(ny, nparts, elemsize, id, block_father->ldy,
+	starpu_filter_nparts_compute_chunk_size_and_offset(ny, nparts, elemsize, id, block_father->ldy,
 				       &child_ny, &offset);
 				       &child_ny, &offset);
 
 
 	STARPU_ASSERT_MSG(block_father->id == STARPU_BLOCK_INTERFACE_ID, "%s can only be applied on a block data", __func__);
 	STARPU_ASSERT_MSG(block_father->id == STARPU_BLOCK_INTERFACE_ID, "%s can only be applied on a block data", __func__);
@@ -151,7 +151,7 @@ void starpu_block_filter_vertical_block_shadow(void *father_interface, void *chi
 	uint32_t child_ny;
 	uint32_t child_ny;
 	size_t offset;
 	size_t offset;
 
 
-	_starpu_filter_nparts_compute_chunk_size_and_offset(ny, nparts, elemsize, id,
+	starpu_filter_nparts_compute_chunk_size_and_offset(ny, nparts, elemsize, id,
 						     block_father->ldy,
 						     block_father->ldy,
 						     &child_ny, &offset);
 						     &child_ny, &offset);
 
 
@@ -189,7 +189,7 @@ void starpu_block_filter_depth_block(void *father_interface, void *child_interfa
 	uint32_t child_nz;
 	uint32_t child_nz;
 	size_t offset;
 	size_t offset;
 
 
-	_starpu_filter_nparts_compute_chunk_size_and_offset(nz, nparts, elemsize, id,
+	starpu_filter_nparts_compute_chunk_size_and_offset(nz, nparts, elemsize, id,
 				       block_father->ldz, &child_nz, &offset);
 				       block_father->ldz, &child_nz, &offset);
 
 
 	STARPU_ASSERT_MSG(block_father->id == STARPU_BLOCK_INTERFACE_ID, "%s can only be applied on a block data", __func__);
 	STARPU_ASSERT_MSG(block_father->id == STARPU_BLOCK_INTERFACE_ID, "%s can only be applied on a block data", __func__);
@@ -229,7 +229,7 @@ void starpu_block_filter_depth_block_shadow(void *father_interface, void *child_
 	uint32_t child_nz;
 	uint32_t child_nz;
 	size_t offset;
 	size_t offset;
 
 
-	_starpu_filter_nparts_compute_chunk_size_and_offset(nz, nparts, elemsize, id,
+	starpu_filter_nparts_compute_chunk_size_and_offset(nz, nparts, elemsize, id,
 						     block_father->ldz,
 						     block_father->ldz,
 						     &child_nz, &offset);
 						     &child_nz, &offset);
 
 

+ 2 - 2
src/datawizard/interfaces/csr_filters.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2008-2011,2013,2014,2016                 Université de Bordeaux
+ * Copyright (C) 2008-2011,2013,2014,2016, 2019                 Université de Bordeaux
  * Copyright (C) 2012                                     Inria
  * Copyright (C) 2012                                     Inria
  * Copyright (C) 2010                                     Mehdi Juhoor
  * Copyright (C) 2010                                     Mehdi Juhoor
  * Copyright (C) 2010,2011,2013,2015,2017                 CNRS
  * Copyright (C) 2010,2011,2013,2015,2017                 CNRS
@@ -41,7 +41,7 @@ void starpu_csr_filter_vertical_block(void *father_interface, void *child_interf
 	  STARPU_MIN(chunk_size, nrow - id*chunk_size);
 	  STARPU_MIN(chunk_size, nrow - id*chunk_size);
 	/* TODO: the formula for the chunk size is probably wrong: we should
 	/* TODO: the formula for the chunk size is probably wrong: we should
 	 * probably do this instead, and write a test.
 	 * probably do this instead, and write a test.
-	_starpu_filter_nparts_compute_chunk_size_and_offset(nrow, nparts, elemsize,
+	starpu_filter_nparts_compute_chunk_size_and_offset(nrow, nparts, elemsize,
 						     id, 1, &chunk_size, NULL);
 						     id, 1, &chunk_size, NULL);
 	 */
 	 */
 
 

+ 4 - 4
src/datawizard/interfaces/matrix_filters.c

@@ -38,7 +38,7 @@ void starpu_matrix_filter_block(void *father_interface, void *child_interface, S
 	uint32_t child_nx;
 	uint32_t child_nx;
 	size_t offset;
 	size_t offset;
 
 
-	_starpu_filter_nparts_compute_chunk_size_and_offset(nx, nchunks, elemsize, id, 1,
+	starpu_filter_nparts_compute_chunk_size_and_offset(nx, nchunks, elemsize, id, 1,
 						     &child_nx, &offset);
 						     &child_nx, &offset);
 
 
 	STARPU_ASSERT_MSG(matrix_father->id == STARPU_MATRIX_INTERFACE_ID, "%s can only be applied on a matrix data", __func__);
 	STARPU_ASSERT_MSG(matrix_father->id == STARPU_MATRIX_INTERFACE_ID, "%s can only be applied on a matrix data", __func__);
@@ -82,7 +82,7 @@ void starpu_matrix_filter_block_shadow(void *father_interface, void *child_inter
 	uint32_t child_nx;
 	uint32_t child_nx;
 	size_t offset;
 	size_t offset;
 
 
-	_starpu_filter_nparts_compute_chunk_size_and_offset(nx, nchunks, elemsize, id, 1,
+	starpu_filter_nparts_compute_chunk_size_and_offset(nx, nchunks, elemsize, id, 1,
 						     &child_nx, &offset);
 						     &child_nx, &offset);
 
 
 	child_nx += 2 * shadow_size;
 	child_nx += 2 * shadow_size;
@@ -122,7 +122,7 @@ void starpu_matrix_filter_vertical_block(void *father_interface, void *child_int
 	uint32_t child_ny;
 	uint32_t child_ny;
 	size_t offset;
 	size_t offset;
 
 
-	_starpu_filter_nparts_compute_chunk_size_and_offset(ny, nchunks, elemsize, id,
+	starpu_filter_nparts_compute_chunk_size_and_offset(ny, nchunks, elemsize, id,
 						     matrix_father->ld,
 						     matrix_father->ld,
 						     &child_ny, &offset);
 						     &child_ny, &offset);
 
 
@@ -162,7 +162,7 @@ void starpu_matrix_filter_vertical_block_shadow(void *father_interface, void *ch
 	uint32_t child_ny;
 	uint32_t child_ny;
 	size_t offset;
 	size_t offset;
 
 
-	_starpu_filter_nparts_compute_chunk_size_and_offset(ny, nchunks, elemsize, id,
+	starpu_filter_nparts_compute_chunk_size_and_offset(ny, nchunks, elemsize, id,
 						     matrix_father->ld,
 						     matrix_father->ld,
 						     &child_ny, &offset);
 						     &child_ny, &offset);
 	child_ny += 2 * shadow_size;
 	child_ny += 2 * shadow_size;

+ 2 - 2
src/datawizard/interfaces/vector_filters.c

@@ -33,7 +33,7 @@ void starpu_vector_filter_block(void *father_interface, void *child_interface, S
 
 
 	uint32_t child_nx;
 	uint32_t child_nx;
 	size_t offset;
 	size_t offset;
-	_starpu_filter_nparts_compute_chunk_size_and_offset(nx, nchunks, elemsize, id, 1,
+	starpu_filter_nparts_compute_chunk_size_and_offset(nx, nchunks, elemsize, id, 1,
 						     &child_nx, &offset);
 						     &child_nx, &offset);
 
 
 	STARPU_ASSERT_MSG(vector_father->id == STARPU_VECTOR_INTERFACE_ID, "%s can only be applied on a vector data", __func__);
 	STARPU_ASSERT_MSG(vector_father->id == STARPU_VECTOR_INTERFACE_ID, "%s can only be applied on a vector data", __func__);
@@ -68,7 +68,7 @@ void starpu_vector_filter_block_shadow(void *father_interface, void *child_inter
 
 
 	uint32_t child_nx;
 	uint32_t child_nx;
 	size_t offset;
 	size_t offset;
-	_starpu_filter_nparts_compute_chunk_size_and_offset(nx, nchunks, elemsize, id, 1,
+	starpu_filter_nparts_compute_chunk_size_and_offset(nx, nchunks, elemsize, id, 1,
 						     &child_nx, &offset);
 						     &child_nx, &offset);
 	child_nx += 2*shadow_size;
 	child_nx += 2*shadow_size;
 
 

+ 9 - 4
src/profiling/bound.c

@@ -2,7 +2,7 @@
  *
  *
  * Copyright (C) 2011,2012,2014                           Inria
  * Copyright (C) 2011,2012,2014                           Inria
  * Copyright (C) 2010-2017                                Université de Bordeaux
  * Copyright (C) 2010-2017                                Université de Bordeaux
- * Copyright (C) 2010-2017                                CNRS
+ * Copyright (C) 2010-2017, 2019                          CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2011                                     Télécom-SudParis
  * Copyright (C) 2011                                     Télécom-SudParis
  *
  *
@@ -378,7 +378,7 @@ static struct bound_task *find_job(unsigned long id)
 }
 }
 
 
 /* Job J depends on previous job of id ID (which is already finished) */
 /* Job J depends on previous job of id ID (which is already finished) */
-void _starpu_bound_job_id_dep(starpu_data_handle_t handle, struct _starpu_job *j, unsigned long id)
+void _starpu_bound_job_id_dep_size(size_t size, struct _starpu_job *j, unsigned long id)
 {
 {
 	struct bound_task *t, *dep_t;
 	struct bound_task *t, *dep_t;
 	int i;
 	int i;
@@ -410,7 +410,7 @@ void _starpu_bound_job_id_dep(starpu_data_handle_t handle, struct _starpu_job *j
 		if (t->deps[i].dep == dep_t)
 		if (t->deps[i].dep == dep_t)
 		{
 		{
 			/* Found, just add size */
 			/* Found, just add size */
-			t->deps[i].size += _starpu_data_get_size(handle);
+			t->deps[i].size += size;
 			break;
 			break;
 		}
 		}
 	if (i == t->depsn)
 	if (i == t->depsn)
@@ -418,11 +418,16 @@ void _starpu_bound_job_id_dep(starpu_data_handle_t handle, struct _starpu_job *j
 		/* Not already there, add */
 		/* Not already there, add */
 		_STARPU_REALLOC(t->deps, ++t->depsn * sizeof(t->deps[0]));
 		_STARPU_REALLOC(t->deps, ++t->depsn * sizeof(t->deps[0]));
 		t->deps[t->depsn-1].dep = dep_t;
 		t->deps[t->depsn-1].dep = dep_t;
-		t->deps[t->depsn-1].size = _starpu_data_get_size(handle);
+		t->deps[t->depsn-1].size = size;
 	}
 	}
 	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 }
 }
 
 
+void _starpu_bound_job_id_dep(starpu_data_handle_t handle, struct _starpu_job *j, unsigned long id)
+{
+	_starpu_bound_job_id_dep_size(_starpu_data_get_size(handle), j, id);
+}
+
 void starpu_bound_stop(void)
 void starpu_bound_stop(void)
 {
 {
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);

+ 9 - 2
tools/starpu_smpirun.in

@@ -3,7 +3,7 @@
 #
 #
 # Copyright (C) 2017                                     CNRS
 # Copyright (C) 2017                                     CNRS
 # Copyright (C) 2016                                     Inria
 # Copyright (C) 2016                                     Inria
-# Copyright (C) 2014-2016                                Université de Bordeaux
+# Copyright (C) 2014-2016,2019                            Université de Bordeaux
 #
 #
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
 # it under the terms of the GNU Lesser General Public License as published by
@@ -42,6 +42,13 @@ else
 	DASH=_
 	DASH=_
 fi
 fi
 
 
+if [ "$SMPI_MAJOR" -ge 4 -o \( "$SMPI_MAJOR" = 3 -a "$SMPI_MINOR" -ge 16 \) ]
+then
+	PRIV_OPT="--cfg=smpi/privatization:yes"
+else
+	PRIV_OPT="--cfg=smpi/privatize${DASH}global${DASH}variables:yes"
+fi
+
 # When executed from source, take xslt from source
 # When executed from source, take xslt from source
 [ "$0" -ef $BUILDDIR/starpu_smpirun ] && STARPU_XSLTDIR=$SOURCE_DATADIR
 [ "$0" -ef $BUILDDIR/starpu_smpirun ] && STARPU_XSLTDIR=$SOURCE_DATADIR
 
 
@@ -115,7 +122,7 @@ EOF
 
 
 STACKSIZE=$(ulimit -s)
 STACKSIZE=$(ulimit -s)
 [ "$STACKSIZE" != unlimited ] || STACKSIZE=8192
 [ "$STACKSIZE" != unlimited ] || STACKSIZE=8192
-$SMPIRUN -platform $PLATFORM -hostfile $MPI_HOSTFILE -np $NP "$@" --cfg=smpi/privatize${DASH}global${DASH}variables:yes --cfg=smpi/simulate${DASH}computation:no --cfg=contexts/stack${DASH}size:$STACKSIZE
+$SMPIRUN -platform $PLATFORM -hostfile $MPI_HOSTFILE -np $NP "$@" $PRIV_OPT --cfg=smpi/simulate${DASH}computation:no --cfg=contexts/stack${DASH}size:$STACKSIZE
 RET=$?
 RET=$?
 
 
 rm -f $PLATFORM
 rm -f $PLATFORM