소스 검색

Merge remote-tracking branch 'gitlab/master' into ft_checkpoint

# Conflicts:
#	mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
#	mpi/examples/matrix_decomposition/mpi_decomposition_params.c
Romain LION 5 년 전
부모
커밋
c079ddad21

+ 16 - 7
configure.ac

@@ -892,8 +892,8 @@ AC_CHECK_FUNCS([mkdtemp])
 
 AC_CHECK_FUNCS([pread pwrite])
 
-AC_ARG_ENABLE(hdf5, [AS_HELP_STRING([--disable-hdf5], [disable HDF5 support])],
-                    enable_hdf5=$enableval, enable_hdf5=maybe)
+AC_ARG_ENABLE(hdf5, [AS_HELP_STRING([--enable-hdf5], [disable HDF5 support])],
+                    enable_hdf5=$enableval, enable_hdf5=no)
 
 if test "x$enable_hdf5" != xno ; then
 	AC_ARG_WITH(hdf5-include-dir,
@@ -952,8 +952,11 @@ fi
 
 if test  "x$enable_libraries_hdf5" = "xyes" -a "x$enable_include_hdf5" = "xyes" -a "x$enable_hdf5" != "xno"; then
         AC_DEFINE([STARPU_HAVE_HDF5], [1], [Define to 1 if you have the <hdf5.h> header file.])
+	enable_hdf5=yes
+else
+	enable_hdf5=no
 fi
-AM_CONDITIONAL(STARPU_HAVE_HDF5, test  "x$enable_libraries_hdf5" = "xyes" -a "x$enable_include_hdf5" = "xyes" -a "x$enable_hdf5" != "xno")
+AM_CONDITIONAL(STARPU_HAVE_HDF5, test "x$enable_hdf5" = "xyes")
 
 
 # This defines HAVE_SYNC_VAL_COMPARE_AND_SWAP
@@ -2449,8 +2452,8 @@ AC_SUBST(STARPU_EXPORT_DYNAMIC)
 # Computes the maximum number of different kernels a message-passing sink
 # can lookup for and launch.
 AC_MSG_CHECKING(Maximum number of message-passing kernels)
-AC_ARG_ENABLE(maxmpkernels, [AS_HELP_STRING([
-	      -enable-maxmpkernels=<number>],
+AC_ARG_ENABLE(maxmpkernels, [AS_HELP_STRING(
+	      [-enable-maxmpkernels=<number>],
 	      [maximum number of kernels a message-passing sink can lookup
 	      for and execute])],
 	      maxmpkernels=$enableval, maxmpkernels=10)
@@ -3037,6 +3040,9 @@ fi
 AC_ARG_ENABLE(mlr, [AS_HELP_STRING([--disable-mlr],
 			[Disable multiple linear regression models])],
 			enable_mlr=$enableval, enable_mlr=$default_enable_mlr)
+AC_ARG_ENABLE(mlr-system-blas, [AS_HELP_STRING([--enable-mlr-system-blas],
+			[Make the multiple linear regression models use the system BLAS instead of min-dgels])],
+			enable_mlr_blas=$enableval, enable_mlr_blas=no)
 
 AC_MSG_CHECKING(whether multiple linear regression models are disabled)
 if test x$enable_mlr = xyes -a "$starpu_windows" != "yes" ; then
@@ -3047,11 +3053,11 @@ if test x$enable_mlr = xyes -a "$starpu_windows" != "yes" ; then
 	if test x$blas_lib = xnone ; then
 	   use_system_lapack=no
 	fi
-	if test x$use_system_lapack = xyes; then
+	if test x$enable_mlr_blas = xyes -a test x$use_system_lapack = xyes; then
 	   	AC_DEFINE(STARPU_MLR_MODEL, [1], [use reflapack library])
 		LDFLAGS="-llapack $LDFLAGS"
 	else
-		if test x$blas_lib = xmkl; then
+		if test x$enable_mlr_blas=xyes -a test x$blas_lib = xmkl; then
 		   	AC_DEFINE(STARPU_MLR_MODEL, [1], [use mkl library])
 		else
 			AC_MSG_CHECKING(whether min-dgels is linked)
@@ -3070,6 +3076,7 @@ if test x$enable_mlr = xyes -a "$starpu_windows" != "yes" ; then
 				else
 					if test ! -d $PWD/min-dgels; then
 						cp -r $srcdir/min-dgels $PWD/
+						chmod +rwX -R $PWD/min-dgels
 					fi
 					AC_MSG_RESULT(yes)
 					DGELS_LIBS="-Wl,--start-group $STARPU_BUILD_DIR/min-dgels/build/minlibblas.a $STARPU_BUILD_DIR/min-dgels/build/minlibdgels.a $STARPU_BUILD_DIR/min-dgels/build/minlibf2c.a -Wl,--end-group"
@@ -3557,6 +3564,7 @@ AC_CONFIG_COMMANDS([executable-scripts], [
   test -e tests/microbenchs/parallel_independent_homogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh tests/microbenchs/
+  test -e tests/microbenchs/bandwidth_scheds.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/bandwidth_scheds.sh tests/microbenchs/
   mkdir -p tests/energy
   test -e tests/energy/static.sh || ln -sf $ac_abs_top_srcdir/tests/energy/static.sh tests/energy/
   test -e tests/energy/dynamic.sh || ln -sf $ac_abs_top_srcdir/tests/energy/dynamic.sh tests/energy/
@@ -3732,6 +3740,7 @@ AC_MSG_NOTICE([
                Scheduler Hypervisor:                          $build_sc_hypervisor
                simgrid enabled:                               $enable_simgrid
                ayudame enabled:                               $ayu_msg
+               HDF5 enabled:                                  $enable_hdf5
 	       Native fortran support:                        $enable_build_fortran
 	       Native MPI fortran support:                    $use_mpi_fort
 	       Support for multiple linear regression models: $support_mlr

+ 5 - 1
doc/doxygen/chapters/370_online_performance_tools.doxy

@@ -375,7 +375,11 @@ parameter are stored in <c>.starpu/sampling/codelets/tmp/</c>
 directory. These files are reused when \ref STARPU_CALIBRATE
 environment variable is set to <c>1</c>, to recompute coefficients
 based on the current, but also on the previous
-executions. Additionally, when multiple linear regression models are
+executions. By default StarPU uses a lightweight dgels implementation, but the
+\ref enable-mlr-system-blas "--enable-mlr-system-blas" configure option can be
+used to make StarPU use a system-provided dgels BLAS.
+
+Additionally, when multiple linear regression models are
 disabled (using \ref disable-mlr "--disable-mlr" configure option) or when the
 <c>model->combinations</c> are not defined, StarPU will still write
 output files into <c>.starpu/sampling/codelets/tmp/</c> to allow

+ 12 - 4
doc/doxygen/chapters/510_configure_options.doxy

@@ -571,11 +571,11 @@ Specify the blas library to be used by some of the examples. Librairies availabl
 Enable linking with LevelDB if available
 </dd>
 
-<dt>--disable-hdf5</dt>
+<dt>--enable-hdf5</dt>
 <dd>
-\anchor disable-hdf5
-\addindex __configure__--disable-hdf5
-Disable building HDF5 support.
+\anchor enable-hdf5
+\addindex __configure__--enable-hdf5
+Enable building HDF5 support.
 </dd>
 
 <dt>--with-hdf5-include-dir=<c>path</c></dt>
@@ -768,6 +768,14 @@ this parameter is 10. Experimental.
 Allow to disable multiple linear regression models (see \ref PerformanceModelExample)
 </dd>
 
+<dt>--enable-mlr-system-blas</dt>
+<dd>
+\anchor enable-mlr-system-blas
+\addindex __configure__--enable-mlr-system-blas
+Allow to make multiple linear regression models use the system-provided BLAS for dgels
+(see \ref PerformanceModelExample)
+</dd>
+
 </dl>
 
 */

+ 4 - 0
min-dgels/Makefile.in

@@ -14,6 +14,10 @@ all:
 	cd $(ADDITIONAL) && $(CC) -c -fPIC *.c && ar cr ../build/minlibdgels.a *.o && ranlib ../build/minlibdgels.a
 
 install:
+installcheck:
+uninstall:
+distuninstallcheck:
+dvi:
 
 clean:
 	-cd $(CLAPACK) && $(MAKE) clean && rm -rf *~

+ 0 - 152
min-dgels/additional/blaswrap.h

@@ -5,156 +5,4 @@
 #ifndef __BLASWRAP_H
 #define __BLASWRAP_H
 
-#ifndef NO_BLAS_WRAP
- 
-/* BLAS1 routines */
-#define _starpu_srotg_ f2c_srotg
-#define _starpu_crotg_ f2c_crotg
-#define _starpu_drotg_ f2c_drotg
-#define _starpu_zrotg_ f2c_zrotg
-#define _starpu_srotmg_ f2c_srotmg
-#define _starpu_drotmg_ f2c_drotmg
-#define _starpu_srot_ f2c_srot
-#define _starpu_drot_ f2c_drot
-#define _starpu_srotm_ f2c_srotm
-#define _starpu_drotm_ f2c_drotm
-#define _starpu_sswap_ f2c_sswap
-#define _starpu_dswap_ f2c_dswap
-#define _starpu_cswap_ f2c_cswap
-#define _starpu_zswap_ f2c_zswap
-#define _starpu_sscal_ f2c_sscal
-#define _starpu_dscal_ f2c_dscal
-#define _starpu_cscal_ f2c_cscal
-#define _starpu_zscal_ f2c_zscal
-#define _starpu_csscal_ f2c_csscal
-#define _starpu_zdscal_ f2c_zdscal
-#define _starpu_scopy_ f2c_scopy
-#define _starpu_dcopy_ f2c_dcopy
-#define _starpu_ccopy_ f2c_ccopy
-#define _starpu_zcopy_ f2c_zcopy
-#define _starpu_saxpy_ f2c_saxpy
-#define _starpu_daxpy_ f2c_daxpy
-#define _starpu_caxpy_ f2c_caxpy
-#define _starpu_zaxpy_ f2c_zaxpy
-#define _starpu_sdot_ f2c_sdot
-#define _starpu_ddot_ f2c_ddot
-#define _starpu_cdotu_ f2c_cdotu
-#define _starpu_zdotu_ f2c_zdotu
-#define _starpu_cdotc_ f2c_cdotc
-#define _starpu_zdotc_ f2c_zdotc
-#define _starpu_snrm2_ f2c_snrm2
-#define _starpu_dnrm2_ f2c_dnrm2
-#define _starpu_scnrm2_ f2c_scnrm2
-#define _starpu_dznrm2_ f2c_dznrm2
-#define _starpu_sasum_ f2c_sasum
-#define _starpu_dasum_ f2c_dasum
-#define _starpu_scasum_ f2c_scasum
-#define _starpu_dzasum_ f2c_dzasum
-#define _starpu_isamax_ f2c_isamax
-#define _starpu_idamax_ f2c_idamax
-#define _starpu_icamax_ f2c_icamax
-#define _starpu_izamax_ f2c_izamax
- 
-/* BLAS2 routines */
-#define _starpu_sgemv_ f2c_sgemv
-#define _starpu_dgemv_ f2c_dgemv
-#define _starpu_cgemv_ f2c_cgemv
-#define _starpu_zgemv_ f2c_zgemv
-#define _starpu_sgbmv_ f2c_sgbmv
-#define _starpu_dgbmv_ f2c_dgbmv
-#define _starpu_cgbmv_ f2c_cgbmv
-#define _starpu_zgbmv_ f2c_zgbmv
-#define _starpu_chemv_ f2c_chemv
-#define _starpu_zhemv_ f2c_zhemv
-#define _starpu_chbmv_ f2c_chbmv
-#define _starpu_zhbmv_ f2c_zhbmv
-#define _starpu_chpmv_ f2c_chpmv
-#define _starpu_zhpmv_ f2c_zhpmv
-#define _starpu_ssymv_ f2c_ssymv
-#define _starpu_dsymv_ f2c_dsymv
-#define _starpu_ssbmv_ f2c_ssbmv
-#define _starpu_dsbmv_ f2c_dsbmv
-#define _starpu_sspmv_ f2c_sspmv
-#define _starpu_dspmv_ f2c_dspmv
-#define _starpu_strmv_ f2c_strmv
-#define _starpu_dtrmv_ f2c_dtrmv
-#define _starpu_ctrmv_ f2c_ctrmv
-#define _starpu_ztrmv_ f2c_ztrmv
-#define _starpu_stbmv_ f2c_stbmv
-#define _starpu_dtbmv_ f2c_dtbmv
-#define _starpu_ctbmv_ f2c_ctbmv
-#define _starpu_ztbmv_ f2c_ztbmv
-#define _starpu_stpmv_ f2c_stpmv
-#define _starpu_dtpmv_ f2c_dtpmv
-#define _starpu_ctpmv_ f2c_ctpmv
-#define _starpu_ztpmv_ f2c_ztpmv
-#define _starpu_strsv_ f2c_strsv
-#define _starpu_dtrsv_ f2c_dtrsv
-#define _starpu_ctrsv_ f2c_ctrsv
-#define _starpu_ztrsv_ f2c_ztrsv
-#define _starpu_stbsv_ f2c_stbsv
-#define _starpu_dtbsv_ f2c_dtbsv
-#define _starpu_ctbsv_ f2c_ctbsv
-#define _starpu_ztbsv_ f2c_ztbsv
-#define _starpu_stpsv_ f2c_stpsv
-#define _starpu_dtpsv_ f2c_dtpsv
-#define _starpu_ctpsv_ f2c_ctpsv
-#define _starpu_ztpsv_ f2c_ztpsv
-#define _starpu_sger_ f2c_sger
-#define _starpu_dger_ f2c_dger
-#define _starpu_cgeru_ f2c_cgeru
-#define _starpu_zgeru_ f2c_zgeru
-#define _starpu_cgerc_ f2c_cgerc
-#define _starpu_zgerc_ f2c_zgerc
-#define _starpu_cher_ f2c_cher
-#define _starpu_zher_ f2c_zher
-#define _starpu_chpr_ f2c_chpr
-#define _starpu_zhpr_ f2c_zhpr
-#define _starpu_cher2_ f2c_cher2
-#define _starpu_zher2_ f2c_zher2
-#define _starpu_chpr2_ f2c_chpr2
-#define _starpu_zhpr2_ f2c_zhpr2
-#define _starpu_ssyr_ f2c_ssyr
-#define _starpu_dsyr_ f2c_dsyr
-#define _starpu_sspr_ f2c_sspr
-#define _starpu_dspr_ f2c_dspr
-#define _starpu_ssyr2_ f2c_ssyr2
-#define _starpu_dsyr2_ f2c_dsyr2
-#define _starpu_sspr2_ f2c_sspr2
-#define _starpu_dspr2_ f2c_dspr2
- 
-/* BLAS3 routines */
-#define _starpu_sgemm_ f2c_sgemm
-#define _starpu_dgemm_ f2c_dgemm
-#define _starpu_cgemm_ f2c_cgemm
-#define _starpu_zgemm_ f2c_zgemm
-#define _starpu_ssymm_ f2c_ssymm
-#define _starpu_dsymm_ f2c_dsymm
-#define _starpu_csymm_ f2c_csymm
-#define _starpu_zsymm_ f2c_zsymm
-#define _starpu_chemm_ f2c_chemm
-#define _starpu_zhemm_ f2c_zhemm
-#define _starpu_ssyrk_ f2c_ssyrk
-#define _starpu_dsyrk_ f2c_dsyrk
-#define _starpu_csyrk_ f2c_csyrk
-#define _starpu_zsyrk_ f2c_zsyrk
-#define _starpu_cherk_ f2c_cherk
-#define _starpu_zherk_ f2c_zherk
-#define _starpu_ssyr2k_ f2c_ssyr2k
-#define _starpu_dsyr2k_ f2c_dsyr2k
-#define _starpu_csyr2k_ f2c_csyr2k
-#define _starpu_zsyr2k_ f2c_zsyr2k
-#define _starpu_cher2k_ f2c_cher2k
-#define _starpu_zher2k_ f2c_zher2k
-#define _starpu_strmm_ f2c_strmm
-#define _starpu_dtrmm_ f2c_dtrmm
-#define _starpu_ctrmm_ f2c_ctrmm
-#define _starpu_ztrmm_ f2c_ztrmm
-#define _starpu_strsm_ f2c_strsm
-#define _starpu_dtrsm_ f2c_dtrsm
-#define _starpu_ctrsm_ f2c_ctrsm
-#define _starpu_ztrsm_ f2c_ztrsm
-
-#endif /* NO_BLAS_WRAP */
-
 #endif /* __BLASWRAP_H */

+ 0 - 152
min-dgels/base/INCLUDE/blaswrap.h

@@ -5,156 +5,4 @@
 #ifndef __BLASWRAP_H
 #define __BLASWRAP_H
 
-#ifndef NO_BLAS_WRAP
- 
-/* BLAS1 routines */
-#define _starpu_srotg_ f2c_srotg
-#define _starpu_crotg_ f2c_crotg
-#define _starpu_drotg_ f2c_drotg
-#define _starpu_zrotg_ f2c_zrotg
-#define _starpu_srotmg_ f2c_srotmg
-#define _starpu_drotmg_ f2c_drotmg
-#define _starpu_srot_ f2c_srot
-#define _starpu_drot_ f2c_drot
-#define _starpu_srotm_ f2c_srotm
-#define _starpu_drotm_ f2c_drotm
-#define _starpu_sswap_ f2c_sswap
-#define _starpu_dswap_ f2c_dswap
-#define _starpu_cswap_ f2c_cswap
-#define _starpu_zswap_ f2c_zswap
-#define _starpu_sscal_ f2c_sscal
-#define _starpu_dscal_ f2c_dscal
-#define _starpu_cscal_ f2c_cscal
-#define _starpu_zscal_ f2c_zscal
-#define _starpu_csscal_ f2c_csscal
-#define _starpu_zdscal_ f2c_zdscal
-#define _starpu_scopy_ f2c_scopy
-#define _starpu_dcopy_ f2c_dcopy
-#define _starpu_ccopy_ f2c_ccopy
-#define _starpu_zcopy_ f2c_zcopy
-#define _starpu_saxpy_ f2c_saxpy
-#define _starpu_daxpy_ f2c_daxpy
-#define _starpu_caxpy_ f2c_caxpy
-#define _starpu_zaxpy_ f2c_zaxpy
-#define _starpu_sdot_ f2c_sdot
-#define _starpu_ddot_ f2c_ddot
-#define _starpu_cdotu_ f2c_cdotu
-#define _starpu_zdotu_ f2c_zdotu
-#define _starpu_cdotc_ f2c_cdotc
-#define _starpu_zdotc_ f2c_zdotc
-#define _starpu_snrm2_ f2c_snrm2
-#define _starpu_dnrm2_ f2c_dnrm2
-#define _starpu_scnrm2_ f2c_scnrm2
-#define _starpu_dznrm2_ f2c_dznrm2
-#define _starpu_sasum_ f2c_sasum
-#define _starpu_dasum_ f2c_dasum
-#define _starpu_scasum_ f2c_scasum
-#define _starpu_dzasum_ f2c_dzasum
-#define _starpu_isamax_ f2c_isamax
-#define _starpu_idamax_ f2c_idamax
-#define _starpu_icamax_ f2c_icamax
-#define _starpu_izamax_ f2c_izamax
- 
-/* BLAS2 routines */
-#define _starpu_sgemv_ f2c_sgemv
-#define _starpu_dgemv_ f2c_dgemv
-#define _starpu_cgemv_ f2c_cgemv
-#define _starpu_zgemv_ f2c_zgemv
-#define _starpu_sgbmv_ f2c_sgbmv
-#define _starpu_dgbmv_ f2c_dgbmv
-#define _starpu_cgbmv_ f2c_cgbmv
-#define _starpu_zgbmv_ f2c_zgbmv
-#define _starpu_chemv_ f2c_chemv
-#define _starpu_zhemv_ f2c_zhemv
-#define _starpu_chbmv_ f2c_chbmv
-#define _starpu_zhbmv_ f2c_zhbmv
-#define _starpu_chpmv_ f2c_chpmv
-#define _starpu_zhpmv_ f2c_zhpmv
-#define _starpu_ssymv_ f2c_ssymv
-#define _starpu_dsymv_ f2c_dsymv
-#define _starpu_ssbmv_ f2c_ssbmv
-#define _starpu_dsbmv_ f2c_dsbmv
-#define _starpu_sspmv_ f2c_sspmv
-#define _starpu_dspmv_ f2c_dspmv
-#define _starpu_strmv_ f2c_strmv
-#define _starpu_dtrmv_ f2c_dtrmv
-#define _starpu_ctrmv_ f2c_ctrmv
-#define _starpu_ztrmv_ f2c_ztrmv
-#define _starpu_stbmv_ f2c_stbmv
-#define _starpu_dtbmv_ f2c_dtbmv
-#define _starpu_ctbmv_ f2c_ctbmv
-#define _starpu_ztbmv_ f2c_ztbmv
-#define _starpu_stpmv_ f2c_stpmv
-#define _starpu_dtpmv_ f2c_dtpmv
-#define _starpu_ctpmv_ f2c_ctpmv
-#define _starpu_ztpmv_ f2c_ztpmv
-#define _starpu_strsv_ f2c_strsv
-#define _starpu_dtrsv_ f2c_dtrsv
-#define _starpu_ctrsv_ f2c_ctrsv
-#define _starpu_ztrsv_ f2c_ztrsv
-#define _starpu_stbsv_ f2c_stbsv
-#define _starpu_dtbsv_ f2c_dtbsv
-#define _starpu_ctbsv_ f2c_ctbsv
-#define _starpu_ztbsv_ f2c_ztbsv
-#define _starpu_stpsv_ f2c_stpsv
-#define _starpu_dtpsv_ f2c_dtpsv
-#define _starpu_ctpsv_ f2c_ctpsv
-#define _starpu_ztpsv_ f2c_ztpsv
-#define _starpu_sger_ f2c_sger
-#define _starpu_dger_ f2c_dger
-#define _starpu_cgeru_ f2c_cgeru
-#define _starpu_zgeru_ f2c_zgeru
-#define _starpu_cgerc_ f2c_cgerc
-#define _starpu_zgerc_ f2c_zgerc
-#define _starpu_cher_ f2c_cher
-#define _starpu_zher_ f2c_zher
-#define _starpu_chpr_ f2c_chpr
-#define _starpu_zhpr_ f2c_zhpr
-#define _starpu_cher2_ f2c_cher2
-#define _starpu_zher2_ f2c_zher2
-#define _starpu_chpr2_ f2c_chpr2
-#define _starpu_zhpr2_ f2c_zhpr2
-#define _starpu_ssyr_ f2c_ssyr
-#define _starpu_dsyr_ f2c_dsyr
-#define _starpu_sspr_ f2c_sspr
-#define _starpu_dspr_ f2c_dspr
-#define _starpu_ssyr2_ f2c_ssyr2
-#define _starpu_dsyr2_ f2c_dsyr2
-#define _starpu_sspr2_ f2c_sspr2
-#define _starpu_dspr2_ f2c_dspr2
- 
-/* BLAS3 routines */
-#define _starpu_sgemm_ f2c_sgemm
-#define _starpu_dgemm_ f2c_dgemm
-#define _starpu_cgemm_ f2c_cgemm
-#define _starpu_zgemm_ f2c_zgemm
-#define _starpu_ssymm_ f2c_ssymm
-#define _starpu_dsymm_ f2c_dsymm
-#define _starpu_csymm_ f2c_csymm
-#define _starpu_zsymm_ f2c_zsymm
-#define _starpu_chemm_ f2c_chemm
-#define _starpu_zhemm_ f2c_zhemm
-#define _starpu_ssyrk_ f2c_ssyrk
-#define _starpu_dsyrk_ f2c_dsyrk
-#define _starpu_csyrk_ f2c_csyrk
-#define _starpu_zsyrk_ f2c_zsyrk
-#define _starpu_cherk_ f2c_cherk
-#define _starpu_zherk_ f2c_zherk
-#define _starpu_ssyr2k_ f2c_ssyr2k
-#define _starpu_dsyr2k_ f2c_dsyr2k
-#define _starpu_csyr2k_ f2c_csyr2k
-#define _starpu_zsyr2k_ f2c_zsyr2k
-#define _starpu_cher2k_ f2c_cher2k
-#define _starpu_zher2k_ f2c_zher2k
-#define _starpu_strmm_ f2c_strmm
-#define _starpu_dtrmm_ f2c_dtrmm
-#define _starpu_ctrmm_ f2c_ctrmm
-#define _starpu_ztrmm_ f2c_ztrmm
-#define _starpu_strsm_ f2c_strsm
-#define _starpu_dtrsm_ f2c_dtrsm
-#define _starpu_ctrsm_ f2c_ctrsm
-#define _starpu_ztrsm_ f2c_ztrsm
-
-#endif /* NO_BLAS_WRAP */
-
 #endif /* __BLASWRAP_H */

+ 63 - 37
mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c

@@ -198,7 +198,7 @@ static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int node
 /* TODO: generate from compiler polyhedral analysis of classical algorithm */
 static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, int nodes)
 {
-	unsigned k, m, n;
+	unsigned k, m, n, i;
 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
 	unsigned nn = size/nblocks;
 
@@ -222,6 +222,13 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
 						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
 						       STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
 						       0);
+
+				if (m == n)
+				{
+					/* Nobody else will need it */
+					starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
+					starpu_data_wont_use(data_handles[m][k]);
+				}
 			}
 			k = n;
 			if (m > n)
@@ -243,27 +250,26 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
 						       STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
 						       0);
 			}
+
 		}
+
+		/* We won't need it any more */
+		starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
+		starpu_data_wont_use(data_handles[n][n]);
+
 		if (n%checkpoint_period==checkpoint_period-1)
 			starpu_mpi_submit_checkpoint_template(*checkpoint_p, (int)(nblocks - 2*n));
+
 		starpu_iteration_pop();
 	}
 
-	/* Submit flushes, StarPU will fit them according to the progress */
-	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
-	for (m = 0; m < nblocks; m++)
-		for (n = 0; n < nblocks ; n++)
-		{
-			if (starpu_data_get_home_node(data_handles[m][n])>=0)
-				starpu_data_wont_use(data_handles[m][n]);
-		}
 }
 
 /* TODO: generate from compiler polyhedral analysis of classical algorithm */
 static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int rank, int nodes)
 {
 	unsigned a, c;
-	unsigned k, m, n;
+	unsigned k, m, n, i;
 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
 	unsigned nn = size/nblocks;
 
@@ -301,6 +307,13 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
 						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
 						       STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
 						       0);
+
+				if (m == nblocks-1)
+				{
+					/* Nobody else will need it */
+					starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
+					starpu_data_wont_use(data_handles[n][k]);
+				}
 			}
 
 			/* k = n */
@@ -323,6 +336,13 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
 						       STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
 						       0);
 			}
+
+			if (m == nblocks - 1)
+			{
+				/* We do not need the potrf result any more */
+				starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
+				starpu_data_wont_use(data_handles[n][n]);
+			}
 		}
 
 		/* column within second antidiagonal for a */
@@ -345,6 +365,13 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
 						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
 						       STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
 						       0);
+
+				if (m == nblocks-1)
+				{
+					/* Nobody else will need it */
+					starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
+					starpu_data_wont_use(data_handles[n][k]);
+				}
 			}
 			/* non-diagonal block, solve */
 			k = n;
@@ -354,21 +381,19 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
 					       STARPU_RW, data_handles[m][k],
 					       STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
 					       0);
+
+			if (m == nblocks - 1)
+			{
+				/* We do not need the potrf result any more */
+				starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
+				starpu_data_wont_use(data_handles[n][n]);
+			}
 		}
 
 		if (a%checkpoint_period==checkpoint_period-1)
 			starpu_mpi_submit_checkpoint_template(*checkpoint_p, (int)(2*nblocks -4*a));
 		starpu_iteration_pop();
 	}
-
-	/* Submit flushes, StarPU will fit them according to the progress */
-	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
-	for (m = 0; m < nblocks; m++)
-		for (n = 0; n < nblocks ; n++)
-		{
-			if (starpu_data_get_home_node(data_handles[m][n])>=0)
-				starpu_data_wont_use(data_handles[m][n]);
-		}
 }
 
 /* TODO: generate from compiler polyhedral analysis of classical algorithm */
@@ -380,7 +405,7 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
 	unsigned nn = size/nblocks;
 
 	/*
-	 * This is basically similar to above, except that we shift k according to the priorities set in the algorithm, so that prio ~ 2*a or 2*a+1
+	 * This is basically similar to above, except that we shift k according to the priorities set in the algorithm, so that gemm prio ~= 2*nblocks - a
 	 * double-antidiagonal number:
 	 * - a=0 contains (0,0) plus (1,0)
 	 * - a=1 contains (2,0), (1,1) plus (3,0), (2, 1)
@@ -394,16 +419,13 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
 	{
 		starpu_iteration_push(a);
 
-		for (k = 0; k < nblocks; k++)
+		for (k = 0; k < (int) nblocks; k++)
 		{
 			n = k;
 			/* Should be m = a-k-n; for potrf and trsm to respect
 			   priorities, but needs to be this for dependencies */
 			m = a-2*k-n;
 
-			if (m < 0 || m >= nblocks)
-				continue;
-
 			if (m == n)
 			{
 				/* diagonal block, factorize */
@@ -413,7 +435,7 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
 						       STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
 						       0);
 			}
-			else
+			else if (m >= n && m < (int) nblocks)
 			{
 				/* non-diagonal block, solve */
 				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
@@ -424,13 +446,20 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
 						       0);
 			}
 
+			if (m == (int) nblocks - 1)
+			{
+				/* We do not need the potrf result any more */
+				starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
+				starpu_data_wont_use(data_handles[n][n]);
+			}
+
 			/* column within antidiagonal for a */
-			for (n = k + 1; n < nblocks; n++)
+			for (n = k + 1; n < (int) nblocks; n++)
 			{
 				/* row */
 				m = a-2*k-n;
 
-				if (m >= n && m < nblocks)
+				if (m >= n && m < (int) nblocks)
 				{
 					/* Update */
 					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
@@ -440,6 +469,12 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
 							       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
 							       STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
 							       0);
+					if (m == (int) nblocks - 1)
+					{
+						/* Nobody else will need it */
+						starpu_data_wont_use(data_handles[n][k]);
+						starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
+					}
 				}
 			}
 
@@ -449,15 +484,6 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
 			starpu_mpi_submit_checkpoint_template(*checkpoint_p, (int)(2*nblocks - a));
 		starpu_iteration_pop();
 	}
-
-	/* Submit flushes, StarPU will fit them according to the progress */
-	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
-	for (m = 0; m < nblocks; m++)
-		for (n = 0; n < nblocks ; n++)
-		{
-			if (starpu_data_get_home_node(data_handles[m][n])>=0)
-				starpu_data_wont_use(data_handles[m][n]);
-		}
 }
 
 /*
@@ -546,7 +572,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 	if (rank == 0)
 	{
 		*timing = end - start;
-		*flops = (1.0f*size*size*size)/3.0f;
+		*flops = FLOPS_SPOTRF(size);
 	}
 }
 

+ 15 - 12
mpi/examples/matrix_decomposition/mpi_decomposition_params.c

@@ -57,69 +57,72 @@ void parse_args(int argc, char **argv, int nodes)
                         size = strtol(argv[++i], &argptr, 10);
                 }
 
-                if (strcmp(argv[i], "-dblockx") == 0)
+                else if (strcmp(argv[i], "-dblockx") == 0)
                 {
                         char *argptr;
                         dblockx = strtol(argv[++i], &argptr, 10);
                 }
 
-                if (strcmp(argv[i], "-dblocky") == 0)
+                else if (strcmp(argv[i], "-dblocky") == 0)
                 {
                         char *argptr;
                         dblocky = strtol(argv[++i], &argptr, 10);
                 }
 
-                if (strcmp(argv[i], "-nblocks") == 0)
+                else if (strcmp(argv[i], "-nblocks") == 0)
                 {
                         char *argptr;
                         nblocks = strtol(argv[++i], &argptr, 10);
                 }
 
-                if (strcmp(argv[i], "-nbigblocks") == 0)
+                else if (strcmp(argv[i], "-nbigblocks") == 0)
                 {
                         char *argptr;
                         nbigblocks = strtol(argv[++i], &argptr, 10);
                 }
 
-                if (strcmp(argv[i], "-columns") == 0)
+                else if (strcmp(argv[i], "-columns") == 0)
                 {
                         submission = COLUMNS;
                 }
 
-                if (strcmp(argv[i], "-antidiagonals") == 0)
+                else if (strcmp(argv[i], "-antidiagonals") == 0)
                 {
                         submission = ANTIDIAGONALS;
                 }
 
-                if (strcmp(argv[i], "-prios") == 0)
+                else if (strcmp(argv[i], "-prios") == 0)
                 {
                         submission = PRIOS;
                 }
 
-                if (strcmp(argv[i], "-no-prio") == 0)
+                else if (strcmp(argv[i], "-no-prio") == 0)
                 {
                         noprio = 1;
                 }
 
-		        if (strcmp(argv[i], "-checkpoint-period") == 0)
+		        else if (strcmp(argv[i], "-checkpoint-period") == 0)
 		        {
 				        char *argptr;
 		                checkpoint_period = strtol(argv[++i], &argptr, 10);
 		        }
 
-                if (strcmp(argv[i], "-check") == 0)
+                else if (strcmp(argv[i], "-check") == 0)
                 {
                         check = 1;
                 }
 
-                if (strcmp(argv[i], "-display") == 0)
+                else if (strcmp(argv[i], "-display") == 0)
                 {
                         display = 1;
                 }
 
-                if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
+                else
+                /* if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) */
                 {
                         printf("usage : %s [-size size] [-nblocks nblocks] [-columns] [-antidiagonals] [-prios] [-no-prio] [-display] [-check]\n", argv[0]);
+                        fprintf(stderr,"Currently selected: %ux%u and %ux%u blocks\n", size, size, nblocks, nblocks);
+                        exit(0);
                 }
         }
 

+ 11 - 0
src/common/fxt.c

@@ -27,6 +27,7 @@ unsigned long _starpu_job_cnt = 0;
 #ifdef STARPU_USE_FXT
 #include <common/fxt.h>
 #include <starpu_fxt.h>
+#include <sys/stat.h>
 
 #ifdef STARPU_HAVE_WINDOWS
 #include <windows.h>
@@ -95,6 +96,16 @@ static void _starpu_profile_set_tracefile(void)
 	char *fxt_prefix = starpu_getenv("STARPU_FXT_PREFIX");
 	if (!fxt_prefix)
 	     fxt_prefix = "/tmp/";
+	else
+	{
+		// Check if the given folder really exists:
+		struct stat folder_stat;
+		if (stat(fxt_prefix, &folder_stat) < 0 || !S_ISDIR(folder_stat.st_mode))
+		{
+			_STARPU_MSG("%s is not a valid directory.\n", fxt_prefix);
+			_starpu_abort();
+		}
+	}
 
 	user = starpu_getenv("USER");
 	if (!user)

+ 56 - 15
src/sched_policies/work_stealing_policy.c

@@ -71,10 +71,16 @@ struct locality_entry
 
 struct _starpu_work_stealing_data_per_worker
 {
+	char fill1[STARPU_CACHELINE_SIZE];
+	/* This is read-mostly, only updated when the queue becomes empty or
+	 * becomes non-empty, to make it generally cheap to check */
+	unsigned notask;	/* whether the queue is empty */
+	char fill2[STARPU_CACHELINE_SIZE];
+
 	struct _starpu_prio_deque queue;
 	int running;
 	int *proxlist;
-	int busy;
+	int busy;	/* Whether this worker is working on a task */
 
 #ifdef USE_LOCALITY_TASKS
 	/* This records the same as queue, but hashed by data accessed with locality flag.  */
@@ -131,11 +137,12 @@ static int select_victim_round_robin(struct _starpu_work_stealing_data *ws, unsi
 		/* Here helgrind would shout that this is unprotected, but we
 		 * are fine with getting outdated values, this is just an
 		 * estimation */
-		ntasks = ws->per_worker[workerids[worker]].queue.ntasks;
-
-		if (ntasks && (ws->per_worker[workerids[worker]].busy
-					   || starpu_worker_is_blocked_in_parallel(workerids[worker])))
-			break;
+		if (!ws->per_worker[workerids[worker]].notask)
+		{
+			if (ws->per_worker[workerids[worker]].busy
+						   || starpu_worker_is_blocked_in_parallel(workerids[worker]))
+				break;
+		}
 
 		worker = (worker + 1) % nworkers;
 		if (worker == ws->last_pop_worker)
@@ -327,15 +334,31 @@ static struct starpu_task *ws_pick_task(struct _starpu_work_stealing_data *ws, i
 	{
 		/* found an interesting task, try to pick it! */
 		if (_starpu_prio_deque_pop_this_task(&data_source->queue, target, best_task))
+		{
+			if (!data_source->queue.ntasks)
+			{
+				STARPU_ASSERT(ws->per_worker[source].notask == 0);
+				ws->per_worker[source].notask = 1;
+			}
 			return best_task;
+		}
 	}
 
 	/* Didn't find an interesting task, or couldn't run it :( */
 	int skipped;
+	struct starpu_task *task;
+
 	if (source != target)
-		return _starpu_prio_deque_deque_task_for_worker(&data_source->queue, target, &skipped);
+		task = _starpu_prio_deque_deque_task_for_worker(&data_source->queue, target, &skipped);
 	else
-		return _starpu_prio_deque_pop_task_for_worker(&data_source->queue, target, &skipped);
+		task = _starpu_prio_deque_pop_task_for_worker(&data_source->queue, target, &skipped);
+
+	if (!data_source->queue.ntasks)
+	{
+		STARPU_ASSERT(ws->per_worker[source].notask == 0);
+		ws->per_worker[source].notask = 1;
+	}
+	return task;
 }
 
 /* Called when popping a task from a queue */
@@ -371,10 +394,18 @@ static void locality_pushed_task(struct _starpu_work_stealing_data *ws STARPU_AT
 static struct starpu_task *ws_pick_task(struct _starpu_work_stealing_data *ws, int source, int target)
 {
 	int skipped;
+	struct starpu_task *task;
 	if (source != target)
-		return _starpu_prio_deque_deque_task_for_worker(&ws->per_worker[source].queue, target, &skipped);
+		task = _starpu_prio_deque_deque_task_for_worker(&ws->per_worker[source].queue, target, &skipped);
 	else
-		return _starpu_prio_deque_pop_task_for_worker(&ws->per_worker[source].queue, target, &skipped);
+		task = _starpu_prio_deque_pop_task_for_worker(&ws->per_worker[source].queue, target, &skipped);
+
+	if (!ws->per_worker[source].queue.ntasks)
+	{
+		STARPU_ASSERT(ws->per_worker[source].notask == 0);
+		ws->per_worker[source].notask = 1;
+	}
+	return task;
 }
 /* Called when popping a task from a queue */
 static void locality_popped_task(struct _starpu_work_stealing_data *ws STARPU_ATTRIBUTE_UNUSED, struct starpu_task *task STARPU_ATTRIBUTE_UNUSED, int workerid STARPU_ATTRIBUTE_UNUSED, unsigned sched_ctx_id STARPU_ATTRIBUTE_UNUSED)
@@ -530,7 +561,8 @@ static struct starpu_task *ws_pop_task(unsigned sched_ctx_id)
 	struct starpu_task *task = NULL;
 	unsigned workerid = starpu_worker_get_id_check();
 
-	ws->per_worker[workerid].busy = 0;
+	if (ws->per_worker[workerid].busy)
+		ws->per_worker[workerid].busy = 0;
 
 #ifdef STARPU_NON_BLOCKING_DRIVERS
 	if (STARPU_RUNNING_ON_VALGRIND || !_starpu_prio_deque_is_empty(&ws->per_worker[workerid].queue))
@@ -617,7 +649,8 @@ static struct starpu_task *ws_pop_task(unsigned sched_ctx_id)
 		if (!task)
 			return NULL;
 	}
-	ws->per_worker[workerid].busy = !!task;
+	if (ws->per_worker[workerid].busy != !!task)
+		ws->per_worker[workerid].busy = !!task;
 	return task;
 }
 
@@ -648,6 +681,11 @@ int ws_push_task(struct starpu_task *task)
 	record_data_locality(task, workerid);
 	STARPU_ASSERT_MSG(ws->per_worker[workerid].running, "workerid=%d, ws=%p\n", workerid, ws);
 	_starpu_prio_deque_push_back_task(&ws->per_worker[workerid].queue, task);
+	if (ws->per_worker[workerid].queue.ntasks == 1)
+	{
+		STARPU_ASSERT(ws->per_worker[workerid].notask == 1);
+		ws->per_worker[workerid].notask = 0;
+	}
 	locality_pushed_task(ws, task, workerid, sched_ctx_id);
 
 	starpu_push_task_end(task);
@@ -676,10 +714,12 @@ static void ws_add_workers(unsigned sched_ctx_id, int *workerids,unsigned nworke
 		int workerid = workerids[i];
 		starpu_sched_ctx_worker_shares_tasks_lists(workerid, sched_ctx_id);
 		_starpu_prio_deque_init(&ws->per_worker[workerid].queue);
+		ws->per_worker[workerid].notask = 1;
 		ws->per_worker[workerid].running = 1;
 
 		/* Tell helgrind that we are fine with getting outdated values,
 		 * this is just an estimation */
+		STARPU_HG_DISABLE_CHECKING(ws->per_worker[workerid].notask);
 		STARPU_HG_DISABLE_CHECKING(ws->per_worker[workerid].queue.ntasks);
 		ws->per_worker[workerid].busy = 0;
 		STARPU_HG_DISABLE_CHECKING(ws->per_worker[workerid].busy);
@@ -760,11 +800,12 @@ static int lws_select_victim(struct _starpu_work_stealing_data *ws, unsigned sch
 	for (i = 0; i < nworkers; i++)
 	{
 		int neighbor = ws->per_worker[workerid].proxlist[i];
+		if (ws->per_worker[neighbor].notask)
+			continue;
                 /* FIXME: do not keep looking again and again at some worker
                  * which has tasks, but that can't execute on me */
-		int ntasks = ws->per_worker[neighbor].queue.ntasks;
-		if (ntasks && (ws->per_worker[neighbor].busy
-					   || starpu_worker_is_blocked_in_parallel(neighbor)))
+		if (ws->per_worker[neighbor].busy
+					   || starpu_worker_is_blocked_in_parallel(neighbor))
 			return neighbor;
 	}
 	return -1;

+ 1 - 2
src/util/execute_on_all.c

@@ -107,8 +107,7 @@ void starpu_execute_on_each_worker_ex(void (*func)(void *), void *arg, uint32_t
 	unsigned nworkers = starpu_worker_get_count();
 	struct starpu_task *tasks[STARPU_NMAXWORKERS];
 
-	/* This method only work on CPU, CUDA, OPENCL */
-	STARPU_ASSERT((where & ~STARPU_CPU & ~STARPU_CUDA & ~STARPU_OPENCL) == 0);
+	STARPU_ASSERT_MSG((where & ~STARPU_CPU & ~STARPU_CUDA & ~STARPU_OPENCL) == 0, "This function is implemented only on CPU, CUDA, OpenCL");
 
 	/* create a wrapper codelet */
 	struct starpu_codelet wrapper_cl =

+ 1 - 0
src/util/starpu_data_cpy.c

@@ -159,6 +159,7 @@ int _starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_h
 	task->callback_func = callback_func;
 	task->callback_arg = callback_arg;
 
+	/* FIXME: priority!! */
 	STARPU_TASK_SET_HANDLE(task, dst_handle, 0);
 	STARPU_TASK_SET_HANDLE(task, src_handle, 1);
 

+ 6 - 2
tests/Makefile.am

@@ -43,6 +43,7 @@ EXTRA_DIST =					\
 	microbenchs/parallel_independent_heterogeneous_tasks.sh	\
 	microbenchs/parallel_independent_homogeneous_tasks_data.sh	\
 	microbenchs/parallel_independent_homogeneous_tasks.sh	\
+	microbenchs/bandwidth_scheds.sh		\
 	energy/static.sh			\
 	energy/dynamic.sh			\
 	energy/perfs.gp				\
@@ -73,7 +74,7 @@ EXTRA_DIST =					\
 	model-checking/starpu-mc.sh.in
 
 CLEANFILES = 					\
-	*.gcno *.gcda *.linkinfo core starpu_idle_microsec.log *.mod *.png *.output tasks.rec perfs.rec perfs2.rec fortran90/starpu_mod.f90
+	*.gcno *.gcda *.linkinfo core starpu_idle_microsec.log *.mod *.png *.output tasks.rec perfs.rec perfs2.rec fortran90/starpu_mod.f90 bandwidth-*.dat
 
 BUILT_SOURCES =
 SUBDIRS =
@@ -173,6 +174,7 @@ myPROGRAMS +=					\
 	helper/execute_on_all			\
 	microbenchs/display_structures_size	\
 	microbenchs/local_pingpong		\
+	microbenchs/bandwidth			\
 	overlap/overlap				\
 	sched_ctx/sched_ctx_list		\
 	sched_ctx/sched_ctx_policy_data		\
@@ -412,7 +414,8 @@ examplebin_PROGRAMS = \
 	microbenchs/sync_tasks_overhead		\
 	microbenchs/tasks_overhead		\
 	microbenchs/tasks_size_overhead		\
-	microbenchs/local_pingpong
+	microbenchs/local_pingpong		\
+	microbenchs/bandwidth
 examplebin_SCRIPTS = \
 	microbenchs/tasks_data_overhead.sh \
 	microbenchs/sync_tasks_data_overhead.sh \
@@ -444,6 +447,7 @@ endif
 if !STARPU_USE_MPI_MASTER_SLAVE
 SHELL_TESTS += \
 	datawizard/locality.sh \
+	microbenchs/bandwidth_scheds.sh \
 	overlap/overlap.sh
 endif
 

+ 342 - 0
tests/microbenchs/bandwidth.c

@@ -0,0 +1,342 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+
+#include <starpu.h>
+#include "../helper.h"
+
+/*
+ * Measure the memory bandwidth available to kernels depending on the number of
+ * kernels and number of idle workers.
+ */
+
+#ifdef STARPU_QUICK_CHECK
+static size_t size = 1024;
+static unsigned cpustep = 4;
+#else
+/* Must be bigger than available cache size per core, 64MiB should be enough */
+static size_t size = 64UL << 20;
+static unsigned cpustep = 1;
+#endif
+
+static unsigned noalone = 0;
+static unsigned iter = 30;
+static unsigned total_ncpus;
+static starpu_pthread_barrier_t barrier_begin, barrier_end;
+static float *result;
+static void **buffers;	/* Indexed by logical core number */
+static char padding1[STARPU_CACHELINE_SIZE];
+static volatile char finished;
+static char padding2[STARPU_CACHELINE_SIZE];
+
+static unsigned interleave(unsigned i);
+
+/* Initialize the buffer locally */
+void initialize_buffer(void *foo)
+{
+	unsigned id = starpu_worker_get_id();
+#ifdef STARPU_HAVE_POSIX_MEMALIGN
+	int ret = posix_memalign(&buffers[id], getpagesize(), 2*size);
+	STARPU_ASSERT(ret == 0);
+#else
+	buffers[id] = malloc(2*size);
+#endif
+	memset(buffers[id], 0, 2*size);
+}
+
+/* Actual transfer codelet */
+void bw_func(void *descr[], void *arg)
+{
+	int id = (uintptr_t) arg;
+	void *src = buffers[id];
+	void *dst = (void*) ((uintptr_t)src + size);
+	unsigned i;
+	double start, stop;
+
+	STARPU_PTHREAD_BARRIER_WAIT(&barrier_begin);
+	start = starpu_timing_now();
+	for (i = 0; i < iter; i++)
+	{
+		memcpy(dst, src, size);
+		STARPU_SYNCHRONIZE();
+	}
+	stop = starpu_timing_now();
+	STARPU_PTHREAD_BARRIER_WAIT(&barrier_end);
+	finished = 1;
+
+	result[id] = (size*iter) / (stop - start);
+}
+
+static struct starpu_codelet bw_codelet =
+{
+	.cpu_funcs = {bw_func},
+	.model = NULL,
+	.nbuffers = 0,
+};
+
+/* Codelet that waits for completion while doing lots of cpu yields (nop). */
+void nop_func(void *descr[], void *arg)
+{
+	STARPU_PTHREAD_BARRIER_WAIT(&barrier_begin);
+	while (!finished)
+	{
+		unsigned i;
+		for (i = 0; i < 1000000; i++)
+			STARPU_UYIELD();
+		STARPU_SYNCHRONIZE();
+	}
+}
+
+static struct starpu_codelet nop_codelet =
+{
+	.cpu_funcs = {nop_func},
+	.model = NULL,
+	.nbuffers = 0,
+};
+
+/* Codelet that waits for completion while aggressively reading the finished variable. */
+void sync_func(void *descr[], void *arg)
+{
+	STARPU_PTHREAD_BARRIER_WAIT(&barrier_begin);
+	while (!finished)
+	{
+		STARPU_VALGRIND_YIELD();
+		STARPU_SYNCHRONIZE();
+	}
+}
+
+static struct starpu_codelet sync_codelet =
+{
+	.cpu_funcs = {sync_func},
+	.model = NULL,
+	.nbuffers = 0,
+};
+
+static void usage(char **argv)
+{
+	fprintf(stderr, "Usage: %s [-n niter] [-s size (MB)] [-c cpustep] [-a]\n", argv[0]);
+	fprintf(stderr, "\t-n niter\tNumber of iterations\n");
+	fprintf(stderr, "\t-s size\tBuffer size in MB\n");
+	fprintf(stderr, "\t-c cpustep\tCpu number increment\n");
+	fprintf(stderr, "\t-a Do not run the alone test\n");
+	exit(EXIT_FAILURE);
+}
+
+static void parse_args(int argc, char **argv)
+{
+	int c;
+	while ((c = getopt(argc, argv, "n:s:c:ah")) != -1)
+	switch(c)
+	{
+		case 'n':
+			iter = atoi(optarg);
+			break;
+		case 's':
+			size = (long)atoi(optarg) << 20;
+			break;
+		case 'c':
+			cpustep = atoi(optarg);
+			break;
+		case 'a':
+			noalone = 1;
+			break;
+		case 'h':
+			usage(argv);
+			break;
+	}
+}
+
+static unsigned interleave(unsigned i)
+{
+	/* TODO: rather distribute over hierarchy */
+	if (total_ncpus > 1)
+		return (i % (total_ncpus/2))*2 + i / (total_ncpus/2);
+	else
+		return 0;
+}
+
+enum sleep_type {
+	PAUSE,
+	NOP,
+	SYNC,
+	SCHED,
+};
+
+static float bench(int *argc, char ***argv, unsigned nbusy, unsigned ncpus, int intl, enum sleep_type sleep)
+{
+	int ret;
+	unsigned i;
+	struct starpu_conf conf;
+	float bw;
+
+	starpu_conf_init(&conf);
+	conf.precedence_over_environment_variables = 1;
+	conf.ncuda = 0;
+	conf.nopencl = 0;
+	conf.nmic = 0;
+	conf.nmpi_ms = 0;
+	conf.ncpus = ncpus;
+
+	if (intl && sleep == PAUSE)
+	{
+		conf.use_explicit_workers_bindid = 1;
+		for (i = 0; i < ncpus; i++)
+			conf.workers_bindid[i] = interleave(i);
+	}
+
+	ret = starpu_initialize(&conf, argc, argv);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	if (sleep == PAUSE || sleep == SCHED)
+		/* In these cases we don't have a task on each cpu */
+		STARPU_PTHREAD_BARRIER_INIT(&barrier_begin, NULL, nbusy);
+	else
+		STARPU_PTHREAD_BARRIER_INIT(&barrier_begin, NULL, ncpus);
+
+	STARPU_PTHREAD_BARRIER_INIT(&barrier_end, NULL, nbusy);
+
+	finished = 0;
+	for (i = 0; i < ncpus; i++)
+		result[i] = NAN;
+
+	for (i = 0; i < nbusy; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+		task->cl = &bw_codelet;
+
+		if (intl)
+			task->cl_arg = (void*) (uintptr_t) interleave(i);
+		else
+			task->cl_arg = (void*) (uintptr_t) i;
+
+		task->execute_on_a_specific_worker = 1;
+		if (intl && sleep != PAUSE) /* In the pause case we interleaved above */
+			task->workerid = interleave(i);
+		else
+			task->workerid = i;
+
+		ret = starpu_task_submit(task);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	if (sleep != PAUSE && sleep != SCHED)
+	{
+		/* Add waiting tasks */
+		for ( ; i < ncpus; i++)
+		{
+			struct starpu_task *task = starpu_task_create();
+			switch (sleep)
+			{
+			case NOP:
+				task->cl = &nop_codelet;
+				break;
+			case SYNC:
+				task->cl = &sync_codelet;
+				break;
+			default:
+				STARPU_ASSERT(0);
+			}
+			task->execute_on_a_specific_worker = 1;
+			task->workerid = interleave(i);
+			ret = starpu_task_submit(task);
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+		}
+	}
+
+
+	starpu_task_wait_for_all();
+	starpu_shutdown();
+
+	for (bw = 0., i = 0; i < nbusy; i++)
+	{
+		if (intl)
+			bw += result[interleave(i)];
+		else
+			bw += result[i];
+	}
+	return bw;
+}
+
+int main(int argc, char **argv)
+{
+	int ret;
+	unsigned n;
+	struct starpu_conf conf;
+	float alone, alone_int, alone_int_nop, alone_int_sync, sched, sched_int;
+
+	parse_args(argc, argv);
+
+	starpu_conf_init(&conf);
+	conf.precedence_over_environment_variables = 1;
+	conf.ncuda = 0;
+	conf.nopencl = 0;
+	conf.nmic = 0;
+	conf.nmpi_ms = 0;
+
+	ret = starpu_initialize(&conf, &argc, &argv);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	total_ncpus = starpu_cpu_worker_get_count();
+
+	buffers = malloc(total_ncpus * sizeof(*buffers));
+	starpu_execute_on_each_worker_ex(initialize_buffer, NULL, STARPU_CPU, "init_buffer");
+	starpu_shutdown();
+
+	if (total_ncpus == 0)
+		return STARPU_TEST_SKIPPED;
+
+	result = malloc(total_ncpus * sizeof(result[0]));
+
+	printf("# nw\ta comp.\t+sched\teff%%\ta scat.\t+nop\t+sync\t+sched\teff%% vs nop\n");
+	for (n = cpustep; n <= total_ncpus; n += cpustep)
+	{
+		if (noalone)
+		{
+			alone = 0.;
+			alone_int = 0.;
+			alone_int_nop = 0.;
+			alone_int_sync = 0.;
+		}
+		else
+		{
+			alone = bench(&argc, &argv, n, n, 0, PAUSE);
+			alone_int = bench(&argc, &argv, n, n, 1, PAUSE);
+			alone_int_nop = bench(&argc, &argv, n, total_ncpus, 1, NOP);
+			alone_int_sync = bench(&argc, &argv, n, total_ncpus, 1, SYNC);
+		}
+		sched = bench(&argc, &argv, n, total_ncpus, 0, SCHED);
+		sched_int = bench(&argc, &argv, n, total_ncpus, 1, SCHED);
+		printf("%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n",
+				n,
+				alone/1000,
+				sched/1000, sched*100/alone,
+				alone_int/1000,
+				alone_int_nop/1000,
+				alone_int_sync/1000,
+				sched_int/1000, sched_int*100/alone_int_nop);
+		fflush(stdout);
+	}
+
+	free(result);
+
+	for (n = 0; n < total_ncpus; n++)
+		free(buffers[n]);
+
+	return EXIT_SUCCESS;
+}

+ 75 - 0
tests/microbenchs/bandwidth_scheds.sh

@@ -0,0 +1,75 @@
+#!/bin/bash
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2016-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+set -e
+
+if [ -n "$STARPU_SCHED" ]
+then
+	SCHEDS=$STARPU_SCHED
+	DEFAULT=$STARPU_SCHED
+else
+	SCHEDS=`$(dirname $0)/../../tools/starpu_sched_display`
+	DEFAULT=eager
+fi
+
+if [ -n "$STARPU_BENCH_DIR" ]; then
+	cat > bandwidth.gp << EOF
+set term svg font ",12" size 1500,500 linewidth 0.5
+set output "bandwidth.svg"
+set pointsize 0.3
+EOF
+else
+	fast="-i 3 -c 4"
+	cat > bandwidth.gp << EOF
+set term postscript eps enhanced color font ",18"
+set output "bandwidth.eps"
+set size 2,1
+EOF
+fi
+
+cat >> bandwidth.gp << EOF
+set key outside
+set ylabel "GB/s"
+set xlabel "ncores"
+
+plot \\
+	"bandwidth-$DEFAULT.dat" using 1:5 with lines title "alone interleave", \\
+	"bandwidth-$DEFAULT.dat" using 1:6 with lines title "nop", \\
+	"bandwidth-$DEFAULT.dat" using 1:7 with lines title "sync", \\
+	"bandwidth-$DEFAULT.dat" using 1:2 with lines title "alone contiguous", \\
+EOF
+
+type=1
+for sched in $SCHEDS
+do
+	if [ "$sched" != eager -a "$sched" != "$SCHEDS" ]; then
+		extra=-a
+	else
+		extra=
+	fi
+
+	STARPU_BACKOFF_MIN=0 STARPU_BACKOFF_MAX=0 STARPU_SCHED=$sched $STARPU_LAUNCH $(dirname $0)/bandwidth $fast $extra "$@" | tee bandwidth-$sched.dat
+	echo "\"bandwidth-$sched.dat\" using 1:3 with linespoints lt $type pt $type title \"$sched\", \\" >> bandwidth.gp
+	echo "\"bandwidth-$sched.dat\" using 1:8 with linespoints lt $type pt $type notitle, \\" >> bandwidth.gp
+	type=$((type+1))
+done
+
+if gnuplot bandwidth.gp ; then
+	if [ -n "$STARPU_BENCH_DIR" ]; then
+		cp bandwidth.png $STARPU_BENCH_DIR/
+	fi
+fi