Browse Source

Merge branch 'master' into ft_checkpoint

Nathalie Furmento 5 years ago
parent
commit
68fea1ee9b
100 changed files with 3051 additions and 1046 deletions
  1. 1 0
      AUTHORS
  2. 9 0
      ChangeLog
  3. 5 5
      Makefile.am
  4. 78 63
      configure.ac
  5. 1 1
      contrib/ci.inria.fr/job-0-tarball.sh
  6. 3 2
      contrib/ci.inria.fr/job-1-check.sh
  7. 19 5
      doc/doxygen/Makefile.am
  8. 1 0
      doc/doxygen/chapters/470_simgrid.doxy
  9. 24 0
      doc/doxygen/chapters/501_environment_variables.doxy
  10. 20 1
      doc/doxygen/chapters/510_configure_options.doxy
  11. 34 0
      doc/doxygen/dev/starpu_check_include.sh
  12. 14 6
      doc/doxygen_dev/Makefile.am
  13. 15 17
      examples/Makefile.am
  14. 2 0
      examples/mlr/mlr.c
  15. 4 6
      examples/stencil/Makefile.am
  16. 16 0
      include/starpu.h
  17. 9 1
      include/starpu_data.h
  18. 26 2
      include/starpu_helper.h
  19. 11 0
      include/starpu_util.h
  20. 5 1
      julia/Makefile.am
  21. 9 9
      julia/Manifest.toml
  22. 16 4
      julia/examples/Makefile.am
  23. 41 31
      julia/examples/axpy/axpy.jl
  24. 7 6
      julia/examples/callback/callback.jl
  25. 15 0
      julia/examples/check_deps/check_deps.jl
  26. 20 0
      julia/examples/cholesky/cholesky.sh
  27. 52 0
      julia/examples/cholesky/cholesky_codelets.jl
  28. 154 0
      julia/examples/cholesky/cholesky_common.jl
  29. 71 0
      julia/examples/cholesky/cholesky_implicit.jl
  30. 79 0
      julia/examples/cholesky/cholesky_native.jl
  31. 93 0
      julia/examples/cholesky/cholesky_tag.jl
  32. 3 3
      julia/examples/dependency/end_dep.jl
  33. 3 3
      julia/examples/dependency/tag_dep.jl
  34. 3 3
      julia/examples/dependency/task_dep.jl
  35. 7 1
      julia/examples/execute.sh.in
  36. 144 0
      julia/examples/gemm/gemm.jl
  37. 22 0
      julia/examples/gemm/gemm.sh
  38. 146 0
      julia/examples/gemm/gemm_bare.jl
  39. 56 0
      julia/examples/gemm/gemm_native.jl
  40. 1 1
      julia/examples/mandelbrot/cpu_mandelbrot.c
  41. 4 1
      julia/examples/mandelbrot/mandelbrot.jl
  42. 1 1
      julia/examples/mult/cpu_mult.c
  43. 15 28
      julia/examples/mult/mult.jl
  44. 2 2
      julia/examples/task_insert_color/task_insert_color.jl
  45. 1 1
      julia/examples/variable/variable.jl
  46. 21 17
      julia/examples/vector_scal/vector_scal.jl
  47. 1 1
      julia/src/Makefile.am
  48. 12 3
      julia/src/StarPU.jl
  49. 78 400
      julia/src/blas.c
  50. 116 134
      julia/src/blas.h
  51. 15 0
      julia/src/blas.jl
  52. 15 0
      julia/src/blas_wrapper.c
  53. 25 8
      julia/src/compiler/c.jl
  54. 301 25
      julia/src/compiler/cuda.jl
  55. 351 3
      julia/src/compiler/expression_manipulation.jl
  56. 24 16
      julia/src/compiler/expressions.jl
  57. 18 17
      julia/src/compiler/file_generation.jl
  58. 4 0
      julia/src/data.jl
  59. 13 11
      julia/src/dynamic_compiler/Makefile.am
  60. 7 6
      julia/src/globals.jl
  61. 4 1
      julia/src/init.jl
  62. 9 0
      julia/src/openblas_ldflags.jl
  63. 109 22
      julia/src/task.jl
  64. 13 5
      julia/src/translate_headers.jl
  65. 8 1
      mpi/Makefile.am
  66. 5 23
      mpi/examples/Makefile.am
  67. 2 1
      mpi/examples/matrix_mult/mm.c
  68. 1 0
      mpi/src/starpu_mpi.c
  69. 34 1
      mpi/src/starpu_mpi_init.c
  70. 11 10
      mpi/tests/Makefile.am
  71. 1 1
      mpi/tests/abstract_sendrecv_bench.c
  72. 1 2
      mpi/tests/abstract_sendrecv_bench.h
  73. 1 1
      mpi/tests/bench_helper.c
  74. 1 1
      mpi/tests/bench_helper.h
  75. 1 1
      mpi/tests/burst.c
  76. 59 3
      mpi/tests/burst_gemm.c
  77. 1 1
      mpi/tests/burst_helper.c
  78. 1 1
      mpi/tests/burst_helper.h
  79. 44 0
      mpi/tests/display_bindings.c
  80. 52 4
      mpi/tests/gemm_helper.c
  81. 3 1
      mpi/tests/gemm_helper.h
  82. 1 1
      mpi/tests/nothing.c
  83. 1 1
      mpi/tests/sendrecv_parallel_tasks_bench.c
  84. 2 2
      sc_hypervisor/examples/Makefile.am
  85. 3 5
      socl/examples/Makefile.am
  86. 138 3
      src/common/utils.c
  87. 14 9
      src/core/perfmodel/perfmodel_history.c
  88. 123 45
      src/core/perfmodel/regression.c
  89. 2 0
      src/core/topology.c
  90. 0 1
      src/core/topology.h
  91. 17 0
      src/core/workers.c
  92. 16 2
      src/core/workers.h
  93. 5 0
      src/datawizard/coherency.h
  94. 24 1
      src/datawizard/interfaces/data_interface.c
  95. 3 5
      src/drivers/driver_common/driver_common.c
  96. 20 16
      src/profiling/profiling.c
  97. 17 11
      src/sched_policies/component_heft.c
  98. 28 12
      src/sched_policies/component_heteroprio.c
  99. 13 7
      src/sched_policies/component_mct.c
  100. 0 0
      src/sched_policies/deque_modeling_policy_data_aware.c

+ 1 - 0
AUTHORS

@@ -12,6 +12,7 @@ Danjean Vincent, University Grenoble Alpes, <Vincent.Danjean@ens-lyon.org>
 Denis Alexandre, Inria, <alexandre.denis@inria.fr>
 Eyraud-Dubois Lionel, Inria, <lionel.eyraud-dubois@inria.fr>
 Furmento Nathalie, CNRS, <nathalie.furmento@labri.fr>
+Guermouche Amina, Télécom SudParis, <amina.guermouche@inria.fr>
 Guilbaud Adrien, Inria, <adrien.guilbaud@inria.fr>
 Henry Sylvain, Université de Bordeaux, <sylvain.henry@inria.fr>
 Hugo Andra, Université de Bordeaux/Inria, <andra.hugo@inria.fr>

+ 9 - 0
ChangeLog

@@ -56,6 +56,15 @@ Small features:
   * And STARPU_LIMIT_BANDWIDTH environment variable.
   * Add field starpu_conf::precedence_over_environment_variables to ignore
     environment variables when parameters are set directly in starpu_conf
+  * Add starpu_data_get_coordinates_array
+  * New STARPU_BACKOFF_MIN and STARPU_BACKOFF_MAX environment variables to the
+    exponential backoff limits of the number of cycles to pause while drivers
+    are spinning.
+  * Add STARPU_DISPLAY_BINDINGS environment variable and
+    starpu_display_bindings() function to display all bindings on the machine by
+    calling hwloc-ps
+Small changes:
+  * New configure option --disable-build-doc-pdf
 
 StarPU 1.3.3 (git revision 11afc5b007fe1ab1c729b55b47a5a98ef7f3cfad)
 ====================================================================

+ 5 - 5
Makefile.am

@@ -27,7 +27,7 @@ SUBDIRS += src
 
 SUBDIRS += tools
 
-if BUILD_TESTS
+if STARPU_BUILD_TESTS
 SUBDIRS += tests
 endif
 
@@ -37,19 +37,19 @@ if STARPU_USE_MPI
 SUBDIRS += mpi
 endif
 
-if BUILD_EXAMPLES
+if STARPU_BUILD_EXAMPLES
 SUBDIRS += examples
 endif
 
-if BUILD_SOCL
+if STARPU_BUILD_SOCL
 SUBDIRS += socl
 endif
 
-if BUILD_STARPUFFT
+if STARPU_BUILD_STARPUFFT
 SUBDIRS += starpufft
 endif
 
-if BUILD_STARPURM
+if STARPU_BUILD_STARPURM
 SUBDIRS += starpurm
 endif
 

File diff suppressed because it is too large
+ 78 - 63
configure.ac


+ 1 - 1
contrib/ci.inria.fr/job-0-tarball.sh

@@ -21,7 +21,7 @@ export LD_LIBRARY_PATH=/home/ci/usr/local/lib:$LD_LIBRARY_PATH
 ./autogen.sh
 if test -d build ; then chmod -R 777 build && rm -rf build ; fi
 mkdir build && cd build
-../configure
+../configure --enable-build-doc-pdf
 make V=1
 make dist
 cp *gz ..

+ 3 - 2
contrib/ci.inria.fr/job-1-check.sh

@@ -63,12 +63,13 @@ fi
 export CC=gcc
 
 CONFIGURE_OPTIONS="--enable-debug --enable-verbose --enable-mpi-check --disable-build-doc"
+CONFIGURE_CHECK=""
 day=$(date +%u)
 if test $day -le 5
 then
     CONFIGURE_CHECK="--enable-quick-check"
-else
-    CONFIGURE_CHECK="--enable-long-check"
+#else
+    # we do a normal check, a long check takes too long on VM nodes
 fi
 ../configure $CONFIGURE_OPTIONS $CONFIGURE_CHECK  $STARPU_CONFIGURE_OPTIONS
 

+ 19 - 5
doc/doxygen/Makefile.am

@@ -29,10 +29,15 @@ txtdir   = $(docdir)/manual
 
 EXTRA_DIST =
 
-if BUILD_DOC
+if STARPU_BUILD_DOC
+if STARPU_BUILD_DOC_PDF
 all: $(DOX_HTML_DIR) $(DOX_PDF)
 EXTRA_DIST += $(DOX_HTML_DIR) $(DOX_PDF)
 txt_DATA = $(DOX_PDF)
+else
+all: $(DOX_HTML_DIR)
+EXTRA_DIST += $(DOX_HTML_DIR)
+endif
 DOX_HTML_SRCDIR=$(DOX_HTML_DIR)
 install-exec-hook:
 	$(MKDIR_P) $(DESTDIR)$(docdir)/manual/html
@@ -40,9 +45,8 @@ install-exec-hook:
 uninstall-hook:
 	rm -rf $(DESTDIR)$(docdir)/manual/html
 else
-if AVAILABLE_DOC
-EXTRA_DIST += $(top_srcdir)/doc/doxygen/html $(top_srcdir)/doc/doxygen/starpu.pdf
-txt_DATA = $(top_srcdir)/doc/doxygen/starpu.pdf
+if STARPU_AVAILABLE_DOC
+EXTRA_DIST += $(top_srcdir)/doc/doxygen/html
 DOX_HTML_SRCDIR=$(top_srcdir)/doc/doxygen/html
 install-exec-hook:
 	$(MKDIR_P) $(DESTDIR)$(docdir)/manual/html
@@ -50,6 +54,10 @@ install-exec-hook:
 uninstall-hook:
 	rm -rf $(DESTDIR)$(docdir)/manual/html
 endif
+if STARPU_AVAILABLE_DOC_PDF
+EXTRA_DIST += $(top_srcdir)/doc/doxygen/starpu.pdf
+txt_DATA = $(top_srcdir)/doc/doxygen/starpu.pdf
+endif
 endif
 
 chapters =	\
@@ -136,7 +144,7 @@ images = 	\
 	chapters/images/tasks_size_overhead.png \
 	chapters/images/temanejo.png
 
-if BUILD_DOC
+if STARPU_BUILD_DOC
 starpu_config.h: $(top_srcdir)/include/starpu_config.h.in
 	@$(SED) 's/#undef \(.*\)/#define \1 1/' $< > $@
 
@@ -200,7 +208,9 @@ dox_inputs = $(DOX_CONFIG) 				\
 	$(top_srcdir)/include/starpu_expert.h		\
 	$(top_srcdir)/include/starpu_fxt.h		\
 	$(top_srcdir)/include/starpu_hash.h		\
+	$(top_srcdir)/include/starpu_helper.h		\
 	$(top_srcdir)/include/starpu_mic.h		\
+	$(top_srcdir)/include/starpu_mpi_ms.h		\
 	$(top_srcdir)/include/starpu_mod.f90		\
 	$(top_srcdir)/include/starpu_opencl.h		\
 	$(top_srcdir)/include/starpu_openmp.h		\
@@ -227,6 +237,8 @@ dox_inputs = $(DOX_CONFIG) 				\
 	$(top_srcdir)/include/starpu_util.h		\
 	$(top_srcdir)/include/starpu_worker.h		\
 	$(top_srcdir)/include/fstarpu_mod.f90		\
+	$(top_srcdir)/include/schedulers/starpu_heteroprio.h	\
+	$(top_srcdir)/starpufft/include/starpufft.h 	\
 	$(top_srcdir)/mpi/include/starpu_mpi.h 		\
 	$(top_srcdir)/mpi/include/starpu_mpi_lb.h	\
 	$(top_srcdir)/mpi/include/fstarpu_mpi_mod.f90		\
@@ -253,6 +265,8 @@ $(DOX_TAG): $(dox_inputs)
 	@$(SED) -i '/\\begin{titlepage}/,$$d' $(DOX_LATEX_DIR)/refman.tex
 	@cat $(top_srcdir)/doc/doxygen/refman.tex >> $(DOX_LATEX_DIR)/refman.tex
 
+$(DOX_HTML_DIR): $(DOX_TAG)
+
 $(DOX_PDF): $(DOX_TAG) refman.tex
 	@cp $(top_srcdir)/doc/doxygen/chapters/version.sty $(DOX_LATEX_DIR)
 	@cp $(top_srcdir)/doc/doxygen/chapters/images/*pdf $(DOX_LATEX_DIR)

+ 1 - 0
doc/doxygen/chapters/470_simgrid.doxy

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Federal University of Rio Grande do Sul (UFRGS)
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 24 - 0
doc/doxygen/chapters/501_environment_variables.doxy

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  * Copyright (C) 2016       Uppsala University
+ * Copyright (C) 2020       Federal University of Rio Grande do Sul (UFRGS)
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -405,6 +406,20 @@ and friends.  The default is Enabled.
 This permits to test the performance effect of memory pinning.
 </dd>
 
+<dt>STARPU_BACKOFF_MIN</dt>
+<dd>
+\anchor STARPU_BACKOFF_MIN
+\addindex __env__STARPU_BACKOFF_MIN
+Set minimum exponential backoff of number of cycles to pause when spinning. Default value is 1.
+</dd>
+
+<dt>STARPU_BACKOFF_MAX</dt>
+<dd>
+\anchor STARPU_BACKOFF_MAX
+\addindex __env__STARPU_BACKOFF_MAX
+Set maximum exponential backoff of number of cycles to pause when spinning. Default value is 32.
+</dd>
+
 <dt>STARPU_MIC_SINK_PROGRAM_NAME</dt>
 <dd>
 \anchor STARPU_MIC_SINK_PROGRAM_NAME
@@ -1351,6 +1366,15 @@ application has crashed. Setting this variable to a value other than 1
 will disable this behaviour. This should be done on JVM systems which
 may use these signals for their own needs.
 The flag can also be set through the field starpu_conf::catch_signals.
+</dd>
+
+<dt>STARPU_DISPLAY_BINDINGS</dt>
+<dd>
+\anchor STARPU_DISPLAY_BINDINGS
+\addindex __env__STARPU_DISPLAY_BINDINGS
+Display the binding of all processes and threads running on the machine. If MPI is enabled, display the binding of each node.<br>
+Users can manually display the binding by calling starpu_display_bindings().
+</dd>
 </dl>
 
 \section ConfiguringTheHypervisor Configuring The Hypervisor

+ 20 - 1
doc/doxygen/chapters/510_configure_options.doxy

@@ -115,7 +115,19 @@ Specify <c>hwloc</c> should not be used by StarPU.
 \addindex __configure__--disable-build-doc
 Disable the creation of the documentation. This should be done on a
 machine which does not have the tools <c>doxygen</c> and <c>latex</c>
-(plus the packages <c>latex-xcolor</c> and <c>texlive-latex-extra</c>).
+(plus the packages <c>latex-xcolor</c> and
+<c>texlive-latex-extra</c>).
+</dd>
+
+<dt>--enable-build-doc-pdf</dt>
+<dd>
+\anchor enable-build-doc-pdf
+\addindex __configure__--enable-build-doc-pdf
+By default, ontly the HTML documentation is generated. Use this option
+to also enable the generation of the PDF documentation. This should be
+done on a machine which does have the tools <c>doxygen</c> and <c>latex</c>
+(plus the packages <c>latex-xcolor</c> and
+<c>texlive-latex-extra</c>).
 </dd>
 
 <dt>--disable-icc</dt>
@@ -370,6 +382,13 @@ used by StarPU data structures.
 Disable the build of libstarpumpi. By default, it is enabled when MPI is found.
 </dd>
 
+<dt>--enable-mpi</dt>
+<dd>
+\anchor enable-mpi
+\addindex __configure__--enable-mpi
+Enable the build of libstarpumpi. This is necessary when using Simgrid+MPI.
+</dd>
+
 <dt>--with-mpicc=<c>path</c></dt>
 <dd>
 \anchor with-mpicc

+ 34 - 0
doc/doxygen/dev/starpu_check_include.sh

@@ -0,0 +1,34 @@
+#!/bin/bash
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+dir=$(dirname $0)
+
+cd $dir/../../../
+for d in $(find . -name include -not -wholename "*/build/*")
+do
+    for f in $(find $d -name "*h")
+    do
+	for i in doxygen-config.cfg.in Makefile.am
+	do
+	    x=`grep $f $dir/../$i`
+	    if test -z "$x"
+	    then
+		echo $f missing in $i
+	    fi
+	done
+    done
+done

+ 14 - 6
doc/doxygen_dev/Makefile.am

@@ -29,10 +29,15 @@ txtdir   = $(docdir)/manual
 
 EXTRA_DIST =
 
-if BUILD_DOC
+if STARPU_BUILD_DOC
+if STARPU_BUILD_DOC_PDF
 all: $(DOX_HTML_DIR) $(DOX_PDF)
 EXTRA_DIST += $(DOX_HTML_DIR) $(DOX_PDF)
 txt_DATA = $(DOX_PDF)
+else
+all: $(DOX_HTML_DIR)
+EXTRA_DIST += $(DOX_HTML_DIR)
+endif
 DOX_HTML_SRCDIR=$(DOX_HTML_DIR)
 install-exec-hook:
 	$(MKDIR_P) $(DESTDIR)$(docdir)/manual/html_dev
@@ -40,9 +45,8 @@ install-exec-hook:
 uninstall-hook:
 	rm -rf $(DESTDIR)$(docdir)/manual/html_dev
 else
-if AVAILABLE_DOC
-EXTRA_DIST += $(top_srcdir)/doc/doxygen_dev/html_dev $(top_srcdir)/doc/doxygen_dev/starpu_dev.pdf
-txt_DATA = $(top_srcdir)/doc/doxygen_dev/starpu_dev.pdf
+if STARPU_AVAILABLE_DOC
+EXTRA_DIST += $(top_srcdir)/doc/doxygen_dev/html_dev
 DOX_HTML_SRCDIR=$(top_srcdir)/doc/doxygen_dev/html_dev
 install-exec-hook:
 	$(MKDIR_P) $(DESTDIR)$(docdir)/manual/html_dev
@@ -50,6 +54,10 @@ install-exec-hook:
 uninstall-hook:
 	rm -rf $(DESTDIR)$(docdir)/manual/html_dev
 endif
+if STARPU_AVAILABLE_DOC_PDF
+EXTRA_DIST += $(top_srcdir)/doc/doxygen_dev/starpu_dev.pdf
+txt_DATA = $(top_srcdir)/doc/doxygen_dev/starpu_dev.pdf
+endif
 endif
 
 chapters =	\
@@ -58,7 +66,7 @@ chapters =	\
 
 images =
 
-if BUILD_DOC
+if STARPU_BUILD_DOC
 config.h: $(top_srcdir)/src/common/config.h.in
 	@$(SED) 's/#undef \(.*\)/#define \1 1/' $< > $@
 	@$(SED) -i '1s/^/\/\*\* \@file \*\/\n/' $@
@@ -191,7 +199,7 @@ dox_inputs = $(DOX_CONFIG) 				\
 	$(top_srcdir)/src/core/drivers.h	\
 	$(top_srcdir)/src/core/workers.h
 
-$(DOX_HTML_DIR): $(DOX_TAG) refman.tex
+$(DOX_HTML_DIR): $(DOX_TAG)
 	@$(MKDIR_P) $(DOX_HTML_DIR)
 
 $(DOX_TAG): $(dox_inputs)

+ 15 - 17
examples/Makefile.am

@@ -153,16 +153,13 @@ SHELL_TESTS =
 if !STARPU_USE_MPI_MASTER_SLAVE
 SHELL_TESTS			+=	scheduler/schedulers.sh
 SHELL_TESTS			+=	scheduler/schedulers_context.sh
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 SHELL_TESTS			+=	mult/sgemm.sh
 endif
 endif
 
-if STARPU_HAVE_WINDOWS
 check_PROGRAMS		=	$(STARPU_EXAMPLES)
-else
-check_PROGRAMS		=	$(LOADER) $(STARPU_EXAMPLES)
-endif
+noinst_PROGRAMS		=
 
 if !STARPU_HAVE_WINDOWS
 ## test loader program
@@ -171,6 +168,7 @@ LOADER			=	loader
 loader_CPPFLAGS 	=	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
 LOADER_BIN		=	$(abs_top_builddir)/examples/$(LOADER)
 loader_SOURCES		=	../tests/loader.c
+noinst_PROGRAMS		+=	loader
 else
 LOADER			=
 LOADER_BIN		=	$(top_builddir)/examples/loader-cross.sh
@@ -304,7 +302,7 @@ endif
 endif
 endif
 
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 STARPU_EXAMPLES +=				\
 	mult/sgemm 				\
 	mult/dgemm				\
@@ -344,7 +342,7 @@ endif
 
 if !STARPU_SIMGRID
 
-if MKL_BLAS_LIB
+if STARPU_MKL_BLAS_LIB
 STARPU_EXAMPLES +=				\
 	lu/lu_example_complex_float		\
 	lu/lu_example_complex_double		\
@@ -646,7 +644,7 @@ endif
 # AXPY example #
 ################
 
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 axpy_axpy_SOURCES =				\
 	axpy/axpy.c				\
 	common/blas.c
@@ -665,7 +663,7 @@ endif
 # Mult example #
 ################
 
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 
 mult_sgemm_SOURCES = 				\
 	mult/sgemm.c				\
@@ -687,7 +685,7 @@ endif
 # Cholesky example #
 ####################
 
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 
 cholesky_cholesky_tag_SOURCES =			\
 	cholesky/cholesky_tag.c			\
@@ -742,7 +740,7 @@ endif
 # LU example #
 ##############
 
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 
 lu_lu_example_float_SOURCES =			\
 	lu/lu_example_float.c			\
@@ -784,7 +782,7 @@ lu_lu_implicit_example_double_SOURCES =		\
 lu_lu_implicit_example_double_LDADD =		\
 	$(STARPU_BLAS_LDFLAGS)
 
-if MKL_BLAS_LIB
+if STARPU_MKL_BLAS_LIB
 lu_lu_example_complex_float_SOURCES =		\
 	lu/lu_example_complex_float.c		\
 	lu/clu.c				\
@@ -837,7 +835,7 @@ endif
 # Heat example #
 ################
 
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 
 heat_heat_SOURCES =				\
 	heat/heat.c				\
@@ -861,7 +859,7 @@ endif
 # CG example #
 ##############
 
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 
 cg_cg_SOURCES =					\
 	cg/cg.c					\
@@ -1013,7 +1011,7 @@ examplebin_PROGRAMS +=				\
 	mandelbrot/mandelbrot
 
 mandelbrot_mandelbrot_CPPFLAGS = $(AM_CFLAGS) $(AM_CPPFLAGS)
-if HAVE_X11
+if STARPU_HAVE_X11
 mandelbrot_mandelbrot_CPPFLAGS += $(X_CFLAGS)
 mandelbrot_mandelbrot_LDADD = $(X_PRE_LIBS) $(X_LIBS) -lX11 $(X_EXTRA_LIBS)
 endif
@@ -1062,7 +1060,7 @@ endif
 # OpenGL interoperability #
 ###########################
 
-if HAVE_OPENGL
+if STARPU_HAVE_OPENGL
 examplebin_PROGRAMS +=				\
 	gl_interop/gl_interop			\
 	gl_interop/gl_interop_idle
@@ -1084,7 +1082,7 @@ endif
 # pipeline example #
 ####################
 
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 pipeline_pipeline_SOURCES	=	\
 	pipeline/pipeline.c		\
 	common/blas.c

+ 2 - 0
examples/mlr/mlr.c

@@ -110,7 +110,9 @@ static struct starpu_perfmodel cl_model_init =
    template.
  */
 
+/* M^2 * N^1 * K^0 */
 static unsigned combi1 [3]		= {	2,	1,	0 };
+/* M^0 * N^3 * K^1 */
 static unsigned combi2 [3]		= {	0,	3,	1 };
 
 static unsigned *combinations[] = { combi1, combi2 };

+ 4 - 6
examples/stencil/Makefile.am

@@ -20,7 +20,7 @@ LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(HWLOC_LIBS) @LIBS@ $(FXT_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
 AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@ $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
 
-if USE_MPI
+if STARPU_USE_MPI
 LIBS += $(top_builddir)/mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 AM_CPPFLAGS += -I$(top_srcdir)/mpi/include
 if STARPU_SIMGRID
@@ -56,14 +56,11 @@ endif
 # What to install and what to check #
 #####################################
 
-if STARPU_HAVE_WINDOWS
 check_PROGRAMS	=	$(STARPU_EXAMPLES)
-else
-check_PROGRAMS	=	$(LOADER) $(STARPU_EXAMPLES)
-endif
+noinst_PROGRAMS	=
 
 if !STARPU_SIMGRID
-if USE_MPI
+if STARPU_USE_MPI
 if STARPU_MPI_CHECK
 TESTS		=	$(STARPU_EXAMPLES)
 endif
@@ -79,6 +76,7 @@ LOADER			=	loader
 loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
 LOADER_BIN		=	./$(LOADER)
 loader_SOURCES		=	../../tests/loader.c
+noinst_PROGRAMS		+=	loader
 else
 LOADER			=
 LOADER_BIN		=	$(top_builddir)/examples/stencil/loader-cross.sh

+ 16 - 0
include/starpu.h

@@ -111,6 +111,12 @@ struct starpu_conf
 	int magic;
 
 	/**
+	   @private
+	   Tell starpu_init() if MPI will be initialized later.
+	*/
+	int will_use_mpi;
+
+	/**
 	   Name of the scheduling policy. This can also be specified
 	   with the environment variable \ref STARPU_SCHED. (default =
 	   <c>NULL</c>).
@@ -441,6 +447,16 @@ struct starpu_conf
 	   performance counters after initialization
 	 */
 	unsigned start_perf_counter_collection;
+
+	/**
+	   Minimum spinning backoff of drivers. Default value: \c 1
+	 */
+	unsigned driver_spinning_backoff_min;
+
+	/**
+	   Maximum spinning backoff of drivers. Default value: \c 32
+	 */
+	unsigned driver_spinning_backoff_max;
 };
 
 /**

+ 9 - 1
include/starpu_data.h

@@ -123,7 +123,7 @@ void starpu_data_set_name(starpu_data_handle_t handle, const char *name);
    tools. \p dimensions is the size of the \p dims array. This can be
    for instance the tile coordinates within a big matrix.
 */
-void starpu_data_set_coordinates_array(starpu_data_handle_t handle, int dimensions, int dims[]);
+void starpu_data_set_coordinates_array(starpu_data_handle_t handle, unsigned dimensions, int dims[]);
 
 /**
    Set the coordinates of the data, to be shown in various profiling
@@ -133,6 +133,14 @@ void starpu_data_set_coordinates_array(starpu_data_handle_t handle, int dimensio
 void starpu_data_set_coordinates(starpu_data_handle_t handle, unsigned dimensions, ...);
 
 /**
+   Get the coordinates of the data, as set by a previous call to
+   starpu_data_set_coordinates_array() or starpu_data_set_coordinates()
+   \p dimensions is the size of the \p dims array.
+   This returns the actual number of returned coordinates.
+*/
+unsigned starpu_data_get_coordinates_array(starpu_data_handle_t handle, unsigned dimensions, int dims[]);
+
+/**
    Unregister a data \p handle from StarPU. If the data was
    automatically allocated by StarPU because the home node was -1, all
    automatically allocated buffers are freed. Otherwise, a valid copy

+ 26 - 2
include/starpu_helper.h

@@ -50,6 +50,20 @@ extern int _starpu_silent;
 char *starpu_getenv(const char *str);
 
 /**
+   If the environment variable \c str is defined and its value is contained in the array \c strings, return the array position.
+   Raise an error if the environment variable \c str is defined with a value not in \c strings
+   Return \c defvalue if the environment variable \c str is not defined.
+ */
+int starpu_get_env_string_var_default(const char *str, const char *strings[], int defvalue);
+
+/**
+   If the environment variable \c str is defined with a well-defined size value, return the value as a size in bytes. Expected size qualifiers are b, B, k, K, m, M, g, G. The default qualifier is K.
+   If the environment variable \c str is not defined or is empty, return \c defval
+   Raise an error if the value of the environment variable \c str is not well-defined.
+ */
+int starpu_get_env_size_default(const char *str, int defval);
+
+/**
    Return the integer value of the environment variable named \p str.
    Return 0 otherwise (the variable does not exist or has a
    non-integer value).
@@ -66,7 +80,8 @@ static __starpu_inline int starpu_get_env_number(const char *str)
 		char *pcheck;
 
 		val = strtol(strval, &pcheck, 10);
-		if (*pcheck) {
+		if (*pcheck)
+		{
 			fprintf(stderr,"The %s environment variable must contain an integer\n", str);
 			STARPU_ABORT();
 		}
@@ -103,7 +118,8 @@ static __starpu_inline float starpu_get_env_float_default(const char *str, float
 		char *pcheck;
 
 		val = strtof(strval, &pcheck);
-		if (*pcheck) {
+		if (*pcheck)
+		{
 			fprintf(stderr,"The %s environment variable must contain a float\n", str);
 			STARPU_ABORT();
 		}
@@ -166,6 +182,14 @@ double starpu_timing_now(void);
 */
 int starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_handle, int asynchronous, void (*callback_func)(void*), void *callback_arg);
 
+/**
+   Call hwloc-ps to display binding of each processus and thread running on
+   the machine.<br>
+   Use the environment variable \ref STARPU_DISPLAY_BINDINGS to automatically
+   call this function at the beginning of the execution of StarPU.
+*/
+void starpu_display_bindings(void);
+
 /** @} */
 
 #ifdef __cplusplus

+ 11 - 0
include/starpu_util.h

@@ -598,6 +598,17 @@ STARPU_ATOMIC_SOMETHING64(or, old | value)
 #define STARPU_WMB() STARPU_SYNCHRONIZE()
 #endif
 
+#if defined(__i386__) || defined(__x86_64__)
+#define STARPU_CACHELINE_SIZE 64
+#elif defined(__ppc__) || defined(__ppc64__) || defined(__ia64__)
+#define STARPU_CACHELINE_SIZE 128
+#elif defined(__s390__) || defined(__s390x__)
+#define STARPU_CACHELINE_SIZE 256
+#else
+/* Conservative default */
+#define STARPU_CACHELINE_SIZE 1024
+#endif
+
 #ifdef _WIN32
 /* Try to fetch the system definition of timespec */
 #include <sys/types.h>

+ 5 - 1
julia/Makefile.am

@@ -15,7 +15,11 @@
 #
 include $(top_srcdir)/starpu.mk
 
-SUBDIRS = src examples
+SUBDIRS = src
+
+if STARPU_BUILD_EXAMPLES
+SUBDIRS += examples
+endif
 
 EXTRA_DIST = README
 

+ 9 - 9
julia/Manifest.toml

@@ -7,28 +7,28 @@ uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 deps = ["Libdl", "Random", "Test"]
 git-tree-sha1 = "6f457df38ae2ba239d5e43b80493bb907de826b2"
 repo-rev = "655e9862947d17423f2fb91ea1014e1cb73c1be1"
-repo-url = "https://github.com/analytech-solutions/CBinding.jl.git"
+repo-url = "https://github.com/analytech-solutions/CBinding.jl"
 uuid = "d43a6710-96b8-4a2d-833c-c424785e5374"
 version = "0.8.1"
 
 [[CEnum]]
-git-tree-sha1 = "62847acab40e6855a9b5905ccb99c2b5cf6b3ebb"
+git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9"
 uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
-version = "0.2.0"
+version = "0.4.1"
 
 [[Clang]]
 deps = ["CEnum", "DataStructures", "LLVM_jll", "Libdl"]
-git-tree-sha1 = "45013227beea038ecc17e8c07cd7c7b05ed26067"
-repo-rev = "master"
-repo-url = "https://github.com/phuchant/Clang.jl.git"
+git-tree-sha1 = "2142a3a54faa28f08edb7b16bde2d3d32b1f3785"
+repo-rev = "29ad279"
+repo-url = "https://github.com/phuchant/Clang.jl"
 uuid = "40e3b903-d033-50b4-a0cc-940c62c95e31"
-version = "0.11.0"
+version = "0.11.1"
 
 [[DataStructures]]
 deps = ["InteractiveUtils", "OrderedCollections"]
-git-tree-sha1 = "6166ecfaf2b8bbf2b68d791bc1d54501f345d314"
+git-tree-sha1 = "af6d9c86e191c917c2276fbede1137e8ea20157f"
 uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-version = "0.17.15"
+version = "0.17.17"
 
 [[Dates]]
 deps = ["Printf"]

+ 16 - 4
julia/examples/Makefile.am

@@ -15,6 +15,8 @@
 #
 include $(top_srcdir)/starpu.mk
 
+noinst_PROGRAMS		=
+
 if STARPU_HAVE_WINDOWS
 LOADER_BIN		=
 else
@@ -22,6 +24,7 @@ loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
 if !STARPU_SIMGRID
 LOADER			=	loader
 LOADER_BIN		=	$(abs_top_builddir)/julia/examples/$(LOADER)
+noinst_PROGRAMS		+=	loader
 endif
 loader_SOURCES		=	../../tests/loader.c
 endif
@@ -44,12 +47,21 @@ EXTRA_DIST =					\
 	callback/callback.sh			\
 	check_deps/check_deps.jl		\
 	check_deps/check_deps.sh		\
+	cholesky/cholesky_codelets.jl		\
+	cholesky/cholesky_common.jl		\
+	cholesky/cholesky_native.jl		\
+	cholesky/cholesky_implicit.jl		\
+	cholesky/cholesky_tag.jl		\
+	cholesky/cholesky.sh			\
 	dependency/end_dep.jl			\
 	dependency/end_dep.sh			\
 	dependency/tag_dep.jl			\
 	dependency/tag_dep.sh			\
 	dependency/task_dep.sh			\
 	dependency/task_dep.jl			\
+	gemm/gemm.jl				\
+	gemm/gemm_native.jl			\
+	gemm/gemm.sh				\
 	mandelbrot/mandelbrot_native.jl		\
 	mandelbrot/mandelbrot.jl		\
 	mandelbrot/mandelbrot.sh		\
@@ -92,11 +104,9 @@ check_PROGRAMS = $(LOADER) $(starpu_julia_EXAMPLES)
 SHELL_TESTS	=
 STARPU_JULIA_EXAMPLES	=
 
-if BUILD_EXAMPLES
 examplebin_PROGRAMS 	+=	$(STARPU_JULIA_EXAMPLES)
 
 TESTS			=	$(SHELL_TESTS) $(STARPU_JULIA_EXAMPLES)
-endif
 
 ######################
 #      Examples      #
@@ -127,6 +137,8 @@ SHELL_TESTS			+=	dependency/tag_dep.sh
 SHELL_TESTS			+=	dependency/task_dep.sh
 SHELL_TESTS			+=	dependency/end_dep.sh
 
-if !NO_BLAS_LIB
-SHELL_TESTS			+= axpy/axpy.sh
+if !STARPU_NO_BLAS_LIB
+SHELL_TESTS			+=	axpy/axpy.sh
+SHELL_TESTS			+=	cholesky/cholesky.sh
+SHELL_TESTS			+=	gemm/gemm.sh
 endif

+ 41 - 31
julia/examples/axpy/axpy.jl

@@ -14,7 +14,7 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 #
 using StarPU
-
+using Printf
 const EPSILON = 1e-6
 
 function check(alpha, X, Y)
@@ -26,36 +26,27 @@ function check(alpha, X, Y)
     end
 end
 
-function main()
-    N = 16 * 1024 * 1024
-    NBLOCKS = 8
-    alpha = 3.41
-
-    starpu_init()
-    starpu_cublas_init()
+@target STARPU_CPU+STARPU_CUDA
+@codelet function axpy(X :: Vector{Float32}, Y :: Vector{Float32}, alpha ::Float32) :: Nothing
+    STARPU_SAXPY(length(X), alpha, X, 1, Y, 1)
+    return
+end
 
+function axpy(N, NBLOCKS, alpha, display = true)
     X = Array(fill(1.0f0, N))
     Y = Array(fill(4.0f0, N))
 
     starpu_memory_pin(X)
     starpu_memory_pin(Y)
 
-    println("BEFORE x[0] = ", X[1])
-    println("BEFORE y[0] = ", Y[1])
-
     block_filter = starpu_data_filter(STARPU_VECTOR_FILTER_BLOCK, NBLOCKS)
 
-    perfmodel = starpu_perfmodel(
-        perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
-        symbol = "history_perf"
-    )
+    if display
+        println("BEFORE x[0] = ", X[1])
+        println("BEFORE y[0] = ", Y[1])
+    end
 
-    cl = starpu_codelet(
-        cpu_func = STARPU_SAXPY,
-        cuda_func = STARPU_SAXPY,
-        modes = [STARPU_R, STARPU_RW],
-        perfmodel = perfmodel
-    )
+    t_start = time_ns()
 
     @starpu_block let
         hX,hY = starpu_data_register(X, Y)
@@ -63,26 +54,45 @@ function main()
         starpu_data_partition(hX, block_filter)
         starpu_data_partition(hY, block_filter)
 
-        t_start = time_ns()
-
         for b in 1:NBLOCKS
-            task = starpu_task(cl = cl, handles = [hX[b],hY[b]], cl_arg=(Float32(alpha),),
-                               tag=starpu_tag_t(b))
-            starpu_task_submit(task)
+            starpu_task_insert(codelet_name = "axpy",
+                               handles = [hX[b], hY[b]],
+                               cl_arg = (Float32(alpha),),
+                               tag = starpu_tag_t(b),
+                               modes = [STARPU_R, STARPU_RW])
         end
+
         starpu_task_wait_for_all()
+    end
 
-        t_end = time_ns()
-        timing = (t_end - t_start) / 1000
+    t_end = time_ns()
 
-        println("timing -> ", timing, " us ", 3*N*4/timing, "MB/s")
+    timing = (t_end-t_start)/1000
 
+    if display
+        @printf("timing -> %d us %.2f MB/s\n", timing, 3*N*4/timing)
+        println("AFTER y[0] = ", Y[1], " (ALPHA=", alpha, ")")
     end
 
-    println("AFTER y[0] = ", Y[1], " (ALPHA=", alpha, ")")
-
     check(alpha, X, Y)
 
+    starpu_memory_unpin(X)
+    starpu_memory_unpin(Y)
+end
+
+function main()
+    N = 16 * 1024 * 1024
+    NBLOCKS = 8
+    alpha = 3.41
+
+    starpu_init()
+    starpu_cublas_init()
+
+    # warmup
+    axpy(10, 1, alpha, false)
+
+    axpy(N, NBLOCKS, alpha)
+
     starpu_shutdown()
 end
 

+ 7 - 6
julia/examples/callback/callback.jl

@@ -37,9 +37,7 @@ function variable_with_starpu(val ::Ref{Int32})
     )
 
     cl = starpu_codelet(
-        cpu_func = CPU_CODELETS["variable"],
-        # cuda_func = CUDA_CODELETS["matrix_mult"],
-        #opencl_func="ocl_matrix_mult",
+        cpu_func = "variable",
         modes = [STARPU_RW],
         perfmodel = perfmodel
     )
@@ -47,8 +45,11 @@ function variable_with_starpu(val ::Ref{Int32})
     @starpu_block let
 	hVal = starpu_data_register(val)
 
-        task = starpu_task(cl = cl, handles = [hVal], callback=callback, callback_arg=(cl, [hVal]))
-        starpu_task_submit(task)
+        starpu_task_insert(codelet_name = "variable",
+                           cl = cl,
+                           handles = [hVal],
+                           callback = callback,
+                           callback_arg = (cl, [hVal]))
 
         starpu_task_wait_for_all()
     end
@@ -63,7 +64,7 @@ function display()
     if v[] == 42
         println("result is correct")
     else
-        println("result is incorret")
+        error("result is incorret")
     end
 end
 

+ 15 - 0
julia/examples/check_deps/check_deps.jl

@@ -1,3 +1,18 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
 import Pkg
 
 try

+ 20 - 0
julia/examples/cholesky/cholesky.sh

@@ -0,0 +1,20 @@
+#!/bin/bash
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+$(dirname $0)/../execute.sh cholesky/cholesky_native.jl -quickcheck
+$(dirname $0)/../execute.sh cholesky/cholesky_implicit.jl -quickcheck
+$(dirname $0)/../execute.sh cholesky/cholesky_tag.jl -quickcheck

+ 52 - 0
julia/examples/cholesky/cholesky_codelets.jl

@@ -0,0 +1,52 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+chol_model11 = starpu_perfmodel(
+    perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
+    symbol = "chol_model11"
+)
+
+chol_model21 = starpu_perfmodel(
+    perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
+    symbol = "chol_model21"
+)
+
+chol_model22 = starpu_perfmodel(
+    perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
+    symbol = "chol_model22"
+)
+
+cl_11 = starpu_codelet(
+    cpu_func = "u11",
+    cuda_func = "u11",
+    modes = [STARPU_RW],
+    color = 0xffff00,
+    perfmodel = chol_model11
+)
+cl_21 = starpu_codelet(
+    cpu_func = "u21",
+    cuda_func = "u21",
+    modes = [STARPU_R, STARPU_RW],
+    color = 0x8080ff,
+    perfmodel = chol_model21
+)
+cl_22 = starpu_codelet(
+    cpu_func = "u22",
+    cuda_func = "u22",
+    modes = [STARPU_R, STARPU_R, STARPU_RW],
+    color = 0x00ff00,
+    perfmodel = chol_model22
+)

+ 154 - 0
julia/examples/cholesky/cholesky_common.jl

@@ -0,0 +1,154 @@
+# Standard kernels for the Cholesky factorization
+# U22 is the gemm update
+# U21 is the trsm update
+# U11 is the cholesky factorization
+
+@target STARPU_CPU+STARPU_CUDA
+@codelet function u11(sub11 :: Matrix{Float32}) :: Nothing
+    nx :: Int32 = width(sub11)
+    ld :: Int32 = ld(sub11)
+
+    for z in 0:nx-1
+        lambda11 :: Float32 = sqrt(sub11[z+1,z+1])
+        sub11[z+1,z+1] = lambda11
+
+        alpha ::Float32 = 1.0f0 / lambda11
+        X :: Vector{Float32} = view(sub11, z+2:z+2+(nx-z-2), z+1)
+        STARPU_SSCAL(nx-z-1, alpha, X, 1)
+
+        alpha = -1.0f0
+        A :: Matrix{Float32} = view(sub11, z+2:z+2+(nx-z-2), z+2:z+2+(nx-z-2))
+	STARPU_SSYR("L", nx-z-1, alpha, X, 1, A, ld)
+    end
+    return
+end
+
+@target STARPU_CPU+STARPU_CUDA
+@codelet function u21(sub11 :: Matrix{Float32},
+                      sub21 :: Matrix{Float32}) :: Nothing
+    ld11 :: Int32 = ld(sub11)
+    ld21 :: Int32 = ld(sub21)
+    nx21 :: Int32 = width(sub21)
+    ny21 :: Int32 = height(sub21)
+    alpha :: Float32 = 1.0f0
+    STARPU_STRSM("R", "L", "T", "N", nx21, ny21, alpha, sub11, ld11, sub21, ld21)
+    return
+end
+
+@target STARPU_CPU+STARPU_CUDA
+@codelet function u22(left   :: Matrix{Float32},
+                      right  :: Matrix{Float32},
+                      center :: Matrix{Float32}) :: Nothing
+    dx :: Int32 = width(center)
+    dy :: Int32 = height(center)
+    dz :: Int32 = width(left)
+    ld21 :: Int32 = ld(left)
+    ld12 :: Int32 = ld(center)
+    ld22 :: Int32 = ld(right)
+    alpha :: Float32 = -1.0f0
+    beta :: Float32 = 1.0f0
+    STARPU_SGEMM("N", "T", dy, dx, dz, alpha, left, ld21, right, ld12, beta, center, ld22)
+    return
+end
+
+@inline function tag11(k)
+    return starpu_tag_t((UInt64(1)<<60) | UInt64(k))
+end
+
+@inline function tag21(k, j)
+    return starpu_tag_t((UInt64(3)<<60) | (UInt64(k)<<32) |  UInt64(j))
+end
+
+@inline function tag22(k, i, j)
+    return starpu_tag_t((UInt64(4)<<60) | (UInt64(k)<<32) | (UInt64(i)<<16) |  UInt64(j))
+end
+
+function check(mat::Matrix{Float32})
+    size_p = size(mat, 1)
+
+    for i in 1:size_p
+        for j in 1:size_p
+            if j > i
+                mat[i, j] = 0.0f0
+            end
+        end
+    end
+
+    test_mat ::Matrix{Float32} = zeros(Float32, size_p, size_p)
+
+    syrk!('L', 'N', 1.0f0, mat, 0.0f0, test_mat)
+
+    for i in 1:size_p
+        for j in 1:size_p
+            if j <= i
+                orig = (1.0f0/(1.0f0+(i-1)+(j-1))) + ((i == j) ? 1.0f0*size_p : 0.0f0)
+                err = abs(test_mat[i,j] - orig) / orig
+                if err > 0.0001
+                    got = test_mat[i,j]
+                    expected = orig
+                    error("[$i, $j] -> $got != $expected (err $err)")
+                end
+            end
+        end
+    end
+
+    println(stderr, "Verification successful !")
+end
+
+function clean_tags(nblocks)
+    for k in 1:nblocks
+        starpu_tag_remove(tag11(k))
+
+        for m in k+1:nblocks
+            starpu_tag_remove(tag21(k, m))
+
+            for n in k+1:nblocks
+                if n <= m
+                    starpu_tag_remove(tag22(k, m, n))
+                end
+            end
+        end
+    end
+end
+
+function main(size_p :: Int, nblocks :: Int; verify = false, verbose = false)
+    mat :: Matrix{Float32} = zeros(Float32, size_p, size_p)
+
+    # create a simple definite positive symetric matrix
+    # Hilbert matrix h(i,j) = 1/(i+j+1)
+
+    for i in 1:size_p
+        for j in 1:size_p
+            mat[i, j] = 1.0f0 / (1.0f0+(i-1)+(j-1)) + ((i == j) ? 1.0f0*size_p : 0.0f0)
+        end
+    end
+
+    if verbose
+        display(mat)
+    end
+
+    starpu_memory_pin(mat)
+
+    t_start = time_ns()
+
+    cholesky(mat, size_p, nblocks)
+
+    t_end = time_ns()
+
+    starpu_memory_unpin(mat)
+
+    flop = (1.0*size_p*size_p*size_p)/3.0
+    time_ms = (t_end-t_start) / 1e6
+    gflops = flop/(time_ms*1000)/1000
+    println("$size_p\t$time_ms\t$gflops")
+
+    clean_tags(nblocks)
+
+    if verbose
+        display(mat)
+    end
+
+    if verify
+        check(mat)
+    end
+end

+ 71 - 0
julia/examples/cholesky/cholesky_implicit.jl

@@ -0,0 +1,71 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+using StarPU
+using LinearAlgebra.BLAS
+
+include("cholesky_common.jl")
+
+function cholesky(mat :: Matrix{Float32}, size, nblocks)
+    include("cholesky_codelets.jl")
+
+    horiz = starpu_data_filter(STARPU_MATRIX_FILTER_BLOCK, nblocks)
+    vert = starpu_data_filter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nblocks)
+
+    @starpu_block let
+        h_mat = starpu_data_register(mat)
+        starpu_data_map_filters(h_mat, horiz, vert)
+
+        for k in 1:nblocks
+
+            starpu_iteration_push(k)
+
+            starpu_task_insert(cl = cl_11, handles = [h_mat[k, k]], tag_only = tag11(k))
+
+            for m in k+1:nblocks
+                starpu_task_insert(cl = cl_21, handles = [h_mat[k, k], h_mat[m, k]], tag_only = tag21(m, k))
+            end
+            starpu_data_wont_use(h_mat[k, k])
+
+            for m in k+1:nblocks
+                for n in k+1:nblocks
+                    if n <= m
+                        starpu_task_insert(cl = cl_22, handles = [h_mat[m, k], h_mat[n, k], h_mat[m, n]], tag_only= tag22(k, m, n))
+                    end
+                end
+                starpu_data_wont_use(h_mat[m, k])
+            end
+
+            starpu_iteration_pop()
+        end
+
+        starpu_task_wait_for_all()
+    end
+end
+
+starpu_init()
+starpu_cublas_init()
+
+println("# size\tms\tGFlops")
+
+if length(ARGS) > 0 && ARGS[1] == "-quickcheck"
+    main(1024, 8, verify = true)
+else
+    for size in 1024:1024:15360
+        main(size, 16)
+    end
+end
+
+starpu_shutdown()

+ 79 - 0
julia/examples/cholesky/cholesky_native.jl

@@ -0,0 +1,79 @@
+using LinearAlgebra
+
+function check(mat::Matrix{Float32})
+    size_p = size(mat, 1)
+
+    for i in 1:size_p
+        for j in 1:size_p
+            if j < i
+                mat[i, j] = 0.0f0
+            end
+        end
+    end
+
+    test_mat ::Matrix{Float32} = zeros(Float32, size_p, size_p)
+
+    BLAS.syrk!('L', 'T', 1.0f0, mat, 0.0f0, test_mat)
+
+    for i in 1:size_p
+        for j in 1:size_p
+            if j <= i
+                orig = (1.0f0/(1.0f0+(i-1)+(j-1))) + ((i == j) ? 1.0f0*size_p : 0.0f0)
+                err = abs(test_mat[i,j] - orig) / orig
+                if err > 0.0001
+                    got = test_mat[i,j]
+                    expected = orig
+                    error("[$i, $j] -> $got != $expected (err $err)")
+                end
+            end
+        end
+    end
+
+    println(stderr, "Verification successful !")
+end
+
+function main(size_p :: Int; verify = false, verbose = false)
+    mat = zeros(Float32, size_p, size_p)
+    # create a simple definite positive symetric matrix
+    # Hilbert matrix h(i,j) = 1/(i+j+1)
+
+    for i in 1:size_p
+        for j in 1:size_p
+            mat[i, j] = 1.0f0 / (1.0f0+(i-1)+(j-1)) + ((i == j) ? 1.0f0*size_p : 0.0f0)
+        end
+    end
+
+    if verbose
+        display(mat)
+    end
+
+    t_start = time_ns()
+
+    cholesky!(mat)
+
+    t_end = time_ns()
+
+    flop = (1.0*size_p*size_p*size_p)/3.0
+    time_ms = (t_end-t_start) / 1e6
+    gflops = flop/(time_ms*1000)/1000
+    println("$size_p\t$time_ms\t$gflops")
+
+    if verbose
+        display(mat)
+    end
+
+    if verify
+        check(mat)
+    end
+end
+
+println("# size\tms\tGFlops")
+
+if length(ARGS) > 0 && ARGS[1] == "-quickcheck"
+    main(1024, verify = true)
+else
+    for size in 1024:1024:15360
+        main(size)
+    end
+end
+

+ 93 - 0
julia/examples/cholesky/cholesky_tag.jl

@@ -0,0 +1,93 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+using StarPU
+using LinearAlgebra.BLAS
+
+include("cholesky_common.jl")
+
+function cholesky(mat :: Matrix{Float32}, size, nblocks)
+    include("cholesky_codelets.jl")
+
+    horiz = starpu_data_filter(STARPU_MATRIX_FILTER_BLOCK, nblocks)
+    vert = starpu_data_filter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nblocks)
+
+    @starpu_block let
+        h_mat = starpu_data_register(mat)
+        starpu_data_set_sequential_consistency_flag(h_mat, 0)
+        starpu_data_map_filters(h_mat, horiz, vert)
+
+        entry_task = starpu_task(cl = cl_11,
+                                 handles = [h_mat[1, 1]],
+                                 tag = tag11(1))
+
+        for k in 1:nblocks
+
+            starpu_iteration_push(k)
+
+            if k > 1
+                # enforce dependencies...
+                starpu_tag_declare_deps(tag11(k), tag22(k-1, k, k))
+                starpu_task_insert(cl = cl_11,
+                                   handles = [h_mat[k, k]],
+                                   tag = tag11(k))
+            end
+
+            for m in k+1:nblocks
+                # enforce dependencies...
+                if k > 1
+                    starpu_tag_declare_deps(tag21(k, m), tag11(k), tag22(k-1, m, k))
+                else
+                    starpu_tag_declare_deps(tag21(k, m), tag11(k))
+                end
+
+                starpu_task_insert(cl = cl_21, handles = [h_mat[k, k], h_mat[m, k]], tag = tag21(k, m))
+
+                for n in k+1:nblocks
+                    if n <= m
+                        # enforce dependencies...
+                        if k > 1
+                            starpu_tag_declare_deps(tag22(k, m, n), tag22(k-1, m, n), tag21(k, n), tag21(k, m))
+                        else
+                            starpu_tag_declare_deps(tag22(k, m, n), tag21(k, n), tag21(k, m))
+                        end
+
+                        starpu_task_insert(cl = cl_22, handles = [h_mat[m, k], h_mat[n, k], h_mat[m, n]], tag = tag22(k, m, n))
+                    end
+                end
+            end
+
+            starpu_iteration_pop()
+        end
+
+        starpu_task_submit(entry_task)
+        starpu_tag_wait(tag11(nblocks))
+    end
+end
+
+starpu_init()
+starpu_cublas_init()
+
+println("# size\tms\tGFlops")
+
+if length(ARGS) > 0 && ARGS[1] == "-quickcheck"
+    main(1024, 8, verify = true)
+else
+    for size in 1024:1024:15360
+        main(size, 16)
+    end
+end
+
+starpu_shutdown()

+ 3 - 3
julia/examples/dependency/end_dep.jl

@@ -53,16 +53,16 @@ function main()
         )
 
         clA = starpu_codelet(
-            cpu_func = CPU_CODELETS["codeletA"],
+            cpu_func = "codeletA",
             perfmodel = perfmodel
         )
         clB = starpu_codelet(
-            cpu_func = CPU_CODELETS["codeletB"],
+            cpu_func = "codeletB",
             modes = [STARPU_RW],
             perfmodel = perfmodel
         )
         clC = starpu_codelet(
-            cpu_func = CPU_CODELETS["codeletC"],
+            cpu_func = "codeletC",
             modes = [STARPU_RW],
             perfmodel = perfmodel
         )

+ 3 - 3
julia/examples/dependency/tag_dep.jl

@@ -75,17 +75,17 @@ function main()
     )
 
         clA = starpu_codelet(
-            cpu_func = CPU_CODELETS["codeletA"],
+            cpu_func = "codeletA",
             modes = [STARPU_RW],
             perfmodel = perfmodel
         )
         clB = starpu_codelet(
-            cpu_func = CPU_CODELETS["codeletB"],
+            cpu_func = "codeletB",
             modes = [STARPU_RW],
             perfmodel = perfmodel
         )
         clC = starpu_codelet(
-            cpu_func = CPU_CODELETS["codeletC"],
+            cpu_func = "codeletC",
             modes = [STARPU_RW],
             perfmodel = perfmodel
         )

+ 3 - 3
julia/examples/dependency/task_dep.jl

@@ -43,17 +43,17 @@ function main()
         )
 
         clA = starpu_codelet(
-            cpu_func = CPU_CODELETS["codeletA"],
+            cpu_func = "codeletA",
             modes = [STARPU_RW],
             perfmodel = perfmodel
         )
         clB = starpu_codelet(
-            cpu_func = CPU_CODELETS["codeletB"],
+            cpu_func = "codeletB",
             modes = [STARPU_RW],
             perfmodel = perfmodel
         )
         clC = starpu_codelet(
-            cpu_func = CPU_CODELETS["codeletC"],
+            cpu_func = "codeletC",
             modes = [STARPU_RW],
             perfmodel = perfmodel
         )

+ 7 - 1
julia/examples/execute.sh.in

@@ -19,11 +19,17 @@ set -x
 export JULIA_LOAD_PATH=@STARPU_SRC_DIR@/julia:$JULIA_LOAD_PATH
 export STARPU_BUILD_DIR=@STARPU_BUILD_DIR@
 export STARPU_SRC_DIR=@STARPU_SRC_DIR@
-export STARPU_JULIA_LIB=@STARPU_BUILD_DIR@/julia/src/.libs/libstarpujulia-1.3.so
+export STARPU_JULIA_LIB=@STARPU_BUILD_DIR@/julia/src/.libs/libstarpujulia-1.3
 export STARPU_JULIA_BUILD=@STARPU_BUILD_DIR@/julia
+export LD_LIBRARY_PATH=@STARPU_BUILD_DIR@/julia/src/.libs/:$LD_LIBRARY_PATH
 export JULIA_NUM_THREADS=8
+export STARPU_NOPENCL=0
+export STARPU_SCHED=dmda
+
 srcdir=@STARPU_SRC_DIR@/julia/examples
 
+rm -f genc*.c gencuda*.cu genc*.o
+
 if test "$1" == "-calllib"
 then
     shift

+ 144 - 0
julia/examples/gemm/gemm.jl

@@ -0,0 +1,144 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+using StarPU
+using LinearAlgebra.BLAS
+
+@target STARPU_CPU+STARPU_CUDA
+@codelet function gemm(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, alpha :: Float32, beta :: Float32) :: Nothing
+
+    M :: Int32 = height(A)
+    N :: Int32 = width(B)
+    K :: Int32 = width(A)
+    lda :: Int32 = ld(A)
+    ldb :: Int32 = ld(B)
+    ldc :: Int32 = ld(C)
+    STARPU_SGEMM("N", "N", M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+    return
+end
+
+function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, alpha :: Float32, beta :: Float32, nslicesx, nslicesy)
+    scale= 3
+    tmin=0
+    vert = starpu_data_filter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslicesx)
+    horiz = starpu_data_filter(STARPU_MATRIX_FILTER_BLOCK, nslicesy)
+    @starpu_block let
+        hA,hB,hC = starpu_data_register(A, B, C)
+        starpu_data_partition(hB, vert)
+        starpu_data_partition(hA, horiz)
+        starpu_data_map_filters(hC, vert, horiz)
+        tmin=0
+
+        for i in (1 : 10 )
+            t=time_ns()
+            @starpu_sync_tasks begin
+                for taskx in (1 : nslicesx)
+                    for tasky in (1 : nslicesy)
+                        starpu_task_insert(codelet_name = "gemm",
+                                           handles = [hA[tasky], hB[taskx], hC[taskx, tasky]],
+                                           cl_arg = (alpha, beta),
+                                           modes = [STARPU_R, STARPU_R, STARPU_RW])
+                    end
+                end
+            end
+            t=time_ns()-t
+            if (tmin==0 || tmin>t)
+                tmin=t
+            end
+        end
+    end
+    return tmin
+end
+
+
+function approximately_equals(
+    A :: Matrix{Cfloat},
+    B :: Matrix{Cfloat},
+    eps = 1e-2
+)
+    (height, width) = size(A)
+
+    for j in (1 : width)
+        for i in (1 : height)
+            if (abs(A[i,j] - B[i,j]) > eps * max(abs(B[i,j]), abs(A[i,j])))
+                println("A[$i,$j] : $(A[i,j]), B[$i,$j] : $(B[i,j])")
+                return false
+            end
+        end
+    end
+
+    return true
+end
+
+function check(expected, A, B, C, alpha, beta)
+    for i in 1 : 10
+        gemm!('N', 'N', alpha, A, B, beta, expected)
+    end
+
+    height,width = size(C)
+    for i in 1:height
+        for j in 1:width
+            got = C[i, j]
+            exp = expected[i, j]
+
+            err = abs(exp - got) / exp
+            if err > 0.0001
+                error("[$i] -> $got != $exp (err $err)")
+            end
+        end
+    end
+end
+
+function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy)
+    for dim in (start_dim : step_dim : stop_dim)
+        A = Array(rand(Cfloat, dim, dim))
+        B = Array(rand(Cfloat, dim, dim))
+        C = zeros(Float32, dim, dim)
+        C_ref = copy(C)
+        starpu_memory_pin(A)
+        starpu_memory_pin(B)
+        starpu_memory_pin(C)
+        alpha = 4.0f0
+        beta = 2.0f0
+        mt =  multiply_with_starpu(A, B, C, alpha, beta, nslicesx, nslicesy)
+        gflop = 2 * dim * dim * dim * 1.e-9
+        gflops = gflop / (mt * 1.e-9)
+        size=dim*dim*dim*4*3/1024/1024
+        println(io,"$dim $gflops")
+        println("$dim $gflops")
+        starpu_memory_unpin(A)
+        starpu_memory_unpin(B)
+        starpu_memory_unpin(C)
+        check(C_ref, A, B, C, alpha, beta)
+    end
+end
+
+if size(ARGS, 1) < 1
+    filename="x.dat"
+else
+    filename=ARGS[1]
+end
+
+starpu_init()
+starpu_cublas_init()
+nblock_x = Int32(ceil(sqrt(starpu_worker_get_count())))
+nblock_y = nblock_x
+io=open(filename,"w")
+compute_times(io,64,512,4096,nblock_x,nblock_y)
+close(io)
+
+starpu_shutdown()
+

+ 22 - 0
julia/examples/gemm/gemm.sh

@@ -0,0 +1,22 @@
+#!/bin/bash
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+$(dirname $0)/../execute.sh gemm/gemm_native.jl
+
+export OMP_NUM_THREADS=1
+$(dirname $0)/../execute.sh gemm/gemm.jl
+

+ 146 - 0
julia/examples/gemm/gemm_bare.jl

@@ -0,0 +1,146 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+using StarPU
+using LinearAlgebra.BLAS
+
+@target STARPU_CPU+STARPU_CUDA
+@codelet function gemm(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, alpha :: Float32, beta :: Float32) :: Nothing
+
+    M :: Int32 = height(A)
+    N :: Int32 = width(B)
+    K :: Int32 = width(A)
+    lda :: Int32 = ld(A)
+    ldb :: Int32 = ld(B)
+    ldc :: Int32 = ld(C)
+    STARPU_SGEMM("N", "N", M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+    return
+end
+
+function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, alpha :: Float32, beta :: Float32, nslicesx, nslicesy)
+    scale= 3
+    tmin=0
+    hA,hB,hC = starpu_data_register(A, B, C)
+    tmin=0
+    perfmodel = starpu_perfmodel(
+        perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
+        symbol = "gemm"
+    )
+    cl = starpu_codelet(
+        cpu_func  = "gemm",
+        cuda_func = "",
+        modes =[STARPU_R,STARPU_R,STARPU_RW], 
+        perfmodel = perfmodel,
+    )
+    task = starpu_task(cl = cl, handles =[hA,hB,hC], cl_arg = (alpha,beta), callback = nothing,
+		callback_arg = nothing, tag = nothing, tag_only = nothing,
+                       sequential_consistency = true,
+                       detach = 1, color = nothing, where = nothing)
+
+
+    for i in (1 : 10 )
+        t=time_ns()
+starpu_task_submit(Ref(task.c_task))
+        #starpu_task_submit(task)
+        starpu_task_wait_for_all()
+        t=time_ns()-t
+	if (tmin==0 || tmin>t)
+           tmin=t
+        end
+    end
+    starpu_data_unregister(hA)
+    starpu_data_unregister(hB)
+    starpu_data_unregister(hC)
+    return tmin
+end
+
+
+function approximately_equals(
+    A :: Matrix{Cfloat},
+    B :: Matrix{Cfloat},
+    eps = 1e-2
+)
+    (height, width) = size(A)
+
+    for j in (1 : width)
+        for i in (1 : height)
+            if (abs(A[i,j] - B[i,j]) > eps * max(abs(B[i,j]), abs(A[i,j])))
+                println("A[$i,$j] : $(A[i,j]), B[$i,$j] : $(B[i,j])")
+                return false
+            end
+        end
+    end
+
+    return true
+end
+
+function check(expected, A, B, C, alpha, beta)
+    for i in 1 : 10
+        gemm!('N', 'N', alpha, A, B, beta, expected)
+    end
+
+    height,width = size(C)
+    for i in 1:height
+        for j in 1:width
+            got = C[i, j]
+            exp = expected[i, j]
+
+            err = abs(exp - got) / exp
+            if err > 0.0001
+                error("[$i] -> $got != $exp (err $err)")
+            end
+        end
+    end
+end
+
+function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy)
+    for dim in (start_dim : step_dim : stop_dim)
+        A = Array(rand(Cfloat, dim, dim))
+        B = Array(rand(Cfloat, dim, dim))
+        C = zeros(Float32, dim, dim)
+        C_ref = copy(C)
+        starpu_memory_pin(A)
+        starpu_memory_pin(B)
+        starpu_memory_pin(C)
+        alpha = 4.0f0
+        beta = 2.0f0
+        mt =  multiply_with_starpu(A, B, C, alpha, beta, nslicesx, nslicesy)
+        gflop = 2 * dim * dim * dim * 1.e-9
+        gflops = gflop / (mt * 1.e-9)
+        size=dim*dim*dim*4*3/1024/1024
+        println(io,"$dim $gflops")
+        println("$dim $gflops")
+        starpu_memory_unpin(A)
+        starpu_memory_unpin(B)
+        starpu_memory_unpin(C)
+        #check(C_ref, A, B, C, alpha, beta)
+    end
+end
+
+if size(ARGS, 1) < 1
+    filename="x.dat"
+else
+    filename=ARGS[1]
+end
+
+starpu_init()
+starpu_cublas_init()
+io=open(filename,"w")
+compute_times(io,64,512,4096,1,1)
+close(io)
+
+starpu_shutdown()
+

+ 56 - 0
julia/examples/gemm/gemm_native.jl

@@ -0,0 +1,56 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+using LinearAlgebra.BLAS
+
+function gemm_without_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, alpha :: Float32, beta :: Float32)
+    tmin = 0
+    for i in (1 : 10 )
+        t=time_ns()
+        gemm!('N', 'N', alpha, A, B, beta, C)
+        t=time_ns() - t
+        if (tmin==0 || tmin>t)
+            tmin=t
+        end
+    end
+    return tmin
+end
+
+
+function compute_times(io,start_dim, step_dim, stop_dim)
+    for dim in (start_dim : step_dim : stop_dim)
+        A = Array(rand(Cfloat, dim, dim))
+        B = Array(rand(Cfloat, dim, dim))
+        C = zeros(Float32, dim, dim)
+        alpha = 4.0f0
+        beta = 2.0f0
+        mt =  gemm_without_starpu(A, B, C, alpha, beta)
+        gflop = 2 * dim * dim * dim * 1.e-9
+        gflops = gflop / (mt * 1.e-9)
+        size=dim*dim*dim*4*3/1024/1024
+        println(io,"$dim $gflops")
+        println("$dim $gflops")
+    end
+end
+
+if size(ARGS, 1) < 1
+    filename="x.dat"
+else
+    filename=ARGS[1]
+end
+io=open(filename,"w")
+compute_times(io,64,512,4096)
+close(io)
+

+ 1 - 1
julia/examples/mandelbrot/cpu_mandelbrot.c

@@ -71,7 +71,7 @@ void cpu_mandelbrot(void *descr[], void *cl_arg)
 }
 
 char* CPU = "cpu_mandelbrot";
-char* GPU = "gpu_mandelbrot";
+char* GPU = "";
 extern char *starpu_find_function(char *name, char *device)
 {
 	if (!strcmp(device,"gpu")) return GPU;

+ 4 - 1
julia/examples/mandelbrot/mandelbrot.jl

@@ -70,7 +70,10 @@ function mandelbrot_with_starpu(A ::Matrix{Int64}, cr ::Float64, ci ::Float64, d
 	starpu_data_partition(hA,horiz)
 
 	@starpu_sync_tasks for taskx in (1 : nslicesx)
-                @starpu_async_cl mandelbrot(hA[taskx]) [STARPU_W] (cr, ci, Int64((taskx-1)*dim/nslicesx), dim)
+            starpu_task_insert(codelet_name = "mandelbrot",
+                               handles = [hA[taskx]],
+                               modes = [STARPU_W],
+                               cl_arg = (cr, ci, Int64((taskx-1)*dim/nslicesx), dim))
 	end
     end
 end

+ 1 - 1
julia/examples/mult/cpu_mult.c

@@ -93,7 +93,7 @@ void cpu_mult(void *descr[], void *cl_arg)
 }
 
 char* CPU = "cpu_mult";
-char* GPU = "gpu_mult";
+char* GPU = "";
 extern char *starpu_find_function(char *name, char *device)
 {
 	if (!strcmp(device,"gpu")) return GPU;

+ 15 - 28
julia/examples/mult/mult.jl

@@ -82,27 +82,16 @@ function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: M
         starpu_data_partition(hA, horiz)
         starpu_data_map_filters(hC, vert, horiz)
         tmin=0
-        perfmodel = starpu_perfmodel(
-            perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
-            symbol = "history_perf"
-        )
-        cl = starpu_codelet(
-            cpu_func = CPU_CODELETS["matrix_mult"],
-            # cuda_func = CUDA_CODELETS["matrix_mult"],
-            #opencl_func="ocl_matrix_mult",
-            modes = [STARPU_R, STARPU_R, STARPU_W],
-            perfmodel = perfmodel
-        )
 
         for i in (1 : 10 )
             t=time_ns()
             @starpu_sync_tasks begin
                 for taskx in (1 : nslicesx)
                     for tasky in (1 : nslicesy)
-                        handles = [hA[tasky], hB[taskx], hC[taskx, tasky]]
-                        task = starpu_task(cl = cl, handles = handles, cl_arg=(Int32(stride),))
-                        starpu_task_submit(task)
-                        #@starpu_async_cl matrix_mult(hA[tasky], hB[taskx], hC[taskx, tasky])
+                        starpu_task_insert(codelet_name = "matrix_mult",
+                                           modes = [STARPU_R, STARPU_R, STARPU_W],
+                                           handles = [hA[tasky], hB[taskx], hC[taskx, tasky]],
+                                           cl_arg = (Int32(stride),))
                     end
                 end
             end
@@ -116,23 +105,20 @@ function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: M
 end
 
 
-function approximately_equals(
-    A :: Matrix{Cfloat},
-    B :: Matrix{Cfloat},
-    eps = 1e-2
-)
-    (height, width) = size(A)
+function check(A, B, C)
+    expected = A * B
+    height,width = size(C)
+    for i in 1:height
+        for j in 1:width
+            got = C[i, j]
+            exp = expected[i, j]
 
-    for j in (1 : width)
-        for i in (1 : height)
-            if (abs(A[i,j] - B[i,j]) > eps * max(abs(B[i,j]), abs(A[i,j])))
-                println("A[$i,$j] : $(A[i,j]), B[$i,$j] : $(B[i,j])")
-                return false
+            err = abs(exp - got) / exp
+            if err > 0.0001
+                error("[$i] -> $got != $exp (err $err)")
             end
         end
     end
-
-    return true
 end
 
 function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy, stride)
@@ -145,6 +131,7 @@ function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy, str
         size=dim*dim*4*3/1024/1024
         println(io,"$size $flops")
         println("$size $flops")
+        check(A, B, C)
     end
 end
 

+ 2 - 2
julia/examples/task_insert_color/task_insert_color.jl

@@ -35,13 +35,13 @@ function task_insert_color_with_starpu(val ::Ref{Int32})
         )
 
         cl1 = starpu_codelet(
-            cpu_func = CPU_CODELETS["task_insert_color"],
+            cpu_func = "task_insert_color",
             modes = [STARPU_RW],
             perfmodel = perfmodel
         )
 
         cl2 = starpu_codelet(
-            cpu_func = CPU_CODELETS["task_insert_color"],
+            cpu_func = "task_insert_color",
             modes = [STARPU_RW],
             perfmodel = perfmodel,
             color = 0x0000FF

+ 1 - 1
julia/examples/variable/variable.jl

@@ -44,7 +44,7 @@ function display(niter)
     if foo[] == niter
         println("result is correct")
     else
-        println("result is incorret")
+        error("result is incorret")
     end
 end
 

+ 21 - 17
julia/examples/vector_scal/vector_scal.jl

@@ -36,28 +36,15 @@ function vector_scal_with_starpu(v :: Vector{Float32}, m :: Int32, k :: Float32,
     @starpu_block let
         hV = starpu_data_register(v)
         tmin=0
-        perfmodel = starpu_perfmodel(
-            perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
-            symbol = "history_perf"
-        )
-        cl = starpu_codelet(
-            cpu_func = CPU_CODELETS["vector_scal"],
-            # cuda_func = CUDA_CODELETS["vector_scal"],
-            #opencl_func="ocl_matrix_mult",
-            modes = [STARPU_RW],
-            perfmodel = perfmodel
-        )
 
         for i in (1 : 1)
             t=time_ns()
             @starpu_sync_tasks begin
-                handles = [hV]
-                task = starpu_task(cl = cl, handles = handles, cl_arg=(m, k, l))
-                starpu_task_submit(task)
+                starpu_task_insert(codelet_name = "vector_scal",
+                                   modes = [STARPU_RW],
+                                   handles = [hV],
+                                   cl_arg=(m, k, l))
             end
-            # @starpu_sync_tasks for task in (1:1)
-            #     @starpu_async_cl vector_scal(hV, STARPU_RW, [m, k, l])
-            # end
             t=time_ns()-t
             if (tmin==0 || tmin>t)
                 tmin=t
@@ -67,9 +54,24 @@ function vector_scal_with_starpu(v :: Vector{Float32}, m :: Int32, k :: Float32,
     return tmin
 end
 
+function check(ref, res, m, k, l)
+    expected = ref .* m .+ (k+l)
+
+    for i in 1:length(expected)
+        got = res[i]
+        exp = expected[i]
+
+        err = abs(exp - got) / exp
+        if err > 0.0001
+            error("[$i] -> $got != $exp (err $err)")
+        end
+    end
+end
+
 function compute_times(io,start_dim, step_dim, stop_dim)
     for size in (start_dim : step_dim : stop_dim)
         V = Array(rand(Cfloat, size))
+        V_ref = copy(V)
         starpu_memory_pin(V)
 
         m :: Int32 = 10
@@ -85,6 +87,8 @@ function compute_times(io,start_dim, step_dim, stop_dim)
         println("OUTPUT ", V[1:10])
         println(io,"$size $mt")
         println("$size $mt")
+
+        check(V_ref, V, m, k, l)
     end
 end
 

+ 1 - 1
julia/src/Makefile.am

@@ -19,7 +19,7 @@ include $(top_srcdir)/starpu-notests.mk
 CLEANFILES = *.gcno *.gcda
 
 AM_CFLAGS = $(GLOBAL_AM_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(HWLOC_CFLAGS) $(FXT_CFLAGS) -fPIC
-LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS)
+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ `@JULIA@ $(top_srcdir)/julia/src/openblas_ldflags.jl`
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_builddir)/src -I$(top_srcdir)/src -I$(top_srcdir)/julia/src
 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) -no-undefined
 

+ 12 - 3
julia/src/StarPU.jl

@@ -26,12 +26,13 @@ const starpu_wrapper_library_name=fstarpu_task_library_name()
 
 include("translate_headers.jl")
 
-if !isfile((@__DIR__)*"/../gen/libstarpu_common.jl") || !isfile((@__DIR__)*"/../gen/libstarpu_api.jl")
+if !isfile(joinpath(fstarpu_build_dir(), "julia/gen/libstarpu_common.jl")) || !isfile(joinpath(fstarpu_build_dir(), "julia/gen/libstarpu_api.jl")) ||
+    mtime(joinpath(@__FILE__, "translate_headers.jl")) > mtime(joinpath(fstarpu_build_dir(), "julia/gen/libstarpu_api.jl"))
     starpu_translate_headers()
 end
 
-include("../gen/libstarpu_common.jl")
-include("../gen/libstarpu_api.jl")
+include(joinpath(fstarpu_build_dir(), "julia/gen/libstarpu_common.jl"))
+include(joinpath(fstarpu_build_dir(), "julia/gen/libstarpu_api.jl"))
 include("globals.jl")
 
 include("compiler/include.jl")
@@ -85,6 +86,8 @@ export starpu_data_get_sub_data
 export starpu_data_partition
 export starpu_data_unpartition
 export starpu_data_map_filters
+export starpu_data_wont_use
+export starpu_task_insert
 export starpu_task_wait_for_all
 export starpu_task_submit
 export starpu_task_end_dep_add
@@ -93,6 +96,7 @@ export starpu_task_declare_deps
 export starpu_task_declare_end_deps
 export starpu_task_wait_for_n_submitted
 export starpu_task_destroy
+export starpu_tag_remove
 export starpu_tag_wait
 export starpu_tag_notify_from_apps
 export starpu_iteration_pop
@@ -108,5 +112,10 @@ export starpu_data_get_default_sequential_consistency_flag
 export starpu_data_set_default_sequential_consistency_flag
 export starpu_data_get_sequential_consistency_flag
 export starpu_data_set_sequential_consistency_flag
+export starpu_worker_get_count
+export starpu_cpu_worker_get_count
+export starpu_cuda_worker_get_count
+export starpu_opencl_worker_get_count
+export starpu_mic_worker_get_count
 
 end

+ 78 - 400
julia/src/blas.c

@@ -17,500 +17,178 @@
 #include <ctype.h>
 #include <stdio.h>
 
-#include <starpu.h>
 #include "blas.h"
 
-/*
-    This files contains BLAS wrappers for the different BLAS implementations
-  (eg. REFBLAS, ATLAS, GOTOBLAS ...). We assume a Fortran orientation as most
-  libraries do not supply C-based ordering.
- */
-
-#ifdef STARPU_ATLAS
-
-inline void STARPU_SGEMM(char *transa, char *transb, int M, int N, int K, 
-			float alpha, const float *A, int lda, const float *B, int ldb, 
-			float beta, float *C, int ldc)
-{
-	enum CBLAS_TRANSPOSE ta = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
-	enum CBLAS_TRANSPOSE tb = (toupper(transb[0]) == 'N')?CblasNoTrans:CblasTrans;
-
-	cblas_sgemm(CblasColMajor, ta, tb,
-			M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);				
-}
-
-inline void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, 
-			double alpha, double *A, int lda, double *B, int ldb, 
-			double beta, double *C, int ldc)
-{
-	enum CBLAS_TRANSPOSE ta = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
-	enum CBLAS_TRANSPOSE tb = (toupper(transb[0]) == 'N')?CblasNoTrans:CblasTrans;
-
-	cblas_dgemm(CblasColMajor, ta, tb,
-			M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);				
-}
-
-inline void STARPU_SGEMV(char *transa, int M, int N, float alpha, float *A, int lda, float *X, int incX, float beta, float *Y, int incY)
-{
-	enum CBLAS_TRANSPOSE ta = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
-
-	cblas_sgemv(CblasColMajor, ta, M, N, alpha, A, lda,
-					X, incX, beta, Y, incY);
-}
-
-inline void STARPU_DGEMV(char *transa, int M, int N, double alpha, double *A, int lda, double *X, int incX, double beta, double *Y, int incY)
-{
-	enum CBLAS_TRANSPOSE ta = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
-
-	cblas_dgemv(CblasColMajor, ta, M, N, alpha, A, lda,
-					X, incX, beta, Y, incY);
-}
-
-inline float STARPU_SASUM(int N, float *X, int incX)
-{
-	return cblas_sasum(N, X, incX);
-}
-
-inline double STARPU_DASUM(int N, double *X, int incX)
-{
-	return cblas_dasum(N, X, incX);
-}
-
-void STARPU_SSCAL(int N, float alpha, float *X, int incX)
-{
-	cblas_sscal(N, alpha, X, incX);
-}
-
-void STARPU_DSCAL(int N, double alpha, double *X, int incX)
-{
-	cblas_dscal(N, alpha, X, incX);
-}
-
-void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
-                   const char *diag, const int m, const int n,
-                   const float alpha, const float *A, const int lda,
-                   float *B, const int ldb)
+inline void STARPU_SGEMM(char *transa, char *transb, BLASINT M, BLASINT N, BLASINT K, 
+			float alpha, const float *A, BLASINT lda, const float *B, BLASINT ldb, 
+			float beta, float *C, BLASINT ldc)
 {
-	enum CBLAS_SIDE side_ = (toupper(side[0]) == 'L')?CblasLeft:CblasRight;
-	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
-	enum CBLAS_TRANSPOSE transa_ = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
-	enum CBLAS_DIAG diag_ = (toupper(diag[0]) == 'N')?CblasNonUnit:CblasUnit;
-
-	cblas_strsm(CblasColMajor, side_, uplo_, transa_, diag_, m, n, alpha, A, lda, B, ldb);
-}
-
-void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
-                   const char *diag, const int m, const int n,
-                   const double alpha, const double *A, const int lda,
-                   double *B, const int ldb)
-{
-	enum CBLAS_SIDE side_ = (toupper(side[0]) == 'L')?CblasLeft:CblasRight;
-	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
-	enum CBLAS_TRANSPOSE transa_ = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
-	enum CBLAS_DIAG diag_ = (toupper(diag[0]) == 'N')?CblasNonUnit:CblasUnit;
-
-	cblas_dtrsm(CblasColMajor, side_, uplo_, transa_, diag_, m, n, alpha, A, lda, B, ldb);
-}
-
-void STARPU_SSYR (const char *uplo, const int n, const float alpha,
-                  const float *x, const int incx, float *A, const int lda)
-{
-	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
-
-	cblas_ssyr(CblasColMajor, uplo_, n, alpha, x, incx, A, lda); 
-}
-
-void STARPU_SSYRK (const char *uplo, const char *trans, const int n,
-                   const int k, const float alpha, const float *A,
-                   const int lda, const float beta, float *C,
-                   const int ldc)
-{
-	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
-	enum CBLAS_TRANSPOSE trans_ = (toupper(trans[0]) == 'N')?CblasNoTrans:CblasTrans;
-	
-	cblas_ssyrk(CblasColMajor, uplo_, trans_, n, k, alpha, A, lda, beta, C, ldc); 
-}
-
-void STARPU_SGER(const int m, const int n, const float alpha,
-                  const float *x, const int incx, const float *y,
-                  const int incy, float *A, const int lda)
-{
-	cblas_sger(CblasColMajor, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void STARPU_DGER(const int m, const int n, const double alpha,
-                  const double *x, const int incx, const double *y,
-                  const int incy, double *A, const int lda)
-{
-	cblas_dger(CblasColMajor, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
-                   const int n, const float *A, const int lda, float *x, 
-                   const int incx)
-{
-	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
-	enum CBLAS_TRANSPOSE trans_ = (toupper(trans[0]) == 'N')?CblasNoTrans:CblasTrans;
-	enum CBLAS_DIAG diag_ = (toupper(diag[0]) == 'N')?CblasNonUnit:CblasUnit;
-
-	cblas_strsv(CblasColMajor, uplo_, trans_, diag_, n, A, lda, x, incx);
-}
-
-void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
-                 const char *diag, const int m, const int n,
-                 const float alpha, const float *A, const int lda,
-                 float *B, const int ldb)
-{
-	enum CBLAS_SIDE side_ = (toupper(side[0]) == 'L')?CblasLeft:CblasRight;
-	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
-	enum CBLAS_TRANSPOSE transA_ = (toupper(transA[0]) == 'N')?CblasNoTrans:CblasTrans;
-	enum CBLAS_DIAG diag_ = (toupper(diag[0]) == 'N')?CblasNonUnit:CblasUnit;
-
-	cblas_strmm(CblasColMajor, side_, uplo_, transA_, diag_, m, n, alpha, A, lda, B, ldb);
-}
-
-void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
-                 const char *diag, const int m, const int n,
-                 const double alpha, const double *A, const int lda,
-                 double *B, const int ldb)
-{
-	enum CBLAS_SIDE side_ = (toupper(side[0]) == 'L')?CblasLeft:CblasRight;
-	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
-	enum CBLAS_TRANSPOSE transA_ = (toupper(transA[0]) == 'N')?CblasNoTrans:CblasTrans;
-	enum CBLAS_DIAG diag_ = (toupper(diag[0]) == 'N')?CblasNonUnit:CblasUnit;
-
-	cblas_dtrmm(CblasColMajor, side_, uplo_, transA_, diag_, m, n, alpha, A, lda, B, ldb);
-}
-
-void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
-                 const int n, const float *A, const int lda, float *X,
-                 const int incX)
-{
-	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
-	enum CBLAS_TRANSPOSE transA_ = (toupper(transA[0]) == 'N')?CblasNoTrans:CblasTrans;
-	enum CBLAS_DIAG diag_ = (toupper(diag[0]) == 'N')?CblasNonUnit:CblasUnit;
-
-	cblas_strmv(CblasColMajor, uplo_, transA_, diag_, n, A, lda, X, incX);
-}
-
-void STARPU_SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY)
-{
-	cblas_saxpy(n, alpha, X, incX, Y, incY);
-}
-
-void STARPU_DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY)
-{
-	cblas_daxpy(n, alpha, X, incX, Y, incY);
-}
-
-int STARPU_ISAMAX (const int n, float *X, const int incX)
-{
-    int retVal;
-    retVal = cblas_isamax(n, X, incX);
-    return retVal;
-}
-
-int STARPU_IDAMAX (const int n, double *X, const int incX)
-{
-    int retVal;
-    retVal = cblas_idamax(n, X, incX);
-    return retVal;
-}
-
-float STARPU_SDOT(const int n, const float *x, const int incx, const float *y, const int incy)
-{
-	return cblas_sdot(n, x, incx, y, incy);
-}
-
-double STARPU_DDOT(const int n, const double *x, const int incx, const double *y, const int incy)
-{
-	return cblas_ddot(n, x, incx, y, incy);
-}
-
-void STARPU_SSWAP(const int n, float *x, const int incx, float *y, const int incy)
-{
-	cblas_sswap(n, x, incx, y, incy);
-}
-
-void STARPU_DSWAP(const int n, double *x, const int incx, double *y, const int incy)
-{
-	cblas_dswap(n, x, incx, y, incy);
-}
-
-#elif defined(STARPU_GOTO) || defined(STARPU_OPENBLAS) || defined(STARPU_SYSTEM_BLAS) || defined(STARPU_MKL) || defined(STARPU_ARMPL)
-
-inline void STARPU_SGEMM(char *transa, char *transb, int M, int N, int K, 
-			float alpha, const float *A, int lda, const float *B, int ldb, 
-			float beta, float *C, int ldc)
-{
-	sgemm_(transa, transb, &M, &N, &K, &alpha,
+	sgemm_64_(transa, transb, &M, &N, &K, &alpha,
 			 A, &lda, B, &ldb,
 			 &beta, C, &ldc);	
 }
 
-inline void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, 
-			double alpha, double *A, int lda, double *B, int ldb, 
-			double beta, double *C, int ldc)
+inline void STARPU_DGEMM(char *transa, char *transb, BLASINT M, BLASINT N, BLASINT K, 
+			double alpha, double *A, BLASINT lda, double *B, BLASINT ldb, 
+			double beta, double *C, BLASINT ldc)
 {
-	dgemm_(transa, transb, &M, &N, &K, &alpha,
+	dgemm_64_(transa, transb, &M, &N, &K, &alpha,
 			 A, &lda, B, &ldb,
 			 &beta, C, &ldc);	
 }
 
 
-inline void STARPU_SGEMV(char *transa, int M, int N, float alpha, float *A, int lda,
-		float *X, int incX, float beta, float *Y, int incY)
+inline void STARPU_SGEMV(char *transa, BLASINT M, BLASINT N, float alpha, float *A, BLASINT lda,
+		float *X, BLASINT incX, float beta, float *Y, BLASINT incY)
 {
-	sgemv_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
+	sgemv_64_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
 }
 
-inline void STARPU_DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
-		double *X, int incX, double beta, double *Y, int incY)
+inline void STARPU_DGEMV(char *transa, BLASINT M, BLASINT N, double alpha, double *A, BLASINT lda,
+		double *X, BLASINT incX, double beta, double *Y, BLASINT incY)
 {
-	dgemv_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
+	dgemv_64_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
 }
 
-inline float STARPU_SASUM(int N, float *X, int incX)
+inline float STARPU_SASUM(BLASINT N, float *X, BLASINT incX)
 {
-	return sasum_(&N, X, &incX);
+	return sasum_64_(&N, X, &incX);
 }
 
-inline double STARPU_DASUM(int N, double *X, int incX)
+inline double STARPU_DASUM(BLASINT N, double *X, BLASINT incX)
 {
-	return dasum_(&N, X, &incX);
+	return dasum_64_(&N, X, &incX);
 }
 
-void STARPU_SSCAL(int N, float alpha, float *X, int incX)
+void STARPU_SSCAL(BLASINT N, float alpha, float *X, BLASINT incX)
 {
-	sscal_(&N, &alpha, X, &incX);
+	sscal_64_(&N, &alpha, X, &incX);
 }
 
-void STARPU_DSCAL(int N, double alpha, double *X, int incX)
+void STARPU_DSCAL(BLASINT N, double alpha, double *X, BLASINT incX)
 {
-	dscal_(&N, &alpha, X, &incX);
+	dscal_64_(&N, &alpha, X, &incX);
 }
 
 void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
-                   const char *diag, const int m, const int n,
-                   const float alpha, const float *A, const int lda,
-                   float *B, const int ldb)
+                   const char *diag, const BLASINT m, const BLASINT n,
+                   const float alpha, const float *A, const BLASINT lda,
+                   float *B, const BLASINT ldb)
 {
-	strsm_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
+	strsm_64_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
 }
 
 void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
-                   const char *diag, const int m, const int n,
-                   const double alpha, const double *A, const int lda,
-                   double *B, const int ldb)
+                   const char *diag, const BLASINT m, const BLASINT n,
+                   const double alpha, const double *A, const BLASINT lda,
+                   double *B, const BLASINT ldb)
 {
-	dtrsm_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
+	dtrsm_64_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
 }
 
-void STARPU_SSYR (const char *uplo, const int n, const float alpha,
-                  const float *x, const int incx, float *A, const int lda)
+void STARPU_SSYR (const char *uplo, const BLASINT n, const float alpha,
+                  const float *x, const BLASINT incx, float *A, const BLASINT lda)
 {
-	ssyr_(uplo, &n, &alpha, x, &incx, A, &lda); 
+	ssyr_64_(uplo, &n, &alpha, x, &incx, A, &lda); 
 }
 
-void STARPU_SSYRK (const char *uplo, const char *trans, const int n,
-                   const int k, const float alpha, const float *A,
-                   const int lda, const float beta, float *C,
-                   const int ldc)
+void STARPU_SSYRK (const char *uplo, const char *trans, const BLASINT n,
+                   const BLASINT k, const float alpha, const float *A,
+                   const BLASINT lda, const float beta, float *C,
+                   const BLASINT ldc)
 {
-	ssyrk_(uplo, trans, &n, &k, &alpha, A, &lda, &beta, C, &ldc); 
+	ssyrk_64_(uplo, trans, &n, &k, &alpha, A, &lda, &beta, C, &ldc); 
 }
 
-void STARPU_SGER(const int m, const int n, const float alpha,
-                  const float *x, const int incx, const float *y,
-                  const int incy, float *A, const int lda)
+void STARPU_SGER(const BLASINT m, const BLASINT n, const float alpha,
+                  const float *x, const BLASINT incx, const float *y,
+                  const BLASINT incy, float *A, const BLASINT lda)
 {
-	sger_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
+	sger_64_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
 }
 
-void STARPU_DGER(const int m, const int n, const double alpha,
-                  const double *x, const int incx, const double *y,
-                  const int incy, double *A, const int lda)
+void STARPU_DGER(const BLASINT m, const BLASINT n, const double alpha,
+                  const double *x, const BLASINT incx, const double *y,
+                  const BLASINT incy, double *A, const BLASINT lda)
 {
-	dger_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
+	dger_64_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
 }
 
 void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
-                   const int n, const float *A, const int lda, float *x, 
-                   const int incx)
+                   const BLASINT n, const float *A, const BLASINT lda, float *x, 
+                   const BLASINT incx)
 {
-	strsv_(uplo, trans, diag, &n, A, &lda, x, &incx);
+	strsv_64_(uplo, trans, diag, &n, A, &lda, x, &incx);
 }
 
 void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
-                 const char *diag, const int m, const int n,
-                 const float alpha, const float *A, const int lda,
-                 float *B, const int ldb)
+                 const char *diag, const BLASINT m, const BLASINT n,
+                 const float alpha, const float *A, const BLASINT lda,
+                 float *B, const BLASINT ldb)
 {
-	strmm_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
+	strmm_64_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
 }
 
 void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
-                 const char *diag, const int m, const int n,
-                 const double alpha, const double *A, const int lda,
-                 double *B, const int ldb)
+                 const char *diag, const BLASINT m, const BLASINT n,
+                 const double alpha, const double *A, const BLASINT lda,
+                 double *B, const BLASINT ldb)
 {
-	dtrmm_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
+	dtrmm_64_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
 }
 
 void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
-                 const int n, const float *A, const int lda, float *X,
-                 const int incX)
+                 const BLASINT n, const float *A, const BLASINT lda, float *X,
+                 const BLASINT incX)
 {
-	strmv_(uplo, transA, diag, &n, A, &lda, X, &incX);
+	strmv_64_(uplo, transA, diag, &n, A, &lda, X, &incX);
 }
 
-void STARPU_SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY)
+void STARPU_SAXPY(const BLASINT n, const float alpha, float *X, const BLASINT incX, float *Y, const BLASINT incY)
 {
-	saxpy_(&n, &alpha, X, &incX, Y, &incY);
+	saxpy_64_(&n, &alpha, X, &incX, Y, &incY);
 }
 
-void STARPU_DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY)
+void STARPU_DAXPY(const BLASINT n, const double alpha, double *X, const BLASINT incX, double *Y, const BLASINT incY)
 {
-	daxpy_(&n, &alpha, X, &incX, Y, &incY);
+	daxpy_64_(&n, &alpha, X, &incX, Y, &incY);
 }
 
-int STARPU_ISAMAX (const int n, float *X, const int incX)
+BLASINT STARPU_ISAMAX (const BLASINT n, float *X, const BLASINT incX)
 {
-    int retVal;
-    retVal = isamax_ (&n, X, &incX);
+    BLASINT retVal;
+    retVal = isamax_64_ (&n, X, &incX);
     return retVal;
 }
 
-int STARPU_IDAMAX (const int n, double *X, const int incX)
+BLASINT STARPU_IDAMAX (const BLASINT n, double *X, const BLASINT incX)
 {
-    int retVal;
-    retVal = idamax_ (&n, X, &incX);
+    BLASINT retVal;
+    retVal = idamax_64_ (&n, X, &incX);
     return retVal;
 }
 
-float STARPU_SDOT(const int n, const float *x, const int incx, const float *y, const int incy)
+float STARPU_SDOT(const BLASINT n, const float *x, const BLASINT incx, const float *y, const BLASINT incy)
 {
 	float retVal = 0;
 
 	/* GOTOBLAS will return a FLOATRET which is a double, not a float */
-	retVal = (float)sdot_(&n, x, &incx, y, &incy);
+	retVal = (float)sdot_64_(&n, x, &incx, y, &incy);
 
 	return retVal;
 }
 
-double STARPU_DDOT(const int n, const double *x, const int incx, const double *y, const int incy)
-{
-	return ddot_(&n, x, &incx, y, &incy);
-}
-
-void STARPU_SSWAP(const int n, float *X, const int incX, float *Y, const int incY)
-{
-	sswap_(&n, X, &incX, Y, &incY);
-}
-
-void STARPU_DSWAP(const int n, double *X, const int incX, double *Y, const int incY)
+double STARPU_DDOT(const BLASINT n, const double *x, const BLASINT incx, const double *y, const BLASINT incy)
 {
-	dswap_(&n, X, &incX, Y, &incY);
+	return ddot_64_(&n, x, &incx, y, &incy);
 }
 
-#if defined(STARPU_MKL) || defined(STARPU_ARMPL)
-void STARPU_SPOTRF(const char*uplo, const int n, float *a, const int lda)
+void STARPU_SSWAP(const BLASINT n, float *X, const BLASINT incX, float *Y, const BLASINT incY)
 {
-	int info = 0;
-	spotrf_(uplo, &n, a, &lda, &info);
+	sswap_64_(&n, X, &incX, Y, &incY);
 }
 
-void STARPU_DPOTRF(const char*uplo, const int n, double *a, const int lda)
+void STARPU_DSWAP(const BLASINT n, double *X, const BLASINT incX, double *Y, const BLASINT incY)
 {
-	int info = 0;
-	dpotrf_(uplo, &n, a, &lda, &info);
+	dswap_64_(&n, X, &incX, Y, &incY);
 }
-#endif
-
-#elif defined(STARPU_SIMGRID)
-inline void STARPU_SGEMM(char *transa, char *transb, int M, int N, int K, 
-			float alpha, const float *A, int lda, const float *B, int ldb, 
-			float beta, float *C, int ldc) { }
-
-inline void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, 
-			double alpha, double *A, int lda, double *B, int ldb, 
-			double beta, double *C, int ldc) { }
-
-inline void STARPU_SGEMV(char *transa, int M, int N, float alpha, float *A, int lda,
-		float *X, int incX, float beta, float *Y, int incY) { }
-
-inline void STARPU_DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
-		double *X, int incX, double beta, double *Y, int incY) { }
-
-inline float STARPU_SASUM(int N, float *X, int incX) { return 0.; }
-
-inline double STARPU_DASUM(int N, double *X, int incX) { return 0.; }
-
-void STARPU_SSCAL(int N, float alpha, float *X, int incX) { }
-
-void STARPU_DSCAL(int N, double alpha, double *X, int incX) { }
-
-void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
-                   const char *diag, const int m, const int n,
-                   const float alpha, const float *A, const int lda,
-                   float *B, const int ldb) { }
-
-void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
-                   const char *diag, const int m, const int n,
-                   const double alpha, const double *A, const int lda,
-                   double *B, const int ldb) { }
-
-void STARPU_SSYR (const char *uplo, const int n, const float alpha,
-                  const float *x, const int incx, float *A, const int lda) { }
-
-void STARPU_SSYRK (const char *uplo, const char *trans, const int n,
-                   const int k, const float alpha, const float *A,
-                   const int lda, const float beta, float *C,
-                   const int ldc) { }
-
-void STARPU_SGER(const int m, const int n, const float alpha,
-                  const float *x, const int incx, const float *y,
-                  const int incy, float *A, const int lda) { }
-
-void STARPU_DGER(const int m, const int n, const double alpha,
-                  const double *x, const int incx, const double *y,
-                  const int incy, double *A, const int lda) { }
-
-void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
-                   const int n, const float *A, const int lda, float *x, 
-                   const int incx) { }
-
-void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
-                 const char *diag, const int m, const int n,
-                 const float alpha, const float *A, const int lda,
-                 float *B, const int ldb) { }
-
-void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
-                 const char *diag, const int m, const int n,
-                 const double alpha, const double *A, const int lda,
-                 double *B, const int ldb) { }
-
-void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
-                 const int n, const float *A, const int lda, float *X,
-                 const int incX) { }
-
-void STARPU_SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY) { }
-
-void STARPU_DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY) { }
-
-int STARPU_ISAMAX (const int n, float *X, const int incX) { return 0; }
-
-int STARPU_IDAMAX (const int n, double *X, const int incX) { return 0; }
-
-float STARPU_SDOT(const int n, const float *x, const int incx, const float *y, const int incy) { return 0.; }
-
-double STARPU_DDOT(const int n, const double *x, const int incx, const double *y, const int incy) { return 0.; }
-
-void STARPU_SSWAP(const int n, float *X, const int incX, float *Y, const int incY) { }
-
-void STARPU_DSWAP(const int n, double *X, const int incX, double *Y, const int incY) { }
-
-void STARPU_SPOTRF(const char*uplo, const int n, float *a, const int lda) { }
-
-void STARPU_DPOTRF(const char*uplo, const int n, double *a, const int lda) { }
-#endif

+ 116 - 134
julia/src/blas.h

@@ -17,150 +17,132 @@
 #ifndef __BLAS_H__
 #define __BLAS_H__
 
-#include <starpu.h>
+#include <stdint.h>
 
-#if defined(STARPU_ATLAS) || defined(STARPU_HAVE_CBLAS_H)
-#include <cblas.h>
-#endif
+#define BLASINT int64_t
 
-void STARPU_SGEMM(char *transa, char *transb, int M, int N, int K, float alpha, const float *A, int lda, 
-		const float *B, int ldb, float beta, float *C, int ldc);
-void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, double alpha, double *A, int lda, 
-		double *B, int ldb, double beta, double *C, int ldc);
-void STARPU_SGEMV(char *transa, int M, int N, float alpha, float *A, int lda,
-		float *X, int incX, float beta, float *Y, int incY);
-void STARPU_DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
-		double *X, int incX, double beta, double *Y, int incY);
-float STARPU_SASUM(int N, float *X, int incX);
-double STARPU_DASUM(int N, double *X, int incX);
-void STARPU_SSCAL(int N, float alpha, float *X, int incX);
-void STARPU_DSCAL(int N, double alpha, double *X, int incX);
+void STARPU_SGEMM(char *transa, char *transb, BLASINT M, BLASINT N, BLASINT K, float alpha, const float *A, BLASINT lda, 
+		const float *B, BLASINT ldb, float beta, float *C, BLASINT ldc);
+void STARPU_DGEMM(char *transa, char *transb, BLASINT M, BLASINT N, BLASINT K, double alpha, double *A, BLASINT lda, 
+		double *B, BLASINT ldb, double beta, double *C, BLASINT ldc);
+void STARPU_SGEMV(char *transa, BLASINT M, BLASINT N, float alpha, float *A, BLASINT lda,
+		float *X, BLASINT incX, float beta, float *Y, BLASINT incY);
+void STARPU_DGEMV(char *transa, BLASINT M, BLASINT N, double alpha, double *A, BLASINT lda,
+		double *X, BLASINT incX, double beta, double *Y, BLASINT incY);
+float STARPU_SASUM(BLASINT N, float *X, BLASINT incX);
+double STARPU_DASUM(BLASINT N, double *X, BLASINT incX);
+void STARPU_SSCAL(BLASINT N, float alpha, float *X, BLASINT incX);
+void STARPU_DSCAL(BLASINT N, double alpha, double *X, BLASINT incX);
 void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
-                   const char *diag, const int m, const int n,
-                   const float alpha, const float *A, const int lda,
-                   float *B, const int ldb);
+                   const char *diag, const BLASINT m, const BLASINT n,
+                   const float alpha, const float *A, const BLASINT lda,
+                   float *B, const BLASINT ldb);
 void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
-                   const char *diag, const int m, const int n,
-                   const double alpha, const double *A, const int lda,
-                   double *B, const int ldb);
-void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, 
-			double alpha, double *A, int lda, double *B, int ldb, 
-			double beta, double *C, int ldc);
-void STARPU_SSYR (const char *uplo, const int n, const float alpha,
-                  const float *x, const int incx, float *A, const int lda);
-void STARPU_SSYRK (const char *uplo, const char *trans, const int n,
-                   const int k, const float alpha, const float *A,
-                   const int lda, const float beta, float *C,
-                   const int ldc);
-void STARPU_SGER (const int m, const int n, const float alpha,
-                  const float *x, const int incx, const float *y,
-                  const int incy, float *A, const int lda);
-void STARPU_DGER(const int m, const int n, const double alpha,
-                  const double *x, const int incx, const double *y,
-                  const int incy, double *A, const int lda);
+                   const char *diag, const BLASINT m, const BLASINT n,
+                   const double alpha, const double *A, const BLASINT lda,
+                   double *B, const BLASINT ldb);
+void STARPU_SSYR (const char *uplo, const BLASINT n, const float alpha,
+                  const float *x, const BLASINT incx, float *A, const BLASINT lda);
+void STARPU_SSYRK (const char *uplo, const char *trans, const BLASINT n,
+                   const BLASINT k, const float alpha, const float *A,
+                   const BLASINT lda, const float beta, float *C,
+                   const BLASINT ldc);
+void STARPU_SGER (const BLASINT m, const BLASINT n, const float alpha,
+                  const float *x, const BLASINT incx, const float *y,
+                  const BLASINT incy, float *A, const BLASINT lda);
+void STARPU_DGER(const BLASINT m, const BLASINT n, const double alpha,
+                  const double *x, const BLASINT incx, const double *y,
+                  const BLASINT incy, double *A, const BLASINT lda);
 void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
-                   const int n, const float *A, const int lda, float *x, 
-                   const int incx);
+                   const BLASINT n, const float *A, const BLASINT lda, float *x, 
+                   const BLASINT incx);
 void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
-                 const char *diag, const int m, const int n,
-                 const float alpha, const float *A, const int lda,
-                 float *B, const int ldb);
+                 const char *diag, const BLASINT m, const BLASINT n,
+                 const float alpha, const float *A, const BLASINT lda,
+                 float *B, const BLASINT ldb);
 void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
-                 const char *diag, const int m, const int n,
-                 const double alpha, const double *A, const int lda,
-                 double *B, const int ldb);
+                 const char *diag, const BLASINT m, const BLASINT n,
+                 const double alpha, const double *A, const BLASINT lda,
+                 double *B, const BLASINT ldb);
 void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
-                 const int n, const float *A, const int lda, float *X,
-                 const int incX);
-void STARPU_SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incy);
-void STARPU_DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY);
-int STARPU_ISAMAX (const int n, float *X, const int incX);
-int STARPU_IDAMAX (const int n, double *X, const int incX);
-float STARPU_SDOT(const int n, const float *x, const int incx, const float *y, const int incy);
-double STARPU_DDOT(const int n, const double *x, const int incx, const double *y, const int incy);
-void STARPU_SSWAP(const int n, float *x, const int incx, float *y, const int incy);
-void STARPU_DSWAP(const int n, double *x, const int incx, double *y, const int incy);
+                 const BLASINT n, const float *A, const BLASINT lda, float *X,
+                 const BLASINT incX);
+void STARPU_SAXPY(const BLASINT n, const float alpha, float *X, const BLASINT incX, float *Y, const BLASINT incy);
+void STARPU_DAXPY(const BLASINT n, const double alpha, double *X, const BLASINT incX, double *Y, const BLASINT incY);
+BLASINT STARPU_ISAMAX (const BLASINT n, float *X, const BLASINT incX);
+BLASINT STARPU_IDAMAX (const BLASINT n, double *X, const BLASINT incX);
+float STARPU_SDOT(const BLASINT n, const float *x, const BLASINT incx, const float *y, const BLASINT incy);
+double STARPU_DDOT(const BLASINT n, const double *x, const BLASINT incx, const double *y, const BLASINT incy);
+void STARPU_SSWAP(const BLASINT n, float *x, const BLASINT incx, float *y, const BLASINT incy);
+void STARPU_DSWAP(const BLASINT n, double *x, const BLASINT incx, double *y, const BLASINT incy);
 
-#if defined(STARPU_MKL) || defined(STARPU_ARMPL)
-void STARPU_SPOTRF(const char*uplo, const int n, float *a, const int lda);
-void STARPU_DPOTRF(const char*uplo, const int n, double *a, const int lda);
-#endif
 
-#if defined(STARPU_GOTO) || defined(STARPU_OPENBLAS) || defined(STARPU_SYSTEM_BLAS) || defined(STARPU_MKL) || defined(STARPU_ARMPL)
-
-extern void sgemm_ (const char *transa, const char *transb, const int *m,
-                   const int *n, const int *k, const float *alpha, 
-                   const float *A, const int *lda, const float *B, 
-                   const int *ldb, const float *beta, float *C, 
-                   const int *ldc);
-extern void dgemm_ (const char *transa, const char *transb, const int *m,
-                   const int *n, const int *k, const double *alpha, 
-                   const double *A, const int *lda, const double *B, 
-                   const int *ldb, const double *beta, double *C, 
-                   const int *ldc);
-extern void sgemv_(const char *trans, const int *m, const int *n, const float *alpha,
-                   const float *a, const int *lda, const float *x, const int *incx, 
-                   const float *beta, float *y, const int *incy);
-extern void dgemv_(const char *trans, const int *m, const int *n, const double *alpha,
-                   const double *a, const int *lda, const double *x, const int *incx,
-                   const double *beta, double *y, const int *incy);
-extern void ssyr_ (const char *uplo, const int *n, const float *alpha,
-                  const float *x, const int *incx, float *A, const int *lda);
-extern void ssyrk_ (const char *uplo, const char *trans, const int *n,
-                   const int *k, const float *alpha, const float *A,
-                   const int *lda, const float *beta, float *C,
-                   const int *ldc);
-extern void strsm_ (const char *side, const char *uplo, const char *transa, 
-                   const char *diag, const int *m, const int *n,
-                   const float *alpha, const float *A, const int *lda,
-                   float *B, const int *ldb);
-extern void dtrsm_ (const char *side, const char *uplo, const char *transa, 
-                   const char *diag, const int *m, const int *n,
-                   const double *alpha, const double *A, const int *lda,
-                   double *B, const int *ldb);
-extern double sasum_ (const int *n, const float *x, const int *incx);
-extern double dasum_ (const int *n, const double *x, const int *incx);
-extern void sscal_ (const int *n, const float *alpha, float *x,
-                   const int *incx);
-extern void dscal_ (const int *n, const double *alpha, double *x,
-                   const int *incx);
-extern void sger_(const int *m, const int *n, const float *alpha,
-                  const float *x, const int *incx, const float *y,
-                  const int *incy, float *A, const int *lda);
-extern void dger_(const int *m, const int *n, const double *alpha,
-                  const double *x, const int *incx, const double *y,
-                  const int *incy, double *A, const int *lda);
-extern void strsv_ (const char *uplo, const char *trans, const char *diag, 
-                   const int *n, const float *A, const int *lda, float *x, 
-                   const int *incx);
-extern void strmm_(const char *side, const char *uplo, const char *transA,
-                 const char *diag, const int *m, const int *n,
-                 const float *alpha, const float *A, const int *lda,
-                 float *B, const int *ldb);
-extern void dtrmm_(const char *side, const char *uplo, const char *transA,
-                 const char *diag, const int *m, const int *n,
-                 const double *alpha, const double *A, const int *lda,
-                 double *B, const int *ldb);
-extern void strmv_(const char *uplo, const char *transA, const char *diag,
-                 const int *n, const float *A, const int *lda, float *X,
-                 const int *incX);
-extern void saxpy_(const int *n, const float *alpha, const float *X, const int *incX,
-		float *Y, const int *incy);
-extern void daxpy_(const int *n, const double *alpha, const double *X, const int *incX,
-		double *Y, const int *incy);
-extern int isamax_(const int *n, const float *X, const int *incX);
-extern int idamax_(const int *n, const double *X, const int *incX);
+extern void sgemm_64_ (const char *transa, const char *transb, const BLASINT *m,
+                   const BLASINT *n, const BLASINT *k, const float *alpha, 
+                   const float *A, const BLASINT *lda, const float *B, 
+                   const BLASINT *ldb, const float *beta, float *C, 
+                   const BLASINT *ldc);
+extern void dgemm_64_ (const char *transa, const char *transb, const BLASINT *m,
+                   const BLASINT *n, const BLASINT *k, const double *alpha, 
+                   const double *A, const BLASINT *lda, const double *B, 
+                   const BLASINT *ldb, const double *beta, double *C, 
+                   const BLASINT *ldc);
+extern void sgemv_64_(const char *trans, const BLASINT *m, const BLASINT *n, const float *alpha,
+                   const float *a, const BLASINT *lda, const float *x, const BLASINT *incx, 
+                   const float *beta, float *y, const BLASINT *incy);
+extern void dgemv_64_(const char *trans, const BLASINT *m, const BLASINT *n, const double *alpha,
+                   const double *a, const BLASINT *lda, const double *x, const BLASINT *incx,
+                   const double *beta, double *y, const BLASINT *incy);
+extern void ssyr_64_ (const char *uplo, const BLASINT *n, const float *alpha,
+                  const float *x, const BLASINT *incx, float *A, const BLASINT *lda);
+extern void ssyrk_64_ (const char *uplo, const char *trans, const BLASINT *n,
+                   const BLASINT *k, const float *alpha, const float *A,
+                   const BLASINT *lda, const float *beta, float *C,
+                   const BLASINT *ldc);
+extern void strsm_64_ (const char *side, const char *uplo, const char *transa, 
+                   const char *diag, const BLASINT *m, const BLASINT *n,
+                   const float *alpha, const float *A, const BLASINT *lda,
+                   float *B, const BLASINT *ldb);
+extern void dtrsm_64_ (const char *side, const char *uplo, const char *transa, 
+                   const char *diag, const BLASINT *m, const BLASINT *n,
+                   const double *alpha, const double *A, const BLASINT *lda,
+                   double *B, const BLASINT *ldb);
+extern double sasum_64_ (const BLASINT *n, const float *x, const BLASINT *incx);
+extern double dasum_64_ (const BLASINT *n, const double *x, const BLASINT *incx);
+extern void sscal_64_ (const BLASINT *n, const float *alpha, float *x,
+                   const BLASINT *incx);
+extern void dscal_64_ (const BLASINT *n, const double *alpha, double *x,
+                   const BLASINT *incx);
+extern void sger_64_(const BLASINT *m, const BLASINT *n, const float *alpha,
+                  const float *x, const BLASINT *incx, const float *y,
+                  const BLASINT *incy, float *A, const BLASINT *lda);
+extern void dger_64_(const BLASINT *m, const BLASINT *n, const double *alpha,
+                  const double *x, const BLASINT *incx, const double *y,
+                  const BLASINT *incy, double *A, const BLASINT *lda);
+extern void strsv_64_ (const char *uplo, const char *trans, const char *diag, 
+                   const BLASINT *n, const float *A, const BLASINT *lda, float *x, 
+                   const BLASINT *incx);
+extern void strmm_64_(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const BLASINT *m, const BLASINT *n,
+                 const float *alpha, const float *A, const BLASINT *lda,
+                 float *B, const BLASINT *ldb);
+extern void dtrmm_64_(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const BLASINT *m, const BLASINT *n,
+                 const double *alpha, const double *A, const BLASINT *lda,
+                 double *B, const BLASINT *ldb);
+extern void strmv_64_(const char *uplo, const char *transA, const char *diag,
+                 const BLASINT *n, const float *A, const BLASINT *lda, float *X,
+                 const BLASINT *incX);
+extern void saxpy_64_(const BLASINT *n, const float *alpha, const float *X, const BLASINT *incX,
+		float *Y, const BLASINT *incy);
+extern void daxpy_64_(const BLASINT *n, const double *alpha, const double *X, const BLASINT *incX,
+		double *Y, const BLASINT *incy);
+extern BLASINT isamax_64_(const BLASINT *n, const float *X, const BLASINT *incX);
+extern BLASINT idamax_64_(const BLASINT *n, const double *X, const BLASINT *incX);
 /* for some reason, FLOATRET is not a float but a double in GOTOBLAS */
-extern double sdot_(const int *n, const float *x, const int *incx, const float *y, const int *incy);
-extern double ddot_(const int *n, const double *x, const int *incx, const double *y, const int *incy);
-extern void sswap_(const int *n, float *x, const int *incx, float *y, const int *incy);
-extern void dswap_(const int *n, double *x, const int *incx, double *y, const int *incy);
-
-#if (defined STARPU_MKL) || (defined STARPU_ARMPL)
-extern void spotrf_(const char*uplo, const int *n, float *a, const int *lda, int *info);
-extern void dpotrf_(const char*uplo, const int *n, double *a, const int *lda, int *info);
-#endif
-
-#endif
+extern double sdot_64_(const BLASINT *n, const float *x, const BLASINT *incx, const float *y, const BLASINT *incy);
+extern double ddot_64_(const BLASINT *n, const double *x, const BLASINT *incx, const double *y, const BLASINT *incy);
+extern void sswap_64_(const BLASINT *n, float *x, const BLASINT *incx, float *y, const BLASINT *incy);
+extern void dswap_64_(const BLASINT *n, double *x, const BLASINT *incx, double *y, const BLASINT *incy);
 
 #endif /* __BLAS_H__ */

+ 15 - 0
julia/src/blas.jl

@@ -1,3 +1,18 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
 @enum STARPU_BLAS begin
     STARPU_SAXPY
 end

+ 15 - 0
julia/src/blas_wrapper.c

@@ -1,3 +1,18 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
 #include <starpu.h>
 #include <blas.h>
 

+ 25 - 8
julia/src/compiler/c.jl

@@ -73,15 +73,16 @@ function transform_to_cpu_kernel(expr :: StarpuExprFunction)
     output = add_for_loop_declarations(expr)
     output = substitute_args(output)
     output = substitute_func_calls(output)
+    output = substitute_views(output)
     output = substitute_indexing(output)
     output = flatten_blocks(output)
 
     return output
 end
 
-function generate_c_struct_param_declaration(funcname)
-    scalar_parameters = CODELETS_SCALARS[funcname]
-    struct_params_name = CODELETS_PARAMS_STRUCT[funcname]
+function generate_c_struct_param_declaration(codelet_name)
+    scalar_parameters = CODELETS_SCALARS[codelet_name]
+    struct_params_name = CODELETS_PARAMS_STRUCT[codelet_name]
 
     output = "struct $struct_params_name {\n"
     for p in scalar_parameters
@@ -212,18 +213,18 @@ function substitute_args(expr :: StarpuExprFunction)
 
 
     new_args = [
-                    starpu_parse(:($buffer_arg_name :: Matrix{Nothing})),
-                    starpu_parse(:($cl_arg_name :: Vector{Nothing}))
-                ]
+        starpu_parse(:($buffer_arg_name :: Ptr{Ptr{Nothing}})),
+        starpu_parse(:($cl_arg_name :: Vector{Nothing}))
+    ]
     new_body = StarpuExprBlock([function_start_affectations..., new_body.exprs...])
 
     return StarpuExprFunction(expr.ret_type, expr.func, new_args, new_body)
 end
 
 func_substitution = Dict(
-    :width => :STARPU_MATRIX_GET_NY,
+    :width  => :STARPU_MATRIX_GET_NY,
     :height => :STARPU_MATRIX_GET_NX,
-
+    :ld     => :STARPU_MATRIX_GET_LD,
     :length => :STARPU_VECTOR_GET_NX
 )
 
@@ -243,6 +244,22 @@ function substitute_func_calls(expr :: StarpuExpr)
     return apply(func_to_apply, expr)
 end
 
+function substitute_views(expr :: StarpuExpr)
+    function func_to_apply(x :: StarpuExpr)
+
+        if !isa(x, StarpuExprCall) || x.func != :view
+            return x
+        end
+
+        ref = x.args[1]
+        indexes = map(i -> isa(i, StarpuExprInterval) ? i.start : i, x.args[2:end])
+
+        return StarpuExprAddress(StarpuExprRef(ref, indexes))
+    end
+
+    return apply(func_to_apply, expr)
+
+end
 
 function substitute_indexing(expr :: StarpuExpr)
 

+ 301 - 25
julia/src/compiler/cuda.jl

@@ -144,7 +144,278 @@ function add_device_to_interval_call(expr :: StarpuExpr)
     return apply(func_to_apply, expr)
 end
 
+function translate_cublas(expr :: StarpuExpr)
+    function func_to_run(x :: StarpuExpr)
+        # STARPU_BLAS => (CUBLAS, TRANS, FILLMODE, ALPHA, SIDE, DIAG)
+        blas_to_cublas = Dict(:STARPU_SGEMM  => (:cublasSgemm, [1, 2], [], [6, 11], [], []),
+                              :STARPU_DGEMM  => (:cublasDgemm, [1, 2], [], [6, 11], [], []),
+                              :STARPU_SGEMV  => (:cublasSgemv, [1], [], [4,9], [], []),
+                              :STARPU_DGEMV  => (:cublasDgemv, [1], [], [4,9], [], []),
+                              :STARPU_SSCAL  => (:cublasSscal, [], [], [2], [], []),
+                              :STARPU_DSCAL  => (:cublasDscal, [], [], [2], [], []),
+                              :STARPU_STRSM  => (:cublasStrsm, [3], [2], [7], [1], [4]),
+                              :STARPU_DTRSM  => (:cublasDtrsm, [3], [2], [7], [1], [4]),
+                              :STARPU_SSYR   => (:cublasSsyr, [], [1], [3], [], []),
+                              :STARPU_SSYRK  => (:cublasSsyrk, [2], [1], [5,8], [], []),
+                              :STARPU_SGER   => (:cublasSger, [], [], [3], [], []),
+                              :STARPU_DGER   => (:cublasDger, [], [], [3], [], []),
+                              :STARPU_STRSV  => (:cublasStrsv, [2], [1], [], [], [3]),
+                              :STARPU_STRMM  => (:cublasStrmm, [3], [2], [7], [1], [4]),
+                              :STARPU_DTRMM  => (:cublasDtrmm, [3], [2], [7], [1], [4]),
+                              :STARPU_STRMV  => (:cublasStrmv, [2], [1], [], [], [3]),
+                              :STARPU_SAXPY  => (:cublasSaxpy, [], [], [2], [], []),
+                              :STARPU_DAXPY  => (:cublasDaxpy, [], [], [2], [], []),
+                              :STARPU_SSWAP  => (:cublasSswap, [], [], [], [], []),
+                              :STARPU_DSWAP  => (:cublasDswap, [], [], [], [], []))
+
+        if !(isa(x, StarpuExprCall) && x.func in keys(blas_to_cublas))
+            return x
+        end
+
+        new_args = x.args
+
+        # cublasOperation_t parameters (e.g. StarpuExprValue("N")
+        for i in blas_to_cublas[x.func][2]
+            if !isa(new_args[i], StarpuExprValue) || !isa(new_args[i].value, String)
+                error("Argument $i of ", x.func, " must be a string")
+            end
+
+            value = new_args[i].value
+
+            if value == "N" || value == "n"
+                new_args[i] = StarpuExprVar(:CUBLAS_OP_N)
+            elseif value == "T" || value == "t"
+                new_args[i] = StarpuExprVar(:CUBLAS_OP_T)
+            elseif value == "C" || value == "c"
+                new_args[i] = StarpuExprVar(:CUBLAS_OP_C)
+            else
+                error("Unhandled value for rgument $i of ", x.func, ": ", value,
+                      "expecting (\"N\", \"T\", or \"C\")")
+            end
+        end
+
+        # cublasFillMode_t parameters (e.g. StarpuExprValue("L")
+        for i in blas_to_cublas[x.func][3]
+            if !isa(new_args[i], StarpuExprValue) || !isa(new_args[i].value, String)
+                error("Argument $i of ", x.func, " must be a string")
+            end
+
+            value = new_args[i].value
+
+            if value == "L" || value == "l"
+                new_args[i] = StarpuExprVar(:CUBLAS_FILL_MODE_LOWER)
+            elseif value == "U" || value == "u"
+                new_args[i] = StarpuExprVar(:CUBLAS_FILL_MODE_UPPER)
+            else
+                error("Unhandled value for rgument $i of ", x.func, ": ", value,
+                      "expecting (\"L\" or \"U\")")
+            end
+        end
+
+        # scalar parameters (alpha, beta, ...):  alpha -> &alpha
+        for i in blas_to_cublas[x.func][4]
+            if !isa(new_args[i], StarpuExprVar)
+                error("Argument $i of ", x.func, " must be a variable")
+            end
+            var_name = new_args[i].name
+            new_args[i] = StarpuExprVar(Symbol("&$var_name"))
+        end
+
+        # cublasSideMode_t parameters (e.g. StarpuExprValue("L")
+        for i in blas_to_cublas[x.func][5]
+            if !isa(new_args[i], StarpuExprValue) || !isa(new_args[i].value, String)
+                error("Argument $i of ", x.func, " must be a string, got: ", new_args[i])
+            end
+
+            value = new_args[i].value
+
+            if value == "L" || value == "l"
+                new_args[i] = StarpuExprVar(:CUBLAS_SIDE_LEFT)
+            elseif value == "R" || value == "r"
+                new_args[i] = StarpuExprVar(:CUBLAS_SIDE_RIGHT)
+            else
+                error("Unhandled value for rgument $i of ", x.func, ": ", value,
+                      "expecting (\"L\" or \"R\")")
+            end
+        end
+
+        # cublasDiag_Typet parameters (e.g. StarpuExprValue("N")
+        for i in blas_to_cublas[x.func][6]
+            if !isa(new_args[i], StarpuExprValue) || !isa(new_args[i].value, String)
+                error("Argument $i of ", x.func, " must be a string")
+            end
+
+            value = new_args[i].value
+
+            if value == "N" || value == "n"
+                new_args[i] = StarpuExprVar(:CUBLAS_DIAG_NON_UNIT)
+            elseif value == "U" || value == "u"
+                new_args[i] = StarpuExprVar(:CUBLAS_DIAG_UNIT)
+            else
+                error("Unhandled value for rgument $i of ", x.func, ": ", value,
+                      "expecting (\"N\" or \"U\")")
+            end
+        end
+
+        new_args = [@parse(starpu_cublas_get_local_handle()), x.args...]
+
+        status_varname = "status"*rand_string()
+        status_var = StarpuExprVar(Symbol("cublasStatus_t "*status_varname))
+        call_expr = StarpuExprCall(blas_to_cublas[x.func][1], new_args)
+
+        return StarpuExprBlock([StarpuExprAffect(status_var, call_expr),
+                                starpu_parse(Meta.parse("""if $status_varname != CUBLAS_STATUS_SUCCESS
+                                                              STARPU_CUBLAS_REPORT_ERROR($status_varname)
+                                                          end""")),
+                                @parse cudaStreamSynchronize(starpu_cuda_get_local_stream())])
+    end
+
+    return apply(func_to_run, expr)
+end
+
+function get_all_assignments(cpu_instr)
+    ret = StarpuExpr[]
+
+    function func_to_run(x :: StarpuExpr)
+        if isa(x, StarpuExprAffect)
+            push!(ret, x)
+        end
+
+        return x
+    end
+
+    apply(func_to_run, cpu_instr)
+    return ret
+end
+
+function get_all_buffer_vars(cpu_instr)
+    ret = StarpuExprTypedVar[]
+    assignments = get_all_assignments(cpu_instr)
+    for x in assignments
+        var = x.var
+        expr = x.expr
+        if isa(expr, StarpuExprCall) && expr.func in [:STARPU_MATRIX_GET_PTR, :STARPU_VECTOR_GET_PTR]
+            push!(ret, var)
+        end
+    end
+
+    return ret
+end
+
+function get_all_buffer_stores(cpu_instr, vars)
+    ret = StarpuExprAffect[]
+
+    function func_to_run(x :: StarpuExpr)
+        if isa(x, StarpuExprAffect) && isa(x.var, StarpuExprRef) && isa(x.var.ref, StarpuExprVar) &&
+            x.var.ref.name in map(x -> x.name, vars)
+            push!(ret, x)
+        end
+
+        return x
+    end
+
+    apply(func_to_run, cpu_instr)
+    return ret
+end
+
+function get_all_buffer_refs(cpu_instr, vars)
+    ret = []
+
+    current_instr = nothing
+    InstrTy = Union{StarpuExprAffect,
+                    StarpuExprCall,
+                    StarpuExprCudaCall,
+                    StarpuExprFor,
+                    StarpuExprIf,
+                    StarpuExprIfElse,
+                    StarpuExprReturn,
+                    StarpuExprBreak,
+                    StarpuExprWhile}
+    parent = nothing
 
+    function func_to_run(x :: StarpuExpr)
+        if isa(x, InstrTy) && !(isa(x, StarpuExprCall) && x.func in [:(+), :(-), :(*), :(/), :(%), :(<), :(<=), :(==), :(!=), :(>=), :(>), :sqrt])
+            current_instr = x
+        end
+
+        if isa(x, StarpuExprRef) && isa(x.ref, StarpuExprVar) && x.ref.name in map(x -> x.name, vars) && # var[...]
+            !isa(parent, StarpuExprAddress) && # filter &var[..]
+            !(isa(current_instr, StarpuExprAffect) && current_instr.var == x) # filter lhs ref
+            push!(ret, (current_instr, x))
+        end
+
+        parent = x
+        return x
+    end
+
+    visit_preorder(func_to_run, cpu_instr)
+    return ret
+end
+
+function transform_cuda_device_loadstore(cpu_instr :: StarpuExprBlock)
+    # Get all CUDA buffer pointers
+    buffer_vars = get_all_buffer_vars(cpu_instr)
+
+    buffer_types = Dict{Symbol, Type}()
+    for var in buffer_vars
+        buffer_types[var.name] = var.typ
+    end
+
+    # Get all store to a CUDA buffer
+    stores = get_all_buffer_stores(cpu_instr, buffer_vars)
+
+    # Get all load from CUDA buffer
+    loads = get_all_buffer_refs(cpu_instr, buffer_vars)
+
+    # Replace each load L:
+    # L: ... buffer[id]
+    # With the following instruction block:
+    # Type varX
+    # cudaMemcpy(&varX, &buffer[id], sizeof(Type), cudaMemcpyDeviceToHost)
+    # L: ... varX
+    for l in loads
+        (instr, ref) = l
+        block = []
+        buffer = ref.ref.name
+        varX = "var"*rand_string()
+        type = buffer_types[Symbol(buffer)]
+        ctype = starpu_type_traduction(eltype(type))
+        push!(block, StarpuExprTypedVar(Symbol(varX), eltype(type)))
+        push!(block, StarpuExprCall(:cudaMemcpy,
+                                    [StarpuExprAddress(StarpuExprVar(Symbol(varX))),
+                                     StarpuExprAddress(ref),
+                                     StarpuExprVar(Symbol("sizeof($ctype)")),
+                                     StarpuExprVar(:cudaMemcpyDeviceToHost)]))
+        push!(block, substitute(instr, ref, StarpuExprVar(Symbol("$varX"))))
+
+        cpu_instr = substitute(cpu_instr, instr, StarpuExprBlock(block))
+    end
+
+    # Replace each Store S:
+    # S: buffer[id] = expr
+    # With the following instruction block:
+    # Type varX
+    # varX = expr
+    # cudaMemcpy(&buffer[id], &varX, sizeof(Type), cudaMemcpyHostToDevice)
+    for s in stores
+        block = []
+        buffer = s.var.ref.name
+        varX = "var"*rand_string()
+        type = buffer_types[Symbol(buffer)]
+        ctype = starpu_type_traduction(eltype(type))
+        push!(block, StarpuExprTypedVar(Symbol(varX), eltype(type)))
+        push!(block, StarpuExprAffect(StarpuExprVar(Symbol("$varX")), s.expr))
+        push!(block, StarpuExprCall(:cudaMemcpy,
+                                    [StarpuExprAddress(s.var),
+                                     StarpuExprAddress(StarpuExprVar(Symbol(varX))),
+                                     StarpuExprVar(Symbol("sizeof($ctype)")),
+                                     StarpuExprVar(:cudaMemcpyHostToDevice)]))
+
+        cpu_instr = substitute(cpu_instr, s, StarpuExprBlock(block))
+    end
+
+    return cpu_instr
+end
 
 function transform_to_cuda_kernel(func :: StarpuExprFunction)
 
@@ -152,45 +423,50 @@ function transform_to_cuda_kernel(func :: StarpuExprFunction)
 
     init, indep, finish = extract_init_indep_finish(cpu_func.body)
 
-    if indep == nothing
-        error("No independant for loop has been found") # TODO can fail because extraction is not correct yet
-    end
+    cpu_instr = init
+    kernel = nothing
 
-    prekernel_instr, kernel_args, kernel_instr = analyse_sets(indep)
+    # Generate a CUDA kernel only if there is an independent loop (@parallel macro).
+    if (indep != nothing)
+        prekernel_instr, kernel_args, kernel_instr = analyse_sets(indep)
 
-    kernel_call = StarpuExprCudaCall(:cudaKernel, (@parse nblocks), (@parse THREADS_PER_BLOCK), StarpuExpr[])
-    prekernel_instr = vcat(init, prekernel_instr)
-    kernel_instr = vcat(kernel_instr, indep.body)
+        kernel_call = StarpuExprCudaCall(:cudaKernel, (@parse nblocks), (@parse THREADS_PER_BLOCK), StarpuExpr[])
+        cpu_instr = vcat(cpu_instr, prekernel_instr)
+        kernel_instr = vcat(kernel_instr, indep.body)
 
-    indep_for_def, indep_for_undef = analyse_variable_declarations(StarpuExprBlock(kernel_instr), kernel_args)
-    prekernel_def, prekernel_undef = analyse_variable_declarations(StarpuExprBlock(prekernel_instr), cpu_func.args)
+        indep_for_def, indep_for_undef = analyse_variable_declarations(StarpuExprBlock(kernel_instr), kernel_args)
+        prekernel_def, prekernel_undef = analyse_variable_declarations(StarpuExprBlock(cpu_instr), cpu_func.args)
 
-    for undef_var in indep_for_undef
+        for undef_var in indep_for_undef
 
-        found_var = find_variable(undef_var, prekernel_def)
+            found_var = find_variable(undef_var, prekernel_def)
 
-        if found_var == nothing # TODO : error then ?
-            continue
+            if found_var == nothing # TODO : error then ?
+                continue
+            end
+
+            push!(kernel_args, found_var)
         end
 
-        push!(kernel_args, found_var)
+        call_args = map((x -> StarpuExprVar(x.name)), kernel_args)
+        kernelname=Symbol("KERNEL_",func.func);
+        cuda_call = StarpuExprCudaCall(kernelname, (@parse nblocks), (@parse THREADS_PER_BLOCK), call_args)
+        push!(cpu_instr, cuda_call)
+        push!(cpu_instr, @parse cudaStreamSynchronize(starpu_cuda_get_local_stream()))
+        kernel = StarpuExprFunction(Nothing, kernelname, kernel_args, StarpuExprBlock(kernel_instr))
+        kernel = add_device_to_interval_call(kernel)
+        kernel = flatten_blocks(kernel)
     end
 
-    call_args = map((x -> StarpuExprVar(x.name)), kernel_args)
-    kernelname=Symbol("KERNEL_",func.func);
-    cuda_call = StarpuExprCudaCall(kernelname, (@parse nblocks), (@parse THREADS_PER_BLOCK), call_args)
-    push!(prekernel_instr, cuda_call)
-    push!(prekernel_instr, @parse cudaStreamSynchronize(starpu_cuda_get_local_stream()))
-    prekernel_instr = vcat(prekernel_instr, finish)
+    cpu_instr = vcat(cpu_instr, finish)
+    cpu_instr = StarpuExprBlock(cpu_instr)
+    cpu_instr = transform_cuda_device_loadstore(cpu_instr)
 
     prekernel_name = Symbol("CUDA_", func.func)
-    prekernel = StarpuExprFunction(Nothing, prekernel_name, cpu_func.args, StarpuExprBlock(prekernel_instr))
+    prekernel = StarpuExprFunction(Nothing, prekernel_name, cpu_func.args, cpu_instr)
+    prekernel = translate_cublas(prekernel)
     prekernel = flatten_blocks(prekernel)
 
-    kernel = StarpuExprFunction(Nothing, kernelname, kernel_args, StarpuExprBlock(kernel_instr))
-    kernel = add_device_to_interval_call(kernel)
-    kernel = flatten_blocks(kernel)
-    
     return prekernel, kernel
 end
 

+ 351 - 3
julia/src/compiler/expression_manipulation.jl

@@ -14,6 +14,30 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 #
 
+"""
+    Lenient comparison operator for structures and arrays.
+"""
+@generated function ≂(x, y)
+    if x != y || x <: Type
+        :(x == y)
+    elseif !isempty(fieldnames(x))
+        mapreduce(n -> :(x.$n ≂ y.$n), (a,b)->:($a && $b), fieldnames(x))
+    elseif x <: Array
+        quote
+            if length(x) != length(y)
+                return false
+            end
+            for i in 1:length(x)
+                if !(x[i] ≂ y[i])
+                    return false
+                end
+            end
+            return true
+        end
+    else
+        :(x == y)
+    end
+end
 
 """
     Returns a new expression where every occurrence of expr_to_replace into expr
@@ -22,8 +46,7 @@
 function substitute(expr :: StarpuExpr, expr_to_replace :: StarpuExpr, new_expr :: StarpuExpr)
 
     function func_to_apply(x :: StarpuExpr)
-
-        if (x == expr_to_replace)
+        if (x ≂ expr_to_replace)
             return new_expr
         end
 
@@ -33,7 +56,6 @@ function substitute(expr :: StarpuExpr, expr_to_replace :: StarpuExpr, new_expr
     return apply(func_to_apply, expr)
 end
 
-
 """
     Returns an expression where "€" symbols  in expr were replaced
     by the following expression list.
@@ -125,3 +147,329 @@ import Base.all
 function all(cond :: Function, expr :: StarpuExpr)
     return !any(!cond, expr)
 end
+
+function visit_preorder(func :: Function, expr :: StarpuExprAffect)
+    func(expr)
+    visit_preorder(func, expr.var)
+    visit_preorder(func, expr.expr)
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExprBlock)
+    func(expr)
+    for e in expr.exprs
+        visit_preorder(func, e)
+    end
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExprCall)
+    func(expr)
+    for a in expr.args
+        visit_preorder(func, a)
+    end
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExprCudaCall)
+    func(expr)
+    func(expr.nblocks)
+    func(expr.threads_per_block)
+    for a in expr.args
+        visit_preorder(func, a)
+    end
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExprField)
+    func(expr)
+    func(expr.left)
+    func(expr.field)
+    func(expr.is_an_arrow)
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExprFor)
+    func(expr)
+    for d in expr.set_declarations
+        visit_preorder(func, d)
+    end
+    visit_preorder(func, expr.set)
+    visit_preorder(func, expr.body)
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExprFunction)
+    func(expr)
+    for a in expr.args
+        visit_preorder(func, a)
+    end
+    visit_preorder(func, e.body)
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExprIf)
+    func(expr)
+    visit_preorder(func, expr.cond)
+    visit_preorder(func, expr.then_statement)
+    return expr
+end
+
+
+
+function visit_preorder(func :: Function, expr :: StarpuExprIfElse)
+    func(expr)
+    visit_preorder(func, expr.cond)
+    visit_preorder(func, expr.then_statement)
+    visit_preorder(func, expr.else_statement)
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExprInterval)
+    func(expr)
+    visit_preorder(func, expr.start)
+    visit_preorder(func, expr.step)
+    visit_preorder(func, expr.stop)
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExprRef)
+    func(expr)
+    visit_preorder(func, expr.ref)
+    for i in expr.indexes
+        visit_preorder(func, i)
+    end
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExprAddress)
+    func(expr)
+    visit_preorder(func, expr.ref)
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExprBreak)
+    func(expr)
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExprReturn)
+    func(expr)
+    visit_preorder(func, expr.value)
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExpr)
+    func(expr)
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExprTypedExpr)
+    func(expr)
+    visit_preorder(func, expr.expr)
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExprWhile)
+    func(expr)
+    visit_preorder(func, expr.cond)
+    visit_preorder(func, expr.body)
+    return expr
+end
+
+# function substitute_preorder(expr :: StarpuExprAffect, match :: StarpuExpr, replace :: StarpuExpr)
+#     if expr == match
+#         return replace
+#     end
+#     var = substitute_preorder(func, expr.var)
+#     expr = substitute_preorder(func, expr.expr)
+
+#     if var != expr.var || expr != expr.expr
+#         return StarpuExprAffect(var, expr)
+#     end
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExprBlock, match :: StarpuExpr, replace :: StarpuExpr)
+#     if expr == match
+#         return replace
+#     end
+
+#     modified = false
+#     new_exprs = Vector{StarpuExpr}()
+#     for e in expr.exprs
+#         push!(new_exprs, substitute_preorder(func, e))
+#     end
+#     if new_exprs != expr.exprs
+#         return StarpuExprBlock(new_exprs)
+#     end
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExprCall, match :: StarpuExpr, replace :: StarpuExpr)
+#     if expr == match
+#         return replace
+#     end
+
+#     new_args = Vector{StarpuExpr}()
+#     for a in expr.args
+#         push!(new_args, substitute_preorder(func, a))
+#     end
+#     if new_args != expr.args
+#         return StarpuExprCall(expr.func, new_args)
+#     end
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExprCudaCall, match :: StarpuExpr, replace :: StarpuExpr)
+#     if expr == match
+#         return replace
+#     end
+
+#     new_args = Vector{StarpuExpr}()
+#     for a in expr.args
+#         push!(new_args, substitute_preorder(func, a))
+#     end
+#     if new_args != expr.args
+#         return new StarpuExprCudaCall(expr.ker_name, expr.nblocks, expr.threads_per_block, new_args)
+#     end
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExprField, match :: StarpuExpr, replace :: StarpuExpr)
+#     if expr == match
+#         return replace
+#     end
+
+#     left = substitute_preorder(expr.left, match, replace)
+#     if left != expr.left
+#         return StarpuExprField(left, expr.field, expr.is_an_arrow)
+#     end
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExprFor, match :: StarpuExpr, replace :: StarpuExpr)
+#     if expr == match
+#         return replace
+#     end
+
+#     new_set_declarations = Vector{StarpuExpr}()
+    
+#     for d in expr.set_declarations
+#         substitute_preorder(func, d)
+#     end
+#     substitute_preorder(expr.set, match :: StarpuExpr, replace :: StarpuExpr)
+#     substitute_preorder(func, expr.body)
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExprFunction, match :: StarpuExpr, replace :: StarpuExpr)
+#         if expr == match
+#         return replace
+#     end
+
+#     for a in expr.args
+#         substitute_preorder(func, a)
+#     end
+#     substitute_preorder(e.body, match :: StarpuExpr, replace :: StarpuExpr)
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExprIf, match :: StarpuExpr, replace :: StarpuExpr)
+#         if expr == match
+#         return replace
+#     end
+
+#     substitute_preorder(func, expr.cond)
+#     substitute_preorder(func, expr.then_statement)
+#     return expr
+# end
+
+
+
+# function substitute_preorder(expr :: StarpuExprIfElse, match :: StarpuExpr, replace :: StarpuExpr)
+#         if expr == match
+#         return replace
+#     end
+
+#     substitute_preorder(func, expr.cond)
+#     substitute_preorder(func, expr.then_statement)
+#     substitute_preorder(func, expr.else_statement)
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExprInterval, match :: StarpuExpr, replace :: StarpuExpr)
+#         if expr == match
+#         return replace
+#     end
+
+#     substitute_preorder(func, expr.start)
+#     substitute_preorder(func, expr.step)
+#     substitute_preorder(func, expr.stop)
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExprRef, match :: StarpuExpr, replace :: StarpuExpr)
+#         if expr == match
+#         return replace
+#     end
+
+#     substitute_preorder(func, expr.ref)
+#     for i in expr.indexes
+#         substitute_preorder(func, i)
+#     end
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExprAddress, match :: StarpuExpr, replace :: StarpuExpr)
+#         if expr == match
+#         return replace
+#     end
+
+#     substitute_preorder(func, expr.ref)
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExprBreak, match :: StarpuExpr, replace :: StarpuExpr)
+#         if expr == match
+#         return replace
+#     end
+
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExprReturn, match :: StarpuExpr, replace :: StarpuExpr)
+#         if expr == match
+#         return replace
+#     end
+
+#     substitute_preorder(func, expr.value)
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExpr, match :: StarpuExpr, replace :: StarpuExpr)
+#         if expr == match
+#         return replace
+#     end
+
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExprTypedExpr, match :: StarpuExpr, replace :: StarpuExpr)
+#         if expr == match
+#         return replace
+#     end
+
+#     substitute_preorder(func, expr.expr)
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExprWhile, match :: StarpuExpr, replace :: StarpuExpr)
+#         if expr == match
+#         return replace
+#     end
+
+#     substitute_preorder(func, expr.cond)
+#     substitute_preorder(func, expr.body)
+#     return expr
+# end

+ 24 - 16
julia/src/compiler/expressions.jl

@@ -124,6 +124,9 @@ struct StarpuExprWhile <: StarpuExpr
     body :: StarpuExpr
 end
 
+struct StarpuExprAddress <: StarpuExpr
+    ref :: StarpuExpr
+end
 
 function starpu_parse_affect(x :: Expr)
 
@@ -250,7 +253,7 @@ function starpu_parse_call(x :: Expr)
 end
 
 
-starpu_infix_operators = (:(+), :(*), :(-), :(/), :(<), :(>), :(<=), :(>=), :(%))
+starpu_infix_operators = (:(+), :(*), :(-), :(/), :(<), :(>), :(<=), :(>=), :(!=), :(%))
 
 
 function print_prefix(io :: IO, x :: StarpuExprCall ; indent = 0, restrict=false)
@@ -296,7 +299,6 @@ function apply(func :: Function, expr :: StarpuExprCall)
     return func(StarpuExprCall(expr.func, map((x -> apply(func, x)), expr.args)))
 end
 
-
 #======================================================
                 CUDA KERNEL CALL
 ======================================================#
@@ -734,8 +736,6 @@ function print(io :: IO, x :: StarpuExprRef ; indent = 0,restrict=false)
 
 end
 
-
-
 function apply(func :: Function, expr :: StarpuExprRef)
 
     ref = apply(func, expr.ref)
@@ -744,6 +744,16 @@ function apply(func :: Function, expr :: StarpuExprRef)
     return func(StarpuExprRef(ref, indexes))
 end
 
+function print(io :: IO, x :: StarpuExprAddress ; indent = 0, restrict=false)
+    print(io, "&")
+    print(io, x.ref, indent = indent)
+end
+
+function apply(func :: Function, expr :: StarpuExprAddress)
+    ref = apply(func, expr.ref)
+    return func(StarpuExprAddress(ref))
+end
+
 #======================================================
                 BREAK EXPRESSION
 ======================================================#
@@ -799,7 +809,7 @@ function apply(func :: Function, expr :: StarpuExpr)
     return func(expr)
 end
 
-print(io :: IO, x :: StarpuExprVar ; indent = 0) = print(io, x.name)
+print(io :: IO, x :: StarpuExprVar ; indent = 0, restrict = false) = print(io, x.name)
 
 function print(io :: IO, x :: StarpuExprValue ; indent = 0,restrict=false)
 
@@ -869,26 +879,24 @@ end
 
 function starpu_type_traduction(x)
     if x <: Array
-        return starpu_type_traduction_array(x)
+        return starpu_type_traduction(eltype(x)) * "*"
     end
 
     if x <: Ptr
-        return starpu_type_traduction(eltype(x)) * "*"
+        depth = 1
+        type = eltype(x)
+        while type <: Ptr
+            depth +=1
+            type = eltype(type)
+        end
+
+        return starpu_type_traduction(type) * "*"^depth
     end
 
     return starpu_type_traduction_dict[x]
 
 end
 
-function starpu_type_traduction_array(x :: Type{Array{T,N}})  where {T,N}
-    output = starpu_type_traduction(T)
-    for i in (1 : N)
-        output *= "*"
-    end
-
-    return output
-end
-
 function print(io :: IO, x :: StarpuExprTyped ; indent = 0,restrict=false)
 
     if (isa(x, StarpuExprTypedVar))

+ 18 - 17
julia/src/compiler/file_generation.jl

@@ -18,6 +18,8 @@ const cpu_kernel_file_start = "#include <stdio.h>
 #include <starpu.h>
 #include <math.h>
 
+#include \"blas.h\"
+
 static inline long long jlstarpu_max(long long a, long long b)
 {
 	return (a > b) ? a : b;
@@ -38,15 +40,16 @@ const cuda_kernel_file_start = "#include <stdio.h>
 #include <stdint.h>
 #include <starpu.h>
 #include <math.h>
+#include <starpu_cublas_v2.h>
 
 #define THREADS_PER_BLOCK 64
 
-static inline long long jlstarpu_max(long long a, long long b)
+__attribute__((unused)) static inline long long jlstarpu_max(long long a, long long b)
 {
 	return (a > b) ? a : b;
 }
 
-static inline long long jlstarpu_interval_size(long long start, long long step, long long stop)
+__attribute__((unused)) static inline long long jlstarpu_interval_size(long long start, long long step, long long stop)
 {
     if (stop >= start){
             return jlstarpu_max(0, (stop - start + 1) / step);
@@ -56,12 +59,12 @@ static inline long long jlstarpu_interval_size(long long start, long long step,
 }
 
 
-__device__ static inline long long jlstarpu_max__device(long long a, long long b)
+__attribute__((unused)) __device__ static inline long long jlstarpu_max__device(long long a, long long b)
 {
 	return (a > b) ? a : b;
 }
 
-__device__ static inline long long jlstarpu_interval_size__device(long long start, long long step, long long stop)
+__attribute__((unused)) __device__ static inline long long jlstarpu_interval_size__device(long long start, long long step, long long stop)
 {
 	if (stop >= start){
 		return jlstarpu_max__device(0, (stop - start + 1) / step);
@@ -70,7 +73,6 @@ __device__ static inline long long jlstarpu_interval_size__device(long long star
 	}
 }
 
-
 "
 
 """
@@ -105,14 +107,10 @@ macro codelet(x)
     cpu_name = name
     cuda_name = "CUDA_"*name
     dump(name)
-    parse_scalar_parameters(parsed, cpu_name, cuda_name)
+    parse_scalar_parameters(parsed, name)
     c_struct_param_decl = generate_c_struct_param_declaration(name)
     cpu_expr = transform_to_cpu_kernel(parsed)
 
-    if (starpu_target & STARPU_CUDA != 0)
-        prekernel, kernel = transform_to_cuda_kernel(parsed)
-    end
-
     generated_cpu_kernel_file_name=string("genc_",string(x.args[1].args[1].args[1]),".c")
     generated_cuda_kernel_file_name=string("gencuda_",string(x.args[1].args[1].args[1]),".cu")
 
@@ -126,11 +124,16 @@ macro codelet(x)
         CPU_CODELETS[name]=cpu_name
     end
 
-    if starpu_target & STARPU_CUDA!=0
+    if (starpu_target & STARPU_CUDA!=0) && STARPU_USE_CUDA == 1
         kernel_file = open(generated_cuda_kernel_file_name, "w")
         debug_print("generating ", generated_cuda_kernel_file_name)
         print(kernel_file, cuda_kernel_file_start)
-        print(kernel_file, "__global__ ", kernel)
+        prekernel, kernel = transform_to_cuda_kernel(parsed)
+
+        if kernel != nothing
+            print(kernel_file, "__global__ ", kernel)
+        end
+
         print(kernel_file, c_struct_param_decl)
         print(kernel_file, "\nextern \"C\" ", prekernel)
         close(kernel_file)
@@ -138,7 +141,7 @@ macro codelet(x)
     end
 end
 
-function parse_scalar_parameters(expr :: StarpuExprFunction, cpu_name::String, cuda_name::String)
+function parse_scalar_parameters(expr :: StarpuExprFunction, codelet_name)
     scalar_parameters = []
     for i in (1 : length(expr.args))
         type = expr.args[i].typ
@@ -147,8 +150,7 @@ function parse_scalar_parameters(expr :: StarpuExprFunction, cpu_name::String, c
         end
     end
 
-    CODELETS_SCALARS[cpu_name] = scalar_parameters
-    CODELETS_SCALARS[cuda_name] = scalar_parameters
+    CODELETS_SCALARS[codelet_name] = scalar_parameters
 
     # declare structure carrying scalar parameters
     struct_params_name = Symbol("params_", rand_string())
@@ -164,6 +166,5 @@ function parse_scalar_parameters(expr :: StarpuExprFunction, cpu_name::String, c
     eval(Meta.parse(add_to_dict_str))
 
     # save structure name
-    CODELETS_PARAMS_STRUCT[cpu_name] = struct_params_name
-    CODELETS_PARAMS_STRUCT[cuda_name] = struct_params_name
+    CODELETS_PARAMS_STRUCT[codelet_name] = struct_params_name
 end

+ 4 - 0
julia/src/data.jl

@@ -160,6 +160,10 @@ function starpu_data_release_on_node(handle :: StarpuDataHandle, node :: Int)
     starpu_data_release_on_node(handle.object, node)
 end
 
+function starpu_data_wont_use(handle :: StarpuDataHandle)
+    starpu_data_wont_use(handle.object)
+end
+
 function repl(x::Symbol)
     return x
 end

+ 13 - 11
julia/src/dynamic_compiler/Makefile.am

@@ -14,33 +14,35 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 #
 
-#LD=$(CC_OR_NVCC)
-LD=$(CC)
-AM_CPPFLAGS = -I$(abs_top_srcdir)/include/ -I$(abs_top_builddir)/src -I$(abs_top_builddir)/include
+LD=$(CC_OR_NVCC)
+AM_CPPFLAGS = -I$(abs_top_srcdir)/include/ -I$(abs_top_builddir)/src -I$(abs_top_builddir)/include \
+	 -I$(abs_top_srcdir)/julia/src/
+
 AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS)
 AM_CFLAGS += -fPIC -O3 -g -DSTRIDE=${STRIDE} -Wall -mavx -fomit-frame-pointer -march=native -ffast-math
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@
-#CUDA_CFLAGS = ${CFLAGS}
-
+CUDA_CFLAGS = $(STARPU_CUDA_CPPFLAGS) -Wno-deprecated-gpu-targets
+LDFLAGS = -L @STARPU_BUILD_DIR@/julia/src/.libs/ -lstarpujulia-1.3
 EXTERNLIB=extern_tasks.so
 GENERATEDLIB=generated_tasks.so
 
 C_OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
-#if STARPU_USE_CUDA
-#CUDA_OBJECTS=$(patsubst %.cu,%.o,$(wildcard gen*.cu))
-#else
+
+if STARPU_USE_CUDA
+CUDA_OBJECTS=$(patsubst %.cu,%.o,$(wildcard gen*.cu))
+else
 CUDA_OBJECTS=
-#endif
+endif
 
 %.o: %.c
 	$(CC) -c $(AM_CPPFLAGS) $(AM_CFLAGS) $^ -o $@
 
 %.o: %.cu
-	$(NVCC) -dc $(CUDA_CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
+	$(NVCC) -dc $(AM_CPPFLAGS) $(CUDA_CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
 
 ${EXTERNLIB}: $(SOURCES_CPU)
 	$(CC) $(AM_CPPFLAGS) $(AM_CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
 
 ${GENERATEDLIB}: $(C_OBJECTS) $(CUDA_OBJECTS)
-	$(LD) -shared $(LDFLAGS) $^ -o $@
+	$(LD) -shared $^ -o $@ $(LDFLAGS)
 

+ 7 - 6
julia/src/globals.jl

@@ -23,16 +23,10 @@ global starpu_target=STARPU_CPU
 global generated_cuda_kernel_file_name = "PRINT TO STDOUT"
 global generated_cpu_kernel_file_name = "PRINT TO STDOUT"
 
-export CPU_CODELETS
 global CPU_CODELETS=Dict{String,String}()
-
-export CUDA_CODELETS
 global CUDA_CODELETS=Dict{String,String}()
 
-export CODELETS_SCALARS
 global CODELETS_SCALARS=Dict{String,Any}()
-
-export CODELETS_PARAMS_STRUCT
 global CODELETS_PARAMS_STRUCT=Dict{String,Any}()
 
 global starpu_type_traduction_dict = Dict(
@@ -47,3 +41,10 @@ global starpu_type_traduction_dict = Dict(
 export starpu_type_traduction_dict
 
 global mutex = Threads.SpinLock()
+
+# detect CUDA support
+try
+    STARPU_USE_CUDA == 1
+catch
+   global  const STARPU_USE_CUDA = 0
+end

+ 4 - 1
julia/src/init.jl

@@ -26,8 +26,11 @@ function starpu_init()
         debug_print("Loading external codelet library")
         ff = Libdl.dlsym(starpu_tasks_library_handle,:starpu_find_function)
         dump(ff)
-        for k in keys(CUDA_CODELETS)
+        for k in keys(CPU_CODELETS)
             CPU_CODELETS[k]=unsafe_string(ccall(ff,Cstring, (Cstring,Cstring),Cstring_from_String(string(k)),Cstring_from_String("cpu")))
+            if STARPU_USE_CUDA == 1
+                CUDA_CODELETS[k]=unsafe_string(ccall(ff,Cstring, (Cstring,Cstring),Cstring_from_String(string(k)),Cstring_from_String("gpu")))
+            end
             print(k,">>>>",CPU_CODELETS[k],"\n")
         end
     else

+ 9 - 0
julia/src/openblas_ldflags.jl

@@ -0,0 +1,9 @@
+import LinearAlgebra.BLAS
+import Libdl
+
+
+libdir = normpath(joinpath(splitpath(filter(x->occursin(Base.libblas_name,x), Libdl.dllist())[1])[1:end-1]...))
+libpath = normpath(filter(x->occursin(Base.libblas_name,x), Libdl.dllist())[1])
+libname = Base.libblas_name[4:end]
+println("-Wl,-rpath,$libpath -L$libdir -l$libname")
+

+ 109 - 22
julia/src/task.jl

@@ -27,8 +27,8 @@ end
 global codelet_list = Vector{jl_starpu_codelet}()
 
 function starpu_codelet(;
-                        cpu_func :: Union{String, STARPU_BLAS} = "",
-                        cuda_func :: Union{String, STARPU_BLAS} = "",
+                        cpu_func :: Union{String, STARPU_BLAS, Cvoid} = "",
+                        cuda_func :: Union{String, STARPU_BLAS, Cvoid} = "",
                         opencl_func :: String = "",
                         modes = [],
                         perfmodel :: starpu_perfmodel,
@@ -42,7 +42,7 @@ function starpu_codelet(;
 
 
     if (where_to_execute == nothing)
-        real_where = ((cpu_func != "") * STARPU_CPU) | ((cuda_func != "") * STARPU_CUDA)
+        real_where = ((cpu_func != nothing) * STARPU_CPU) | ((cuda_func != nothing) * STARPU_CUDA)
     else
         real_where = where_to_execute
     end
@@ -63,7 +63,7 @@ function starpu_codelet(;
         output.cpu_func = cpu_blas_codelets[cpu_func]
         output.c_codelet.cpu_func = load_wrapper_function_pointer(output.cpu_func)
     else
-        output.c_codelet.cpu_func = load_starpu_function_pointer(cpu_func)
+        output.c_codelet.cpu_func = load_starpu_function_pointer(get(CPU_CODELETS, cpu_func, ""))
     end
 
     if typeof(cuda_func) == STARPU_BLAS
@@ -71,10 +71,10 @@ function starpu_codelet(;
         output.c_codelet.cuda_func = load_wrapper_function_pointer(output.cuda_func)
         output.c_codelet.cuda_flags[1] = STARPU_CUDA_ASYNC
     else
-        output.c_codelet.cuda_func = load_starpu_function_pointer(cuda_func)
+        output.c_codelet.cuda_func = load_starpu_function_pointer(get(CUDA_CODELETS, cuda_func, ""))
     end
 
-    output.c_codelet.opencl_func = load_starpu_function_pointer(opencl_func)
+    output.c_codelet.opencl_func = load_starpu_function_pointer("")
 
     # Codelets must not be garbage collected before starpu shutdown is called.
     lock(mutex)
@@ -104,9 +104,18 @@ task_list = Vector{jl_starpu_task}()
 
             Creates a new task which will run the specified codelet on handle buffers and cl_args data
         """
-function starpu_task(; cl :: Union{Cvoid, jl_starpu_codelet} = nothing, handles :: Vector{StarpuDataHandle} = StarpuDataHandle[], cl_arg = (),
-                     callback :: Union{Cvoid, Function} = nothing, callback_arg = nothing, tag :: Union{Cvoid, starpu_tag_t} = nothing,
-                     sequential_consistency = true, detach = 1)
+function starpu_task(;
+                     cl :: Union{Cvoid, jl_starpu_codelet} = nothing,
+                     handles :: Vector{StarpuDataHandle} = StarpuDataHandle[],
+                     cl_arg = (),
+                     callback :: Union{Cvoid, Function} = nothing,
+                     callback_arg = nothing,
+                     tag :: Union{Cvoid, starpu_tag_t} = nothing,
+                     tag_only :: Union{Cvoid, starpu_tag_t} = nothing,
+                     sequential_consistency = true,
+                     detach = 1,
+                     color :: Union{Cvoid, UInt32} = nothing,
+                     where :: Union{Cvoid, Int32} = nothing)
     if (cl == nothing)
         error("\"cl\" field can't be empty when creating a StarpuTask")
     end
@@ -114,15 +123,11 @@ function starpu_task(; cl :: Union{Cvoid, jl_starpu_codelet} = nothing, handles
     output = jl_starpu_task(cl, handles, map((x -> x.object), handles), false, nothing, Vector{Cint}(undef, 1), callback, callback_arg, starpu_task(zero))
 
     # handle scalar_parameters
-    codelet_name = cl.cpu_func
-    if isempty(codelet_name)
-        codelet_name = cl.cuda_func
-    end
-    if isempty(codelet_name)
-        codelet_name = cl.opencl_func
-    end
-    if isempty(codelet_name)
-        error("No function provided with codelet.")
+    codelet_name = ""
+    if isa(cl.cpu_func, String) && cl.cpu_func != ""
+        codelet = cl.cpu_func
+    elseif isa(cl.gpu_func, String) && cl.gpu_func != ""
+        codelet = cl.gpu_func
     end
     scalar_parameters = get(CODELETS_SCALARS, codelet_name, nothing)
     if scalar_parameters != nothing
@@ -163,6 +168,18 @@ function starpu_task(; cl :: Union{Cvoid, jl_starpu_codelet} = nothing, handles
         output.c_task.use_tag = 1
     end
 
+    if tag_only != nothing
+        output.c_task.tag_id = tag_only
+    end
+
+    if color != nothing
+        output.c_task.color = color
+    end
+
+    if where != nothing
+        output.c_task.where = where
+    end
+
     # Tasks must not be garbage collected before starpu_task_wait_for_all is called.
     # This is necessary in particular for tasks created inside callback functions.
     lock(mutex)
@@ -173,8 +190,8 @@ function starpu_task(; cl :: Union{Cvoid, jl_starpu_codelet} = nothing, handles
 end
 
 
-function create_param_struct_from_clarg(name, cl_arg)
-    struct_params_name = CODELETS_PARAMS_STRUCT[name]
+function create_param_struct_from_clarg(codelet_name, cl_arg)
+    struct_params_name = CODELETS_PARAMS_STRUCT[codelet_name]
 
     if struct_params_name == false
         error("structure name not found in CODELET_PARAMS_STRUCT")
@@ -236,6 +253,76 @@ function starpu_modes(x :: Symbol)
     end
 end
 
+default_codelet = Dict{String, jl_starpu_codelet}()
+default_perfmodel = Dict{String, starpu_perfmodel}()
+
+function get_default_perfmodel(name)
+    if name in keys(default_perfmodel)
+        return default_perfmodel[name]
+    end
+
+    perfmodel = starpu_perfmodel(
+        perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
+        symbol = name
+    )
+    default_perfmodel[name] = perfmodel
+    return perfmodel
+end
+
+function get_default_codelet(codelet_name, perfmodel, modes) :: jl_starpu_codelet
+    if codelet_name in keys(default_codelet)
+        return default_codelet[codelet_name]
+    end
+
+    cl = starpu_codelet(
+        cpu_func  = codelet_name in keys(CPU_CODELETS) ? codelet_name : "",
+        cuda_func = codelet_name in keys(CUDA_CODELETS) ? codelet_name : "",
+        modes = modes,
+        perfmodel = perfmodel,
+    )
+    default_codelet[codelet_name] = cl
+    return cl
+end
+
+function starpu_task_insert(;
+                            codelet_name :: Union{Cvoid, String} = nothing,
+                            cl :: Union{Cvoid, jl_starpu_codelet} = nothing,
+                            perfmodel :: Union{starpu_perfmodel, Cvoid} = nothing,
+                            handles :: Vector{StarpuDataHandle} = StarpuDataHandle[],
+                            cl_arg = (),
+                            callback :: Union{Cvoid, Function} = nothing,
+                            callback_arg = nothing,
+                            tag :: Union{Cvoid, starpu_tag_t} = nothing,
+                            tag_only :: Union{Cvoid, starpu_tag_t} = nothing,
+                            sequential_consistency = true,
+                            detach = 1,
+                            where :: Union{Cvoid, Int32} = nothing,
+                            color :: Union{Cvoid, UInt32} = nothing,
+                            modes = nothing)
+    if cl == nothing && codelet_name == nothing
+        error("At least one of the two parameters codelet_name or cl must be provided when calling starpu_task_insert.")
+
+    end
+    if cl == nothing && modes == nothing
+        error("Modes must be defined when calling starpu_task_insert without a codelet.")
+    end
+
+    if perfmodel == nothing
+        perfmodel = get_default_perfmodel(codelet_name == nothing ? "default" : codelet_name)
+    end
+
+    if cl == nothing
+        cl = get_default_codelet(codelet_name, perfmodel, modes)
+    end
+
+    task = starpu_task(cl = cl, handles = handles, cl_arg = cl_arg, callback = callback,
+                       callback_arg = callback_arg, tag = tag, tag_only = tag_only,
+                       sequential_consistency = sequential_consistency,
+                       detach = detach, color = color, where = where)
+
+    starpu_task_submit(task)
+end
+
 """
     Creates and submits an asynchronous task running cl Codelet function.
     Ex : @starpu_async_cl cl(handle1, handle2)
@@ -254,8 +341,8 @@ macro starpu_async_cl(expr, modes, cl_arg=(), color ::UInt32=0x00000000)
     )
     println(CPU_CODELETS[string(expr.args[1])])
     cl = starpu_codelet(
-        cpu_func = CPU_CODELETS[string(expr.args[1])],
-        # cuda_func = CUDA_CODELETS[string(expr.args[1])],
+        cpu_func  = string(expr.args[1]),
+        cuda_func = string(expr.args[1]),
         #opencl_func="ocl_matrix_mult",
         ### TODO: CORRECT !
         modes = map((x -> starpu_modes(x)),modes.args),

+ 13 - 5
julia/src/translate_headers.jl

@@ -19,8 +19,8 @@ using Clang.LibClang.LLVM_jll
 function starpu_translate_headers()
     debug_print("Translating StarPU headers...")
 
-    if !isdir((@__DIR__)*"/../gen")
-        mkdir((@__DIR__)*"/../gen")
+    if !isdir(joinpath(fstarpu_build_dir(), "julia/gen"))
+        mkdir(joinpath(fstarpu_build_dir(), "julia/gen"))
     end
 
     STARPU_BUILD_INCLUDE=joinpath(fstarpu_build_dir(), "include")
@@ -63,6 +63,7 @@ function starpu_translate_headers()
                                "starpu_data_set_default_sequential_consistency_flag",
                                "starpu_data_get_sequential_consistency_flag",
                                "starpu_data_set_sequential_consistency_flag",
+                               "starpu_data_wont_use",
                                "starpu_matrix_data_register",
                                "starpu_block_data_register",
                                "starpu_vector_data_register",
@@ -76,6 +77,7 @@ function starpu_translate_headers()
                                "starpu_task_submit",
                                "starpu_task_wait",
                                "starpu_task_wait_for_n_submitted",
+                               "starpu_tag_remove",
                                "starpu_tag_wait",
                                "starpu_tag_declare_deps_array",
                                "starpu_tag_notify_from_apps",
@@ -83,16 +85,22 @@ function starpu_translate_headers()
                                "starpu_task_declare_deps_array",
                                "starpu_iteration_push",
                                "starpu_iteration_pop",
+                               "starpu_worker_get_count",
+                               "starpu_cpu_worker_get_count",
+                               "starpu_cuda_worker_get_count",
+                               "starpu_opencl_worker_get_count",
+                               "starpu_mic_worker_get_count",
                                "STARPU_CPU",
                                "STARPU_CUDA",
                                "STARPU_CUDA_ASYNC",
                                "STARPU_OPENCL",
                                "STARPU_MAIN_RAM",
-                               "STARPU_NMAXBUFS"])
+                               "STARPU_NMAXBUFS",
+                               "STARPU_USE_CUDA"])
 
     wc = init(; headers = STARPU_HEADERS,
-              output_file = joinpath(@__DIR__, "../gen/libstarpu_api.jl"),
-              common_file = joinpath(@__DIR__, "../gen/libstarpu_common.jl"),
+              output_file = joinpath(fstarpu_build_dir(), "julia/gen/libstarpu_api.jl"),
+              common_file = joinpath(fstarpu_build_dir(), "julia/gen/libstarpu_common.jl"),
               clang_includes = vcat(LIBCLANG_INCLUDE, CLANG_INCLUDE),
               clang_args = clang_args,
               header_library = x->"starpu_wrapper_library_name",

+ 8 - 1
mpi/Makefile.am

@@ -16,7 +16,14 @@
 
 include $(top_srcdir)/starpu-subdirtests.mk
 
-SUBDIRS=src tests examples tools
+SUBDIRS=src tools
+
+if STARPU_BUILD_EXAMPLES
+SUBDIRS += examples
+endif
+if STARPU_BUILD_TESTS
+SUBDIRS += tests
+endif
 
 pkgconfigdir = $(libdir)/pkgconfig
 pkgconfig_DATA = libstarpumpi.pc starpumpi-1.0.pc starpumpi-1.1.pc starpumpi-1.2.pc starpumpi-1.3.pc

+ 5 - 23
mpi/examples/Makefile.am

@@ -21,6 +21,8 @@ CCLD=$(MPICC)
 FC=$(MPIFORT)
 FCLD=$(MPIFORT)
 
+noinst_PROGRAMS		=
+
 if STARPU_HAVE_WINDOWS
 LOADER_BIN		=
 else
@@ -28,6 +30,7 @@ loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
 if !STARPU_SIMGRID
 LOADER			=	loader
 LOADER_BIN		=	$(abs_top_builddir)/mpi/examples/$(LOADER)
+noinst_PROGRAMS		+=	loader
 endif
 loader_SOURCES		=	../../tests/loader.c
 endif
@@ -108,7 +111,6 @@ AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) $(ST
 ###################
 # Stencil example #
 ###################
-if BUILD_EXAMPLES
 examplebin_PROGRAMS +=		\
 	stencil/stencil5
 starpu_mpi_EXAMPLES	+=	\
@@ -121,14 +123,11 @@ starpu_mpi_EXAMPLES	+=	\
 	stencil/stencil5_lb
 endif
 
-endif
-
 ##################
 # MPI LU example #
 ##################
 
-if BUILD_EXAMPLES
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 
 examplebin_PROGRAMS += 			\
 	mpi_lu/plu_example_float	\
@@ -210,14 +209,12 @@ mpi_lu_plu_outofcore_example_double_SOURCES =	\
 	mpi_lu/pdlu_implicit.c			\
 	../../examples/common/blas.c
 endif
-endif
 
 ########################
 # MPI Cholesky example #
 ########################
 
-if BUILD_EXAMPLES
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 examplebin_PROGRAMS +=		\
 	matrix_decomposition/mpi_cholesky			\
 	matrix_decomposition/mpi_cholesky_distributed
@@ -250,13 +247,11 @@ starpu_mpi_EXAMPLES +=				\
 	matrix_decomposition/mpi_cholesky			\
 	matrix_decomposition/mpi_cholesky_distributed
 endif
-endif
 
 ########################
 # MPI Matrix mult example #
 ########################
 
-if BUILD_EXAMPLES
 examplebin_PROGRAMS +=		\
 	matrix_mult/mm
 
@@ -270,14 +265,12 @@ if !STARPU_SIMGRID
 starpu_mpi_EXAMPLES +=				\
 	matrix_mult/mm
 endif
-endif
 
 ##########################################
 # Native Fortran MPI Matrix mult example #
 ##########################################
 
 if STARPU_HAVE_MPIFORT
-if BUILD_EXAMPLES
 if !STARPU_SANITIZE
 examplebin_PROGRAMS +=		\
 	native_fortran/nf_mm	\
@@ -318,13 +311,11 @@ starpu_mpi_EXAMPLES +=				\
 endif
 endif
 endif
-endif
 
 ###################
 # complex example #
 ###################
 
-if BUILD_EXAMPLES
 examplebin_PROGRAMS +=			\
 	complex/mpi_complex
 
@@ -334,13 +325,11 @@ complex_mpi_complex_SOURCES =		\
 
 starpu_mpi_EXAMPLES	+=			\
 	complex/mpi_complex
-endif
 
 #########################
 # user_datatype example #
 #########################
 
-if BUILD_EXAMPLES
 examplebin_PROGRAMS +=				\
 	user_datatype/user_datatype		\
 	user_datatype/user_datatype2
@@ -358,13 +347,11 @@ starpu_mpi_EXAMPLES	+=			\
 	user_datatype/user_datatype2		\
 	user_datatype/user_datatype
 endif
-endif
 
 ###################
 # comm example #
 ###################
 
-if BUILD_EXAMPLES
 examplebin_PROGRAMS +=			\
 	comm/comm			\
 	comm/mix_comm
@@ -374,13 +361,11 @@ starpu_mpi_EXAMPLES	+=			\
 	comm/comm				\
 	comm/mix_comm
 endif
-endif
 
 ##################
 # filter example #
 ##################
 
-if BUILD_EXAMPLES
 examplebin_PROGRAMS +=			\
 	filters/filter
 
@@ -388,7 +373,6 @@ if !STARPU_SIMGRID
 starpu_mpi_EXAMPLES	+=			\
 	filters/filter
 endif
-endif
 
 # Native Fortran example
 
@@ -401,7 +385,6 @@ native_fortran/fstarpu_mpi_mod.f90:
 	$(V_ln) $(LN_S) $(abs_top_srcdir)/mpi/include/$(notdir $@) $@
 
 if STARPU_HAVE_MPIFORT
-if BUILD_EXAMPLES
 if !STARPU_SANITIZE
 # - express the creation of .mod along .o
 fstarpu_mod.mod: native_fortran/fstarpu_mod.o
@@ -416,4 +399,3 @@ native_fortran/nf_mm_task_build.o: nf_mm_cl.mod fstarpu_mpi_mod.mod fstarpu_mod.
 native_fortran/nf_basic_ring.o: fstarpu_mpi_mod.mod fstarpu_mod.mod
 endif
 endif
-endif

+ 2 - 1
mpi/examples/matrix_mult/mm.c

@@ -289,7 +289,8 @@ static struct starpu_codelet gemm_cl =
 {
 	.cpu_funcs = {cpu_mult}, /* cpu implementation(s) of the routine */
 	.nbuffers = 3, /* number of data handles referenced by this routine */
-	.modes = {STARPU_R, STARPU_R, STARPU_RW} /* access modes for each data handle */
+	.modes = {STARPU_R, STARPU_R, STARPU_RW}, /* access modes for each data handle */
+	.name = "gemm" /* to display task name in traces */
 };
 
 int main(int argc, char *argv[])

+ 1 - 0
mpi/src/starpu_mpi.c

@@ -431,6 +431,7 @@ void starpu_mpi_data_migrate(MPI_Comm comm, starpu_data_handle_t data, int new_r
 
 	/* Flush cache in all other nodes */
 	/* TODO: Ideally we'd transmit the knowledge of who owns it */
+	/* TODO: or at least remember that the previous owner has the data, that's an easy case to support */
 	starpu_mpi_cache_flush(comm, data);
 	return;
 }

+ 34 - 1
mpi/src/starpu_mpi_init.c

@@ -138,7 +138,38 @@ int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi, MPI_Comm
 	_starpu_mpi_do_initialize(argc_argv);
 #endif
 
-	return _mpi_backend._starpu_mpi_backend_progress_init(argc_argv);
+	int ret = _mpi_backend._starpu_mpi_backend_progress_init(argc_argv);
+
+	if (starpu_get_env_number_default("STARPU_DISPLAY_BINDINGS", 0))
+	{
+		int rank, size, i;
+		char hostname[65];
+
+		starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+		starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
+		gethostname(hostname, sizeof(hostname));
+
+		/* We make a barrier between each node calling hwloc-ps, to avoid mixing
+		 * outputs in stdout. */
+		for (i = 0; i < size; i++)
+		{
+			starpu_mpi_barrier(MPI_COMM_WORLD);
+			if (rank == i)
+			{
+				fprintf(stdout, "== Binding for rank %d on node %s ==\n", rank, hostname);
+				starpu_display_bindings();
+				fflush(stdout);
+			}
+		}
+		starpu_mpi_barrier(MPI_COMM_WORLD);
+		if (rank == 0)
+		{
+			fprintf(stdout, "== End of bindings ==\n");
+			fflush(stdout);
+		}
+	}
+
+	return ret;
 }
 
 #ifdef STARPU_SIMGRID
@@ -219,6 +250,8 @@ int starpu_mpi_init_conf(int *argc, char ***argv, int initialize_mpi, MPI_Comm c
 			conf->reserve_ncpus++;
 	}
 
+	conf->will_use_mpi = 1;
+
 	int ret = starpu_init(conf);
 	if (ret < 0)
 		return ret;

+ 11 - 10
mpi/tests/Makefile.am

@@ -19,6 +19,8 @@ include $(top_srcdir)/starpu.mk
 CC=$(MPICC)
 CCLD=$(MPICC)
 
+noinst_PROGRAMS		=
+
 if STARPU_HAVE_WINDOWS
 LOADER_BIN		=
 else
@@ -26,6 +28,7 @@ loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
 if !STARPU_SIMGRID
 LOADER			=	loader
 LOADER_BIN		=	$(abs_top_builddir)/mpi/tests/$(LOADER)
+noinst_PROGRAMS		+=	loader
 endif
 loader_SOURCES		=	../../tests/loader.c
 endif
@@ -93,8 +96,6 @@ AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) $(ST
 # Unit testcases       #
 ########################
 
-if BUILD_TESTS
-
 starpu_mpi_TESTS =
 
 starpu_mpi_TESTS +=				\
@@ -142,14 +143,15 @@ starpu_mpi_TESTS +=				\
 	user_defined_datatype			\
 	early_stuff				\
 	sendrecv_bench				\
-	burst
+	burst						\
+	display_bindings
 
 if !STARPU_USE_MPI_MPI
 starpu_mpi_TESTS +=				\
 	sendrecv_parallel_tasks_bench
 endif
 
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 starpu_mpi_TESTS +=				\
 	sendrecv_gemm_bench			\
 	burst_gemm
@@ -182,7 +184,7 @@ starpu_mpi_TESTS +=				\
 	starpu_redefine
 endif
 
-noinst_PROGRAMS =				\
+noinst_PROGRAMS +=				\
 	datatypes				\
 	pingpong				\
 	mpi_test				\
@@ -245,9 +247,10 @@ noinst_PROGRAMS =				\
 	sendrecv_bench				\
 	sendrecv_parallel_tasks_bench		\
 	burst					\
-	nothing
+	nothing							\
+	display_bindings
 
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 noinst_PROGRAMS +=				\
 	sendrecv_gemm_bench			\
 	burst_gemm
@@ -299,7 +302,7 @@ sendrecv_parallel_tasks_bench_SOURCES += abstract_sendrecv_bench.c
 burst_SOURCES = burst.c
 burst_SOURCES += burst_helper.c
 
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 sendrecv_gemm_bench_SOURCES = sendrecv_gemm_bench.c
 sendrecv_gemm_bench_SOURCES += bench_helper.c
 sendrecv_gemm_bench_SOURCES += gemm_helper.c
@@ -315,5 +318,3 @@ burst_gemm_SOURCES += ../../examples/common/blas.c
 
 burst_gemm_LDADD = $(STARPU_BLAS_LDFLAGS)
 endif
-
-endif

+ 1 - 1
mpi/tests/abstract_sendrecv_bench.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 2
mpi/tests/abstract_sendrecv_bench.h

@@ -1,7 +1,6 @@
-
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
mpi/tests/bench_helper.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
mpi/tests/bench_helper.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
mpi/tests/burst.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 59 - 3
mpi/tests/burst_gemm.c

@@ -29,6 +29,9 @@
 #include "gemm_helper.h"
 #include "burst_helper.h"
 
+static int gemm_warmup = 1;
+static int gemm_warmup_wait = 0;
+
 void parse_args(int argc, char **argv)
 {
 	int i;
@@ -62,10 +65,19 @@ void parse_args(int argc, char **argv)
 		{
 			burst_nb_requests = atoi(argv[++i]);
 		}
+		else if (strcmp(argv[i], "-no-gemm-warmup") == 0)
+		{
+			gemm_warmup = 0;
+		}
+		else if (strcmp(argv[i], "-gemm-warmup-wait") == 0)
+		{
+			/* All warmup GEMMs will start at the same moment */
+			gemm_warmup_wait = 1;
+		}
 		else if (strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0)
 		{
-			fprintf(stderr,"Usage: %s [-nblocks n] [-size size] [-check] [-nreqs nreqs]\n", argv[0]);
-			fprintf(stderr,"Currently selected: matrix size: %u - %u blocks - %d requests in each burst\n", matrix_dim, nslices, burst_nb_requests);
+			fprintf(stderr,"Usage: %s [-nblocks n] [-size size] [-check] [-nreqs nreqs] [-no-gemm-warmup] [-gemm-warmup-wait]\n", argv[0]);
+			fprintf(stderr,"Currently selected: matrix size: %u - %u blocks - %d requests in each burst - gemm warmup: %d -gemm-warmup-wait: %d\n", matrix_dim, nslices, burst_nb_requests, gemm_warmup, gemm_warmup_wait);
 			exit(EXIT_SUCCESS);
 		}
 		else
@@ -106,13 +118,30 @@ int main(int argc, char **argv)
 	if (gemm_init_data() == -ENODEV)
 		goto enodev;
 
+	/* GEMM warmup, to really load the BLAS library */
+	if (gemm_warmup)
+	{
+		if (gemm_warmup_wait)
+		{
+			starpu_task_wait_for_all();
+			starpu_pause();
+		}
+
+		if(gemm_submit_tasks() == -ENODEV)
+			goto enodev;
+
+		if (gemm_warmup_wait)
+		{
+			starpu_resume();
+		}
+	}
+
 	burst_init_data(mpi_rank);
 
 	/* Wait for everything and everybody: */
 	starpu_task_wait_for_all();
 	starpu_mpi_barrier(MPI_COMM_WORLD);
 
-
 	FPRINTF(stderr, "** Burst warmup **\n");
 	burst_all(mpi_rank);
 
@@ -142,6 +171,33 @@ int main(int argc, char **argv)
 	starpu_task_wait_for_all();
 	starpu_mpi_barrier(MPI_COMM_WORLD);
 
+	starpu_sleep(0.3); // sleep to easily distinguish different parts in traces
+
+	FPRINTF(stderr, "** Workers are computing, without communications **\n");
+	starpu_pause();
+	if(gemm_submit_tasks() == -ENODEV)
+		goto enodev;
+	starpu_resume();
+
+	/* Wait for everything and everybody: */
+	starpu_task_wait_for_all();
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+
+	starpu_sleep(0.3); // sleep to easily distinguish different parts in traces
+
+	FPRINTF(stderr, "** Burst while workers are computing, but polling a moment between each task **\n");
+	starpu_pause();
+	gemm_add_polling_dependencies();
+	if(gemm_submit_tasks_with_tags(/* enable task tags */ 1) == -ENODEV)
+		goto enodev;
+	starpu_resume();
+
+	burst_all(mpi_rank);
+
+	/* Wait for everything and everybody: */
+	starpu_task_wait_for_all();
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+
 enodev:
 	gemm_release();
 	burst_free_data(mpi_rank);

+ 1 - 1
mpi/tests/burst_helper.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
mpi/tests/burst_helper.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 44 - 0
mpi/tests/display_bindings.c

@@ -0,0 +1,44 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_mpi.h>
+#include <stdlib.h>
+#include "helper.h"
+
+#if !defined(STARPU_HAVE_SETENV)
+#warning setenv is not defined. Skipping test
+int main(void)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+int main(int argc, char **argv)
+{
+	int ret;
+	setenv("STARPU_DISPLAY_BINDINGS", "1", 1);
+
+	MPI_INIT_THREAD_real(&argc, &argv, MPI_THREAD_SERIALIZED);
+
+	ret = starpu_mpi_init_conf(NULL, NULL, 0, MPI_COMM_WORLD, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
+
+	starpu_mpi_shutdown();
+	MPI_Finalize();
+
+	return EXIT_SUCCESS;
+}
+#endif

+ 52 - 4
mpi/tests/gemm_helper.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -199,6 +199,7 @@ static struct starpu_codelet cl_init_matrix_zero =
 	.color = 0x808000 // olive
 };
 
+/* Allocate and partition buffers */
 void gemm_alloc_data()
 {
 	starpu_malloc_flags((void **)&A, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
@@ -207,14 +208,13 @@ void gemm_alloc_data()
 	partition_mult_data();
 }
 
-
+/* Submit tasks to initialize matrices: fill them with zeros or random numbers */
 int gemm_init_data()
 {
 #ifndef STARPU_SIMGRID
 	int ret;
 	unsigned x, y;
 
-	// Initialize matrices:
 	for (x = 0; x < nslices; x++)
 	{
 		struct starpu_task *task = starpu_task_create();
@@ -237,11 +237,17 @@ int gemm_init_data()
 	return 0;
 }
 
-
+/* Submit tasks to compute the GEMM */
 int gemm_submit_tasks()
 {
+	return gemm_submit_tasks_with_tags(/* by default, disable task tags */ 0);
+}
+
+int gemm_submit_tasks_with_tags(int with_tags)
+{
 	int ret;
 	unsigned x, y;
+	starpu_tag_t task_tag = 0;
 
 	for (x = 0; x < nslices; x++)
 	for (y = 0; y < nslices; y++)
@@ -253,6 +259,12 @@ int gemm_submit_tasks()
 		task->handles[2] = starpu_data_get_sub_data(C_handle, 2, x, y);
 		task->flops = 2ULL * (matrix_dim/nslices) * (matrix_dim/nslices) * matrix_dim;
 
+		if (with_tags)
+		{
+			task->use_tag = 1;
+			task->tag_id = ++task_tag;
+		}
+
 		ret = starpu_task_submit(task);
 		CHECK_TASK_SUBMIT(ret);
 		starpu_data_wont_use(starpu_data_get_sub_data(C_handle, 2, x, y));
@@ -261,6 +273,42 @@ int gemm_submit_tasks()
 	return 0;
 }
 
+/* Add dependencies between GEMM tasks to see the impact of polling workers which will at the end get a task.
+ * The new dependency graph has the following shape:
+ * - the same number of GEMMs as the number of workers are executed in parallel on all workers ("a column of tasks")
+ * - then a GEMM waits all tasks of the previous column of tasks, and is executed on a worker
+ * - the next column of tasks waits for the previous GEMM
+ * - and so on...
+ *
+ * worker 0 |  1  |  4  |  5  |  8  |  9  |
+ * worker 1 |  2  |     |  6  |     | 10  |  ...
+ * worker 2 |  3  |     |  7  |     | 11  |
+ *
+ * This function has to be called before gemm_submit_tasks_with_tags(1).
+ */
+void gemm_add_polling_dependencies()
+{
+	starpu_tag_t nb_tasks = (starpu_tag_t) nslices * (starpu_tag_t) nslices;
+	unsigned nb_workers = starpu_worker_get_count();
+
+	for (starpu_tag_t synchro_tag = nb_workers+1; synchro_tag <= nb_tasks; synchro_tag += (nb_workers+1))
+	{
+		// this synchro tag depends on tasks of previous column of tasks:
+		for (starpu_tag_t previous_tag = synchro_tag - nb_workers; previous_tag < synchro_tag; previous_tag++)
+		{
+			starpu_tag_declare_deps(synchro_tag, 1, previous_tag);
+		}
+
+		// tasks of the next column of tasks depend on this synchro tag:
+		// this actually allows workers to poll for new tasks, while no task is available
+		for (starpu_tag_t next_tag = synchro_tag+1; next_tag < (synchro_tag + nb_workers + 1) && next_tag <= nb_tasks; next_tag++)
+		{
+			starpu_tag_declare_deps(next_tag, 1, synchro_tag);
+		}
+	}
+
+}
+
 void gemm_release()
 {
 	starpu_data_unpartition(C_handle, STARPU_MAIN_RAM);

+ 3 - 1
mpi/tests/gemm_helper.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -29,5 +29,7 @@ void gemm_alloc_data();
 int gemm_init_data();
 int gemm_submit_tasks();
 void gemm_release();
+void gemm_add_polling_dependencies();
+int gemm_submit_tasks_with_tags(int with_tags);
 
 #endif /* __MPI_TESTS_GEMM_HELPER__ */

+ 1 - 1
mpi/tests/nothing.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
mpi/tests/sendrecv_parallel_tasks_bench.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 2 - 2
sc_hypervisor/examples/Makefile.am

@@ -26,7 +26,7 @@ noinst_PROGRAMS =				\
 	lp_test/lp_resize_test			\
 	hierarchical_ctxs/resize_hierarchical_ctxs
 
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 noinst_PROGRAMS +=				\
 	cholesky/cholesky_implicit
 
@@ -35,7 +35,7 @@ noinst_HEADERS = 				\
 	sched_ctx_utils/sched_ctx_utils.h
 endif
 
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 
 cholesky_cholesky_implicit_SOURCES =		\
 	cholesky/cholesky_implicit.c		\

+ 3 - 5
socl/examples/Makefile.am

@@ -25,11 +25,8 @@ if !STARPU_SIMGRID
 TESTS		=	$(SOCL_EXAMPLES)
 endif
 
-if STARPU_HAVE_WINDOWS
+noinst_PROGRAMS	=
 check_PROGRAMS	=	$(SOCL_EXAMPLES)
-else
-check_PROGRAMS	=	$(LOADER) $(SOCL_EXAMPLES)
-endif
 
 if !STARPU_HAVE_WINDOWS
 ## test loader program
@@ -37,6 +34,7 @@ LOADER			=	loader
 loader_CPPFLAGS 	=	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
 LOADER_BIN		=	$(abs_top_builddir)/socl/examples/$(LOADER)
 loader_SOURCES		=	../../tests/loader.c
+noinst_PROGRAMS		+=	loader
 
 if STARPU_HAVE_AM111
 TESTS_ENVIRONMENT	=	LD_LIBRARY_PATH="@SOCL_OCL_LIB_OPENCL_DIR@:$(LD_LIBRARY_PATH)" OCL_ICD_VENDORS="$(abs_top_builddir)/socl/vendors/" top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
@@ -75,7 +73,7 @@ matmul_matmul_LDADD = -lm
 mansched_mansched_SOURCES = mansched/mansched.c
 
 #mandelbrot_mandelbrot_CPPFLAGS = $(AM_CPPFLAGS) $(AM_CFLAGS)
-#if HAVE_X11
+#if STARPU_HAVE_X11
 #mandelbrot_mandelbrot_CPPFLAGS += $(X_CFLAGS)
 #mandelbrot_mandelbrot_LDADD = $(X_PRE_LIBS) $(X_LIBS) -lX11 $(X_EXTRA_LIBS)
 #endif

+ 138 - 3
src/common/utils.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Federal University of Rio Grande do Sul (UFRGS)
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -23,6 +24,7 @@
 #include <unistd.h>
 #endif
 #include <fcntl.h>
+#include <ctype.h>
 
 #if defined(_WIN32) && !defined(__CYGWIN__)
 #include <io.h>
@@ -526,7 +528,7 @@ void _starpu_gethostname(char *hostname, size_t size)
 
 	if (force_mpi_hostnames && force_mpi_hostnames[0])
 	{
-		char *host, *srv_hosts, *rsrv;
+		char *host, *srv_hosts;
 		srv_hosts = strdup(force_mpi_hostnames);
 		int rank;
 		if (starpu_mpi_world_rank)
@@ -541,8 +543,8 @@ void _starpu_gethostname(char *hostname, size_t size)
 
 		if (force_mpi_hostnames != NULL)
 		{
-			host = strtok_r(srv_hosts, " ", &rsrv);
-			while (rank-->0 && (host = strtok_r(NULL, " ", &rsrv)));
+			host = strtok(srv_hosts, " ");
+			while (rank-->0 && (host = strtok(NULL, " ")));
 			if(rank>=0)
 			{
 				_STARPU_MSG("Missing hostnames in STARPU_MPI_HOSTNAMES\n");
@@ -620,3 +622,136 @@ char *starpu_getenv(const char *str)
 #endif
 	return getenv(str);
 }
+
+int _strings_ncmp(const char *strings[], const char *str)
+{
+	int pos = 0;
+	while (strings[pos])
+	{
+		if ((strlen(str) == strlen(strings[pos]) && strncasecmp(str, strings[pos], strlen(strings[pos])) == 0))
+			break;
+		pos++;
+	}
+	if (strings[pos] == NULL)
+		return -1;
+	return pos;
+}
+
+int starpu_get_env_string_var_default(const char *str, const char *strings[], int defvalue)
+{
+	int val;
+	char *strval;
+
+	strval = starpu_getenv(str);
+	if (!strval)
+	{
+		val = defvalue;
+	}
+	else
+	{
+		val = _strings_ncmp(strings, strval);
+		if (val < 0)
+		{
+			int i;
+			_STARPU_MSG("\n");
+			_STARPU_MSG("Invalid value '%s' for environment variable '%s'\n", strval, str);
+			_STARPU_MSG("Valid values are:\n");
+			for(i=0;strings[i]!=NULL;i++) _STARPU_MSG("\t%s\n",strings[i]);
+			_STARPU_MSG("\n");
+			STARPU_ABORT();
+		}
+	}
+	return val;
+}
+
+static void remove_spaces(char *str)
+{
+	int i = 0;
+	int j = 0;
+
+	while (str[j] != '\0')
+	{
+		if (isspace(str[j]))
+		{
+			j++;
+			continue;
+		}
+		if (j > i)
+		{
+			str[i] = str[j];
+		}
+		i++;
+		j++;
+	}
+	if (j > i)
+	{
+		str[i] = str[j];
+	}
+}
+
+int starpu_get_env_size_default(const char *str, int defval)
+{
+	int val;
+	char *strval;
+
+	strval = starpu_getenv(str);
+	if (!strval)
+	{
+		val = defval;
+	}
+	else
+	{
+		char *value = strdup(strval);
+		if (value == NULL)
+			_STARPU_ERROR("memory allocation failed\n");
+		remove_spaces(value);
+		if (value[0] == '\0')
+		{
+			free(value);
+			val = defval;
+		}
+		else
+		{
+			char *endptr = NULL;
+			int mult = 1024;
+			errno = 0;
+			int v = (int)strtol(value, &endptr, 10);
+			if (errno != 0)
+				_STARPU_ERROR("could not parse environment variable '%s' with value '%s', strtol failed with error %s\n", str, value, strerror(errno));
+			if (*endptr != '\0')
+			{
+				switch (*endptr)
+				{
+				case 'b':
+				case 'B': mult = 1; break;
+				case 'k':
+				case 'K': mult = 1024; break;
+				case 'm':
+				case 'M': mult = 1024*1024; break;
+				case 'g':
+				case 'G': mult = 1024*1024*1024; break;
+				default:
+					_STARPU_ERROR("could not parse environment variable '%s' with value '%s' size suffix invalid\n", str, value);
+				}
+			}
+			val = v*mult;
+			free(value);
+		}
+	}
+	return val;
+}
+
+void starpu_display_bindings(void)
+{
+#ifdef STARPU_HAVE_HWLOC
+	int hwloc_ret = system("hwloc-ps -a -t -c");
+	if (hwloc_ret)
+	{
+		_STARPU_DISP("hwloc-ps returned %d\n", hwloc_ret);
+		fflush(stderr);
+	}
+	fflush(stdout);
+#else
+	_STARPU_DISP("hwloc not available to display bindings.\n");
+#endif
+}

+ 14 - 9
src/core/perfmodel/perfmodel_history.c

@@ -344,7 +344,10 @@ static void dump_reg_model(FILE *f, struct starpu_perfmodel *model, int comb, in
 	double a = nan(""), b = nan(""), c = nan("");
 
 	if (model->type == STARPU_NL_REGRESSION_BASED)
-		_starpu_regression_non_linear_power(per_arch_model->list, &a, &b, &c);
+	{
+		if (_starpu_regression_non_linear_power(per_arch_model->list, &a, &b, &c) != 0)
+			_STARPU_DISP("Warning: could not compute a non-linear regression for model %s\n", model->symbol);
+	}
 
 	fprintf(f, "# a\t\tb\t\tc\n");
 	_starpu_write_double(f, "%-15e", a);
@@ -1491,6 +1494,8 @@ int starpu_perfmodel_load_file(const char *filename, struct starpu_perfmodel *mo
 	res = fclose(f);
 	STARPU_ASSERT(res == 0);
 
+	if (ret)
+		starpu_perfmodel_unload_model(model);
 	return ret;
 }
 
@@ -1885,20 +1890,20 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 				STARPU_HG_DISABLE_CHECKING(entry->nsample);
 				STARPU_HG_DISABLE_CHECKING(entry->mean);
 
-				/* Do not take the first measurement into account, it is very often quite bogus */
+				/* For history-based, do not take the first measurement into account, it is very often quite bogus */
 				/* TODO: it'd be good to use a better estimation heuristic, like the median, or latest n values, etc. */
-				//entry->mean = 0;
-				//entry->sum = 0;
-
-				//entry->deviation = 0.0;
-				//entry->sum2 = 0;
+				if (model->type != STARPU_HISTORY_BASED)
+				{
+					entry->sum = measured;
+					entry->sum2 = measured*measured;
+					entry->nsample = 1;
+					entry->mean = measured;
+				}
 
 				entry->size = _starpu_job_get_data_size(model, arch, impl, j);
 				entry->flops = j->task->flops;
 
 				entry->footprint = key;
-				//entry->nsample = 0;
-				//entry->nerror = 0;
 
 				insert_history_entry(entry, list, &per_arch_model->history);
 			}

+ 123 - 45
src/core/perfmodel/regression.c

@@ -20,7 +20,32 @@
 #define MAXREGITER	1000
 #define EPS 1.0e-10
 
-static double compute_b(double c, unsigned n, unsigned *x, double *y)
+/* For measurements close to C, we do not want to try to fit, since we are
+   fitting the distance to C, which won't actually really get smaller */
+#define C_RADIUS 1
+
+/*
+ * smoothly ramp from 0 to 1 between 0 and 1
+ * <= 0: stay 0
+ * >= 1: stay 1 */
+static double level(double x)
+{
+	if (x <= 0.)
+		return 0.;
+	if (x >= 1.)
+		return 1.;
+	if (x < 0.5)
+		return -2*x*x+4*x-1;
+	return 2*x*x;
+}
+
+static double fixpop(unsigned pop, double c, double y)
+{
+	double distance = (y-c)/c;
+	return pop * level((distance - C_RADIUS) / C_RADIUS);
+}
+
+static double compute_b(double c, unsigned n, size_t *x, double *y, unsigned *pop)
 {
 	double b;
 
@@ -29,43 +54,55 @@ static double compute_b(double c, unsigned n, unsigned *x, double *y)
 	double sumx = 0.0;
 	double sumx2 = 0.0;
 	double sumy = 0.0;
+	double nn = 0;
 
 	unsigned i;
 	for (i = 0; i < n; i++)
 	{
 		double xi = log(x[i]);
 		double yi = log(y[i]-c);
+		double popi = fixpop(pop[i], c, y[i]);
+		if (popi <= 0)
+			continue;
 
-		sumxy += xi*yi;
-		sumx += xi;
-		sumx2 += xi*xi;
-		sumy += yi;
+		sumxy += xi*yi*popi;
+		sumx += xi*popi;
+		sumx2 += xi*xi*popi;
+		sumy += yi*popi;
+
+		nn += popi;
 	}
 
-	b = (n * sumxy - sumx * sumy) / (n*sumx2 - sumx*sumx);
+	b = (nn * sumxy - sumx * sumy) / (nn*sumx2 - sumx*sumx);
 
 	return b;
 }
 
-static double compute_a(double c, double b, unsigned n, unsigned *x, double *y)
+static double compute_a(double c, double b, unsigned n, size_t *x, double *y, unsigned *pop)
 {
 	double a;
 
 	/* X = log (x) , Y = log (y - c) */
 	double sumx = 0.0;
 	double sumy = 0.0;
+	double nn = 0;
 
 	unsigned i;
 	for (i = 0; i < n; i++)
 	{
 		double xi = log(x[i]);
 		double yi = log(y[i]-c);
+		double popi = fixpop(pop[i], c, y[i]);
+		if (popi <= 0)
+			continue;
+
+		sumx += xi*popi;
+		sumy += yi*popi;
 
-		sumx += xi;
-		sumy += yi;
+		nn += popi;
 	}
 
-	a = (sumy - b*sumx) / n;
+	a = (sumy - b*sumx) / nn;
 
 	return a;
 }
@@ -73,7 +110,7 @@ static double compute_a(double c, double b, unsigned n, unsigned *x, double *y)
 
 
 /* returns r */
-static double test_r(double c, unsigned n, unsigned *x, double *y)
+static double test_r(double c, unsigned n, size_t *x, double *y, unsigned *pop)
 {
 	double r;
 
@@ -85,20 +122,26 @@ static double test_r(double c, unsigned n, unsigned *x, double *y)
 	double sumx2 = 0.0;
 	double sumy = 0.0;
 	double sumy2 = 0.0;
+	double nn = 0;
 
 	unsigned i;
 	for (i = 0; i < n; i++)
 	{
 		double xi = log(x[i]);
 		double yi = log(y[i]-c);
+		double popi = fixpop(pop[i], c, y[i]);
+		if (popi <= 0)
+			continue;
 
 	//	printf("Xi = %e, Yi = %e\n", xi, yi);
 
-		sumxy += xi*yi;
-		sumx += xi;
-		sumx2 += xi*xi;
-		sumy += yi;
-		sumy2 += yi*yi;
+		sumxy += xi*yi*popi;
+		sumx += xi*popi;
+		sumx2 += xi*xi*popi;
+		sumy += yi*popi;
+		sumy2 += yi*yi*popi;
+
+		nn += popi;
 	}
 
 	//printf("sumxy %e\n", sumxy);
@@ -107,7 +150,7 @@ static double test_r(double c, unsigned n, unsigned *x, double *y)
 	//printf("sumy %e\n", sumy);
 	//printf("sumy2 %e\n", sumy2);
 
-	r = (n * sumxy - sumx * sumy) / sqrt( (n* sumx2 - sumx*sumx) * (n*sumy2 - sumy*sumy) );
+	r = (nn * sumxy - sumx * sumy) / sqrt( (nn* sumx2 - sumx*sumx) * (nn*sumy2 - sumy*sumy) );
 
 	return r;
 }
@@ -119,38 +162,52 @@ static unsigned find_list_size(struct starpu_perfmodel_history_list *list_histor
 	struct starpu_perfmodel_history_list *ptr = list_history;
 	while (ptr)
 	{
-		cnt++;
+		if (ptr->entry->nsample)
+			cnt++;
 		ptr = ptr->next;
 	}
 
 	return cnt;
 }
 
-static double find_list_min(double *y, unsigned n)
+static int compar(const void *_a, const void *_b)
 {
-	double min = DBL_MAX;
+	double a = *(double*) _a;
+	double b = *(double*) _b;
+	if (a < b)
+		return -1;
+	if (a > b)
+		return 1;
+	return 0;
+}
 
-	unsigned i;
-	for (i = 0; i < n; i++)
-	{
-		min = STARPU_MIN(min, y[i]);
-	}
+static double get_list_fourth(double *y, unsigned n)
+{
+	double sorted[n];
+
+	memcpy(sorted, y, n * sizeof(*sorted));
 
-	return min;
+	qsort(sorted, n, sizeof(*sorted), compar);
+
+	return sorted[n/3];
 }
 
-static void dump_list(unsigned *x, double *y, struct starpu_perfmodel_history_list *list_history)
+static void dump_list(size_t *x, double *y, unsigned *pop, struct starpu_perfmodel_history_list *list_history)
 {
 	struct starpu_perfmodel_history_list *ptr = list_history;
 	unsigned i = 0;
 
 	while (ptr)
 	{
-		x[i] = ptr->entry->size;
-		y[i] = ptr->entry->mean;
+		if (ptr->entry->nsample)
+		{
+			x[i] = ptr->entry->size;
+			y[i] = ptr->entry->mean;
+			pop[i] = ptr->entry->nsample;
+			i++;
+		}
 
 		ptr = ptr->next;
-		i++;
 	}
 }
 
@@ -159,52 +216,72 @@ static void dump_list(unsigned *x, double *y, struct starpu_perfmodel_history_li
  * 	return 0 if success, -1 otherwise
  * 	if success, a, b and c are modified
  * */
+
+/* See in Cedric Augonnet's PhD thesis's Appendix B for the rationale
+ * Scheduling Tasks over Multicore machines enhanced with Accelerators: a
+ * Runtime System’s Perspective */
 int _starpu_regression_non_linear_power(struct starpu_perfmodel_history_list *ptr, double *a, double *b, double *c)
 {
 	unsigned n = find_list_size(ptr);
-	STARPU_ASSERT(n);
+	if (!n)
+		return -1;
 
-	unsigned *x;
-	_STARPU_MALLOC(x, n*sizeof(unsigned));
+	size_t *x;
+	_STARPU_MALLOC(x, n*sizeof(size_t));
 
 	double *y;
 	_STARPU_MALLOC(y, n*sizeof(double));
 	STARPU_ASSERT(y);
 
-	dump_list(x, y, ptr);
+	unsigned *pop;
+	_STARPU_MALLOC(pop, n*sizeof(unsigned));
+	STARPU_ASSERT(y);
+
+	dump_list(x, y, pop, ptr);
 
 	double cmin = 0.0;
-	double cmax = find_list_min(y, n);
+	double cmax = get_list_fourth(y, n);
 
 	unsigned iter;
 
 	double err = 100000.0;
 
+/*
+	unsigned i;
+	for (i = 0; i < 100; i++)
+	{
+		double ci = cmin + (cmax-cmin)*i/100.;
+		fprintf(stderr,"%f: %f\n", ci, 1.0 - test_r(ci, n, x, y, pop));
+	}
+*/
+
+	/* Use dichotomy to find c that gives the best matching */
 	for (iter = 0; iter < MAXREGITER; iter++)
 	{
 		double c1, c2;
 		double r1, r2;
 
-		double radius = 0.01;
-
-		c1 = cmin + (0.5-radius)*(cmax - cmin);
-		c2 = cmin + (0.5+radius)*(cmax - cmin);
+		c1 = cmin + (0.33)*(cmax - cmin);
+		c2 = cmin + (0.67)*(cmax - cmin);
 
-		r1 = test_r(c1, n, x, y);
-		r2 = test_r(c2, n, x, y);
+		r1 = test_r(c1, n, x, y, pop);
+		r2 = test_r(c2, n, x, y, pop);
 
 		double err1, err2;
 		err1 = fabs(1.0 - r1);
 		err2 = fabs(1.0 - r2);
 
+		//fprintf(stderr,"%f - %f: %f - %f: %f - %f\n", cmin, c1, err1, c2, err2, cmax);
+
 		if (err1 < err2)
 		{
-			cmax = (cmin + cmax)/2;
+			/* 1 is better */
+			cmax = c2;
 		}
 		else
 		{
 			/* 2 is better */
-			cmin = (cmin + cmax)/2;
+			cmin = c1;
 		}
 
 		if (fabs(err - STARPU_MIN(err1, err2)) < EPS)
@@ -215,11 +292,12 @@ int _starpu_regression_non_linear_power(struct starpu_perfmodel_history_list *pt
 
 	*c = (cmin + cmax)/2;
 
-	*b = compute_b(*c, n, x, y);
-	*a = exp(compute_a(*c, *b, n, x, y));
+	*b = compute_b(*c, n, x, y, pop);
+	*a = exp(compute_a(*c, *b, n, x, y, pop));
 
 	free(x);
 	free(y);
+	free(pop);
 
 	return 0;
 }

+ 2 - 0
src/core/topology.c

@@ -1049,6 +1049,8 @@ static inline unsigned _starpu_get_next_bindid(struct _starpu_machine_config *co
 {
 	struct _starpu_machine_topology *topology = &config->topology;
 
+	STARPU_ASSERT_MSG(topology_is_initialized, "The StarPU core is not initialized yet, have you called starpu_init?");
+
 	unsigned current_preferred;
 	unsigned nhyperthreads = topology->nhwpus / topology->nhwcpus;
 	unsigned ncores = topology->nhwpus / nhyperthreads;

+ 0 - 1
src/core/topology.h

@@ -24,7 +24,6 @@
 #include <common/list.h>
 #include <common/fxt.h>
 
-/** TODO actually move this struct into this header */
 struct _starpu_machine_config;
 
 #ifndef STARPU_SIMGRID

+ 17 - 0
src/core/workers.c

@@ -1059,6 +1059,7 @@ int starpu_conf_init(struct starpu_conf *conf)
 
 	memset(conf, 0, sizeof(*conf));
 	conf->magic = 42;
+	conf->will_use_mpi = 0;
 	conf->sched_policy_name = starpu_getenv("STARPU_SCHED");
 	conf->sched_policy = NULL;
 	conf->global_sched_ctx_min_priority = starpu_get_env_number("STARPU_MIN_PRIO");
@@ -1143,6 +1144,9 @@ int starpu_conf_init(struct starpu_conf *conf)
 	/* 64MiB by default */
 	conf->trace_buffer_size = ((uint64_t) starpu_get_env_number_default("STARPU_TRACE_BUFFER_SIZE", 64)) << 20;
 
+	conf->driver_spinning_backoff_min = (unsigned) starpu_get_env_number_default("STARPU_BACKOFF_MIN", 1);
+	conf->driver_spinning_backoff_max = (unsigned) starpu_get_env_number_default("STARPU_BACKOFF_MAX", 32);
+
 	/* Do not start performance counter collection by default */
 	conf->start_perf_counter_collection = 0;
 	return 0;
@@ -1663,6 +1667,15 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 
 	_starpu_catch_signals();
 
+	/* if MPI is enabled, binding display will be done later, after MPI initialization */
+	if (!_starpu_config.conf.will_use_mpi && starpu_get_env_number_default("STARPU_DISPLAY_BINDINGS", 0))
+	{
+		fprintf(stdout, "== Binding ==\n");
+		starpu_display_bindings();
+		fprintf(stdout, "== End of binding ==\n");
+		fflush(stdout);
+	}
+
 	return 0;
 }
 
@@ -1751,6 +1764,8 @@ void starpu_pause()
 {
 	STARPU_HG_DISABLE_CHECKING(_starpu_config.pause_depth);
 	_starpu_config.pause_depth += 1;
+
+	starpu_fxt_trace_user_event_string("starpu_pause");
 }
 
 void starpu_resume()
@@ -1762,6 +1777,8 @@ void starpu_resume()
 		STARPU_PTHREAD_COND_BROADCAST(&pause_cond);
 	}
 	STARPU_PTHREAD_MUTEX_UNLOCK(&pause_mutex);
+
+	starpu_fxt_trace_user_event_string("starpu_resume");
 }
 
 unsigned _starpu_worker_can_block(unsigned memnode STARPU_ATTRIBUTE_UNUSED, struct _starpu_worker *worker STARPU_ATTRIBUTE_UNUSED)

+ 16 - 2
src/core/workers.h

@@ -203,6 +203,10 @@ LIST_TYPE(_starpu_worker,
 
 	int enable_knob;
 	int bindid_requested;
+
+	/* Keep this last, to make sure to separate worker data in separate
+	  cache lines. */
+	char padding[STARPU_CACHELINE_SIZE];
 );
 
 struct _starpu_combined_worker
@@ -223,6 +227,10 @@ struct _starpu_combined_worker
 #ifdef STARPU_HAVE_HWLOC
 	hwloc_bitmap_t hwloc_cpu_set;
 #endif
+
+	/* Keep this last, to make sure to separate worker data in separate
+	  cache lines. */
+	char padding[STARPU_CACHELINE_SIZE];
 };
 
 /**
@@ -389,6 +397,9 @@ struct _starpu_machine_config
 	/** Memory node for MPI, if only one */
 	int mpi_nodeid;
 
+	/* Separate out previous variables from per-worker data. */
+	char padding1[STARPU_CACHELINE_SIZE];
+
 	/** Basic workers : each of this worker is running its own driver and
 	 * can be combined with other basic workers. */
 	struct _starpu_worker workers[STARPU_NMAXWORKERS];
@@ -397,6 +408,11 @@ struct _starpu_machine_config
 	 * that can run parallel tasks together. */
 	struct _starpu_combined_worker combined_workers[STARPU_NMAX_COMBINEDWORKERS];
 
+	starpu_pthread_mutex_t submitted_mutex;
+
+	/* Separate out previous mutex from the rest of the data. */
+	char padding2[STARPU_CACHELINE_SIZE];
+
 	/** Translation table from bindid to worker IDs */
 	struct
 	{
@@ -432,8 +448,6 @@ struct _starpu_machine_config
 
 	/** When >0, StarPU should stop performance counters collection. */
 	int perf_counter_pause_depth;
-
-	starpu_pthread_mutex_t submitted_mutex;
 };
 
 extern int _starpu_worker_parallel_blocks;

+ 5 - 0
src/datawizard/coherency.h

@@ -281,6 +281,11 @@ struct _starpu_data_state
 
 	int partition_automatic_disabled;
 
+	/** Application-provided coordinates. The maximum dimension (5) is
+	  * relatively arbitrary. */
+	unsigned dimensions;
+	int coordinates[5];
+
 	/** A generic pointer to data in the user land (could be anything and this
 	 * is not manage by StarPU) */
 	void *user_data;

+ 24 - 1
src/datawizard/interfaces/data_interface.c

@@ -1117,8 +1117,18 @@ int starpu_data_get_home_node(starpu_data_handle_t handle)
 	return handle->home_node;
 }
 
-void starpu_data_set_coordinates_array(starpu_data_handle_t handle STARPU_ATTRIBUTE_UNUSED, int dimensions STARPU_ATTRIBUTE_UNUSED, int dims[] STARPU_ATTRIBUTE_UNUSED)
+void starpu_data_set_coordinates_array(starpu_data_handle_t handle, unsigned dimensions, int dims[])
 {
+	unsigned i;
+	unsigned max_dimensions = sizeof(handle->coordinates)/sizeof(handle->coordinates[0]);
+
+	if (dimensions > max_dimensions)
+		dimensions = max_dimensions;
+
+	handle->dimensions = dimensions;
+	for (i = 0; i < dimensions; i++)
+		handle->coordinates[i] = dims[i];
+
 	_STARPU_TRACE_DATA_COORDINATES(handle, dimensions, dims);
 }
 
@@ -1135,3 +1145,16 @@ void starpu_data_set_coordinates(starpu_data_handle_t handle, unsigned dimension
 
 	starpu_data_set_coordinates_array(handle, dimensions, dims);
 }
+
+unsigned starpu_data_get_coordinates_array(starpu_data_handle_t handle, unsigned dimensions, int dims[])
+{
+	unsigned i;
+
+	if (dimensions > handle->dimensions)
+		dimensions = handle->dimensions;
+
+	for (i = 0; i < dimensions; i++)
+		dims[i] = handle->coordinates[i];
+
+	return dimensions;
+}

+ 3 - 5
src/drivers/driver_common/driver_common.c

@@ -28,8 +28,6 @@
 #include <core/debug.h>
 #include <core/task.h>
 
-#define BACKOFF_MAX 32  /* TODO : use parameter to define them */
-#define BACKOFF_MIN 1
 
 void _starpu_driver_start_job(struct _starpu_worker *worker, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, int rank, int profiling)
 {
@@ -374,7 +372,7 @@ static void _starpu_exponential_backoff(struct _starpu_worker *worker)
 {
 	int delay = worker->spinning_backoff;
 
-	if (worker->spinning_backoff < BACKOFF_MAX)
+	if (worker->spinning_backoff < worker->config->conf.driver_spinning_backoff_max)
 		worker->spinning_backoff<<=1;
 
 	while(delay--)
@@ -504,7 +502,7 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *worker, int w
 	{
 		_starpu_worker_set_status_sleeping(workerid);
 	}
-	worker->spinning_backoff = BACKOFF_MIN;
+	worker->spinning_backoff = worker->config->conf.driver_spinning_backoff_min;
 
 	_starpu_worker_leave_sched_op(worker);
 	STARPU_PTHREAD_COND_BROADCAST(&worker->sched_cond);
@@ -703,7 +701,7 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 	}
 
 	_starpu_worker_set_status_wakeup(workerid);
-	worker->spinning_backoff = BACKOFF_MIN;
+	worker->spinning_backoff = worker->config->conf.driver_spinning_backoff_min;
 #endif /* !STARPU_SIMGRID */
 
 	_starpu_worker_leave_sched_op(&workers[0]);

+ 20 - 16
src/profiling/profiling.c

@@ -29,6 +29,8 @@
 #include <papi.h>
 #endif
 
+/* TODO: move to worker structure */
+
 static struct starpu_profiling_worker_info worker_info[STARPU_NMAXWORKERS];
 /* TODO: rather use rwlock */
 static starpu_pthread_mutex_t worker_info_mutex[STARPU_NMAXWORKERS];
@@ -325,26 +327,28 @@ void _starpu_worker_stop_sleeping(int workerid)
 
 		STARPU_PTHREAD_MUTEX_LOCK(&worker_info_mutex[workerid]);
 
-		STARPU_ASSERT(worker_registered_sleeping_start[workerid] == 1);
-		sleeping_start = &sleeping_start_date[workerid];
-
-                /* Perhaps that profiling was enabled while the worker was
-                 * already blocked, so we don't measure (end - start), but
-                 * (end - max(start,worker_start)) where worker_start is the
-                 * date of the previous profiling info reset on the worker */
-		struct timespec *worker_start = &worker_info[workerid].start_time;
-		if (starpu_timespec_cmp(sleeping_start, worker_start, <))
+		if (worker_registered_sleeping_start[workerid] == 1)
 		{
-			/* sleeping_start < worker_start */
-			sleeping_start = worker_start;
-		}
+			sleeping_start = &sleeping_start_date[workerid];
+
+			/* Perhaps that profiling was enabled while the worker was
+			 * already blocked, so we don't measure (end - start), but
+			 * (end - max(start,worker_start)) where worker_start is the
+			 * date of the previous profiling info reset on the worker */
+			struct timespec *worker_start = &worker_info[workerid].start_time;
+			if (starpu_timespec_cmp(sleeping_start, worker_start, <))
+			{
+				/* sleeping_start < worker_start */
+				sleeping_start = worker_start;
+			}
 
-		struct timespec sleeping_time;
-		starpu_timespec_sub(&sleep_end_time, sleeping_start, &sleeping_time);
+			struct timespec sleeping_time;
+			starpu_timespec_sub(&sleep_end_time, sleeping_start, &sleeping_time);
 
-		starpu_timespec_accumulate(&worker_info[workerid].sleeping_time, &sleeping_time);
+			starpu_timespec_accumulate(&worker_info[workerid].sleeping_time, &sleeping_time);
 
-		worker_registered_sleeping_start[workerid] = 0;
+			worker_registered_sleeping_start[workerid] = 0;
+		}
 
 		STARPU_PTHREAD_MUTEX_UNLOCK(&worker_info_mutex[workerid]);
 

+ 17 - 11
src/sched_policies/component_heft.c

@@ -77,10 +77,13 @@ static int heft_progress_one(struct starpu_sched_component *component)
 		/* Estimated transfer+task termination for each child */
 		double estimated_ends_with_task[component->nchildren * ntasks];
 
-		/* Minimum transfer+task termination on all children */
-		double min_exp_end_with_task[ntasks];
-		/* Maximum transfer+task termination on all children */
-		double max_exp_end_with_task[ntasks];
+		/* estimated energy */
+		double local_energy[component->nchildren * ntasks];
+
+		/* Minimum transfer+task termination of the NTASKS tasks over all workers */
+		double min_exp_end_of_task[ntasks];
+		/* Maximum termination of the already-scheduled tasks over all workers */
+		double max_exp_end_of_workers;
 
 		unsigned suitable_components[component->nchildren * ntasks];
 
@@ -100,20 +103,23 @@ static int heft_progress_one(struct starpu_sched_component *component)
 					estimated_lengths + offset,
 					estimated_transfer_length + offset,
 					estimated_ends_with_task + offset,
-					&min_exp_end_with_task[n], &max_exp_end_with_task[n],
+					&min_exp_end_of_task[n], &max_exp_end_of_workers,
 							  suitable_components + offset, nsuitable_components[n]);
+			
+			/* Compute the energy, if provided*/
+			starpu_mct_compute_energy(component, tasks[n], local_energy + offset, suitable_components + offset, nsuitable_components[n]);
 		}
 
+		/* best_task is the task that will finish first among the ntasks, while best_benefit is its expected execution time*/
 		int best_task = 0;
-		double max_benefit = 0;
+		double best_benefit = min_exp_end_of_task[0];
 
 		/* Find the task which provides the most computation time benefit */
-		for (n = 0; n < ntasks; n++)
+		for (n = 1; n < ntasks; n++)
 		{
-			double benefit = max_exp_end_with_task[n] - min_exp_end_with_task[n];
-			if (max_benefit < benefit)
+			if (best_benefit > min_exp_end_of_task[n])
 			{
-				max_benefit = benefit;
+				best_benefit =  min_exp_end_of_task[n];
 				best_task = n;
 			}
 		}
@@ -129,7 +135,7 @@ static int heft_progress_one(struct starpu_sched_component *component)
 
 		unsigned offset = component->nchildren * best_task;
 
-		int best_icomponent = starpu_mct_get_best_component(d, tasks[best_task], estimated_lengths + offset, estimated_transfer_length + offset, estimated_ends_with_task + offset, min_exp_end_with_task[best_task], max_exp_end_with_task[best_task], suitable_components + offset, nsuitable_components[best_task]);
+		int best_icomponent = starpu_mct_get_best_component(d, tasks[best_task], estimated_lengths + offset, estimated_transfer_length + offset, estimated_ends_with_task + offset, local_energy + offset, min_exp_end_of_task[best_task], max_exp_end_of_workers, suitable_components + offset, nsuitable_components[best_task]);
 
 		STARPU_ASSERT(best_icomponent != -1);
 		best_component = component->children[best_icomponent];

+ 28 - 12
src/sched_policies/component_heteroprio.c

@@ -106,10 +106,13 @@ static int heteroprio_progress_accel(struct starpu_sched_component *component, s
 	/* Estimated transfer+task termination for each child */
 	double estimated_ends_with_task[component->nchildren];
 
-	/* Minimum transfer+task termination on all children */
-	double min_exp_end_with_task;
-	/* Maximum transfer+task termination on all children */
-	double max_exp_end_with_task;
+	/* provided local energy */
+	double local_energy[component->nchildren];
+
+	/* Minimum transfer+task termination of the task over all workers */
+	double min_exp_end_of_task;
+	/* Maximum termination of the already-scheduled tasks over all workers */
+	double max_exp_end_of_workers;
 
 	unsigned suitable_components[component->nchildren];
 	unsigned nsuitable_components;
@@ -155,16 +158,21 @@ static int heteroprio_progress_accel(struct starpu_sched_component *component, s
 			estimated_lengths,
 			estimated_transfer_length,
 			estimated_ends_with_task,
-			&min_exp_end_with_task, &max_exp_end_with_task,
+			&min_exp_end_of_task, &max_exp_end_of_workers,
 			suitable_components, nsuitable_components);
 
+	/* Compute the energy, if provided*/
+	starpu_mct_compute_energy(component, task, local_energy, suitable_components, nsuitable_components);
+
 	/* And now find out which worker suits best for this task,
 	 * including data transfer */
+
 	int best_icomponent = starpu_mct_get_best_component(d, task,
 			estimated_lengths,
 			estimated_transfer_length,
 			estimated_ends_with_task,
-			min_exp_end_with_task, max_exp_end_with_task,
+                        local_energy,
+			min_exp_end_of_task, max_exp_end_of_workers,
 			suitable_components, nsuitable_components);
 
 	if (best_icomponent == -1)
@@ -236,10 +244,13 @@ static int heteroprio_progress_noaccel(struct starpu_sched_component *component,
 	/* Estimated transfer+task termination for each child */
 	double estimated_ends_with_task[component->nchildren];
 
-	/* Minimum transfer+task termination on all children */
-	double min_exp_end_with_task;
-	/* Maximum transfer+task termination on all children */
-	double max_exp_end_with_task;
+	/* estimated energy */
+	double local_energy[component->nchildren];
+
+	/* Minimum transfer+task termination of the task over all workers */
+	double min_exp_end_of_task;
+	/* Maximum termination of the already-scheduled tasks over all workers */
+	double max_exp_end_of_workers;
 
 	unsigned suitable_components[component->nchildren];
 	unsigned nsuitable_components;
@@ -264,16 +275,21 @@ static int heteroprio_progress_noaccel(struct starpu_sched_component *component,
 			estimated_lengths,
 			estimated_transfer_length,
 			estimated_ends_with_task,
-			&min_exp_end_with_task, &max_exp_end_with_task,
+			&min_exp_end_of_task, &max_exp_end_of_workers,
 			suitable_components, nsuitable_components);
 
+	/* Compute the energy, if provided*/
+	starpu_mct_compute_energy(component, task, local_energy, suitable_components, nsuitable_components);
+	
 	/* And now find out which worker suits best for this task,
 	 * including data transfer */
+
 	int best_icomponent = starpu_mct_get_best_component(d, task,
 			estimated_lengths,
 			estimated_transfer_length,
 			estimated_ends_with_task,
-			min_exp_end_with_task, max_exp_end_with_task,
+                        local_energy,
+			min_exp_end_of_task, max_exp_end_of_workers,
 			suitable_components, nsuitable_components);
 
 	/* If no best component is found, it means that the perfmodel of

+ 13 - 7
src/sched_policies/component_mct.c

@@ -35,10 +35,13 @@ static int mct_push_task(struct starpu_sched_component * component, struct starp
 	/* Estimated transfer+task termination for each child */
 	double estimated_ends_with_task[component->nchildren];
 
-	/* Minimum transfer+task termination on all children */
-	double min_exp_end_with_task;
-	/* Maximum transfer+task termination on all children */
-	double max_exp_end_with_task;
+	/* estimated energy */
+	double local_energy[component->nchildren];
+
+	/* Minimum transfer+task termination of the task over all workers */
+	double min_exp_end_of_task;
+	/* Maximum termination of the already-scheduled tasks over all workers */
+	double max_exp_end_of_workers;
 
 	unsigned suitable_components[component->nchildren];
 	unsigned nsuitable_components;
@@ -58,12 +61,14 @@ static int mct_push_task(struct starpu_sched_component * component, struct starp
 	   make scheduling decisions at the same time */
 	STARPU_COMPONENT_MUTEX_LOCK(&d->scheduling_mutex);
 
-
 	starpu_mct_compute_expected_times(component, task, estimated_lengths, estimated_transfer_length,
-					  estimated_ends_with_task, &min_exp_end_with_task, &max_exp_end_with_task, suitable_components, nsuitable_components);
+					  estimated_ends_with_task, &min_exp_end_of_task, &max_exp_end_of_workers, suitable_components, nsuitable_components);
+
+	/* Compute the energy, if provided*/
+	starpu_mct_compute_energy(component, task, local_energy, suitable_components, nsuitable_components);
 
 	int best_icomponent = starpu_mct_get_best_component(d, task, estimated_lengths, estimated_transfer_length,
-					  estimated_ends_with_task, min_exp_end_with_task, max_exp_end_with_task, suitable_components, nsuitable_components);
+							    estimated_ends_with_task, local_energy, min_exp_end_of_task, max_exp_end_of_workers, suitable_components, nsuitable_components);
 
 	/* If no best component is found, it means that the perfmodel of
 	 * the task had been purged since it has been pushed on the mct component.
@@ -105,6 +110,7 @@ static void mct_component_deinit_data(struct starpu_sched_component * component)
 
 int starpu_sched_component_is_mct(struct starpu_sched_component * component)
 {
+
 	return component->push_task == mct_push_task;
 }
 

+ 0 - 0
src/sched_policies/deque_modeling_policy_data_aware.c


Some files were not shown because too many files changed in this diff