Преглед изворни кода

Merge branch 'master' into ft_checkpoint

Nathalie Furmento пре 6 година
родитељ
комит
68fea1ee9b
100 измењених фајлова са 3051 додато и 1046 уклоњено
  1. 1 0
      AUTHORS
  2. 9 0
      ChangeLog
  3. 5 5
      Makefile.am
  4. 78 63
      configure.ac
  5. 1 1
      contrib/ci.inria.fr/job-0-tarball.sh
  6. 3 2
      contrib/ci.inria.fr/job-1-check.sh
  7. 19 5
      doc/doxygen/Makefile.am
  8. 1 0
      doc/doxygen/chapters/470_simgrid.doxy
  9. 24 0
      doc/doxygen/chapters/501_environment_variables.doxy
  10. 20 1
      doc/doxygen/chapters/510_configure_options.doxy
  11. 34 0
      doc/doxygen/dev/starpu_check_include.sh
  12. 14 6
      doc/doxygen_dev/Makefile.am
  13. 15 17
      examples/Makefile.am
  14. 2 0
      examples/mlr/mlr.c
  15. 4 6
      examples/stencil/Makefile.am
  16. 16 0
      include/starpu.h
  17. 9 1
      include/starpu_data.h
  18. 26 2
      include/starpu_helper.h
  19. 11 0
      include/starpu_util.h
  20. 5 1
      julia/Makefile.am
  21. 9 9
      julia/Manifest.toml
  22. 16 4
      julia/examples/Makefile.am
  23. 41 31
      julia/examples/axpy/axpy.jl
  24. 7 6
      julia/examples/callback/callback.jl
  25. 15 0
      julia/examples/check_deps/check_deps.jl
  26. 20 0
      julia/examples/cholesky/cholesky.sh
  27. 52 0
      julia/examples/cholesky/cholesky_codelets.jl
  28. 154 0
      julia/examples/cholesky/cholesky_common.jl
  29. 71 0
      julia/examples/cholesky/cholesky_implicit.jl
  30. 79 0
      julia/examples/cholesky/cholesky_native.jl
  31. 93 0
      julia/examples/cholesky/cholesky_tag.jl
  32. 3 3
      julia/examples/dependency/end_dep.jl
  33. 3 3
      julia/examples/dependency/tag_dep.jl
  34. 3 3
      julia/examples/dependency/task_dep.jl
  35. 7 1
      julia/examples/execute.sh.in
  36. 144 0
      julia/examples/gemm/gemm.jl
  37. 22 0
      julia/examples/gemm/gemm.sh
  38. 146 0
      julia/examples/gemm/gemm_bare.jl
  39. 56 0
      julia/examples/gemm/gemm_native.jl
  40. 1 1
      julia/examples/mandelbrot/cpu_mandelbrot.c
  41. 4 1
      julia/examples/mandelbrot/mandelbrot.jl
  42. 1 1
      julia/examples/mult/cpu_mult.c
  43. 15 28
      julia/examples/mult/mult.jl
  44. 2 2
      julia/examples/task_insert_color/task_insert_color.jl
  45. 1 1
      julia/examples/variable/variable.jl
  46. 21 17
      julia/examples/vector_scal/vector_scal.jl
  47. 1 1
      julia/src/Makefile.am
  48. 12 3
      julia/src/StarPU.jl
  49. 78 400
      julia/src/blas.c
  50. 116 134
      julia/src/blas.h
  51. 15 0
      julia/src/blas.jl
  52. 15 0
      julia/src/blas_wrapper.c
  53. 25 8
      julia/src/compiler/c.jl
  54. 301 25
      julia/src/compiler/cuda.jl
  55. 351 3
      julia/src/compiler/expression_manipulation.jl
  56. 24 16
      julia/src/compiler/expressions.jl
  57. 18 17
      julia/src/compiler/file_generation.jl
  58. 4 0
      julia/src/data.jl
  59. 13 11
      julia/src/dynamic_compiler/Makefile.am
  60. 7 6
      julia/src/globals.jl
  61. 4 1
      julia/src/init.jl
  62. 9 0
      julia/src/openblas_ldflags.jl
  63. 109 22
      julia/src/task.jl
  64. 13 5
      julia/src/translate_headers.jl
  65. 8 1
      mpi/Makefile.am
  66. 5 23
      mpi/examples/Makefile.am
  67. 2 1
      mpi/examples/matrix_mult/mm.c
  68. 1 0
      mpi/src/starpu_mpi.c
  69. 34 1
      mpi/src/starpu_mpi_init.c
  70. 11 10
      mpi/tests/Makefile.am
  71. 1 1
      mpi/tests/abstract_sendrecv_bench.c
  72. 1 2
      mpi/tests/abstract_sendrecv_bench.h
  73. 1 1
      mpi/tests/bench_helper.c
  74. 1 1
      mpi/tests/bench_helper.h
  75. 1 1
      mpi/tests/burst.c
  76. 59 3
      mpi/tests/burst_gemm.c
  77. 1 1
      mpi/tests/burst_helper.c
  78. 1 1
      mpi/tests/burst_helper.h
  79. 44 0
      mpi/tests/display_bindings.c
  80. 52 4
      mpi/tests/gemm_helper.c
  81. 3 1
      mpi/tests/gemm_helper.h
  82. 1 1
      mpi/tests/nothing.c
  83. 1 1
      mpi/tests/sendrecv_parallel_tasks_bench.c
  84. 2 2
      sc_hypervisor/examples/Makefile.am
  85. 3 5
      socl/examples/Makefile.am
  86. 138 3
      src/common/utils.c
  87. 14 9
      src/core/perfmodel/perfmodel_history.c
  88. 123 45
      src/core/perfmodel/regression.c
  89. 2 0
      src/core/topology.c
  90. 0 1
      src/core/topology.h
  91. 17 0
      src/core/workers.c
  92. 16 2
      src/core/workers.h
  93. 5 0
      src/datawizard/coherency.h
  94. 24 1
      src/datawizard/interfaces/data_interface.c
  95. 3 5
      src/drivers/driver_common/driver_common.c
  96. 20 16
      src/profiling/profiling.c
  97. 17 11
      src/sched_policies/component_heft.c
  98. 28 12
      src/sched_policies/component_heteroprio.c
  99. 13 7
      src/sched_policies/component_mct.c
  100. 0 0
      src/sched_policies/deque_modeling_policy_data_aware.c

+ 1 - 0
AUTHORS

@@ -12,6 +12,7 @@ Danjean Vincent, University Grenoble Alpes, <Vincent.Danjean@ens-lyon.org>
 Denis Alexandre, Inria, <alexandre.denis@inria.fr>
 Denis Alexandre, Inria, <alexandre.denis@inria.fr>
 Eyraud-Dubois Lionel, Inria, <lionel.eyraud-dubois@inria.fr>
 Eyraud-Dubois Lionel, Inria, <lionel.eyraud-dubois@inria.fr>
 Furmento Nathalie, CNRS, <nathalie.furmento@labri.fr>
 Furmento Nathalie, CNRS, <nathalie.furmento@labri.fr>
+Guermouche Amina, Télécom SudParis, <amina.guermouche@inria.fr>
 Guilbaud Adrien, Inria, <adrien.guilbaud@inria.fr>
 Guilbaud Adrien, Inria, <adrien.guilbaud@inria.fr>
 Henry Sylvain, Université de Bordeaux, <sylvain.henry@inria.fr>
 Henry Sylvain, Université de Bordeaux, <sylvain.henry@inria.fr>
 Hugo Andra, Université de Bordeaux/Inria, <andra.hugo@inria.fr>
 Hugo Andra, Université de Bordeaux/Inria, <andra.hugo@inria.fr>

+ 9 - 0
ChangeLog

@@ -56,6 +56,15 @@ Small features:
   * And STARPU_LIMIT_BANDWIDTH environment variable.
   * And STARPU_LIMIT_BANDWIDTH environment variable.
   * Add field starpu_conf::precedence_over_environment_variables to ignore
   * Add field starpu_conf::precedence_over_environment_variables to ignore
     environment variables when parameters are set directly in starpu_conf
     environment variables when parameters are set directly in starpu_conf
+  * Add starpu_data_get_coordinates_array
+  * New STARPU_BACKOFF_MIN and STARPU_BACKOFF_MAX environment variables to the
+    exponential backoff limits of the number of cycles to pause while drivers
+    are spinning.
+  * Add STARPU_DISPLAY_BINDINGS environment variable and
+    starpu_display_bindings() function to display all bindings on the machine by
+    calling hwloc-ps
+Small changes:
+  * New configure option --disable-build-doc-pdf
 
 
 StarPU 1.3.3 (git revision 11afc5b007fe1ab1c729b55b47a5a98ef7f3cfad)
 StarPU 1.3.3 (git revision 11afc5b007fe1ab1c729b55b47a5a98ef7f3cfad)
 ====================================================================
 ====================================================================

+ 5 - 5
Makefile.am

@@ -27,7 +27,7 @@ SUBDIRS += src
 
 
 SUBDIRS += tools
 SUBDIRS += tools
 
 
-if BUILD_TESTS
+if STARPU_BUILD_TESTS
 SUBDIRS += tests
 SUBDIRS += tests
 endif
 endif
 
 
@@ -37,19 +37,19 @@ if STARPU_USE_MPI
 SUBDIRS += mpi
 SUBDIRS += mpi
 endif
 endif
 
 
-if BUILD_EXAMPLES
+if STARPU_BUILD_EXAMPLES
 SUBDIRS += examples
 SUBDIRS += examples
 endif
 endif
 
 
-if BUILD_SOCL
+if STARPU_BUILD_SOCL
 SUBDIRS += socl
 SUBDIRS += socl
 endif
 endif
 
 
-if BUILD_STARPUFFT
+if STARPU_BUILD_STARPUFFT
 SUBDIRS += starpufft
 SUBDIRS += starpufft
 endif
 endif
 
 
-if BUILD_STARPURM
+if STARPU_BUILD_STARPURM
 SUBDIRS += starpurm
 SUBDIRS += starpurm
 endif
 endif
 
 

Разлика између датотеке није приказан због своје велике величине
+ 78 - 63
configure.ac


+ 1 - 1
contrib/ci.inria.fr/job-0-tarball.sh

@@ -21,7 +21,7 @@ export LD_LIBRARY_PATH=/home/ci/usr/local/lib:$LD_LIBRARY_PATH
 ./autogen.sh
 ./autogen.sh
 if test -d build ; then chmod -R 777 build && rm -rf build ; fi
 if test -d build ; then chmod -R 777 build && rm -rf build ; fi
 mkdir build && cd build
 mkdir build && cd build
-../configure
+../configure --enable-build-doc-pdf
 make V=1
 make V=1
 make dist
 make dist
 cp *gz ..
 cp *gz ..

+ 3 - 2
contrib/ci.inria.fr/job-1-check.sh

@@ -63,12 +63,13 @@ fi
 export CC=gcc
 export CC=gcc
 
 
 CONFIGURE_OPTIONS="--enable-debug --enable-verbose --enable-mpi-check --disable-build-doc"
 CONFIGURE_OPTIONS="--enable-debug --enable-verbose --enable-mpi-check --disable-build-doc"
+CONFIGURE_CHECK=""
 day=$(date +%u)
 day=$(date +%u)
 if test $day -le 5
 if test $day -le 5
 then
 then
     CONFIGURE_CHECK="--enable-quick-check"
     CONFIGURE_CHECK="--enable-quick-check"
-else
-    CONFIGURE_CHECK="--enable-long-check"
+#else
+    # we do a normal check, a long check takes too long on VM nodes
 fi
 fi
 ../configure $CONFIGURE_OPTIONS $CONFIGURE_CHECK  $STARPU_CONFIGURE_OPTIONS
 ../configure $CONFIGURE_OPTIONS $CONFIGURE_CHECK  $STARPU_CONFIGURE_OPTIONS
 
 

+ 19 - 5
doc/doxygen/Makefile.am

@@ -29,10 +29,15 @@ txtdir   = $(docdir)/manual
 
 
 EXTRA_DIST =
 EXTRA_DIST =
 
 
-if BUILD_DOC
+if STARPU_BUILD_DOC
+if STARPU_BUILD_DOC_PDF
 all: $(DOX_HTML_DIR) $(DOX_PDF)
 all: $(DOX_HTML_DIR) $(DOX_PDF)
 EXTRA_DIST += $(DOX_HTML_DIR) $(DOX_PDF)
 EXTRA_DIST += $(DOX_HTML_DIR) $(DOX_PDF)
 txt_DATA = $(DOX_PDF)
 txt_DATA = $(DOX_PDF)
+else
+all: $(DOX_HTML_DIR)
+EXTRA_DIST += $(DOX_HTML_DIR)
+endif
 DOX_HTML_SRCDIR=$(DOX_HTML_DIR)
 DOX_HTML_SRCDIR=$(DOX_HTML_DIR)
 install-exec-hook:
 install-exec-hook:
 	$(MKDIR_P) $(DESTDIR)$(docdir)/manual/html
 	$(MKDIR_P) $(DESTDIR)$(docdir)/manual/html
@@ -40,9 +45,8 @@ install-exec-hook:
 uninstall-hook:
 uninstall-hook:
 	rm -rf $(DESTDIR)$(docdir)/manual/html
 	rm -rf $(DESTDIR)$(docdir)/manual/html
 else
 else
-if AVAILABLE_DOC
-EXTRA_DIST += $(top_srcdir)/doc/doxygen/html $(top_srcdir)/doc/doxygen/starpu.pdf
-txt_DATA = $(top_srcdir)/doc/doxygen/starpu.pdf
+if STARPU_AVAILABLE_DOC
+EXTRA_DIST += $(top_srcdir)/doc/doxygen/html
 DOX_HTML_SRCDIR=$(top_srcdir)/doc/doxygen/html
 DOX_HTML_SRCDIR=$(top_srcdir)/doc/doxygen/html
 install-exec-hook:
 install-exec-hook:
 	$(MKDIR_P) $(DESTDIR)$(docdir)/manual/html
 	$(MKDIR_P) $(DESTDIR)$(docdir)/manual/html
@@ -50,6 +54,10 @@ install-exec-hook:
 uninstall-hook:
 uninstall-hook:
 	rm -rf $(DESTDIR)$(docdir)/manual/html
 	rm -rf $(DESTDIR)$(docdir)/manual/html
 endif
 endif
+if STARPU_AVAILABLE_DOC_PDF
+EXTRA_DIST += $(top_srcdir)/doc/doxygen/starpu.pdf
+txt_DATA = $(top_srcdir)/doc/doxygen/starpu.pdf
+endif
 endif
 endif
 
 
 chapters =	\
 chapters =	\
@@ -136,7 +144,7 @@ images = 	\
 	chapters/images/tasks_size_overhead.png \
 	chapters/images/tasks_size_overhead.png \
 	chapters/images/temanejo.png
 	chapters/images/temanejo.png
 
 
-if BUILD_DOC
+if STARPU_BUILD_DOC
 starpu_config.h: $(top_srcdir)/include/starpu_config.h.in
 starpu_config.h: $(top_srcdir)/include/starpu_config.h.in
 	@$(SED) 's/#undef \(.*\)/#define \1 1/' $< > $@
 	@$(SED) 's/#undef \(.*\)/#define \1 1/' $< > $@
 
 
@@ -200,7 +208,9 @@ dox_inputs = $(DOX_CONFIG) 				\
 	$(top_srcdir)/include/starpu_expert.h		\
 	$(top_srcdir)/include/starpu_expert.h		\
 	$(top_srcdir)/include/starpu_fxt.h		\
 	$(top_srcdir)/include/starpu_fxt.h		\
 	$(top_srcdir)/include/starpu_hash.h		\
 	$(top_srcdir)/include/starpu_hash.h		\
+	$(top_srcdir)/include/starpu_helper.h		\
 	$(top_srcdir)/include/starpu_mic.h		\
 	$(top_srcdir)/include/starpu_mic.h		\
+	$(top_srcdir)/include/starpu_mpi_ms.h		\
 	$(top_srcdir)/include/starpu_mod.f90		\
 	$(top_srcdir)/include/starpu_mod.f90		\
 	$(top_srcdir)/include/starpu_opencl.h		\
 	$(top_srcdir)/include/starpu_opencl.h		\
 	$(top_srcdir)/include/starpu_openmp.h		\
 	$(top_srcdir)/include/starpu_openmp.h		\
@@ -227,6 +237,8 @@ dox_inputs = $(DOX_CONFIG) 				\
 	$(top_srcdir)/include/starpu_util.h		\
 	$(top_srcdir)/include/starpu_util.h		\
 	$(top_srcdir)/include/starpu_worker.h		\
 	$(top_srcdir)/include/starpu_worker.h		\
 	$(top_srcdir)/include/fstarpu_mod.f90		\
 	$(top_srcdir)/include/fstarpu_mod.f90		\
+	$(top_srcdir)/include/schedulers/starpu_heteroprio.h	\
+	$(top_srcdir)/starpufft/include/starpufft.h 	\
 	$(top_srcdir)/mpi/include/starpu_mpi.h 		\
 	$(top_srcdir)/mpi/include/starpu_mpi.h 		\
 	$(top_srcdir)/mpi/include/starpu_mpi_lb.h	\
 	$(top_srcdir)/mpi/include/starpu_mpi_lb.h	\
 	$(top_srcdir)/mpi/include/fstarpu_mpi_mod.f90		\
 	$(top_srcdir)/mpi/include/fstarpu_mpi_mod.f90		\
@@ -253,6 +265,8 @@ $(DOX_TAG): $(dox_inputs)
 	@$(SED) -i '/\\begin{titlepage}/,$$d' $(DOX_LATEX_DIR)/refman.tex
 	@$(SED) -i '/\\begin{titlepage}/,$$d' $(DOX_LATEX_DIR)/refman.tex
 	@cat $(top_srcdir)/doc/doxygen/refman.tex >> $(DOX_LATEX_DIR)/refman.tex
 	@cat $(top_srcdir)/doc/doxygen/refman.tex >> $(DOX_LATEX_DIR)/refman.tex
 
 
+$(DOX_HTML_DIR): $(DOX_TAG)
+
 $(DOX_PDF): $(DOX_TAG) refman.tex
 $(DOX_PDF): $(DOX_TAG) refman.tex
 	@cp $(top_srcdir)/doc/doxygen/chapters/version.sty $(DOX_LATEX_DIR)
 	@cp $(top_srcdir)/doc/doxygen/chapters/version.sty $(DOX_LATEX_DIR)
 	@cp $(top_srcdir)/doc/doxygen/chapters/images/*pdf $(DOX_LATEX_DIR)
 	@cp $(top_srcdir)/doc/doxygen/chapters/images/*pdf $(DOX_LATEX_DIR)

+ 1 - 0
doc/doxygen/chapters/470_simgrid.doxy

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Federal University of Rio Grande do Sul (UFRGS)
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by

+ 24 - 0
doc/doxygen/chapters/501_environment_variables.doxy

@@ -2,6 +2,7 @@
  *
  *
  * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  * Copyright (C) 2016       Uppsala University
  * Copyright (C) 2016       Uppsala University
+ * Copyright (C) 2020       Federal University of Rio Grande do Sul (UFRGS)
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -405,6 +406,20 @@ and friends.  The default is Enabled.
 This permits to test the performance effect of memory pinning.
 This permits to test the performance effect of memory pinning.
 </dd>
 </dd>
 
 
+<dt>STARPU_BACKOFF_MIN</dt>
+<dd>
+\anchor STARPU_BACKOFF_MIN
+\addindex __env__STARPU_BACKOFF_MIN
+Set minimum exponential backoff of number of cycles to pause when spinning. Default value is 1.
+</dd>
+
+<dt>STARPU_BACKOFF_MAX</dt>
+<dd>
+\anchor STARPU_BACKOFF_MAX
+\addindex __env__STARPU_BACKOFF_MAX
+Set maximum exponential backoff of number of cycles to pause when spinning. Default value is 32.
+</dd>
+
 <dt>STARPU_MIC_SINK_PROGRAM_NAME</dt>
 <dt>STARPU_MIC_SINK_PROGRAM_NAME</dt>
 <dd>
 <dd>
 \anchor STARPU_MIC_SINK_PROGRAM_NAME
 \anchor STARPU_MIC_SINK_PROGRAM_NAME
@@ -1351,6 +1366,15 @@ application has crashed. Setting this variable to a value other than 1
 will disable this behaviour. This should be done on JVM systems which
 will disable this behaviour. This should be done on JVM systems which
 may use these signals for their own needs.
 may use these signals for their own needs.
 The flag can also be set through the field starpu_conf::catch_signals.
 The flag can also be set through the field starpu_conf::catch_signals.
+</dd>
+
+<dt>STARPU_DISPLAY_BINDINGS</dt>
+<dd>
+\anchor STARPU_DISPLAY_BINDINGS
+\addindex __env__STARPU_DISPLAY_BINDINGS
+Display the binding of all processes and threads running on the machine. If MPI is enabled, display the binding of each node.<br>
+Users can manually display the binding by calling starpu_display_bindings().
+</dd>
 </dl>
 </dl>
 
 
 \section ConfiguringTheHypervisor Configuring The Hypervisor
 \section ConfiguringTheHypervisor Configuring The Hypervisor

+ 20 - 1
doc/doxygen/chapters/510_configure_options.doxy

@@ -115,7 +115,19 @@ Specify <c>hwloc</c> should not be used by StarPU.
 \addindex __configure__--disable-build-doc
 \addindex __configure__--disable-build-doc
 Disable the creation of the documentation. This should be done on a
 Disable the creation of the documentation. This should be done on a
 machine which does not have the tools <c>doxygen</c> and <c>latex</c>
 machine which does not have the tools <c>doxygen</c> and <c>latex</c>
-(plus the packages <c>latex-xcolor</c> and <c>texlive-latex-extra</c>).
+(plus the packages <c>latex-xcolor</c> and
+<c>texlive-latex-extra</c>).
+</dd>
+
+<dt>--enable-build-doc-pdf</dt>
+<dd>
+\anchor enable-build-doc-pdf
+\addindex __configure__--enable-build-doc-pdf
+By default, ontly the HTML documentation is generated. Use this option
+to also enable the generation of the PDF documentation. This should be
+done on a machine which does have the tools <c>doxygen</c> and <c>latex</c>
+(plus the packages <c>latex-xcolor</c> and
+<c>texlive-latex-extra</c>).
 </dd>
 </dd>
 
 
 <dt>--disable-icc</dt>
 <dt>--disable-icc</dt>
@@ -370,6 +382,13 @@ used by StarPU data structures.
 Disable the build of libstarpumpi. By default, it is enabled when MPI is found.
 Disable the build of libstarpumpi. By default, it is enabled when MPI is found.
 </dd>
 </dd>
 
 
+<dt>--enable-mpi</dt>
+<dd>
+\anchor enable-mpi
+\addindex __configure__--enable-mpi
+Enable the build of libstarpumpi. This is necessary when using Simgrid+MPI.
+</dd>
+
 <dt>--with-mpicc=<c>path</c></dt>
 <dt>--with-mpicc=<c>path</c></dt>
 <dd>
 <dd>
 \anchor with-mpicc
 \anchor with-mpicc

+ 34 - 0
doc/doxygen/dev/starpu_check_include.sh

@@ -0,0 +1,34 @@
+#!/bin/bash
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+dir=$(dirname $0)
+
+cd $dir/../../../
+for d in $(find . -name include -not -wholename "*/build/*")
+do
+    for f in $(find $d -name "*h")
+    do
+	for i in doxygen-config.cfg.in Makefile.am
+	do
+	    x=`grep $f $dir/../$i`
+	    if test -z "$x"
+	    then
+		echo $f missing in $i
+	    fi
+	done
+    done
+done

+ 14 - 6
doc/doxygen_dev/Makefile.am

@@ -29,10 +29,15 @@ txtdir   = $(docdir)/manual
 
 
 EXTRA_DIST =
 EXTRA_DIST =
 
 
-if BUILD_DOC
+if STARPU_BUILD_DOC
+if STARPU_BUILD_DOC_PDF
 all: $(DOX_HTML_DIR) $(DOX_PDF)
 all: $(DOX_HTML_DIR) $(DOX_PDF)
 EXTRA_DIST += $(DOX_HTML_DIR) $(DOX_PDF)
 EXTRA_DIST += $(DOX_HTML_DIR) $(DOX_PDF)
 txt_DATA = $(DOX_PDF)
 txt_DATA = $(DOX_PDF)
+else
+all: $(DOX_HTML_DIR)
+EXTRA_DIST += $(DOX_HTML_DIR)
+endif
 DOX_HTML_SRCDIR=$(DOX_HTML_DIR)
 DOX_HTML_SRCDIR=$(DOX_HTML_DIR)
 install-exec-hook:
 install-exec-hook:
 	$(MKDIR_P) $(DESTDIR)$(docdir)/manual/html_dev
 	$(MKDIR_P) $(DESTDIR)$(docdir)/manual/html_dev
@@ -40,9 +45,8 @@ install-exec-hook:
 uninstall-hook:
 uninstall-hook:
 	rm -rf $(DESTDIR)$(docdir)/manual/html_dev
 	rm -rf $(DESTDIR)$(docdir)/manual/html_dev
 else
 else
-if AVAILABLE_DOC
-EXTRA_DIST += $(top_srcdir)/doc/doxygen_dev/html_dev $(top_srcdir)/doc/doxygen_dev/starpu_dev.pdf
-txt_DATA = $(top_srcdir)/doc/doxygen_dev/starpu_dev.pdf
+if STARPU_AVAILABLE_DOC
+EXTRA_DIST += $(top_srcdir)/doc/doxygen_dev/html_dev
 DOX_HTML_SRCDIR=$(top_srcdir)/doc/doxygen_dev/html_dev
 DOX_HTML_SRCDIR=$(top_srcdir)/doc/doxygen_dev/html_dev
 install-exec-hook:
 install-exec-hook:
 	$(MKDIR_P) $(DESTDIR)$(docdir)/manual/html_dev
 	$(MKDIR_P) $(DESTDIR)$(docdir)/manual/html_dev
@@ -50,6 +54,10 @@ install-exec-hook:
 uninstall-hook:
 uninstall-hook:
 	rm -rf $(DESTDIR)$(docdir)/manual/html_dev
 	rm -rf $(DESTDIR)$(docdir)/manual/html_dev
 endif
 endif
+if STARPU_AVAILABLE_DOC_PDF
+EXTRA_DIST += $(top_srcdir)/doc/doxygen_dev/starpu_dev.pdf
+txt_DATA = $(top_srcdir)/doc/doxygen_dev/starpu_dev.pdf
+endif
 endif
 endif
 
 
 chapters =	\
 chapters =	\
@@ -58,7 +66,7 @@ chapters =	\
 
 
 images =
 images =
 
 
-if BUILD_DOC
+if STARPU_BUILD_DOC
 config.h: $(top_srcdir)/src/common/config.h.in
 config.h: $(top_srcdir)/src/common/config.h.in
 	@$(SED) 's/#undef \(.*\)/#define \1 1/' $< > $@
 	@$(SED) 's/#undef \(.*\)/#define \1 1/' $< > $@
 	@$(SED) -i '1s/^/\/\*\* \@file \*\/\n/' $@
 	@$(SED) -i '1s/^/\/\*\* \@file \*\/\n/' $@
@@ -191,7 +199,7 @@ dox_inputs = $(DOX_CONFIG) 				\
 	$(top_srcdir)/src/core/drivers.h	\
 	$(top_srcdir)/src/core/drivers.h	\
 	$(top_srcdir)/src/core/workers.h
 	$(top_srcdir)/src/core/workers.h
 
 
-$(DOX_HTML_DIR): $(DOX_TAG) refman.tex
+$(DOX_HTML_DIR): $(DOX_TAG)
 	@$(MKDIR_P) $(DOX_HTML_DIR)
 	@$(MKDIR_P) $(DOX_HTML_DIR)
 
 
 $(DOX_TAG): $(dox_inputs)
 $(DOX_TAG): $(dox_inputs)

+ 15 - 17
examples/Makefile.am

@@ -153,16 +153,13 @@ SHELL_TESTS =
 if !STARPU_USE_MPI_MASTER_SLAVE
 if !STARPU_USE_MPI_MASTER_SLAVE
 SHELL_TESTS			+=	scheduler/schedulers.sh
 SHELL_TESTS			+=	scheduler/schedulers.sh
 SHELL_TESTS			+=	scheduler/schedulers_context.sh
 SHELL_TESTS			+=	scheduler/schedulers_context.sh
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 SHELL_TESTS			+=	mult/sgemm.sh
 SHELL_TESTS			+=	mult/sgemm.sh
 endif
 endif
 endif
 endif
 
 
-if STARPU_HAVE_WINDOWS
 check_PROGRAMS		=	$(STARPU_EXAMPLES)
 check_PROGRAMS		=	$(STARPU_EXAMPLES)
-else
-check_PROGRAMS		=	$(LOADER) $(STARPU_EXAMPLES)
-endif
+noinst_PROGRAMS		=
 
 
 if !STARPU_HAVE_WINDOWS
 if !STARPU_HAVE_WINDOWS
 ## test loader program
 ## test loader program
@@ -171,6 +168,7 @@ LOADER			=	loader
 loader_CPPFLAGS 	=	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
 loader_CPPFLAGS 	=	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
 LOADER_BIN		=	$(abs_top_builddir)/examples/$(LOADER)
 LOADER_BIN		=	$(abs_top_builddir)/examples/$(LOADER)
 loader_SOURCES		=	../tests/loader.c
 loader_SOURCES		=	../tests/loader.c
+noinst_PROGRAMS		+=	loader
 else
 else
 LOADER			=
 LOADER			=
 LOADER_BIN		=	$(top_builddir)/examples/loader-cross.sh
 LOADER_BIN		=	$(top_builddir)/examples/loader-cross.sh
@@ -304,7 +302,7 @@ endif
 endif
 endif
 endif
 endif
 
 
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 STARPU_EXAMPLES +=				\
 STARPU_EXAMPLES +=				\
 	mult/sgemm 				\
 	mult/sgemm 				\
 	mult/dgemm				\
 	mult/dgemm				\
@@ -344,7 +342,7 @@ endif
 
 
 if !STARPU_SIMGRID
 if !STARPU_SIMGRID
 
 
-if MKL_BLAS_LIB
+if STARPU_MKL_BLAS_LIB
 STARPU_EXAMPLES +=				\
 STARPU_EXAMPLES +=				\
 	lu/lu_example_complex_float		\
 	lu/lu_example_complex_float		\
 	lu/lu_example_complex_double		\
 	lu/lu_example_complex_double		\
@@ -646,7 +644,7 @@ endif
 # AXPY example #
 # AXPY example #
 ################
 ################
 
 
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 axpy_axpy_SOURCES =				\
 axpy_axpy_SOURCES =				\
 	axpy/axpy.c				\
 	axpy/axpy.c				\
 	common/blas.c
 	common/blas.c
@@ -665,7 +663,7 @@ endif
 # Mult example #
 # Mult example #
 ################
 ################
 
 
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 
 
 mult_sgemm_SOURCES = 				\
 mult_sgemm_SOURCES = 				\
 	mult/sgemm.c				\
 	mult/sgemm.c				\
@@ -687,7 +685,7 @@ endif
 # Cholesky example #
 # Cholesky example #
 ####################
 ####################
 
 
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 
 
 cholesky_cholesky_tag_SOURCES =			\
 cholesky_cholesky_tag_SOURCES =			\
 	cholesky/cholesky_tag.c			\
 	cholesky/cholesky_tag.c			\
@@ -742,7 +740,7 @@ endif
 # LU example #
 # LU example #
 ##############
 ##############
 
 
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 
 
 lu_lu_example_float_SOURCES =			\
 lu_lu_example_float_SOURCES =			\
 	lu/lu_example_float.c			\
 	lu/lu_example_float.c			\
@@ -784,7 +782,7 @@ lu_lu_implicit_example_double_SOURCES =		\
 lu_lu_implicit_example_double_LDADD =		\
 lu_lu_implicit_example_double_LDADD =		\
 	$(STARPU_BLAS_LDFLAGS)
 	$(STARPU_BLAS_LDFLAGS)
 
 
-if MKL_BLAS_LIB
+if STARPU_MKL_BLAS_LIB
 lu_lu_example_complex_float_SOURCES =		\
 lu_lu_example_complex_float_SOURCES =		\
 	lu/lu_example_complex_float.c		\
 	lu/lu_example_complex_float.c		\
 	lu/clu.c				\
 	lu/clu.c				\
@@ -837,7 +835,7 @@ endif
 # Heat example #
 # Heat example #
 ################
 ################
 
 
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 
 
 heat_heat_SOURCES =				\
 heat_heat_SOURCES =				\
 	heat/heat.c				\
 	heat/heat.c				\
@@ -861,7 +859,7 @@ endif
 # CG example #
 # CG example #
 ##############
 ##############
 
 
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 
 
 cg_cg_SOURCES =					\
 cg_cg_SOURCES =					\
 	cg/cg.c					\
 	cg/cg.c					\
@@ -1013,7 +1011,7 @@ examplebin_PROGRAMS +=				\
 	mandelbrot/mandelbrot
 	mandelbrot/mandelbrot
 
 
 mandelbrot_mandelbrot_CPPFLAGS = $(AM_CFLAGS) $(AM_CPPFLAGS)
 mandelbrot_mandelbrot_CPPFLAGS = $(AM_CFLAGS) $(AM_CPPFLAGS)
-if HAVE_X11
+if STARPU_HAVE_X11
 mandelbrot_mandelbrot_CPPFLAGS += $(X_CFLAGS)
 mandelbrot_mandelbrot_CPPFLAGS += $(X_CFLAGS)
 mandelbrot_mandelbrot_LDADD = $(X_PRE_LIBS) $(X_LIBS) -lX11 $(X_EXTRA_LIBS)
 mandelbrot_mandelbrot_LDADD = $(X_PRE_LIBS) $(X_LIBS) -lX11 $(X_EXTRA_LIBS)
 endif
 endif
@@ -1062,7 +1060,7 @@ endif
 # OpenGL interoperability #
 # OpenGL interoperability #
 ###########################
 ###########################
 
 
-if HAVE_OPENGL
+if STARPU_HAVE_OPENGL
 examplebin_PROGRAMS +=				\
 examplebin_PROGRAMS +=				\
 	gl_interop/gl_interop			\
 	gl_interop/gl_interop			\
 	gl_interop/gl_interop_idle
 	gl_interop/gl_interop_idle
@@ -1084,7 +1082,7 @@ endif
 # pipeline example #
 # pipeline example #
 ####################
 ####################
 
 
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 pipeline_pipeline_SOURCES	=	\
 pipeline_pipeline_SOURCES	=	\
 	pipeline/pipeline.c		\
 	pipeline/pipeline.c		\
 	common/blas.c
 	common/blas.c

+ 2 - 0
examples/mlr/mlr.c

@@ -110,7 +110,9 @@ static struct starpu_perfmodel cl_model_init =
    template.
    template.
  */
  */
 
 
+/* M^2 * N^1 * K^0 */
 static unsigned combi1 [3]		= {	2,	1,	0 };
 static unsigned combi1 [3]		= {	2,	1,	0 };
+/* M^0 * N^3 * K^1 */
 static unsigned combi2 [3]		= {	0,	3,	1 };
 static unsigned combi2 [3]		= {	0,	3,	1 };
 
 
 static unsigned *combinations[] = { combi1, combi2 };
 static unsigned *combinations[] = { combi1, combi2 };

+ 4 - 6
examples/stencil/Makefile.am

@@ -20,7 +20,7 @@ LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(HWLOC_LIBS) @LIBS@ $(FXT_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
 AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@ $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
 AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@ $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
 
 
-if USE_MPI
+if STARPU_USE_MPI
 LIBS += $(top_builddir)/mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 LIBS += $(top_builddir)/mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 AM_CPPFLAGS += -I$(top_srcdir)/mpi/include
 AM_CPPFLAGS += -I$(top_srcdir)/mpi/include
 if STARPU_SIMGRID
 if STARPU_SIMGRID
@@ -56,14 +56,11 @@ endif
 # What to install and what to check #
 # What to install and what to check #
 #####################################
 #####################################
 
 
-if STARPU_HAVE_WINDOWS
 check_PROGRAMS	=	$(STARPU_EXAMPLES)
 check_PROGRAMS	=	$(STARPU_EXAMPLES)
-else
-check_PROGRAMS	=	$(LOADER) $(STARPU_EXAMPLES)
-endif
+noinst_PROGRAMS	=
 
 
 if !STARPU_SIMGRID
 if !STARPU_SIMGRID
-if USE_MPI
+if STARPU_USE_MPI
 if STARPU_MPI_CHECK
 if STARPU_MPI_CHECK
 TESTS		=	$(STARPU_EXAMPLES)
 TESTS		=	$(STARPU_EXAMPLES)
 endif
 endif
@@ -79,6 +76,7 @@ LOADER			=	loader
 loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
 loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
 LOADER_BIN		=	./$(LOADER)
 LOADER_BIN		=	./$(LOADER)
 loader_SOURCES		=	../../tests/loader.c
 loader_SOURCES		=	../../tests/loader.c
+noinst_PROGRAMS		+=	loader
 else
 else
 LOADER			=
 LOADER			=
 LOADER_BIN		=	$(top_builddir)/examples/stencil/loader-cross.sh
 LOADER_BIN		=	$(top_builddir)/examples/stencil/loader-cross.sh

+ 16 - 0
include/starpu.h

@@ -111,6 +111,12 @@ struct starpu_conf
 	int magic;
 	int magic;
 
 
 	/**
 	/**
+	   @private
+	   Tell starpu_init() if MPI will be initialized later.
+	*/
+	int will_use_mpi;
+
+	/**
 	   Name of the scheduling policy. This can also be specified
 	   Name of the scheduling policy. This can also be specified
 	   with the environment variable \ref STARPU_SCHED. (default =
 	   with the environment variable \ref STARPU_SCHED. (default =
 	   <c>NULL</c>).
 	   <c>NULL</c>).
@@ -441,6 +447,16 @@ struct starpu_conf
 	   performance counters after initialization
 	   performance counters after initialization
 	 */
 	 */
 	unsigned start_perf_counter_collection;
 	unsigned start_perf_counter_collection;
+
+	/**
+	   Minimum spinning backoff of drivers. Default value: \c 1
+	 */
+	unsigned driver_spinning_backoff_min;
+
+	/**
+	   Maximum spinning backoff of drivers. Default value: \c 32
+	 */
+	unsigned driver_spinning_backoff_max;
 };
 };
 
 
 /**
 /**

+ 9 - 1
include/starpu_data.h

@@ -123,7 +123,7 @@ void starpu_data_set_name(starpu_data_handle_t handle, const char *name);
    tools. \p dimensions is the size of the \p dims array. This can be
    tools. \p dimensions is the size of the \p dims array. This can be
    for instance the tile coordinates within a big matrix.
    for instance the tile coordinates within a big matrix.
 */
 */
-void starpu_data_set_coordinates_array(starpu_data_handle_t handle, int dimensions, int dims[]);
+void starpu_data_set_coordinates_array(starpu_data_handle_t handle, unsigned dimensions, int dims[]);
 
 
 /**
 /**
    Set the coordinates of the data, to be shown in various profiling
    Set the coordinates of the data, to be shown in various profiling
@@ -133,6 +133,14 @@ void starpu_data_set_coordinates_array(starpu_data_handle_t handle, int dimensio
 void starpu_data_set_coordinates(starpu_data_handle_t handle, unsigned dimensions, ...);
 void starpu_data_set_coordinates(starpu_data_handle_t handle, unsigned dimensions, ...);
 
 
 /**
 /**
+   Get the coordinates of the data, as set by a previous call to
+   starpu_data_set_coordinates_array() or starpu_data_set_coordinates()
+   \p dimensions is the size of the \p dims array.
+   This returns the actual number of returned coordinates.
+*/
+unsigned starpu_data_get_coordinates_array(starpu_data_handle_t handle, unsigned dimensions, int dims[]);
+
+/**
    Unregister a data \p handle from StarPU. If the data was
    Unregister a data \p handle from StarPU. If the data was
    automatically allocated by StarPU because the home node was -1, all
    automatically allocated by StarPU because the home node was -1, all
    automatically allocated buffers are freed. Otherwise, a valid copy
    automatically allocated buffers are freed. Otherwise, a valid copy

+ 26 - 2
include/starpu_helper.h

@@ -50,6 +50,20 @@ extern int _starpu_silent;
 char *starpu_getenv(const char *str);
 char *starpu_getenv(const char *str);
 
 
 /**
 /**
+   If the environment variable \c str is defined and its value is contained in the array \c strings, return the array position.
+   Raise an error if the environment variable \c str is defined with a value not in \c strings
+   Return \c defvalue if the environment variable \c str is not defined.
+ */
+int starpu_get_env_string_var_default(const char *str, const char *strings[], int defvalue);
+
+/**
+   If the environment variable \c str is defined with a well-defined size value, return the value as a size in bytes. Expected size qualifiers are b, B, k, K, m, M, g, G. The default qualifier is K.
+   If the environment variable \c str is not defined or is empty, return \c defval
+   Raise an error if the value of the environment variable \c str is not well-defined.
+ */
+int starpu_get_env_size_default(const char *str, int defval);
+
+/**
    Return the integer value of the environment variable named \p str.
    Return the integer value of the environment variable named \p str.
    Return 0 otherwise (the variable does not exist or has a
    Return 0 otherwise (the variable does not exist or has a
    non-integer value).
    non-integer value).
@@ -66,7 +80,8 @@ static __starpu_inline int starpu_get_env_number(const char *str)
 		char *pcheck;
 		char *pcheck;
 
 
 		val = strtol(strval, &pcheck, 10);
 		val = strtol(strval, &pcheck, 10);
-		if (*pcheck) {
+		if (*pcheck)
+		{
 			fprintf(stderr,"The %s environment variable must contain an integer\n", str);
 			fprintf(stderr,"The %s environment variable must contain an integer\n", str);
 			STARPU_ABORT();
 			STARPU_ABORT();
 		}
 		}
@@ -103,7 +118,8 @@ static __starpu_inline float starpu_get_env_float_default(const char *str, float
 		char *pcheck;
 		char *pcheck;
 
 
 		val = strtof(strval, &pcheck);
 		val = strtof(strval, &pcheck);
-		if (*pcheck) {
+		if (*pcheck)
+		{
 			fprintf(stderr,"The %s environment variable must contain a float\n", str);
 			fprintf(stderr,"The %s environment variable must contain a float\n", str);
 			STARPU_ABORT();
 			STARPU_ABORT();
 		}
 		}
@@ -166,6 +182,14 @@ double starpu_timing_now(void);
 */
 */
 int starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_handle, int asynchronous, void (*callback_func)(void*), void *callback_arg);
 int starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_handle, int asynchronous, void (*callback_func)(void*), void *callback_arg);
 
 
+/**
+   Call hwloc-ps to display binding of each processus and thread running on
+   the machine.<br>
+   Use the environment variable \ref STARPU_DISPLAY_BINDINGS to automatically
+   call this function at the beginning of the execution of StarPU.
+*/
+void starpu_display_bindings(void);
+
 /** @} */
 /** @} */
 
 
 #ifdef __cplusplus
 #ifdef __cplusplus

+ 11 - 0
include/starpu_util.h

@@ -598,6 +598,17 @@ STARPU_ATOMIC_SOMETHING64(or, old | value)
 #define STARPU_WMB() STARPU_SYNCHRONIZE()
 #define STARPU_WMB() STARPU_SYNCHRONIZE()
 #endif
 #endif
 
 
+#if defined(__i386__) || defined(__x86_64__)
+#define STARPU_CACHELINE_SIZE 64
+#elif defined(__ppc__) || defined(__ppc64__) || defined(__ia64__)
+#define STARPU_CACHELINE_SIZE 128
+#elif defined(__s390__) || defined(__s390x__)
+#define STARPU_CACHELINE_SIZE 256
+#else
+/* Conservative default */
+#define STARPU_CACHELINE_SIZE 1024
+#endif
+
 #ifdef _WIN32
 #ifdef _WIN32
 /* Try to fetch the system definition of timespec */
 /* Try to fetch the system definition of timespec */
 #include <sys/types.h>
 #include <sys/types.h>

+ 5 - 1
julia/Makefile.am

@@ -15,7 +15,11 @@
 #
 #
 include $(top_srcdir)/starpu.mk
 include $(top_srcdir)/starpu.mk
 
 
-SUBDIRS = src examples
+SUBDIRS = src
+
+if STARPU_BUILD_EXAMPLES
+SUBDIRS += examples
+endif
 
 
 EXTRA_DIST = README
 EXTRA_DIST = README
 
 

+ 9 - 9
julia/Manifest.toml

@@ -7,28 +7,28 @@ uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 deps = ["Libdl", "Random", "Test"]
 deps = ["Libdl", "Random", "Test"]
 git-tree-sha1 = "6f457df38ae2ba239d5e43b80493bb907de826b2"
 git-tree-sha1 = "6f457df38ae2ba239d5e43b80493bb907de826b2"
 repo-rev = "655e9862947d17423f2fb91ea1014e1cb73c1be1"
 repo-rev = "655e9862947d17423f2fb91ea1014e1cb73c1be1"
-repo-url = "https://github.com/analytech-solutions/CBinding.jl.git"
+repo-url = "https://github.com/analytech-solutions/CBinding.jl"
 uuid = "d43a6710-96b8-4a2d-833c-c424785e5374"
 uuid = "d43a6710-96b8-4a2d-833c-c424785e5374"
 version = "0.8.1"
 version = "0.8.1"
 
 
 [[CEnum]]
 [[CEnum]]
-git-tree-sha1 = "62847acab40e6855a9b5905ccb99c2b5cf6b3ebb"
+git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9"
 uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
 uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
-version = "0.2.0"
+version = "0.4.1"
 
 
 [[Clang]]
 [[Clang]]
 deps = ["CEnum", "DataStructures", "LLVM_jll", "Libdl"]
 deps = ["CEnum", "DataStructures", "LLVM_jll", "Libdl"]
-git-tree-sha1 = "45013227beea038ecc17e8c07cd7c7b05ed26067"
-repo-rev = "master"
-repo-url = "https://github.com/phuchant/Clang.jl.git"
+git-tree-sha1 = "2142a3a54faa28f08edb7b16bde2d3d32b1f3785"
+repo-rev = "29ad279"
+repo-url = "https://github.com/phuchant/Clang.jl"
 uuid = "40e3b903-d033-50b4-a0cc-940c62c95e31"
 uuid = "40e3b903-d033-50b4-a0cc-940c62c95e31"
-version = "0.11.0"
+version = "0.11.1"
 
 
 [[DataStructures]]
 [[DataStructures]]
 deps = ["InteractiveUtils", "OrderedCollections"]
 deps = ["InteractiveUtils", "OrderedCollections"]
-git-tree-sha1 = "6166ecfaf2b8bbf2b68d791bc1d54501f345d314"
+git-tree-sha1 = "af6d9c86e191c917c2276fbede1137e8ea20157f"
 uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-version = "0.17.15"
+version = "0.17.17"
 
 
 [[Dates]]
 [[Dates]]
 deps = ["Printf"]
 deps = ["Printf"]

+ 16 - 4
julia/examples/Makefile.am

@@ -15,6 +15,8 @@
 #
 #
 include $(top_srcdir)/starpu.mk
 include $(top_srcdir)/starpu.mk
 
 
+noinst_PROGRAMS		=
+
 if STARPU_HAVE_WINDOWS
 if STARPU_HAVE_WINDOWS
 LOADER_BIN		=
 LOADER_BIN		=
 else
 else
@@ -22,6 +24,7 @@ loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
 if !STARPU_SIMGRID
 if !STARPU_SIMGRID
 LOADER			=	loader
 LOADER			=	loader
 LOADER_BIN		=	$(abs_top_builddir)/julia/examples/$(LOADER)
 LOADER_BIN		=	$(abs_top_builddir)/julia/examples/$(LOADER)
+noinst_PROGRAMS		+=	loader
 endif
 endif
 loader_SOURCES		=	../../tests/loader.c
 loader_SOURCES		=	../../tests/loader.c
 endif
 endif
@@ -44,12 +47,21 @@ EXTRA_DIST =					\
 	callback/callback.sh			\
 	callback/callback.sh			\
 	check_deps/check_deps.jl		\
 	check_deps/check_deps.jl		\
 	check_deps/check_deps.sh		\
 	check_deps/check_deps.sh		\
+	cholesky/cholesky_codelets.jl		\
+	cholesky/cholesky_common.jl		\
+	cholesky/cholesky_native.jl		\
+	cholesky/cholesky_implicit.jl		\
+	cholesky/cholesky_tag.jl		\
+	cholesky/cholesky.sh			\
 	dependency/end_dep.jl			\
 	dependency/end_dep.jl			\
 	dependency/end_dep.sh			\
 	dependency/end_dep.sh			\
 	dependency/tag_dep.jl			\
 	dependency/tag_dep.jl			\
 	dependency/tag_dep.sh			\
 	dependency/tag_dep.sh			\
 	dependency/task_dep.sh			\
 	dependency/task_dep.sh			\
 	dependency/task_dep.jl			\
 	dependency/task_dep.jl			\
+	gemm/gemm.jl				\
+	gemm/gemm_native.jl			\
+	gemm/gemm.sh				\
 	mandelbrot/mandelbrot_native.jl		\
 	mandelbrot/mandelbrot_native.jl		\
 	mandelbrot/mandelbrot.jl		\
 	mandelbrot/mandelbrot.jl		\
 	mandelbrot/mandelbrot.sh		\
 	mandelbrot/mandelbrot.sh		\
@@ -92,11 +104,9 @@ check_PROGRAMS = $(LOADER) $(starpu_julia_EXAMPLES)
 SHELL_TESTS	=
 SHELL_TESTS	=
 STARPU_JULIA_EXAMPLES	=
 STARPU_JULIA_EXAMPLES	=
 
 
-if BUILD_EXAMPLES
 examplebin_PROGRAMS 	+=	$(STARPU_JULIA_EXAMPLES)
 examplebin_PROGRAMS 	+=	$(STARPU_JULIA_EXAMPLES)
 
 
 TESTS			=	$(SHELL_TESTS) $(STARPU_JULIA_EXAMPLES)
 TESTS			=	$(SHELL_TESTS) $(STARPU_JULIA_EXAMPLES)
-endif
 
 
 ######################
 ######################
 #      Examples      #
 #      Examples      #
@@ -127,6 +137,8 @@ SHELL_TESTS			+=	dependency/tag_dep.sh
 SHELL_TESTS			+=	dependency/task_dep.sh
 SHELL_TESTS			+=	dependency/task_dep.sh
 SHELL_TESTS			+=	dependency/end_dep.sh
 SHELL_TESTS			+=	dependency/end_dep.sh
 
 
-if !NO_BLAS_LIB
-SHELL_TESTS			+= axpy/axpy.sh
+if !STARPU_NO_BLAS_LIB
+SHELL_TESTS			+=	axpy/axpy.sh
+SHELL_TESTS			+=	cholesky/cholesky.sh
+SHELL_TESTS			+=	gemm/gemm.sh
 endif
 endif

+ 41 - 31
julia/examples/axpy/axpy.jl

@@ -14,7 +14,7 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 #
 #
 using StarPU
 using StarPU
-
+using Printf
 const EPSILON = 1e-6
 const EPSILON = 1e-6
 
 
 function check(alpha, X, Y)
 function check(alpha, X, Y)
@@ -26,36 +26,27 @@ function check(alpha, X, Y)
     end
     end
 end
 end
 
 
-function main()
-    N = 16 * 1024 * 1024
-    NBLOCKS = 8
-    alpha = 3.41
-
-    starpu_init()
-    starpu_cublas_init()
+@target STARPU_CPU+STARPU_CUDA
+@codelet function axpy(X :: Vector{Float32}, Y :: Vector{Float32}, alpha ::Float32) :: Nothing
+    STARPU_SAXPY(length(X), alpha, X, 1, Y, 1)
+    return
+end
 
 
+function axpy(N, NBLOCKS, alpha, display = true)
     X = Array(fill(1.0f0, N))
     X = Array(fill(1.0f0, N))
     Y = Array(fill(4.0f0, N))
     Y = Array(fill(4.0f0, N))
 
 
     starpu_memory_pin(X)
     starpu_memory_pin(X)
     starpu_memory_pin(Y)
     starpu_memory_pin(Y)
 
 
-    println("BEFORE x[0] = ", X[1])
-    println("BEFORE y[0] = ", Y[1])
-
     block_filter = starpu_data_filter(STARPU_VECTOR_FILTER_BLOCK, NBLOCKS)
     block_filter = starpu_data_filter(STARPU_VECTOR_FILTER_BLOCK, NBLOCKS)
 
 
-    perfmodel = starpu_perfmodel(
-        perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
-        symbol = "history_perf"
-    )
+    if display
+        println("BEFORE x[0] = ", X[1])
+        println("BEFORE y[0] = ", Y[1])
+    end
 
 
-    cl = starpu_codelet(
-        cpu_func = STARPU_SAXPY,
-        cuda_func = STARPU_SAXPY,
-        modes = [STARPU_R, STARPU_RW],
-        perfmodel = perfmodel
-    )
+    t_start = time_ns()
 
 
     @starpu_block let
     @starpu_block let
         hX,hY = starpu_data_register(X, Y)
         hX,hY = starpu_data_register(X, Y)
@@ -63,26 +54,45 @@ function main()
         starpu_data_partition(hX, block_filter)
         starpu_data_partition(hX, block_filter)
         starpu_data_partition(hY, block_filter)
         starpu_data_partition(hY, block_filter)
 
 
-        t_start = time_ns()
-
         for b in 1:NBLOCKS
         for b in 1:NBLOCKS
-            task = starpu_task(cl = cl, handles = [hX[b],hY[b]], cl_arg=(Float32(alpha),),
-                               tag=starpu_tag_t(b))
-            starpu_task_submit(task)
+            starpu_task_insert(codelet_name = "axpy",
+                               handles = [hX[b], hY[b]],
+                               cl_arg = (Float32(alpha),),
+                               tag = starpu_tag_t(b),
+                               modes = [STARPU_R, STARPU_RW])
         end
         end
+
         starpu_task_wait_for_all()
         starpu_task_wait_for_all()
+    end
 
 
-        t_end = time_ns()
-        timing = (t_end - t_start) / 1000
+    t_end = time_ns()
 
 
-        println("timing -> ", timing, " us ", 3*N*4/timing, "MB/s")
+    timing = (t_end-t_start)/1000
 
 
+    if display
+        @printf("timing -> %d us %.2f MB/s\n", timing, 3*N*4/timing)
+        println("AFTER y[0] = ", Y[1], " (ALPHA=", alpha, ")")
     end
     end
 
 
-    println("AFTER y[0] = ", Y[1], " (ALPHA=", alpha, ")")
-
     check(alpha, X, Y)
     check(alpha, X, Y)
 
 
+    starpu_memory_unpin(X)
+    starpu_memory_unpin(Y)
+end
+
+function main()
+    N = 16 * 1024 * 1024
+    NBLOCKS = 8
+    alpha = 3.41
+
+    starpu_init()
+    starpu_cublas_init()
+
+    # warmup
+    axpy(10, 1, alpha, false)
+
+    axpy(N, NBLOCKS, alpha)
+
     starpu_shutdown()
     starpu_shutdown()
 end
 end
 
 

+ 7 - 6
julia/examples/callback/callback.jl

@@ -37,9 +37,7 @@ function variable_with_starpu(val ::Ref{Int32})
     )
     )
 
 
     cl = starpu_codelet(
     cl = starpu_codelet(
-        cpu_func = CPU_CODELETS["variable"],
-        # cuda_func = CUDA_CODELETS["matrix_mult"],
-        #opencl_func="ocl_matrix_mult",
+        cpu_func = "variable",
         modes = [STARPU_RW],
         modes = [STARPU_RW],
         perfmodel = perfmodel
         perfmodel = perfmodel
     )
     )
@@ -47,8 +45,11 @@ function variable_with_starpu(val ::Ref{Int32})
     @starpu_block let
     @starpu_block let
 	hVal = starpu_data_register(val)
 	hVal = starpu_data_register(val)
 
 
-        task = starpu_task(cl = cl, handles = [hVal], callback=callback, callback_arg=(cl, [hVal]))
-        starpu_task_submit(task)
+        starpu_task_insert(codelet_name = "variable",
+                           cl = cl,
+                           handles = [hVal],
+                           callback = callback,
+                           callback_arg = (cl, [hVal]))
 
 
         starpu_task_wait_for_all()
         starpu_task_wait_for_all()
     end
     end
@@ -63,7 +64,7 @@ function display()
     if v[] == 42
     if v[] == 42
         println("result is correct")
         println("result is correct")
     else
     else
-        println("result is incorret")
+        error("result is incorret")
     end
     end
 end
 end
 
 

+ 15 - 0
julia/examples/check_deps/check_deps.jl

@@ -1,3 +1,18 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
 import Pkg
 import Pkg
 
 
 try
 try

+ 20 - 0
julia/examples/cholesky/cholesky.sh

@@ -0,0 +1,20 @@
+#!/bin/bash
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+$(dirname $0)/../execute.sh cholesky/cholesky_native.jl -quickcheck
+$(dirname $0)/../execute.sh cholesky/cholesky_implicit.jl -quickcheck
+$(dirname $0)/../execute.sh cholesky/cholesky_tag.jl -quickcheck

+ 52 - 0
julia/examples/cholesky/cholesky_codelets.jl

@@ -0,0 +1,52 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+chol_model11 = starpu_perfmodel(
+    perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
+    symbol = "chol_model11"
+)
+
+chol_model21 = starpu_perfmodel(
+    perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
+    symbol = "chol_model21"
+)
+
+chol_model22 = starpu_perfmodel(
+    perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
+    symbol = "chol_model22"
+)
+
+cl_11 = starpu_codelet(
+    cpu_func = "u11",
+    cuda_func = "u11",
+    modes = [STARPU_RW],
+    color = 0xffff00,
+    perfmodel = chol_model11
+)
+cl_21 = starpu_codelet(
+    cpu_func = "u21",
+    cuda_func = "u21",
+    modes = [STARPU_R, STARPU_RW],
+    color = 0x8080ff,
+    perfmodel = chol_model21
+)
+cl_22 = starpu_codelet(
+    cpu_func = "u22",
+    cuda_func = "u22",
+    modes = [STARPU_R, STARPU_R, STARPU_RW],
+    color = 0x00ff00,
+    perfmodel = chol_model22
+)

+ 154 - 0
julia/examples/cholesky/cholesky_common.jl

@@ -0,0 +1,154 @@
+# Standard kernels for the Cholesky factorization
+# U22 is the gemm update
+# U21 is the trsm update
+# U11 is the cholesky factorization
+
+@target STARPU_CPU+STARPU_CUDA
+@codelet function u11(sub11 :: Matrix{Float32}) :: Nothing
+    nx :: Int32 = width(sub11)
+    ld :: Int32 = ld(sub11)
+
+    for z in 0:nx-1
+        lambda11 :: Float32 = sqrt(sub11[z+1,z+1])
+        sub11[z+1,z+1] = lambda11
+
+        alpha ::Float32 = 1.0f0 / lambda11
+        X :: Vector{Float32} = view(sub11, z+2:z+2+(nx-z-2), z+1)
+        STARPU_SSCAL(nx-z-1, alpha, X, 1)
+
+        alpha = -1.0f0
+        A :: Matrix{Float32} = view(sub11, z+2:z+2+(nx-z-2), z+2:z+2+(nx-z-2))
+	STARPU_SSYR("L", nx-z-1, alpha, X, 1, A, ld)
+    end
+    return
+end
+
+@target STARPU_CPU+STARPU_CUDA
+@codelet function u21(sub11 :: Matrix{Float32},
+                      sub21 :: Matrix{Float32}) :: Nothing
+    ld11 :: Int32 = ld(sub11)
+    ld21 :: Int32 = ld(sub21)
+    nx21 :: Int32 = width(sub21)
+    ny21 :: Int32 = height(sub21)
+    alpha :: Float32 = 1.0f0
+    STARPU_STRSM("R", "L", "T", "N", nx21, ny21, alpha, sub11, ld11, sub21, ld21)
+    return
+end
+
+@target STARPU_CPU+STARPU_CUDA
+@codelet function u22(left   :: Matrix{Float32},
+                      right  :: Matrix{Float32},
+                      center :: Matrix{Float32}) :: Nothing
+    dx :: Int32 = width(center)
+    dy :: Int32 = height(center)
+    dz :: Int32 = width(left)
+    ld21 :: Int32 = ld(left)
+    ld12 :: Int32 = ld(center)
+    ld22 :: Int32 = ld(right)
+    alpha :: Float32 = -1.0f0
+    beta :: Float32 = 1.0f0
+    STARPU_SGEMM("N", "T", dy, dx, dz, alpha, left, ld21, right, ld12, beta, center, ld22)
+    return
+end
+
+@inline function tag11(k)
+    return starpu_tag_t((UInt64(1)<<60) | UInt64(k))
+end
+
+@inline function tag21(k, j)
+    return starpu_tag_t((UInt64(3)<<60) | (UInt64(k)<<32) |  UInt64(j))
+end
+
+@inline function tag22(k, i, j)
+    return starpu_tag_t((UInt64(4)<<60) | (UInt64(k)<<32) | (UInt64(i)<<16) |  UInt64(j))
+end
+
+function check(mat::Matrix{Float32})
+    size_p = size(mat, 1)
+
+    for i in 1:size_p
+        for j in 1:size_p
+            if j > i
+                mat[i, j] = 0.0f0
+            end
+        end
+    end
+
+    test_mat ::Matrix{Float32} = zeros(Float32, size_p, size_p)
+
+    syrk!('L', 'N', 1.0f0, mat, 0.0f0, test_mat)
+
+    for i in 1:size_p
+        for j in 1:size_p
+            if j <= i
+                orig = (1.0f0/(1.0f0+(i-1)+(j-1))) + ((i == j) ? 1.0f0*size_p : 0.0f0)
+                err = abs(test_mat[i,j] - orig) / orig
+                if err > 0.0001
+                    got = test_mat[i,j]
+                    expected = orig
+                    error("[$i, $j] -> $got != $expected (err $err)")
+                end
+            end
+        end
+    end
+
+    println(stderr, "Verification successful !")
+end
+
+function clean_tags(nblocks)
+    for k in 1:nblocks
+        starpu_tag_remove(tag11(k))
+
+        for m in k+1:nblocks
+            starpu_tag_remove(tag21(k, m))
+
+            for n in k+1:nblocks
+                if n <= m
+                    starpu_tag_remove(tag22(k, m, n))
+                end
+            end
+        end
+    end
+end
+
+function main(size_p :: Int, nblocks :: Int; verify = false, verbose = false)
+    mat :: Matrix{Float32} = zeros(Float32, size_p, size_p)
+
+    # create a simple definite positive symetric matrix
+    # Hilbert matrix h(i,j) = 1/(i+j+1)
+
+    for i in 1:size_p
+        for j in 1:size_p
+            mat[i, j] = 1.0f0 / (1.0f0+(i-1)+(j-1)) + ((i == j) ? 1.0f0*size_p : 0.0f0)
+        end
+    end
+
+    if verbose
+        display(mat)
+    end
+
+    starpu_memory_pin(mat)
+
+    t_start = time_ns()
+
+    cholesky(mat, size_p, nblocks)
+
+    t_end = time_ns()
+
+    starpu_memory_unpin(mat)
+
+    flop = (1.0*size_p*size_p*size_p)/3.0
+    time_ms = (t_end-t_start) / 1e6
+    gflops = flop/(time_ms*1000)/1000
+    println("$size_p\t$time_ms\t$gflops")
+
+    clean_tags(nblocks)
+
+    if verbose
+        display(mat)
+    end
+
+    if verify
+        check(mat)
+    end
+end

+ 71 - 0
julia/examples/cholesky/cholesky_implicit.jl

@@ -0,0 +1,71 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+using StarPU
+using LinearAlgebra.BLAS
+
+include("cholesky_common.jl")
+
+function cholesky(mat :: Matrix{Float32}, size, nblocks)
+    include("cholesky_codelets.jl")
+
+    horiz = starpu_data_filter(STARPU_MATRIX_FILTER_BLOCK, nblocks)
+    vert = starpu_data_filter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nblocks)
+
+    @starpu_block let
+        h_mat = starpu_data_register(mat)
+        starpu_data_map_filters(h_mat, horiz, vert)
+
+        for k in 1:nblocks
+
+            starpu_iteration_push(k)
+
+            starpu_task_insert(cl = cl_11, handles = [h_mat[k, k]], tag_only = tag11(k))
+
+            for m in k+1:nblocks
+                starpu_task_insert(cl = cl_21, handles = [h_mat[k, k], h_mat[m, k]], tag_only = tag21(m, k))
+            end
+            starpu_data_wont_use(h_mat[k, k])
+
+            for m in k+1:nblocks
+                for n in k+1:nblocks
+                    if n <= m
+                        starpu_task_insert(cl = cl_22, handles = [h_mat[m, k], h_mat[n, k], h_mat[m, n]], tag_only= tag22(k, m, n))
+                    end
+                end
+                starpu_data_wont_use(h_mat[m, k])
+            end
+
+            starpu_iteration_pop()
+        end
+
+        starpu_task_wait_for_all()
+    end
+end
+
+starpu_init()
+starpu_cublas_init()
+
+println("# size\tms\tGFlops")
+
+if length(ARGS) > 0 && ARGS[1] == "-quickcheck"
+    main(1024, 8, verify = true)
+else
+    for size in 1024:1024:15360
+        main(size, 16)
+    end
+end
+
+starpu_shutdown()

+ 79 - 0
julia/examples/cholesky/cholesky_native.jl

@@ -0,0 +1,79 @@
+using LinearAlgebra
+
+function check(mat::Matrix{Float32})
+    size_p = size(mat, 1)
+
+    for i in 1:size_p
+        for j in 1:size_p
+            if j < i
+                mat[i, j] = 0.0f0
+            end
+        end
+    end
+
+    test_mat ::Matrix{Float32} = zeros(Float32, size_p, size_p)
+
+    BLAS.syrk!('L', 'T', 1.0f0, mat, 0.0f0, test_mat)
+
+    for i in 1:size_p
+        for j in 1:size_p
+            if j <= i
+                orig = (1.0f0/(1.0f0+(i-1)+(j-1))) + ((i == j) ? 1.0f0*size_p : 0.0f0)
+                err = abs(test_mat[i,j] - orig) / orig
+                if err > 0.0001
+                    got = test_mat[i,j]
+                    expected = orig
+                    error("[$i, $j] -> $got != $expected (err $err)")
+                end
+            end
+        end
+    end
+
+    println(stderr, "Verification successful !")
+end
+
+function main(size_p :: Int; verify = false, verbose = false)
+    mat = zeros(Float32, size_p, size_p)
+    # create a simple definite positive symetric matrix
+    # Hilbert matrix h(i,j) = 1/(i+j+1)
+
+    for i in 1:size_p
+        for j in 1:size_p
+            mat[i, j] = 1.0f0 / (1.0f0+(i-1)+(j-1)) + ((i == j) ? 1.0f0*size_p : 0.0f0)
+        end
+    end
+
+    if verbose
+        display(mat)
+    end
+
+    t_start = time_ns()
+
+    cholesky!(mat)
+
+    t_end = time_ns()
+
+    flop = (1.0*size_p*size_p*size_p)/3.0
+    time_ms = (t_end-t_start) / 1e6
+    gflops = flop/(time_ms*1000)/1000
+    println("$size_p\t$time_ms\t$gflops")
+
+    if verbose
+        display(mat)
+    end
+
+    if verify
+        check(mat)
+    end
+end
+
+println("# size\tms\tGFlops")
+
+if length(ARGS) > 0 && ARGS[1] == "-quickcheck"
+    main(1024, verify = true)
+else
+    for size in 1024:1024:15360
+        main(size)
+    end
+end
+

+ 93 - 0
julia/examples/cholesky/cholesky_tag.jl

@@ -0,0 +1,93 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+using StarPU
+using LinearAlgebra.BLAS
+
+include("cholesky_common.jl")
+
+function cholesky(mat :: Matrix{Float32}, size, nblocks)
+    include("cholesky_codelets.jl")
+
+    horiz = starpu_data_filter(STARPU_MATRIX_FILTER_BLOCK, nblocks)
+    vert = starpu_data_filter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nblocks)
+
+    @starpu_block let
+        h_mat = starpu_data_register(mat)
+        starpu_data_set_sequential_consistency_flag(h_mat, 0)
+        starpu_data_map_filters(h_mat, horiz, vert)
+
+        entry_task = starpu_task(cl = cl_11,
+                                 handles = [h_mat[1, 1]],
+                                 tag = tag11(1))
+
+        for k in 1:nblocks
+
+            starpu_iteration_push(k)
+
+            if k > 1
+                # enforce dependencies...
+                starpu_tag_declare_deps(tag11(k), tag22(k-1, k, k))
+                starpu_task_insert(cl = cl_11,
+                                   handles = [h_mat[k, k]],
+                                   tag = tag11(k))
+            end
+
+            for m in k+1:nblocks
+                # enforce dependencies...
+                if k > 1
+                    starpu_tag_declare_deps(tag21(k, m), tag11(k), tag22(k-1, m, k))
+                else
+                    starpu_tag_declare_deps(tag21(k, m), tag11(k))
+                end
+
+                starpu_task_insert(cl = cl_21, handles = [h_mat[k, k], h_mat[m, k]], tag = tag21(k, m))
+
+                for n in k+1:nblocks
+                    if n <= m
+                        # enforce dependencies...
+                        if k > 1
+                            starpu_tag_declare_deps(tag22(k, m, n), tag22(k-1, m, n), tag21(k, n), tag21(k, m))
+                        else
+                            starpu_tag_declare_deps(tag22(k, m, n), tag21(k, n), tag21(k, m))
+                        end
+
+                        starpu_task_insert(cl = cl_22, handles = [h_mat[m, k], h_mat[n, k], h_mat[m, n]], tag = tag22(k, m, n))
+                    end
+                end
+            end
+
+            starpu_iteration_pop()
+        end
+
+        starpu_task_submit(entry_task)
+        starpu_tag_wait(tag11(nblocks))
+    end
+end
+
+starpu_init()
+starpu_cublas_init()
+
+println("# size\tms\tGFlops")
+
+if length(ARGS) > 0 && ARGS[1] == "-quickcheck"
+    main(1024, 8, verify = true)
+else
+    for size in 1024:1024:15360
+        main(size, 16)
+    end
+end
+
+starpu_shutdown()

+ 3 - 3
julia/examples/dependency/end_dep.jl

@@ -53,16 +53,16 @@ function main()
         )
         )
 
 
         clA = starpu_codelet(
         clA = starpu_codelet(
-            cpu_func = CPU_CODELETS["codeletA"],
+            cpu_func = "codeletA",
             perfmodel = perfmodel
             perfmodel = perfmodel
         )
         )
         clB = starpu_codelet(
         clB = starpu_codelet(
-            cpu_func = CPU_CODELETS["codeletB"],
+            cpu_func = "codeletB",
             modes = [STARPU_RW],
             modes = [STARPU_RW],
             perfmodel = perfmodel
             perfmodel = perfmodel
         )
         )
         clC = starpu_codelet(
         clC = starpu_codelet(
-            cpu_func = CPU_CODELETS["codeletC"],
+            cpu_func = "codeletC",
             modes = [STARPU_RW],
             modes = [STARPU_RW],
             perfmodel = perfmodel
             perfmodel = perfmodel
         )
         )

+ 3 - 3
julia/examples/dependency/tag_dep.jl

@@ -75,17 +75,17 @@ function main()
     )
     )
 
 
         clA = starpu_codelet(
         clA = starpu_codelet(
-            cpu_func = CPU_CODELETS["codeletA"],
+            cpu_func = "codeletA",
             modes = [STARPU_RW],
             modes = [STARPU_RW],
             perfmodel = perfmodel
             perfmodel = perfmodel
         )
         )
         clB = starpu_codelet(
         clB = starpu_codelet(
-            cpu_func = CPU_CODELETS["codeletB"],
+            cpu_func = "codeletB",
             modes = [STARPU_RW],
             modes = [STARPU_RW],
             perfmodel = perfmodel
             perfmodel = perfmodel
         )
         )
         clC = starpu_codelet(
         clC = starpu_codelet(
-            cpu_func = CPU_CODELETS["codeletC"],
+            cpu_func = "codeletC",
             modes = [STARPU_RW],
             modes = [STARPU_RW],
             perfmodel = perfmodel
             perfmodel = perfmodel
         )
         )

+ 3 - 3
julia/examples/dependency/task_dep.jl

@@ -43,17 +43,17 @@ function main()
         )
         )
 
 
         clA = starpu_codelet(
         clA = starpu_codelet(
-            cpu_func = CPU_CODELETS["codeletA"],
+            cpu_func = "codeletA",
             modes = [STARPU_RW],
             modes = [STARPU_RW],
             perfmodel = perfmodel
             perfmodel = perfmodel
         )
         )
         clB = starpu_codelet(
         clB = starpu_codelet(
-            cpu_func = CPU_CODELETS["codeletB"],
+            cpu_func = "codeletB",
             modes = [STARPU_RW],
             modes = [STARPU_RW],
             perfmodel = perfmodel
             perfmodel = perfmodel
         )
         )
         clC = starpu_codelet(
         clC = starpu_codelet(
-            cpu_func = CPU_CODELETS["codeletC"],
+            cpu_func = "codeletC",
             modes = [STARPU_RW],
             modes = [STARPU_RW],
             perfmodel = perfmodel
             perfmodel = perfmodel
         )
         )

+ 7 - 1
julia/examples/execute.sh.in

@@ -19,11 +19,17 @@ set -x
 export JULIA_LOAD_PATH=@STARPU_SRC_DIR@/julia:$JULIA_LOAD_PATH
 export JULIA_LOAD_PATH=@STARPU_SRC_DIR@/julia:$JULIA_LOAD_PATH
 export STARPU_BUILD_DIR=@STARPU_BUILD_DIR@
 export STARPU_BUILD_DIR=@STARPU_BUILD_DIR@
 export STARPU_SRC_DIR=@STARPU_SRC_DIR@
 export STARPU_SRC_DIR=@STARPU_SRC_DIR@
-export STARPU_JULIA_LIB=@STARPU_BUILD_DIR@/julia/src/.libs/libstarpujulia-1.3.so
+export STARPU_JULIA_LIB=@STARPU_BUILD_DIR@/julia/src/.libs/libstarpujulia-1.3
 export STARPU_JULIA_BUILD=@STARPU_BUILD_DIR@/julia
 export STARPU_JULIA_BUILD=@STARPU_BUILD_DIR@/julia
+export LD_LIBRARY_PATH=@STARPU_BUILD_DIR@/julia/src/.libs/:$LD_LIBRARY_PATH
 export JULIA_NUM_THREADS=8
 export JULIA_NUM_THREADS=8
+export STARPU_NOPENCL=0
+export STARPU_SCHED=dmda
+
 srcdir=@STARPU_SRC_DIR@/julia/examples
 srcdir=@STARPU_SRC_DIR@/julia/examples
 
 
+rm -f genc*.c gencuda*.cu genc*.o
+
 if test "$1" == "-calllib"
 if test "$1" == "-calllib"
 then
 then
     shift
     shift

+ 144 - 0
julia/examples/gemm/gemm.jl

@@ -0,0 +1,144 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+using StarPU
+using LinearAlgebra.BLAS
+
+@target STARPU_CPU+STARPU_CUDA
+@codelet function gemm(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, alpha :: Float32, beta :: Float32) :: Nothing
+
+    M :: Int32 = height(A)
+    N :: Int32 = width(B)
+    K :: Int32 = width(A)
+    lda :: Int32 = ld(A)
+    ldb :: Int32 = ld(B)
+    ldc :: Int32 = ld(C)
+    STARPU_SGEMM("N", "N", M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+    return
+end
+
+function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, alpha :: Float32, beta :: Float32, nslicesx, nslicesy)
+    scale= 3
+    tmin=0
+    vert = starpu_data_filter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslicesx)
+    horiz = starpu_data_filter(STARPU_MATRIX_FILTER_BLOCK, nslicesy)
+    @starpu_block let
+        hA,hB,hC = starpu_data_register(A, B, C)
+        starpu_data_partition(hB, vert)
+        starpu_data_partition(hA, horiz)
+        starpu_data_map_filters(hC, vert, horiz)
+        tmin=0
+
+        for i in (1 : 10 )
+            t=time_ns()
+            @starpu_sync_tasks begin
+                for taskx in (1 : nslicesx)
+                    for tasky in (1 : nslicesy)
+                        starpu_task_insert(codelet_name = "gemm",
+                                           handles = [hA[tasky], hB[taskx], hC[taskx, tasky]],
+                                           cl_arg = (alpha, beta),
+                                           modes = [STARPU_R, STARPU_R, STARPU_RW])
+                    end
+                end
+            end
+            t=time_ns()-t
+            if (tmin==0 || tmin>t)
+                tmin=t
+            end
+        end
+    end
+    return tmin
+end
+
+
+function approximately_equals(
+    A :: Matrix{Cfloat},
+    B :: Matrix{Cfloat},
+    eps = 1e-2
+)
+    (height, width) = size(A)
+
+    for j in (1 : width)
+        for i in (1 : height)
+            if (abs(A[i,j] - B[i,j]) > eps * max(abs(B[i,j]), abs(A[i,j])))
+                println("A[$i,$j] : $(A[i,j]), B[$i,$j] : $(B[i,j])")
+                return false
+            end
+        end
+    end
+
+    return true
+end
+
+function check(expected, A, B, C, alpha, beta)
+    for i in 1 : 10
+        gemm!('N', 'N', alpha, A, B, beta, expected)
+    end
+
+    height,width = size(C)
+    for i in 1:height
+        for j in 1:width
+            got = C[i, j]
+            exp = expected[i, j]
+
+            err = abs(exp - got) / exp
+            if err > 0.0001
+                error("[$i] -> $got != $exp (err $err)")
+            end
+        end
+    end
+end
+
+function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy)
+    for dim in (start_dim : step_dim : stop_dim)
+        A = Array(rand(Cfloat, dim, dim))
+        B = Array(rand(Cfloat, dim, dim))
+        C = zeros(Float32, dim, dim)
+        C_ref = copy(C)
+        starpu_memory_pin(A)
+        starpu_memory_pin(B)
+        starpu_memory_pin(C)
+        alpha = 4.0f0
+        beta = 2.0f0
+        mt =  multiply_with_starpu(A, B, C, alpha, beta, nslicesx, nslicesy)
+        gflop = 2 * dim * dim * dim * 1.e-9
+        gflops = gflop / (mt * 1.e-9)
+        size=dim*dim*dim*4*3/1024/1024
+        println(io,"$dim $gflops")
+        println("$dim $gflops")
+        starpu_memory_unpin(A)
+        starpu_memory_unpin(B)
+        starpu_memory_unpin(C)
+        check(C_ref, A, B, C, alpha, beta)
+    end
+end
+
+if size(ARGS, 1) < 1
+    filename="x.dat"
+else
+    filename=ARGS[1]
+end
+
+starpu_init()
+starpu_cublas_init()
+nblock_x = Int32(ceil(sqrt(starpu_worker_get_count())))
+nblock_y = nblock_x
+io=open(filename,"w")
+compute_times(io,64,512,4096,nblock_x,nblock_y)
+close(io)
+
+starpu_shutdown()
+

+ 22 - 0
julia/examples/gemm/gemm.sh

@@ -0,0 +1,22 @@
+#!/bin/bash
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+$(dirname $0)/../execute.sh gemm/gemm_native.jl
+
+export OMP_NUM_THREADS=1
+$(dirname $0)/../execute.sh gemm/gemm.jl
+

+ 146 - 0
julia/examples/gemm/gemm_bare.jl

@@ -0,0 +1,146 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+using StarPU
+using LinearAlgebra.BLAS
+
+@target STARPU_CPU+STARPU_CUDA
+@codelet function gemm(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, alpha :: Float32, beta :: Float32) :: Nothing
+
+    M :: Int32 = height(A)
+    N :: Int32 = width(B)
+    K :: Int32 = width(A)
+    lda :: Int32 = ld(A)
+    ldb :: Int32 = ld(B)
+    ldc :: Int32 = ld(C)
+    STARPU_SGEMM("N", "N", M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+    return
+end
+
+function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, alpha :: Float32, beta :: Float32, nslicesx, nslicesy)
+    scale= 3
+    tmin=0
+    hA,hB,hC = starpu_data_register(A, B, C)
+    tmin=0
+    perfmodel = starpu_perfmodel(
+        perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
+        symbol = "gemm"
+    )
+    cl = starpu_codelet(
+        cpu_func  = "gemm",
+        cuda_func = "",
+        modes =[STARPU_R,STARPU_R,STARPU_RW], 
+        perfmodel = perfmodel,
+    )
+    task = starpu_task(cl = cl, handles =[hA,hB,hC], cl_arg = (alpha,beta), callback = nothing,
+		callback_arg = nothing, tag = nothing, tag_only = nothing,
+                       sequential_consistency = true,
+                       detach = 1, color = nothing, where = nothing)
+
+
+    for i in (1 : 10 )
+        t=time_ns()
+starpu_task_submit(Ref(task.c_task))
+        #starpu_task_submit(task)
+        starpu_task_wait_for_all()
+        t=time_ns()-t
+	if (tmin==0 || tmin>t)
+           tmin=t
+        end
+    end
+    starpu_data_unregister(hA)
+    starpu_data_unregister(hB)
+    starpu_data_unregister(hC)
+    return tmin
+end
+
+
+function approximately_equals(
+    A :: Matrix{Cfloat},
+    B :: Matrix{Cfloat},
+    eps = 1e-2
+)
+    (height, width) = size(A)
+
+    for j in (1 : width)
+        for i in (1 : height)
+            if (abs(A[i,j] - B[i,j]) > eps * max(abs(B[i,j]), abs(A[i,j])))
+                println("A[$i,$j] : $(A[i,j]), B[$i,$j] : $(B[i,j])")
+                return false
+            end
+        end
+    end
+
+    return true
+end
+
+function check(expected, A, B, C, alpha, beta)
+    for i in 1 : 10
+        gemm!('N', 'N', alpha, A, B, beta, expected)
+    end
+
+    height,width = size(C)
+    for i in 1:height
+        for j in 1:width
+            got = C[i, j]
+            exp = expected[i, j]
+
+            err = abs(exp - got) / exp
+            if err > 0.0001
+                error("[$i] -> $got != $exp (err $err)")
+            end
+        end
+    end
+end
+
+function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy)
+    for dim in (start_dim : step_dim : stop_dim)
+        A = Array(rand(Cfloat, dim, dim))
+        B = Array(rand(Cfloat, dim, dim))
+        C = zeros(Float32, dim, dim)
+        C_ref = copy(C)
+        starpu_memory_pin(A)
+        starpu_memory_pin(B)
+        starpu_memory_pin(C)
+        alpha = 4.0f0
+        beta = 2.0f0
+        mt =  multiply_with_starpu(A, B, C, alpha, beta, nslicesx, nslicesy)
+        gflop = 2 * dim * dim * dim * 1.e-9
+        gflops = gflop / (mt * 1.e-9)
+        size=dim*dim*dim*4*3/1024/1024
+        println(io,"$dim $gflops")
+        println("$dim $gflops")
+        starpu_memory_unpin(A)
+        starpu_memory_unpin(B)
+        starpu_memory_unpin(C)
+        #check(C_ref, A, B, C, alpha, beta)
+    end
+end
+
+if size(ARGS, 1) < 1
+    filename="x.dat"
+else
+    filename=ARGS[1]
+end
+
+starpu_init()
+starpu_cublas_init()
+io=open(filename,"w")
+compute_times(io,64,512,4096,1,1)
+close(io)
+
+starpu_shutdown()
+

+ 56 - 0
julia/examples/gemm/gemm_native.jl

@@ -0,0 +1,56 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+using LinearAlgebra.BLAS
+
+function gemm_without_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, alpha :: Float32, beta :: Float32)
+    tmin = 0
+    for i in (1 : 10 )
+        t=time_ns()
+        gemm!('N', 'N', alpha, A, B, beta, C)
+        t=time_ns() - t
+        if (tmin==0 || tmin>t)
+            tmin=t
+        end
+    end
+    return tmin
+end
+
+
+function compute_times(io,start_dim, step_dim, stop_dim)
+    for dim in (start_dim : step_dim : stop_dim)
+        A = Array(rand(Cfloat, dim, dim))
+        B = Array(rand(Cfloat, dim, dim))
+        C = zeros(Float32, dim, dim)
+        alpha = 4.0f0
+        beta = 2.0f0
+        mt =  gemm_without_starpu(A, B, C, alpha, beta)
+        gflop = 2 * dim * dim * dim * 1.e-9
+        gflops = gflop / (mt * 1.e-9)
+        size=dim*dim*dim*4*3/1024/1024
+        println(io,"$dim $gflops")
+        println("$dim $gflops")
+    end
+end
+
+if size(ARGS, 1) < 1
+    filename="x.dat"
+else
+    filename=ARGS[1]
+end
+io=open(filename,"w")
+compute_times(io,64,512,4096)
+close(io)
+

+ 1 - 1
julia/examples/mandelbrot/cpu_mandelbrot.c

@@ -71,7 +71,7 @@ void cpu_mandelbrot(void *descr[], void *cl_arg)
 }
 }
 
 
 char* CPU = "cpu_mandelbrot";
 char* CPU = "cpu_mandelbrot";
-char* GPU = "gpu_mandelbrot";
+char* GPU = "";
 extern char *starpu_find_function(char *name, char *device)
 extern char *starpu_find_function(char *name, char *device)
 {
 {
 	if (!strcmp(device,"gpu")) return GPU;
 	if (!strcmp(device,"gpu")) return GPU;

+ 4 - 1
julia/examples/mandelbrot/mandelbrot.jl

@@ -70,7 +70,10 @@ function mandelbrot_with_starpu(A ::Matrix{Int64}, cr ::Float64, ci ::Float64, d
 	starpu_data_partition(hA,horiz)
 	starpu_data_partition(hA,horiz)
 
 
 	@starpu_sync_tasks for taskx in (1 : nslicesx)
 	@starpu_sync_tasks for taskx in (1 : nslicesx)
-                @starpu_async_cl mandelbrot(hA[taskx]) [STARPU_W] (cr, ci, Int64((taskx-1)*dim/nslicesx), dim)
+            starpu_task_insert(codelet_name = "mandelbrot",
+                               handles = [hA[taskx]],
+                               modes = [STARPU_W],
+                               cl_arg = (cr, ci, Int64((taskx-1)*dim/nslicesx), dim))
 	end
 	end
     end
     end
 end
 end

+ 1 - 1
julia/examples/mult/cpu_mult.c

@@ -93,7 +93,7 @@ void cpu_mult(void *descr[], void *cl_arg)
 }
 }
 
 
 char* CPU = "cpu_mult";
 char* CPU = "cpu_mult";
-char* GPU = "gpu_mult";
+char* GPU = "";
 extern char *starpu_find_function(char *name, char *device)
 extern char *starpu_find_function(char *name, char *device)
 {
 {
 	if (!strcmp(device,"gpu")) return GPU;
 	if (!strcmp(device,"gpu")) return GPU;

+ 15 - 28
julia/examples/mult/mult.jl

@@ -82,27 +82,16 @@ function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: M
         starpu_data_partition(hA, horiz)
         starpu_data_partition(hA, horiz)
         starpu_data_map_filters(hC, vert, horiz)
         starpu_data_map_filters(hC, vert, horiz)
         tmin=0
         tmin=0
-        perfmodel = starpu_perfmodel(
-            perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
-            symbol = "history_perf"
-        )
-        cl = starpu_codelet(
-            cpu_func = CPU_CODELETS["matrix_mult"],
-            # cuda_func = CUDA_CODELETS["matrix_mult"],
-            #opencl_func="ocl_matrix_mult",
-            modes = [STARPU_R, STARPU_R, STARPU_W],
-            perfmodel = perfmodel
-        )
 
 
         for i in (1 : 10 )
         for i in (1 : 10 )
             t=time_ns()
             t=time_ns()
             @starpu_sync_tasks begin
             @starpu_sync_tasks begin
                 for taskx in (1 : nslicesx)
                 for taskx in (1 : nslicesx)
                     for tasky in (1 : nslicesy)
                     for tasky in (1 : nslicesy)
-                        handles = [hA[tasky], hB[taskx], hC[taskx, tasky]]
-                        task = starpu_task(cl = cl, handles = handles, cl_arg=(Int32(stride),))
-                        starpu_task_submit(task)
-                        #@starpu_async_cl matrix_mult(hA[tasky], hB[taskx], hC[taskx, tasky])
+                        starpu_task_insert(codelet_name = "matrix_mult",
+                                           modes = [STARPU_R, STARPU_R, STARPU_W],
+                                           handles = [hA[tasky], hB[taskx], hC[taskx, tasky]],
+                                           cl_arg = (Int32(stride),))
                     end
                     end
                 end
                 end
             end
             end
@@ -116,23 +105,20 @@ function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: M
 end
 end
 
 
 
 
-function approximately_equals(
-    A :: Matrix{Cfloat},
-    B :: Matrix{Cfloat},
-    eps = 1e-2
-)
-    (height, width) = size(A)
+function check(A, B, C)
+    expected = A * B
+    height,width = size(C)
+    for i in 1:height
+        for j in 1:width
+            got = C[i, j]
+            exp = expected[i, j]
 
 
-    for j in (1 : width)
-        for i in (1 : height)
-            if (abs(A[i,j] - B[i,j]) > eps * max(abs(B[i,j]), abs(A[i,j])))
-                println("A[$i,$j] : $(A[i,j]), B[$i,$j] : $(B[i,j])")
-                return false
+            err = abs(exp - got) / exp
+            if err > 0.0001
+                error("[$i] -> $got != $exp (err $err)")
             end
             end
         end
         end
     end
     end
-
-    return true
 end
 end
 
 
 function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy, stride)
 function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy, stride)
@@ -145,6 +131,7 @@ function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy, str
         size=dim*dim*4*3/1024/1024
         size=dim*dim*4*3/1024/1024
         println(io,"$size $flops")
         println(io,"$size $flops")
         println("$size $flops")
         println("$size $flops")
+        check(A, B, C)
     end
     end
 end
 end
 
 

+ 2 - 2
julia/examples/task_insert_color/task_insert_color.jl

@@ -35,13 +35,13 @@ function task_insert_color_with_starpu(val ::Ref{Int32})
         )
         )
 
 
         cl1 = starpu_codelet(
         cl1 = starpu_codelet(
-            cpu_func = CPU_CODELETS["task_insert_color"],
+            cpu_func = "task_insert_color",
             modes = [STARPU_RW],
             modes = [STARPU_RW],
             perfmodel = perfmodel
             perfmodel = perfmodel
         )
         )
 
 
         cl2 = starpu_codelet(
         cl2 = starpu_codelet(
-            cpu_func = CPU_CODELETS["task_insert_color"],
+            cpu_func = "task_insert_color",
             modes = [STARPU_RW],
             modes = [STARPU_RW],
             perfmodel = perfmodel,
             perfmodel = perfmodel,
             color = 0x0000FF
             color = 0x0000FF

+ 1 - 1
julia/examples/variable/variable.jl

@@ -44,7 +44,7 @@ function display(niter)
     if foo[] == niter
     if foo[] == niter
         println("result is correct")
         println("result is correct")
     else
     else
-        println("result is incorret")
+        error("result is incorret")
     end
     end
 end
 end
 
 

+ 21 - 17
julia/examples/vector_scal/vector_scal.jl

@@ -36,28 +36,15 @@ function vector_scal_with_starpu(v :: Vector{Float32}, m :: Int32, k :: Float32,
     @starpu_block let
     @starpu_block let
         hV = starpu_data_register(v)
         hV = starpu_data_register(v)
         tmin=0
         tmin=0
-        perfmodel = starpu_perfmodel(
-            perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
-            symbol = "history_perf"
-        )
-        cl = starpu_codelet(
-            cpu_func = CPU_CODELETS["vector_scal"],
-            # cuda_func = CUDA_CODELETS["vector_scal"],
-            #opencl_func="ocl_matrix_mult",
-            modes = [STARPU_RW],
-            perfmodel = perfmodel
-        )
 
 
         for i in (1 : 1)
         for i in (1 : 1)
             t=time_ns()
             t=time_ns()
             @starpu_sync_tasks begin
             @starpu_sync_tasks begin
-                handles = [hV]
-                task = starpu_task(cl = cl, handles = handles, cl_arg=(m, k, l))
-                starpu_task_submit(task)
+                starpu_task_insert(codelet_name = "vector_scal",
+                                   modes = [STARPU_RW],
+                                   handles = [hV],
+                                   cl_arg=(m, k, l))
             end
             end
-            # @starpu_sync_tasks for task in (1:1)
-            #     @starpu_async_cl vector_scal(hV, STARPU_RW, [m, k, l])
-            # end
             t=time_ns()-t
             t=time_ns()-t
             if (tmin==0 || tmin>t)
             if (tmin==0 || tmin>t)
                 tmin=t
                 tmin=t
@@ -67,9 +54,24 @@ function vector_scal_with_starpu(v :: Vector{Float32}, m :: Int32, k :: Float32,
     return tmin
     return tmin
 end
 end
 
 
+function check(ref, res, m, k, l)
+    expected = ref .* m .+ (k+l)
+
+    for i in 1:length(expected)
+        got = res[i]
+        exp = expected[i]
+
+        err = abs(exp - got) / exp
+        if err > 0.0001
+            error("[$i] -> $got != $exp (err $err)")
+        end
+    end
+end
+
 function compute_times(io,start_dim, step_dim, stop_dim)
 function compute_times(io,start_dim, step_dim, stop_dim)
     for size in (start_dim : step_dim : stop_dim)
     for size in (start_dim : step_dim : stop_dim)
         V = Array(rand(Cfloat, size))
         V = Array(rand(Cfloat, size))
+        V_ref = copy(V)
         starpu_memory_pin(V)
         starpu_memory_pin(V)
 
 
         m :: Int32 = 10
         m :: Int32 = 10
@@ -85,6 +87,8 @@ function compute_times(io,start_dim, step_dim, stop_dim)
         println("OUTPUT ", V[1:10])
         println("OUTPUT ", V[1:10])
         println(io,"$size $mt")
         println(io,"$size $mt")
         println("$size $mt")
         println("$size $mt")
+
+        check(V_ref, V, m, k, l)
     end
     end
 end
 end
 
 

+ 1 - 1
julia/src/Makefile.am

@@ -19,7 +19,7 @@ include $(top_srcdir)/starpu-notests.mk
 CLEANFILES = *.gcno *.gcda
 CLEANFILES = *.gcno *.gcda
 
 
 AM_CFLAGS = $(GLOBAL_AM_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(HWLOC_CFLAGS) $(FXT_CFLAGS) -fPIC
 AM_CFLAGS = $(GLOBAL_AM_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(HWLOC_CFLAGS) $(FXT_CFLAGS) -fPIC
-LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS)
+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ `@JULIA@ $(top_srcdir)/julia/src/openblas_ldflags.jl`
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_builddir)/src -I$(top_srcdir)/src -I$(top_srcdir)/julia/src
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_builddir)/src -I$(top_srcdir)/src -I$(top_srcdir)/julia/src
 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) -no-undefined
 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) -no-undefined
 
 

+ 12 - 3
julia/src/StarPU.jl

@@ -26,12 +26,13 @@ const starpu_wrapper_library_name=fstarpu_task_library_name()
 
 
 include("translate_headers.jl")
 include("translate_headers.jl")
 
 
-if !isfile((@__DIR__)*"/../gen/libstarpu_common.jl") || !isfile((@__DIR__)*"/../gen/libstarpu_api.jl")
+if !isfile(joinpath(fstarpu_build_dir(), "julia/gen/libstarpu_common.jl")) || !isfile(joinpath(fstarpu_build_dir(), "julia/gen/libstarpu_api.jl")) ||
+    mtime(joinpath(@__FILE__, "translate_headers.jl")) > mtime(joinpath(fstarpu_build_dir(), "julia/gen/libstarpu_api.jl"))
     starpu_translate_headers()
     starpu_translate_headers()
 end
 end
 
 
-include("../gen/libstarpu_common.jl")
-include("../gen/libstarpu_api.jl")
+include(joinpath(fstarpu_build_dir(), "julia/gen/libstarpu_common.jl"))
+include(joinpath(fstarpu_build_dir(), "julia/gen/libstarpu_api.jl"))
 include("globals.jl")
 include("globals.jl")
 
 
 include("compiler/include.jl")
 include("compiler/include.jl")
@@ -85,6 +86,8 @@ export starpu_data_get_sub_data
 export starpu_data_partition
 export starpu_data_partition
 export starpu_data_unpartition
 export starpu_data_unpartition
 export starpu_data_map_filters
 export starpu_data_map_filters
+export starpu_data_wont_use
+export starpu_task_insert
 export starpu_task_wait_for_all
 export starpu_task_wait_for_all
 export starpu_task_submit
 export starpu_task_submit
 export starpu_task_end_dep_add
 export starpu_task_end_dep_add
@@ -93,6 +96,7 @@ export starpu_task_declare_deps
 export starpu_task_declare_end_deps
 export starpu_task_declare_end_deps
 export starpu_task_wait_for_n_submitted
 export starpu_task_wait_for_n_submitted
 export starpu_task_destroy
 export starpu_task_destroy
+export starpu_tag_remove
 export starpu_tag_wait
 export starpu_tag_wait
 export starpu_tag_notify_from_apps
 export starpu_tag_notify_from_apps
 export starpu_iteration_pop
 export starpu_iteration_pop
@@ -108,5 +112,10 @@ export starpu_data_get_default_sequential_consistency_flag
 export starpu_data_set_default_sequential_consistency_flag
 export starpu_data_set_default_sequential_consistency_flag
 export starpu_data_get_sequential_consistency_flag
 export starpu_data_get_sequential_consistency_flag
 export starpu_data_set_sequential_consistency_flag
 export starpu_data_set_sequential_consistency_flag
+export starpu_worker_get_count
+export starpu_cpu_worker_get_count
+export starpu_cuda_worker_get_count
+export starpu_opencl_worker_get_count
+export starpu_mic_worker_get_count
 
 
 end
 end

+ 78 - 400
julia/src/blas.c

@@ -17,500 +17,178 @@
 #include <ctype.h>
 #include <ctype.h>
 #include <stdio.h>
 #include <stdio.h>
 
 
-#include <starpu.h>
 #include "blas.h"
 #include "blas.h"
 
 
-/*
-    This files contains BLAS wrappers for the different BLAS implementations
-  (eg. REFBLAS, ATLAS, GOTOBLAS ...). We assume a Fortran orientation as most
-  libraries do not supply C-based ordering.
- */
-
-#ifdef STARPU_ATLAS
-
-inline void STARPU_SGEMM(char *transa, char *transb, int M, int N, int K, 
-			float alpha, const float *A, int lda, const float *B, int ldb, 
-			float beta, float *C, int ldc)
-{
-	enum CBLAS_TRANSPOSE ta = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
-	enum CBLAS_TRANSPOSE tb = (toupper(transb[0]) == 'N')?CblasNoTrans:CblasTrans;
-
-	cblas_sgemm(CblasColMajor, ta, tb,
-			M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);				
-}
-
-inline void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, 
-			double alpha, double *A, int lda, double *B, int ldb, 
-			double beta, double *C, int ldc)
-{
-	enum CBLAS_TRANSPOSE ta = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
-	enum CBLAS_TRANSPOSE tb = (toupper(transb[0]) == 'N')?CblasNoTrans:CblasTrans;
-
-	cblas_dgemm(CblasColMajor, ta, tb,
-			M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);				
-}
-
-inline void STARPU_SGEMV(char *transa, int M, int N, float alpha, float *A, int lda, float *X, int incX, float beta, float *Y, int incY)
-{
-	enum CBLAS_TRANSPOSE ta = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
-
-	cblas_sgemv(CblasColMajor, ta, M, N, alpha, A, lda,
-					X, incX, beta, Y, incY);
-}
-
-inline void STARPU_DGEMV(char *transa, int M, int N, double alpha, double *A, int lda, double *X, int incX, double beta, double *Y, int incY)
-{
-	enum CBLAS_TRANSPOSE ta = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
-
-	cblas_dgemv(CblasColMajor, ta, M, N, alpha, A, lda,
-					X, incX, beta, Y, incY);
-}
-
-inline float STARPU_SASUM(int N, float *X, int incX)
-{
-	return cblas_sasum(N, X, incX);
-}
-
-inline double STARPU_DASUM(int N, double *X, int incX)
-{
-	return cblas_dasum(N, X, incX);
-}
-
-void STARPU_SSCAL(int N, float alpha, float *X, int incX)
-{
-	cblas_sscal(N, alpha, X, incX);
-}
-
-void STARPU_DSCAL(int N, double alpha, double *X, int incX)
-{
-	cblas_dscal(N, alpha, X, incX);
-}
-
-void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
-                   const char *diag, const int m, const int n,
-                   const float alpha, const float *A, const int lda,
-                   float *B, const int ldb)
+inline void STARPU_SGEMM(char *transa, char *transb, BLASINT M, BLASINT N, BLASINT K, 
+			float alpha, const float *A, BLASINT lda, const float *B, BLASINT ldb, 
+			float beta, float *C, BLASINT ldc)
 {
 {
-	enum CBLAS_SIDE side_ = (toupper(side[0]) == 'L')?CblasLeft:CblasRight;
-	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
-	enum CBLAS_TRANSPOSE transa_ = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
-	enum CBLAS_DIAG diag_ = (toupper(diag[0]) == 'N')?CblasNonUnit:CblasUnit;
-
-	cblas_strsm(CblasColMajor, side_, uplo_, transa_, diag_, m, n, alpha, A, lda, B, ldb);
-}
-
-void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
-                   const char *diag, const int m, const int n,
-                   const double alpha, const double *A, const int lda,
-                   double *B, const int ldb)
-{
-	enum CBLAS_SIDE side_ = (toupper(side[0]) == 'L')?CblasLeft:CblasRight;
-	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
-	enum CBLAS_TRANSPOSE transa_ = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
-	enum CBLAS_DIAG diag_ = (toupper(diag[0]) == 'N')?CblasNonUnit:CblasUnit;
-
-	cblas_dtrsm(CblasColMajor, side_, uplo_, transa_, diag_, m, n, alpha, A, lda, B, ldb);
-}
-
-void STARPU_SSYR (const char *uplo, const int n, const float alpha,
-                  const float *x, const int incx, float *A, const int lda)
-{
-	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
-
-	cblas_ssyr(CblasColMajor, uplo_, n, alpha, x, incx, A, lda); 
-}
-
-void STARPU_SSYRK (const char *uplo, const char *trans, const int n,
-                   const int k, const float alpha, const float *A,
-                   const int lda, const float beta, float *C,
-                   const int ldc)
-{
-	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
-	enum CBLAS_TRANSPOSE trans_ = (toupper(trans[0]) == 'N')?CblasNoTrans:CblasTrans;
-	
-	cblas_ssyrk(CblasColMajor, uplo_, trans_, n, k, alpha, A, lda, beta, C, ldc); 
-}
-
-void STARPU_SGER(const int m, const int n, const float alpha,
-                  const float *x, const int incx, const float *y,
-                  const int incy, float *A, const int lda)
-{
-	cblas_sger(CblasColMajor, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void STARPU_DGER(const int m, const int n, const double alpha,
-                  const double *x, const int incx, const double *y,
-                  const int incy, double *A, const int lda)
-{
-	cblas_dger(CblasColMajor, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
-                   const int n, const float *A, const int lda, float *x, 
-                   const int incx)
-{
-	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
-	enum CBLAS_TRANSPOSE trans_ = (toupper(trans[0]) == 'N')?CblasNoTrans:CblasTrans;
-	enum CBLAS_DIAG diag_ = (toupper(diag[0]) == 'N')?CblasNonUnit:CblasUnit;
-
-	cblas_strsv(CblasColMajor, uplo_, trans_, diag_, n, A, lda, x, incx);
-}
-
-void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
-                 const char *diag, const int m, const int n,
-                 const float alpha, const float *A, const int lda,
-                 float *B, const int ldb)
-{
-	enum CBLAS_SIDE side_ = (toupper(side[0]) == 'L')?CblasLeft:CblasRight;
-	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
-	enum CBLAS_TRANSPOSE transA_ = (toupper(transA[0]) == 'N')?CblasNoTrans:CblasTrans;
-	enum CBLAS_DIAG diag_ = (toupper(diag[0]) == 'N')?CblasNonUnit:CblasUnit;
-
-	cblas_strmm(CblasColMajor, side_, uplo_, transA_, diag_, m, n, alpha, A, lda, B, ldb);
-}
-
-void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
-                 const char *diag, const int m, const int n,
-                 const double alpha, const double *A, const int lda,
-                 double *B, const int ldb)
-{
-	enum CBLAS_SIDE side_ = (toupper(side[0]) == 'L')?CblasLeft:CblasRight;
-	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
-	enum CBLAS_TRANSPOSE transA_ = (toupper(transA[0]) == 'N')?CblasNoTrans:CblasTrans;
-	enum CBLAS_DIAG diag_ = (toupper(diag[0]) == 'N')?CblasNonUnit:CblasUnit;
-
-	cblas_dtrmm(CblasColMajor, side_, uplo_, transA_, diag_, m, n, alpha, A, lda, B, ldb);
-}
-
-void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
-                 const int n, const float *A, const int lda, float *X,
-                 const int incX)
-{
-	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
-	enum CBLAS_TRANSPOSE transA_ = (toupper(transA[0]) == 'N')?CblasNoTrans:CblasTrans;
-	enum CBLAS_DIAG diag_ = (toupper(diag[0]) == 'N')?CblasNonUnit:CblasUnit;
-
-	cblas_strmv(CblasColMajor, uplo_, transA_, diag_, n, A, lda, X, incX);
-}
-
-void STARPU_SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY)
-{
-	cblas_saxpy(n, alpha, X, incX, Y, incY);
-}
-
-void STARPU_DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY)
-{
-	cblas_daxpy(n, alpha, X, incX, Y, incY);
-}
-
-int STARPU_ISAMAX (const int n, float *X, const int incX)
-{
-    int retVal;
-    retVal = cblas_isamax(n, X, incX);
-    return retVal;
-}
-
-int STARPU_IDAMAX (const int n, double *X, const int incX)
-{
-    int retVal;
-    retVal = cblas_idamax(n, X, incX);
-    return retVal;
-}
-
-float STARPU_SDOT(const int n, const float *x, const int incx, const float *y, const int incy)
-{
-	return cblas_sdot(n, x, incx, y, incy);
-}
-
-double STARPU_DDOT(const int n, const double *x, const int incx, const double *y, const int incy)
-{
-	return cblas_ddot(n, x, incx, y, incy);
-}
-
-void STARPU_SSWAP(const int n, float *x, const int incx, float *y, const int incy)
-{
-	cblas_sswap(n, x, incx, y, incy);
-}
-
-void STARPU_DSWAP(const int n, double *x, const int incx, double *y, const int incy)
-{
-	cblas_dswap(n, x, incx, y, incy);
-}
-
-#elif defined(STARPU_GOTO) || defined(STARPU_OPENBLAS) || defined(STARPU_SYSTEM_BLAS) || defined(STARPU_MKL) || defined(STARPU_ARMPL)
-
-inline void STARPU_SGEMM(char *transa, char *transb, int M, int N, int K, 
-			float alpha, const float *A, int lda, const float *B, int ldb, 
-			float beta, float *C, int ldc)
-{
-	sgemm_(transa, transb, &M, &N, &K, &alpha,
+	sgemm_64_(transa, transb, &M, &N, &K, &alpha,
 			 A, &lda, B, &ldb,
 			 A, &lda, B, &ldb,
 			 &beta, C, &ldc);	
 			 &beta, C, &ldc);	
 }
 }
 
 
-inline void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, 
-			double alpha, double *A, int lda, double *B, int ldb, 
-			double beta, double *C, int ldc)
+inline void STARPU_DGEMM(char *transa, char *transb, BLASINT M, BLASINT N, BLASINT K, 
+			double alpha, double *A, BLASINT lda, double *B, BLASINT ldb, 
+			double beta, double *C, BLASINT ldc)
 {
 {
-	dgemm_(transa, transb, &M, &N, &K, &alpha,
+	dgemm_64_(transa, transb, &M, &N, &K, &alpha,
 			 A, &lda, B, &ldb,
 			 A, &lda, B, &ldb,
 			 &beta, C, &ldc);	
 			 &beta, C, &ldc);	
 }
 }
 
 
 
 
-inline void STARPU_SGEMV(char *transa, int M, int N, float alpha, float *A, int lda,
-		float *X, int incX, float beta, float *Y, int incY)
+inline void STARPU_SGEMV(char *transa, BLASINT M, BLASINT N, float alpha, float *A, BLASINT lda,
+		float *X, BLASINT incX, float beta, float *Y, BLASINT incY)
 {
 {
-	sgemv_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
+	sgemv_64_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
 }
 }
 
 
-inline void STARPU_DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
-		double *X, int incX, double beta, double *Y, int incY)
+inline void STARPU_DGEMV(char *transa, BLASINT M, BLASINT N, double alpha, double *A, BLASINT lda,
+		double *X, BLASINT incX, double beta, double *Y, BLASINT incY)
 {
 {
-	dgemv_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
+	dgemv_64_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
 }
 }
 
 
-inline float STARPU_SASUM(int N, float *X, int incX)
+inline float STARPU_SASUM(BLASINT N, float *X, BLASINT incX)
 {
 {
-	return sasum_(&N, X, &incX);
+	return sasum_64_(&N, X, &incX);
 }
 }
 
 
-inline double STARPU_DASUM(int N, double *X, int incX)
+inline double STARPU_DASUM(BLASINT N, double *X, BLASINT incX)
 {
 {
-	return dasum_(&N, X, &incX);
+	return dasum_64_(&N, X, &incX);
 }
 }
 
 
-void STARPU_SSCAL(int N, float alpha, float *X, int incX)
+void STARPU_SSCAL(BLASINT N, float alpha, float *X, BLASINT incX)
 {
 {
-	sscal_(&N, &alpha, X, &incX);
+	sscal_64_(&N, &alpha, X, &incX);
 }
 }
 
 
-void STARPU_DSCAL(int N, double alpha, double *X, int incX)
+void STARPU_DSCAL(BLASINT N, double alpha, double *X, BLASINT incX)
 {
 {
-	dscal_(&N, &alpha, X, &incX);
+	dscal_64_(&N, &alpha, X, &incX);
 }
 }
 
 
 void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
 void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
-                   const char *diag, const int m, const int n,
-                   const float alpha, const float *A, const int lda,
-                   float *B, const int ldb)
+                   const char *diag, const BLASINT m, const BLASINT n,
+                   const float alpha, const float *A, const BLASINT lda,
+                   float *B, const BLASINT ldb)
 {
 {
-	strsm_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
+	strsm_64_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
 }
 }
 
 
 void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
 void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
-                   const char *diag, const int m, const int n,
-                   const double alpha, const double *A, const int lda,
-                   double *B, const int ldb)
+                   const char *diag, const BLASINT m, const BLASINT n,
+                   const double alpha, const double *A, const BLASINT lda,
+                   double *B, const BLASINT ldb)
 {
 {
-	dtrsm_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
+	dtrsm_64_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
 }
 }
 
 
-void STARPU_SSYR (const char *uplo, const int n, const float alpha,
-                  const float *x, const int incx, float *A, const int lda)
+void STARPU_SSYR (const char *uplo, const BLASINT n, const float alpha,
+                  const float *x, const BLASINT incx, float *A, const BLASINT lda)
 {
 {
-	ssyr_(uplo, &n, &alpha, x, &incx, A, &lda); 
+	ssyr_64_(uplo, &n, &alpha, x, &incx, A, &lda); 
 }
 }
 
 
-void STARPU_SSYRK (const char *uplo, const char *trans, const int n,
-                   const int k, const float alpha, const float *A,
-                   const int lda, const float beta, float *C,
-                   const int ldc)
+void STARPU_SSYRK (const char *uplo, const char *trans, const BLASINT n,
+                   const BLASINT k, const float alpha, const float *A,
+                   const BLASINT lda, const float beta, float *C,
+                   const BLASINT ldc)
 {
 {
-	ssyrk_(uplo, trans, &n, &k, &alpha, A, &lda, &beta, C, &ldc); 
+	ssyrk_64_(uplo, trans, &n, &k, &alpha, A, &lda, &beta, C, &ldc); 
 }
 }
 
 
-void STARPU_SGER(const int m, const int n, const float alpha,
-                  const float *x, const int incx, const float *y,
-                  const int incy, float *A, const int lda)
+void STARPU_SGER(const BLASINT m, const BLASINT n, const float alpha,
+                  const float *x, const BLASINT incx, const float *y,
+                  const BLASINT incy, float *A, const BLASINT lda)
 {
 {
-	sger_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
+	sger_64_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
 }
 }
 
 
-void STARPU_DGER(const int m, const int n, const double alpha,
-                  const double *x, const int incx, const double *y,
-                  const int incy, double *A, const int lda)
+void STARPU_DGER(const BLASINT m, const BLASINT n, const double alpha,
+                  const double *x, const BLASINT incx, const double *y,
+                  const BLASINT incy, double *A, const BLASINT lda)
 {
 {
-	dger_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
+	dger_64_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
 }
 }
 
 
 void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
 void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
-                   const int n, const float *A, const int lda, float *x, 
-                   const int incx)
+                   const BLASINT n, const float *A, const BLASINT lda, float *x, 
+                   const BLASINT incx)
 {
 {
-	strsv_(uplo, trans, diag, &n, A, &lda, x, &incx);
+	strsv_64_(uplo, trans, diag, &n, A, &lda, x, &incx);
 }
 }
 
 
 void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
 void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
-                 const char *diag, const int m, const int n,
-                 const float alpha, const float *A, const int lda,
-                 float *B, const int ldb)
+                 const char *diag, const BLASINT m, const BLASINT n,
+                 const float alpha, const float *A, const BLASINT lda,
+                 float *B, const BLASINT ldb)
 {
 {
-	strmm_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
+	strmm_64_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
 }
 }
 
 
 void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
 void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
-                 const char *diag, const int m, const int n,
-                 const double alpha, const double *A, const int lda,
-                 double *B, const int ldb)
+                 const char *diag, const BLASINT m, const BLASINT n,
+                 const double alpha, const double *A, const BLASINT lda,
+                 double *B, const BLASINT ldb)
 {
 {
-	dtrmm_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
+	dtrmm_64_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
 }
 }
 
 
 void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
 void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
-                 const int n, const float *A, const int lda, float *X,
-                 const int incX)
+                 const BLASINT n, const float *A, const BLASINT lda, float *X,
+                 const BLASINT incX)
 {
 {
-	strmv_(uplo, transA, diag, &n, A, &lda, X, &incX);
+	strmv_64_(uplo, transA, diag, &n, A, &lda, X, &incX);
 }
 }
 
 
-void STARPU_SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY)
+void STARPU_SAXPY(const BLASINT n, const float alpha, float *X, const BLASINT incX, float *Y, const BLASINT incY)
 {
 {
-	saxpy_(&n, &alpha, X, &incX, Y, &incY);
+	saxpy_64_(&n, &alpha, X, &incX, Y, &incY);
 }
 }
 
 
-void STARPU_DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY)
+void STARPU_DAXPY(const BLASINT n, const double alpha, double *X, const BLASINT incX, double *Y, const BLASINT incY)
 {
 {
-	daxpy_(&n, &alpha, X, &incX, Y, &incY);
+	daxpy_64_(&n, &alpha, X, &incX, Y, &incY);
 }
 }
 
 
-int STARPU_ISAMAX (const int n, float *X, const int incX)
+BLASINT STARPU_ISAMAX (const BLASINT n, float *X, const BLASINT incX)
 {
 {
-    int retVal;
-    retVal = isamax_ (&n, X, &incX);
+    BLASINT retVal;
+    retVal = isamax_64_ (&n, X, &incX);
     return retVal;
     return retVal;
 }
 }
 
 
-int STARPU_IDAMAX (const int n, double *X, const int incX)
+BLASINT STARPU_IDAMAX (const BLASINT n, double *X, const BLASINT incX)
 {
 {
-    int retVal;
-    retVal = idamax_ (&n, X, &incX);
+    BLASINT retVal;
+    retVal = idamax_64_ (&n, X, &incX);
     return retVal;
     return retVal;
 }
 }
 
 
-float STARPU_SDOT(const int n, const float *x, const int incx, const float *y, const int incy)
+float STARPU_SDOT(const BLASINT n, const float *x, const BLASINT incx, const float *y, const BLASINT incy)
 {
 {
 	float retVal = 0;
 	float retVal = 0;
 
 
 	/* GOTOBLAS will return a FLOATRET which is a double, not a float */
 	/* GOTOBLAS will return a FLOATRET which is a double, not a float */
-	retVal = (float)sdot_(&n, x, &incx, y, &incy);
+	retVal = (float)sdot_64_(&n, x, &incx, y, &incy);
 
 
 	return retVal;
 	return retVal;
 }
 }
 
 
-double STARPU_DDOT(const int n, const double *x, const int incx, const double *y, const int incy)
-{
-	return ddot_(&n, x, &incx, y, &incy);
-}
-
-void STARPU_SSWAP(const int n, float *X, const int incX, float *Y, const int incY)
-{
-	sswap_(&n, X, &incX, Y, &incY);
-}
-
-void STARPU_DSWAP(const int n, double *X, const int incX, double *Y, const int incY)
+double STARPU_DDOT(const BLASINT n, const double *x, const BLASINT incx, const double *y, const BLASINT incy)
 {
 {
-	dswap_(&n, X, &incX, Y, &incY);
+	return ddot_64_(&n, x, &incx, y, &incy);
 }
 }
 
 
-#if defined(STARPU_MKL) || defined(STARPU_ARMPL)
-void STARPU_SPOTRF(const char*uplo, const int n, float *a, const int lda)
+void STARPU_SSWAP(const BLASINT n, float *X, const BLASINT incX, float *Y, const BLASINT incY)
 {
 {
-	int info = 0;
-	spotrf_(uplo, &n, a, &lda, &info);
+	sswap_64_(&n, X, &incX, Y, &incY);
 }
 }
 
 
-void STARPU_DPOTRF(const char*uplo, const int n, double *a, const int lda)
+void STARPU_DSWAP(const BLASINT n, double *X, const BLASINT incX, double *Y, const BLASINT incY)
 {
 {
-	int info = 0;
-	dpotrf_(uplo, &n, a, &lda, &info);
+	dswap_64_(&n, X, &incX, Y, &incY);
 }
 }
-#endif
-
-#elif defined(STARPU_SIMGRID)
-inline void STARPU_SGEMM(char *transa, char *transb, int M, int N, int K, 
-			float alpha, const float *A, int lda, const float *B, int ldb, 
-			float beta, float *C, int ldc) { }
-
-inline void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, 
-			double alpha, double *A, int lda, double *B, int ldb, 
-			double beta, double *C, int ldc) { }
-
-inline void STARPU_SGEMV(char *transa, int M, int N, float alpha, float *A, int lda,
-		float *X, int incX, float beta, float *Y, int incY) { }
-
-inline void STARPU_DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
-		double *X, int incX, double beta, double *Y, int incY) { }
-
-inline float STARPU_SASUM(int N, float *X, int incX) { return 0.; }
-
-inline double STARPU_DASUM(int N, double *X, int incX) { return 0.; }
-
-void STARPU_SSCAL(int N, float alpha, float *X, int incX) { }
-
-void STARPU_DSCAL(int N, double alpha, double *X, int incX) { }
-
-void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
-                   const char *diag, const int m, const int n,
-                   const float alpha, const float *A, const int lda,
-                   float *B, const int ldb) { }
-
-void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
-                   const char *diag, const int m, const int n,
-                   const double alpha, const double *A, const int lda,
-                   double *B, const int ldb) { }
-
-void STARPU_SSYR (const char *uplo, const int n, const float alpha,
-                  const float *x, const int incx, float *A, const int lda) { }
-
-void STARPU_SSYRK (const char *uplo, const char *trans, const int n,
-                   const int k, const float alpha, const float *A,
-                   const int lda, const float beta, float *C,
-                   const int ldc) { }
-
-void STARPU_SGER(const int m, const int n, const float alpha,
-                  const float *x, const int incx, const float *y,
-                  const int incy, float *A, const int lda) { }
-
-void STARPU_DGER(const int m, const int n, const double alpha,
-                  const double *x, const int incx, const double *y,
-                  const int incy, double *A, const int lda) { }
-
-void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
-                   const int n, const float *A, const int lda, float *x, 
-                   const int incx) { }
-
-void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
-                 const char *diag, const int m, const int n,
-                 const float alpha, const float *A, const int lda,
-                 float *B, const int ldb) { }
-
-void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
-                 const char *diag, const int m, const int n,
-                 const double alpha, const double *A, const int lda,
-                 double *B, const int ldb) { }
-
-void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
-                 const int n, const float *A, const int lda, float *X,
-                 const int incX) { }
-
-void STARPU_SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY) { }
-
-void STARPU_DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY) { }
-
-int STARPU_ISAMAX (const int n, float *X, const int incX) { return 0; }
-
-int STARPU_IDAMAX (const int n, double *X, const int incX) { return 0; }
-
-float STARPU_SDOT(const int n, const float *x, const int incx, const float *y, const int incy) { return 0.; }
-
-double STARPU_DDOT(const int n, const double *x, const int incx, const double *y, const int incy) { return 0.; }
-
-void STARPU_SSWAP(const int n, float *X, const int incX, float *Y, const int incY) { }
-
-void STARPU_DSWAP(const int n, double *X, const int incX, double *Y, const int incY) { }
-
-void STARPU_SPOTRF(const char*uplo, const int n, float *a, const int lda) { }
-
-void STARPU_DPOTRF(const char*uplo, const int n, double *a, const int lda) { }
-#endif

+ 116 - 134
julia/src/blas.h

@@ -17,150 +17,132 @@
 #ifndef __BLAS_H__
 #ifndef __BLAS_H__
 #define __BLAS_H__
 #define __BLAS_H__
 
 
-#include <starpu.h>
+#include <stdint.h>
 
 
-#if defined(STARPU_ATLAS) || defined(STARPU_HAVE_CBLAS_H)
-#include <cblas.h>
-#endif
+#define BLASINT int64_t
 
 
-void STARPU_SGEMM(char *transa, char *transb, int M, int N, int K, float alpha, const float *A, int lda, 
-		const float *B, int ldb, float beta, float *C, int ldc);
-void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, double alpha, double *A, int lda, 
-		double *B, int ldb, double beta, double *C, int ldc);
-void STARPU_SGEMV(char *transa, int M, int N, float alpha, float *A, int lda,
-		float *X, int incX, float beta, float *Y, int incY);
-void STARPU_DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
-		double *X, int incX, double beta, double *Y, int incY);
-float STARPU_SASUM(int N, float *X, int incX);
-double STARPU_DASUM(int N, double *X, int incX);
-void STARPU_SSCAL(int N, float alpha, float *X, int incX);
-void STARPU_DSCAL(int N, double alpha, double *X, int incX);
+void STARPU_SGEMM(char *transa, char *transb, BLASINT M, BLASINT N, BLASINT K, float alpha, const float *A, BLASINT lda, 
+		const float *B, BLASINT ldb, float beta, float *C, BLASINT ldc);
+void STARPU_DGEMM(char *transa, char *transb, BLASINT M, BLASINT N, BLASINT K, double alpha, double *A, BLASINT lda, 
+		double *B, BLASINT ldb, double beta, double *C, BLASINT ldc);
+void STARPU_SGEMV(char *transa, BLASINT M, BLASINT N, float alpha, float *A, BLASINT lda,
+		float *X, BLASINT incX, float beta, float *Y, BLASINT incY);
+void STARPU_DGEMV(char *transa, BLASINT M, BLASINT N, double alpha, double *A, BLASINT lda,
+		double *X, BLASINT incX, double beta, double *Y, BLASINT incY);
+float STARPU_SASUM(BLASINT N, float *X, BLASINT incX);
+double STARPU_DASUM(BLASINT N, double *X, BLASINT incX);
+void STARPU_SSCAL(BLASINT N, float alpha, float *X, BLASINT incX);
+void STARPU_DSCAL(BLASINT N, double alpha, double *X, BLASINT incX);
 void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
 void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
-                   const char *diag, const int m, const int n,
-                   const float alpha, const float *A, const int lda,
-                   float *B, const int ldb);
+                   const char *diag, const BLASINT m, const BLASINT n,
+                   const float alpha, const float *A, const BLASINT lda,
+                   float *B, const BLASINT ldb);
 void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
 void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
-                   const char *diag, const int m, const int n,
-                   const double alpha, const double *A, const int lda,
-                   double *B, const int ldb);
-void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, 
-			double alpha, double *A, int lda, double *B, int ldb, 
-			double beta, double *C, int ldc);
-void STARPU_SSYR (const char *uplo, const int n, const float alpha,
-                  const float *x, const int incx, float *A, const int lda);
-void STARPU_SSYRK (const char *uplo, const char *trans, const int n,
-                   const int k, const float alpha, const float *A,
-                   const int lda, const float beta, float *C,
-                   const int ldc);
-void STARPU_SGER (const int m, const int n, const float alpha,
-                  const float *x, const int incx, const float *y,
-                  const int incy, float *A, const int lda);
-void STARPU_DGER(const int m, const int n, const double alpha,
-                  const double *x, const int incx, const double *y,
-                  const int incy, double *A, const int lda);
+                   const char *diag, const BLASINT m, const BLASINT n,
+                   const double alpha, const double *A, const BLASINT lda,
+                   double *B, const BLASINT ldb);
+void STARPU_SSYR (const char *uplo, const BLASINT n, const float alpha,
+                  const float *x, const BLASINT incx, float *A, const BLASINT lda);
+void STARPU_SSYRK (const char *uplo, const char *trans, const BLASINT n,
+                   const BLASINT k, const float alpha, const float *A,
+                   const BLASINT lda, const float beta, float *C,
+                   const BLASINT ldc);
+void STARPU_SGER (const BLASINT m, const BLASINT n, const float alpha,
+                  const float *x, const BLASINT incx, const float *y,
+                  const BLASINT incy, float *A, const BLASINT lda);
+void STARPU_DGER(const BLASINT m, const BLASINT n, const double alpha,
+                  const double *x, const BLASINT incx, const double *y,
+                  const BLASINT incy, double *A, const BLASINT lda);
 void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
 void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
-                   const int n, const float *A, const int lda, float *x, 
-                   const int incx);
+                   const BLASINT n, const float *A, const BLASINT lda, float *x, 
+                   const BLASINT incx);
 void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
 void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
-                 const char *diag, const int m, const int n,
-                 const float alpha, const float *A, const int lda,
-                 float *B, const int ldb);
+                 const char *diag, const BLASINT m, const BLASINT n,
+                 const float alpha, const float *A, const BLASINT lda,
+                 float *B, const BLASINT ldb);
 void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
 void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
-                 const char *diag, const int m, const int n,
-                 const double alpha, const double *A, const int lda,
-                 double *B, const int ldb);
+                 const char *diag, const BLASINT m, const BLASINT n,
+                 const double alpha, const double *A, const BLASINT lda,
+                 double *B, const BLASINT ldb);
 void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
 void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
-                 const int n, const float *A, const int lda, float *X,
-                 const int incX);
-void STARPU_SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incy);
-void STARPU_DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY);
-int STARPU_ISAMAX (const int n, float *X, const int incX);
-int STARPU_IDAMAX (const int n, double *X, const int incX);
-float STARPU_SDOT(const int n, const float *x, const int incx, const float *y, const int incy);
-double STARPU_DDOT(const int n, const double *x, const int incx, const double *y, const int incy);
-void STARPU_SSWAP(const int n, float *x, const int incx, float *y, const int incy);
-void STARPU_DSWAP(const int n, double *x, const int incx, double *y, const int incy);
+                 const BLASINT n, const float *A, const BLASINT lda, float *X,
+                 const BLASINT incX);
+void STARPU_SAXPY(const BLASINT n, const float alpha, float *X, const BLASINT incX, float *Y, const BLASINT incy);
+void STARPU_DAXPY(const BLASINT n, const double alpha, double *X, const BLASINT incX, double *Y, const BLASINT incY);
+BLASINT STARPU_ISAMAX (const BLASINT n, float *X, const BLASINT incX);
+BLASINT STARPU_IDAMAX (const BLASINT n, double *X, const BLASINT incX);
+float STARPU_SDOT(const BLASINT n, const float *x, const BLASINT incx, const float *y, const BLASINT incy);
+double STARPU_DDOT(const BLASINT n, const double *x, const BLASINT incx, const double *y, const BLASINT incy);
+void STARPU_SSWAP(const BLASINT n, float *x, const BLASINT incx, float *y, const BLASINT incy);
+void STARPU_DSWAP(const BLASINT n, double *x, const BLASINT incx, double *y, const BLASINT incy);
 
 
-#if defined(STARPU_MKL) || defined(STARPU_ARMPL)
-void STARPU_SPOTRF(const char*uplo, const int n, float *a, const int lda);
-void STARPU_DPOTRF(const char*uplo, const int n, double *a, const int lda);
-#endif
 
 
-#if defined(STARPU_GOTO) || defined(STARPU_OPENBLAS) || defined(STARPU_SYSTEM_BLAS) || defined(STARPU_MKL) || defined(STARPU_ARMPL)
-
-extern void sgemm_ (const char *transa, const char *transb, const int *m,
-                   const int *n, const int *k, const float *alpha, 
-                   const float *A, const int *lda, const float *B, 
-                   const int *ldb, const float *beta, float *C, 
-                   const int *ldc);
-extern void dgemm_ (const char *transa, const char *transb, const int *m,
-                   const int *n, const int *k, const double *alpha, 
-                   const double *A, const int *lda, const double *B, 
-                   const int *ldb, const double *beta, double *C, 
-                   const int *ldc);
-extern void sgemv_(const char *trans, const int *m, const int *n, const float *alpha,
-                   const float *a, const int *lda, const float *x, const int *incx, 
-                   const float *beta, float *y, const int *incy);
-extern void dgemv_(const char *trans, const int *m, const int *n, const double *alpha,
-                   const double *a, const int *lda, const double *x, const int *incx,
-                   const double *beta, double *y, const int *incy);
-extern void ssyr_ (const char *uplo, const int *n, const float *alpha,
-                  const float *x, const int *incx, float *A, const int *lda);
-extern void ssyrk_ (const char *uplo, const char *trans, const int *n,
-                   const int *k, const float *alpha, const float *A,
-                   const int *lda, const float *beta, float *C,
-                   const int *ldc);
-extern void strsm_ (const char *side, const char *uplo, const char *transa, 
-                   const char *diag, const int *m, const int *n,
-                   const float *alpha, const float *A, const int *lda,
-                   float *B, const int *ldb);
-extern void dtrsm_ (const char *side, const char *uplo, const char *transa, 
-                   const char *diag, const int *m, const int *n,
-                   const double *alpha, const double *A, const int *lda,
-                   double *B, const int *ldb);
-extern double sasum_ (const int *n, const float *x, const int *incx);
-extern double dasum_ (const int *n, const double *x, const int *incx);
-extern void sscal_ (const int *n, const float *alpha, float *x,
-                   const int *incx);
-extern void dscal_ (const int *n, const double *alpha, double *x,
-                   const int *incx);
-extern void sger_(const int *m, const int *n, const float *alpha,
-                  const float *x, const int *incx, const float *y,
-                  const int *incy, float *A, const int *lda);
-extern void dger_(const int *m, const int *n, const double *alpha,
-                  const double *x, const int *incx, const double *y,
-                  const int *incy, double *A, const int *lda);
-extern void strsv_ (const char *uplo, const char *trans, const char *diag, 
-                   const int *n, const float *A, const int *lda, float *x, 
-                   const int *incx);
-extern void strmm_(const char *side, const char *uplo, const char *transA,
-                 const char *diag, const int *m, const int *n,
-                 const float *alpha, const float *A, const int *lda,
-                 float *B, const int *ldb);
-extern void dtrmm_(const char *side, const char *uplo, const char *transA,
-                 const char *diag, const int *m, const int *n,
-                 const double *alpha, const double *A, const int *lda,
-                 double *B, const int *ldb);
-extern void strmv_(const char *uplo, const char *transA, const char *diag,
-                 const int *n, const float *A, const int *lda, float *X,
-                 const int *incX);
-extern void saxpy_(const int *n, const float *alpha, const float *X, const int *incX,
-		float *Y, const int *incy);
-extern void daxpy_(const int *n, const double *alpha, const double *X, const int *incX,
-		double *Y, const int *incy);
-extern int isamax_(const int *n, const float *X, const int *incX);
-extern int idamax_(const int *n, const double *X, const int *incX);
+extern void sgemm_64_ (const char *transa, const char *transb, const BLASINT *m,
+                   const BLASINT *n, const BLASINT *k, const float *alpha, 
+                   const float *A, const BLASINT *lda, const float *B, 
+                   const BLASINT *ldb, const float *beta, float *C, 
+                   const BLASINT *ldc);
+extern void dgemm_64_ (const char *transa, const char *transb, const BLASINT *m,
+                   const BLASINT *n, const BLASINT *k, const double *alpha, 
+                   const double *A, const BLASINT *lda, const double *B, 
+                   const BLASINT *ldb, const double *beta, double *C, 
+                   const BLASINT *ldc);
+extern void sgemv_64_(const char *trans, const BLASINT *m, const BLASINT *n, const float *alpha,
+                   const float *a, const BLASINT *lda, const float *x, const BLASINT *incx, 
+                   const float *beta, float *y, const BLASINT *incy);
+extern void dgemv_64_(const char *trans, const BLASINT *m, const BLASINT *n, const double *alpha,
+                   const double *a, const BLASINT *lda, const double *x, const BLASINT *incx,
+                   const double *beta, double *y, const BLASINT *incy);
+extern void ssyr_64_ (const char *uplo, const BLASINT *n, const float *alpha,
+                  const float *x, const BLASINT *incx, float *A, const BLASINT *lda);
+extern void ssyrk_64_ (const char *uplo, const char *trans, const BLASINT *n,
+                   const BLASINT *k, const float *alpha, const float *A,
+                   const BLASINT *lda, const float *beta, float *C,
+                   const BLASINT *ldc);
+extern void strsm_64_ (const char *side, const char *uplo, const char *transa, 
+                   const char *diag, const BLASINT *m, const BLASINT *n,
+                   const float *alpha, const float *A, const BLASINT *lda,
+                   float *B, const BLASINT *ldb);
+extern void dtrsm_64_ (const char *side, const char *uplo, const char *transa, 
+                   const char *diag, const BLASINT *m, const BLASINT *n,
+                   const double *alpha, const double *A, const BLASINT *lda,
+                   double *B, const BLASINT *ldb);
+extern double sasum_64_ (const BLASINT *n, const float *x, const BLASINT *incx);
+extern double dasum_64_ (const BLASINT *n, const double *x, const BLASINT *incx);
+extern void sscal_64_ (const BLASINT *n, const float *alpha, float *x,
+                   const BLASINT *incx);
+extern void dscal_64_ (const BLASINT *n, const double *alpha, double *x,
+                   const BLASINT *incx);
+extern void sger_64_(const BLASINT *m, const BLASINT *n, const float *alpha,
+                  const float *x, const BLASINT *incx, const float *y,
+                  const BLASINT *incy, float *A, const BLASINT *lda);
+extern void dger_64_(const BLASINT *m, const BLASINT *n, const double *alpha,
+                  const double *x, const BLASINT *incx, const double *y,
+                  const BLASINT *incy, double *A, const BLASINT *lda);
+extern void strsv_64_ (const char *uplo, const char *trans, const char *diag, 
+                   const BLASINT *n, const float *A, const BLASINT *lda, float *x, 
+                   const BLASINT *incx);
+extern void strmm_64_(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const BLASINT *m, const BLASINT *n,
+                 const float *alpha, const float *A, const BLASINT *lda,
+                 float *B, const BLASINT *ldb);
+extern void dtrmm_64_(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const BLASINT *m, const BLASINT *n,
+                 const double *alpha, const double *A, const BLASINT *lda,
+                 double *B, const BLASINT *ldb);
+extern void strmv_64_(const char *uplo, const char *transA, const char *diag,
+                 const BLASINT *n, const float *A, const BLASINT *lda, float *X,
+                 const BLASINT *incX);
+extern void saxpy_64_(const BLASINT *n, const float *alpha, const float *X, const BLASINT *incX,
+		float *Y, const BLASINT *incy);
+extern void daxpy_64_(const BLASINT *n, const double *alpha, const double *X, const BLASINT *incX,
+		double *Y, const BLASINT *incy);
+extern BLASINT isamax_64_(const BLASINT *n, const float *X, const BLASINT *incX);
+extern BLASINT idamax_64_(const BLASINT *n, const double *X, const BLASINT *incX);
 /* for some reason, FLOATRET is not a float but a double in GOTOBLAS */
 /* for some reason, FLOATRET is not a float but a double in GOTOBLAS */
-extern double sdot_(const int *n, const float *x, const int *incx, const float *y, const int *incy);
-extern double ddot_(const int *n, const double *x, const int *incx, const double *y, const int *incy);
-extern void sswap_(const int *n, float *x, const int *incx, float *y, const int *incy);
-extern void dswap_(const int *n, double *x, const int *incx, double *y, const int *incy);
-
-#if (defined STARPU_MKL) || (defined STARPU_ARMPL)
-extern void spotrf_(const char*uplo, const int *n, float *a, const int *lda, int *info);
-extern void dpotrf_(const char*uplo, const int *n, double *a, const int *lda, int *info);
-#endif
-
-#endif
+extern double sdot_64_(const BLASINT *n, const float *x, const BLASINT *incx, const float *y, const BLASINT *incy);
+extern double ddot_64_(const BLASINT *n, const double *x, const BLASINT *incx, const double *y, const BLASINT *incy);
+extern void sswap_64_(const BLASINT *n, float *x, const BLASINT *incx, float *y, const BLASINT *incy);
+extern void dswap_64_(const BLASINT *n, double *x, const BLASINT *incx, double *y, const BLASINT *incy);
 
 
 #endif /* __BLAS_H__ */
 #endif /* __BLAS_H__ */

+ 15 - 0
julia/src/blas.jl

@@ -1,3 +1,18 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
 @enum STARPU_BLAS begin
 @enum STARPU_BLAS begin
     STARPU_SAXPY
     STARPU_SAXPY
 end
 end

+ 15 - 0
julia/src/blas_wrapper.c

@@ -1,3 +1,18 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
 #include <starpu.h>
 #include <starpu.h>
 #include <blas.h>
 #include <blas.h>
 
 

+ 25 - 8
julia/src/compiler/c.jl

@@ -73,15 +73,16 @@ function transform_to_cpu_kernel(expr :: StarpuExprFunction)
     output = add_for_loop_declarations(expr)
     output = add_for_loop_declarations(expr)
     output = substitute_args(output)
     output = substitute_args(output)
     output = substitute_func_calls(output)
     output = substitute_func_calls(output)
+    output = substitute_views(output)
     output = substitute_indexing(output)
     output = substitute_indexing(output)
     output = flatten_blocks(output)
     output = flatten_blocks(output)
 
 
     return output
     return output
 end
 end
 
 
-function generate_c_struct_param_declaration(funcname)
-    scalar_parameters = CODELETS_SCALARS[funcname]
-    struct_params_name = CODELETS_PARAMS_STRUCT[funcname]
+function generate_c_struct_param_declaration(codelet_name)
+    scalar_parameters = CODELETS_SCALARS[codelet_name]
+    struct_params_name = CODELETS_PARAMS_STRUCT[codelet_name]
 
 
     output = "struct $struct_params_name {\n"
     output = "struct $struct_params_name {\n"
     for p in scalar_parameters
     for p in scalar_parameters
@@ -212,18 +213,18 @@ function substitute_args(expr :: StarpuExprFunction)
 
 
 
 
     new_args = [
     new_args = [
-                    starpu_parse(:($buffer_arg_name :: Matrix{Nothing})),
-                    starpu_parse(:($cl_arg_name :: Vector{Nothing}))
-                ]
+        starpu_parse(:($buffer_arg_name :: Ptr{Ptr{Nothing}})),
+        starpu_parse(:($cl_arg_name :: Vector{Nothing}))
+    ]
     new_body = StarpuExprBlock([function_start_affectations..., new_body.exprs...])
     new_body = StarpuExprBlock([function_start_affectations..., new_body.exprs...])
 
 
     return StarpuExprFunction(expr.ret_type, expr.func, new_args, new_body)
     return StarpuExprFunction(expr.ret_type, expr.func, new_args, new_body)
 end
 end
 
 
 func_substitution = Dict(
 func_substitution = Dict(
-    :width => :STARPU_MATRIX_GET_NY,
+    :width  => :STARPU_MATRIX_GET_NY,
     :height => :STARPU_MATRIX_GET_NX,
     :height => :STARPU_MATRIX_GET_NX,
-
+    :ld     => :STARPU_MATRIX_GET_LD,
     :length => :STARPU_VECTOR_GET_NX
     :length => :STARPU_VECTOR_GET_NX
 )
 )
 
 
@@ -243,6 +244,22 @@ function substitute_func_calls(expr :: StarpuExpr)
     return apply(func_to_apply, expr)
     return apply(func_to_apply, expr)
 end
 end
 
 
+function substitute_views(expr :: StarpuExpr)
+    function func_to_apply(x :: StarpuExpr)
+
+        if !isa(x, StarpuExprCall) || x.func != :view
+            return x
+        end
+
+        ref = x.args[1]
+        indexes = map(i -> isa(i, StarpuExprInterval) ? i.start : i, x.args[2:end])
+
+        return StarpuExprAddress(StarpuExprRef(ref, indexes))
+    end
+
+    return apply(func_to_apply, expr)
+
+end
 
 
 function substitute_indexing(expr :: StarpuExpr)
 function substitute_indexing(expr :: StarpuExpr)
 
 

+ 301 - 25
julia/src/compiler/cuda.jl

@@ -144,7 +144,278 @@ function add_device_to_interval_call(expr :: StarpuExpr)
     return apply(func_to_apply, expr)
     return apply(func_to_apply, expr)
 end
 end
 
 
+function translate_cublas(expr :: StarpuExpr)
+    function func_to_run(x :: StarpuExpr)
+        # STARPU_BLAS => (CUBLAS, TRANS, FILLMODE, ALPHA, SIDE, DIAG)
+        blas_to_cublas = Dict(:STARPU_SGEMM  => (:cublasSgemm, [1, 2], [], [6, 11], [], []),
+                              :STARPU_DGEMM  => (:cublasDgemm, [1, 2], [], [6, 11], [], []),
+                              :STARPU_SGEMV  => (:cublasSgemv, [1], [], [4,9], [], []),
+                              :STARPU_DGEMV  => (:cublasDgemv, [1], [], [4,9], [], []),
+                              :STARPU_SSCAL  => (:cublasSscal, [], [], [2], [], []),
+                              :STARPU_DSCAL  => (:cublasDscal, [], [], [2], [], []),
+                              :STARPU_STRSM  => (:cublasStrsm, [3], [2], [7], [1], [4]),
+                              :STARPU_DTRSM  => (:cublasDtrsm, [3], [2], [7], [1], [4]),
+                              :STARPU_SSYR   => (:cublasSsyr, [], [1], [3], [], []),
+                              :STARPU_SSYRK  => (:cublasSsyrk, [2], [1], [5,8], [], []),
+                              :STARPU_SGER   => (:cublasSger, [], [], [3], [], []),
+                              :STARPU_DGER   => (:cublasDger, [], [], [3], [], []),
+                              :STARPU_STRSV  => (:cublasStrsv, [2], [1], [], [], [3]),
+                              :STARPU_STRMM  => (:cublasStrmm, [3], [2], [7], [1], [4]),
+                              :STARPU_DTRMM  => (:cublasDtrmm, [3], [2], [7], [1], [4]),
+                              :STARPU_STRMV  => (:cublasStrmv, [2], [1], [], [], [3]),
+                              :STARPU_SAXPY  => (:cublasSaxpy, [], [], [2], [], []),
+                              :STARPU_DAXPY  => (:cublasDaxpy, [], [], [2], [], []),
+                              :STARPU_SSWAP  => (:cublasSswap, [], [], [], [], []),
+                              :STARPU_DSWAP  => (:cublasDswap, [], [], [], [], []))
+
+        if !(isa(x, StarpuExprCall) && x.func in keys(blas_to_cublas))
+            return x
+        end
+
+        new_args = x.args
+
+        # cublasOperation_t parameters (e.g. StarpuExprValue("N")
+        for i in blas_to_cublas[x.func][2]
+            if !isa(new_args[i], StarpuExprValue) || !isa(new_args[i].value, String)
+                error("Argument $i of ", x.func, " must be a string")
+            end
+
+            value = new_args[i].value
+
+            if value == "N" || value == "n"
+                new_args[i] = StarpuExprVar(:CUBLAS_OP_N)
+            elseif value == "T" || value == "t"
+                new_args[i] = StarpuExprVar(:CUBLAS_OP_T)
+            elseif value == "C" || value == "c"
+                new_args[i] = StarpuExprVar(:CUBLAS_OP_C)
+            else
+                error("Unhandled value for rgument $i of ", x.func, ": ", value,
+                      "expecting (\"N\", \"T\", or \"C\")")
+            end
+        end
+
+        # cublasFillMode_t parameters (e.g. StarpuExprValue("L")
+        for i in blas_to_cublas[x.func][3]
+            if !isa(new_args[i], StarpuExprValue) || !isa(new_args[i].value, String)
+                error("Argument $i of ", x.func, " must be a string")
+            end
+
+            value = new_args[i].value
+
+            if value == "L" || value == "l"
+                new_args[i] = StarpuExprVar(:CUBLAS_FILL_MODE_LOWER)
+            elseif value == "U" || value == "u"
+                new_args[i] = StarpuExprVar(:CUBLAS_FILL_MODE_UPPER)
+            else
+                error("Unhandled value for rgument $i of ", x.func, ": ", value,
+                      "expecting (\"L\" or \"U\")")
+            end
+        end
+
+        # scalar parameters (alpha, beta, ...):  alpha -> &alpha
+        for i in blas_to_cublas[x.func][4]
+            if !isa(new_args[i], StarpuExprVar)
+                error("Argument $i of ", x.func, " must be a variable")
+            end
+            var_name = new_args[i].name
+            new_args[i] = StarpuExprVar(Symbol("&$var_name"))
+        end
+
+        # cublasSideMode_t parameters (e.g. StarpuExprValue("L")
+        for i in blas_to_cublas[x.func][5]
+            if !isa(new_args[i], StarpuExprValue) || !isa(new_args[i].value, String)
+                error("Argument $i of ", x.func, " must be a string, got: ", new_args[i])
+            end
+
+            value = new_args[i].value
+
+            if value == "L" || value == "l"
+                new_args[i] = StarpuExprVar(:CUBLAS_SIDE_LEFT)
+            elseif value == "R" || value == "r"
+                new_args[i] = StarpuExprVar(:CUBLAS_SIDE_RIGHT)
+            else
+                error("Unhandled value for rgument $i of ", x.func, ": ", value,
+                      "expecting (\"L\" or \"R\")")
+            end
+        end
+
+        # cublasDiag_Typet parameters (e.g. StarpuExprValue("N")
+        for i in blas_to_cublas[x.func][6]
+            if !isa(new_args[i], StarpuExprValue) || !isa(new_args[i].value, String)
+                error("Argument $i of ", x.func, " must be a string")
+            end
+
+            value = new_args[i].value
+
+            if value == "N" || value == "n"
+                new_args[i] = StarpuExprVar(:CUBLAS_DIAG_NON_UNIT)
+            elseif value == "U" || value == "u"
+                new_args[i] = StarpuExprVar(:CUBLAS_DIAG_UNIT)
+            else
+                error("Unhandled value for rgument $i of ", x.func, ": ", value,
+                      "expecting (\"N\" or \"U\")")
+            end
+        end
+
+        new_args = [@parse(starpu_cublas_get_local_handle()), x.args...]
+
+        status_varname = "status"*rand_string()
+        status_var = StarpuExprVar(Symbol("cublasStatus_t "*status_varname))
+        call_expr = StarpuExprCall(blas_to_cublas[x.func][1], new_args)
+
+        return StarpuExprBlock([StarpuExprAffect(status_var, call_expr),
+                                starpu_parse(Meta.parse("""if $status_varname != CUBLAS_STATUS_SUCCESS
+                                                              STARPU_CUBLAS_REPORT_ERROR($status_varname)
+                                                          end""")),
+                                @parse cudaStreamSynchronize(starpu_cuda_get_local_stream())])
+    end
+
+    return apply(func_to_run, expr)
+end
+
+function get_all_assignments(cpu_instr)
+    ret = StarpuExpr[]
+
+    function func_to_run(x :: StarpuExpr)
+        if isa(x, StarpuExprAffect)
+            push!(ret, x)
+        end
+
+        return x
+    end
+
+    apply(func_to_run, cpu_instr)
+    return ret
+end
+
+function get_all_buffer_vars(cpu_instr)
+    ret = StarpuExprTypedVar[]
+    assignments = get_all_assignments(cpu_instr)
+    for x in assignments
+        var = x.var
+        expr = x.expr
+        if isa(expr, StarpuExprCall) && expr.func in [:STARPU_MATRIX_GET_PTR, :STARPU_VECTOR_GET_PTR]
+            push!(ret, var)
+        end
+    end
+
+    return ret
+end
+
+function get_all_buffer_stores(cpu_instr, vars)
+    ret = StarpuExprAffect[]
+
+    function func_to_run(x :: StarpuExpr)
+        if isa(x, StarpuExprAffect) && isa(x.var, StarpuExprRef) && isa(x.var.ref, StarpuExprVar) &&
+            x.var.ref.name in map(x -> x.name, vars)
+            push!(ret, x)
+        end
+
+        return x
+    end
+
+    apply(func_to_run, cpu_instr)
+    return ret
+end
+
+function get_all_buffer_refs(cpu_instr, vars)
+    ret = []
+
+    current_instr = nothing
+    InstrTy = Union{StarpuExprAffect,
+                    StarpuExprCall,
+                    StarpuExprCudaCall,
+                    StarpuExprFor,
+                    StarpuExprIf,
+                    StarpuExprIfElse,
+                    StarpuExprReturn,
+                    StarpuExprBreak,
+                    StarpuExprWhile}
+    parent = nothing
 
 
+    function func_to_run(x :: StarpuExpr)
+        if isa(x, InstrTy) && !(isa(x, StarpuExprCall) && x.func in [:(+), :(-), :(*), :(/), :(%), :(<), :(<=), :(==), :(!=), :(>=), :(>), :sqrt])
+            current_instr = x
+        end
+
+        if isa(x, StarpuExprRef) && isa(x.ref, StarpuExprVar) && x.ref.name in map(x -> x.name, vars) && # var[...]
+            !isa(parent, StarpuExprAddress) && # filter &var[..]
+            !(isa(current_instr, StarpuExprAffect) && current_instr.var == x) # filter lhs ref
+            push!(ret, (current_instr, x))
+        end
+
+        parent = x
+        return x
+    end
+
+    visit_preorder(func_to_run, cpu_instr)
+    return ret
+end
+
+function transform_cuda_device_loadstore(cpu_instr :: StarpuExprBlock)
+    # Get all CUDA buffer pointers
+    buffer_vars = get_all_buffer_vars(cpu_instr)
+
+    buffer_types = Dict{Symbol, Type}()
+    for var in buffer_vars
+        buffer_types[var.name] = var.typ
+    end
+
+    # Get all store to a CUDA buffer
+    stores = get_all_buffer_stores(cpu_instr, buffer_vars)
+
+    # Get all load from CUDA buffer
+    loads = get_all_buffer_refs(cpu_instr, buffer_vars)
+
+    # Replace each load L:
+    # L: ... buffer[id]
+    # With the following instruction block:
+    # Type varX
+    # cudaMemcpy(&varX, &buffer[id], sizeof(Type), cudaMemcpyDeviceToHost)
+    # L: ... varX
+    for l in loads
+        (instr, ref) = l
+        block = []
+        buffer = ref.ref.name
+        varX = "var"*rand_string()
+        type = buffer_types[Symbol(buffer)]
+        ctype = starpu_type_traduction(eltype(type))
+        push!(block, StarpuExprTypedVar(Symbol(varX), eltype(type)))
+        push!(block, StarpuExprCall(:cudaMemcpy,
+                                    [StarpuExprAddress(StarpuExprVar(Symbol(varX))),
+                                     StarpuExprAddress(ref),
+                                     StarpuExprVar(Symbol("sizeof($ctype)")),
+                                     StarpuExprVar(:cudaMemcpyDeviceToHost)]))
+        push!(block, substitute(instr, ref, StarpuExprVar(Symbol("$varX"))))
+
+        cpu_instr = substitute(cpu_instr, instr, StarpuExprBlock(block))
+    end
+
+    # Replace each Store S:
+    # S: buffer[id] = expr
+    # With the following instruction block:
+    # Type varX
+    # varX = expr
+    # cudaMemcpy(&buffer[id], &varX, sizeof(Type), cudaMemcpyHostToDevice)
+    for s in stores
+        block = []
+        buffer = s.var.ref.name
+        varX = "var"*rand_string()
+        type = buffer_types[Symbol(buffer)]
+        ctype = starpu_type_traduction(eltype(type))
+        push!(block, StarpuExprTypedVar(Symbol(varX), eltype(type)))
+        push!(block, StarpuExprAffect(StarpuExprVar(Symbol("$varX")), s.expr))
+        push!(block, StarpuExprCall(:cudaMemcpy,
+                                    [StarpuExprAddress(s.var),
+                                     StarpuExprAddress(StarpuExprVar(Symbol(varX))),
+                                     StarpuExprVar(Symbol("sizeof($ctype)")),
+                                     StarpuExprVar(:cudaMemcpyHostToDevice)]))
+
+        cpu_instr = substitute(cpu_instr, s, StarpuExprBlock(block))
+    end
+
+    return cpu_instr
+end
 
 
 function transform_to_cuda_kernel(func :: StarpuExprFunction)
 function transform_to_cuda_kernel(func :: StarpuExprFunction)
 
 
@@ -152,45 +423,50 @@ function transform_to_cuda_kernel(func :: StarpuExprFunction)
 
 
     init, indep, finish = extract_init_indep_finish(cpu_func.body)
     init, indep, finish = extract_init_indep_finish(cpu_func.body)
 
 
-    if indep == nothing
-        error("No independant for loop has been found") # TODO can fail because extraction is not correct yet
-    end
+    cpu_instr = init
+    kernel = nothing
 
 
-    prekernel_instr, kernel_args, kernel_instr = analyse_sets(indep)
+    # Generate a CUDA kernel only if there is an independent loop (@parallel macro).
+    if (indep != nothing)
+        prekernel_instr, kernel_args, kernel_instr = analyse_sets(indep)
 
 
-    kernel_call = StarpuExprCudaCall(:cudaKernel, (@parse nblocks), (@parse THREADS_PER_BLOCK), StarpuExpr[])
-    prekernel_instr = vcat(init, prekernel_instr)
-    kernel_instr = vcat(kernel_instr, indep.body)
+        kernel_call = StarpuExprCudaCall(:cudaKernel, (@parse nblocks), (@parse THREADS_PER_BLOCK), StarpuExpr[])
+        cpu_instr = vcat(cpu_instr, prekernel_instr)
+        kernel_instr = vcat(kernel_instr, indep.body)
 
 
-    indep_for_def, indep_for_undef = analyse_variable_declarations(StarpuExprBlock(kernel_instr), kernel_args)
-    prekernel_def, prekernel_undef = analyse_variable_declarations(StarpuExprBlock(prekernel_instr), cpu_func.args)
+        indep_for_def, indep_for_undef = analyse_variable_declarations(StarpuExprBlock(kernel_instr), kernel_args)
+        prekernel_def, prekernel_undef = analyse_variable_declarations(StarpuExprBlock(cpu_instr), cpu_func.args)
 
 
-    for undef_var in indep_for_undef
+        for undef_var in indep_for_undef
 
 
-        found_var = find_variable(undef_var, prekernel_def)
+            found_var = find_variable(undef_var, prekernel_def)
 
 
-        if found_var == nothing # TODO : error then ?
-            continue
+            if found_var == nothing # TODO : error then ?
+                continue
+            end
+
+            push!(kernel_args, found_var)
         end
         end
 
 
-        push!(kernel_args, found_var)
+        call_args = map((x -> StarpuExprVar(x.name)), kernel_args)
+        kernelname=Symbol("KERNEL_",func.func);
+        cuda_call = StarpuExprCudaCall(kernelname, (@parse nblocks), (@parse THREADS_PER_BLOCK), call_args)
+        push!(cpu_instr, cuda_call)
+        push!(cpu_instr, @parse cudaStreamSynchronize(starpu_cuda_get_local_stream()))
+        kernel = StarpuExprFunction(Nothing, kernelname, kernel_args, StarpuExprBlock(kernel_instr))
+        kernel = add_device_to_interval_call(kernel)
+        kernel = flatten_blocks(kernel)
     end
     end
 
 
-    call_args = map((x -> StarpuExprVar(x.name)), kernel_args)
-    kernelname=Symbol("KERNEL_",func.func);
-    cuda_call = StarpuExprCudaCall(kernelname, (@parse nblocks), (@parse THREADS_PER_BLOCK), call_args)
-    push!(prekernel_instr, cuda_call)
-    push!(prekernel_instr, @parse cudaStreamSynchronize(starpu_cuda_get_local_stream()))
-    prekernel_instr = vcat(prekernel_instr, finish)
+    cpu_instr = vcat(cpu_instr, finish)
+    cpu_instr = StarpuExprBlock(cpu_instr)
+    cpu_instr = transform_cuda_device_loadstore(cpu_instr)
 
 
     prekernel_name = Symbol("CUDA_", func.func)
     prekernel_name = Symbol("CUDA_", func.func)
-    prekernel = StarpuExprFunction(Nothing, prekernel_name, cpu_func.args, StarpuExprBlock(prekernel_instr))
+    prekernel = StarpuExprFunction(Nothing, prekernel_name, cpu_func.args, cpu_instr)
+    prekernel = translate_cublas(prekernel)
     prekernel = flatten_blocks(prekernel)
     prekernel = flatten_blocks(prekernel)
 
 
-    kernel = StarpuExprFunction(Nothing, kernelname, kernel_args, StarpuExprBlock(kernel_instr))
-    kernel = add_device_to_interval_call(kernel)
-    kernel = flatten_blocks(kernel)
-    
     return prekernel, kernel
     return prekernel, kernel
 end
 end
 
 

+ 351 - 3
julia/src/compiler/expression_manipulation.jl

@@ -14,6 +14,30 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 #
 #
 
 
+"""
+    Lenient comparison operator for structures and arrays.
+"""
+@generated function ≂(x, y)
+    if x != y || x <: Type
+        :(x == y)
+    elseif !isempty(fieldnames(x))
+        mapreduce(n -> :(x.$n ≂ y.$n), (a,b)->:($a && $b), fieldnames(x))
+    elseif x <: Array
+        quote
+            if length(x) != length(y)
+                return false
+            end
+            for i in 1:length(x)
+                if !(x[i] ≂ y[i])
+                    return false
+                end
+            end
+            return true
+        end
+    else
+        :(x == y)
+    end
+end
 
 
 """
 """
     Returns a new expression where every occurrence of expr_to_replace into expr
     Returns a new expression where every occurrence of expr_to_replace into expr
@@ -22,8 +46,7 @@
 function substitute(expr :: StarpuExpr, expr_to_replace :: StarpuExpr, new_expr :: StarpuExpr)
 function substitute(expr :: StarpuExpr, expr_to_replace :: StarpuExpr, new_expr :: StarpuExpr)
 
 
     function func_to_apply(x :: StarpuExpr)
     function func_to_apply(x :: StarpuExpr)
-
-        if (x == expr_to_replace)
+        if (x ≂ expr_to_replace)
             return new_expr
             return new_expr
         end
         end
 
 
@@ -33,7 +56,6 @@ function substitute(expr :: StarpuExpr, expr_to_replace :: StarpuExpr, new_expr
     return apply(func_to_apply, expr)
     return apply(func_to_apply, expr)
 end
 end
 
 
-
 """
 """
     Returns an expression where "€" symbols  in expr were replaced
     Returns an expression where "€" symbols  in expr were replaced
     by the following expression list.
     by the following expression list.
@@ -125,3 +147,329 @@ import Base.all
 function all(cond :: Function, expr :: StarpuExpr)
 function all(cond :: Function, expr :: StarpuExpr)
     return !any(!cond, expr)
     return !any(!cond, expr)
 end
 end
+
+function visit_preorder(func :: Function, expr :: StarpuExprAffect)
+    func(expr)
+    visit_preorder(func, expr.var)
+    visit_preorder(func, expr.expr)
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExprBlock)
+    func(expr)
+    for e in expr.exprs
+        visit_preorder(func, e)
+    end
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExprCall)
+    func(expr)
+    for a in expr.args
+        visit_preorder(func, a)
+    end
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExprCudaCall)
+    func(expr)
+    func(expr.nblocks)
+    func(expr.threads_per_block)
+    for a in expr.args
+        visit_preorder(func, a)
+    end
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExprField)
+    func(expr)
+    func(expr.left)
+    func(expr.field)
+    func(expr.is_an_arrow)
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExprFor)
+    func(expr)
+    for d in expr.set_declarations
+        visit_preorder(func, d)
+    end
+    visit_preorder(func, expr.set)
+    visit_preorder(func, expr.body)
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExprFunction)
+    func(expr)
+    for a in expr.args
+        visit_preorder(func, a)
+    end
+    visit_preorder(func, e.body)
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExprIf)
+    func(expr)
+    visit_preorder(func, expr.cond)
+    visit_preorder(func, expr.then_statement)
+    return expr
+end
+
+
+
+function visit_preorder(func :: Function, expr :: StarpuExprIfElse)
+    func(expr)
+    visit_preorder(func, expr.cond)
+    visit_preorder(func, expr.then_statement)
+    visit_preorder(func, expr.else_statement)
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExprInterval)
+    func(expr)
+    visit_preorder(func, expr.start)
+    visit_preorder(func, expr.step)
+    visit_preorder(func, expr.stop)
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExprRef)
+    func(expr)
+    visit_preorder(func, expr.ref)
+    for i in expr.indexes
+        visit_preorder(func, i)
+    end
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExprAddress)
+    func(expr)
+    visit_preorder(func, expr.ref)
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExprBreak)
+    func(expr)
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExprReturn)
+    func(expr)
+    visit_preorder(func, expr.value)
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExpr)
+    func(expr)
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExprTypedExpr)
+    func(expr)
+    visit_preorder(func, expr.expr)
+    return expr
+end
+
+function visit_preorder(func :: Function, expr :: StarpuExprWhile)
+    func(expr)
+    visit_preorder(func, expr.cond)
+    visit_preorder(func, expr.body)
+    return expr
+end
+
+# function substitute_preorder(expr :: StarpuExprAffect, match :: StarpuExpr, replace :: StarpuExpr)
+#     if expr == match
+#         return replace
+#     end
+#     var = substitute_preorder(func, expr.var)
+#     expr = substitute_preorder(func, expr.expr)
+
+#     if var != expr.var || expr != expr.expr
+#         return StarpuExprAffect(var, expr)
+#     end
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExprBlock, match :: StarpuExpr, replace :: StarpuExpr)
+#     if expr == match
+#         return replace
+#     end
+
+#     modified = false
+#     new_exprs = Vector{StarpuExpr}()
+#     for e in expr.exprs
+#         push!(new_exprs, substitute_preorder(func, e))
+#     end
+#     if new_exprs != expr.exprs
+#         return StarpuExprBlock(new_exprs)
+#     end
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExprCall, match :: StarpuExpr, replace :: StarpuExpr)
+#     if expr == match
+#         return replace
+#     end
+
+#     new_args = Vector{StarpuExpr}()
+#     for a in expr.args
+#         push!(new_args, substitute_preorder(func, a))
+#     end
+#     if new_args != expr.args
+#         return StarpuExprCall(expr.func, new_args)
+#     end
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExprCudaCall, match :: StarpuExpr, replace :: StarpuExpr)
+#     if expr == match
+#         return replace
+#     end
+
+#     new_args = Vector{StarpuExpr}()
+#     for a in expr.args
+#         push!(new_args, substitute_preorder(func, a))
+#     end
+#     if new_args != expr.args
+#         return new StarpuExprCudaCall(expr.ker_name, expr.nblocks, expr.threads_per_block, new_args)
+#     end
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExprField, match :: StarpuExpr, replace :: StarpuExpr)
+#     if expr == match
+#         return replace
+#     end
+
+#     left = substitute_preorder(expr.left, match, replace)
+#     if left != expr.left
+#         return StarpuExprField(left, expr.field, expr.is_an_arrow)
+#     end
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExprFor, match :: StarpuExpr, replace :: StarpuExpr)
+#     if expr == match
+#         return replace
+#     end
+
+#     new_set_declarations = Vector{StarpuExpr}()
+    
+#     for d in expr.set_declarations
+#         substitute_preorder(func, d)
+#     end
+#     substitute_preorder(expr.set, match :: StarpuExpr, replace :: StarpuExpr)
+#     substitute_preorder(func, expr.body)
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExprFunction, match :: StarpuExpr, replace :: StarpuExpr)
+#         if expr == match
+#         return replace
+#     end
+
+#     for a in expr.args
+#         substitute_preorder(func, a)
+#     end
+#     substitute_preorder(e.body, match :: StarpuExpr, replace :: StarpuExpr)
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExprIf, match :: StarpuExpr, replace :: StarpuExpr)
+#         if expr == match
+#         return replace
+#     end
+
+#     substitute_preorder(func, expr.cond)
+#     substitute_preorder(func, expr.then_statement)
+#     return expr
+# end
+
+
+
+# function substitute_preorder(expr :: StarpuExprIfElse, match :: StarpuExpr, replace :: StarpuExpr)
+#         if expr == match
+#         return replace
+#     end
+
+#     substitute_preorder(func, expr.cond)
+#     substitute_preorder(func, expr.then_statement)
+#     substitute_preorder(func, expr.else_statement)
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExprInterval, match :: StarpuExpr, replace :: StarpuExpr)
+#         if expr == match
+#         return replace
+#     end
+
+#     substitute_preorder(func, expr.start)
+#     substitute_preorder(func, expr.step)
+#     substitute_preorder(func, expr.stop)
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExprRef, match :: StarpuExpr, replace :: StarpuExpr)
+#         if expr == match
+#         return replace
+#     end
+
+#     substitute_preorder(func, expr.ref)
+#     for i in expr.indexes
+#         substitute_preorder(func, i)
+#     end
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExprAddress, match :: StarpuExpr, replace :: StarpuExpr)
+#         if expr == match
+#         return replace
+#     end
+
+#     substitute_preorder(func, expr.ref)
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExprBreak, match :: StarpuExpr, replace :: StarpuExpr)
+#         if expr == match
+#         return replace
+#     end
+
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExprReturn, match :: StarpuExpr, replace :: StarpuExpr)
+#         if expr == match
+#         return replace
+#     end
+
+#     substitute_preorder(func, expr.value)
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExpr, match :: StarpuExpr, replace :: StarpuExpr)
+#         if expr == match
+#         return replace
+#     end
+
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExprTypedExpr, match :: StarpuExpr, replace :: StarpuExpr)
+#         if expr == match
+#         return replace
+#     end
+
+#     substitute_preorder(func, expr.expr)
+#     return expr
+# end
+
+# function substitute_preorder(expr :: StarpuExprWhile, match :: StarpuExpr, replace :: StarpuExpr)
+#         if expr == match
+#         return replace
+#     end
+
+#     substitute_preorder(func, expr.cond)
+#     substitute_preorder(func, expr.body)
+#     return expr
+# end

+ 24 - 16
julia/src/compiler/expressions.jl

@@ -124,6 +124,9 @@ struct StarpuExprWhile <: StarpuExpr
     body :: StarpuExpr
     body :: StarpuExpr
 end
 end
 
 
+struct StarpuExprAddress <: StarpuExpr
+    ref :: StarpuExpr
+end
 
 
 function starpu_parse_affect(x :: Expr)
 function starpu_parse_affect(x :: Expr)
 
 
@@ -250,7 +253,7 @@ function starpu_parse_call(x :: Expr)
 end
 end
 
 
 
 
-starpu_infix_operators = (:(+), :(*), :(-), :(/), :(<), :(>), :(<=), :(>=), :(%))
+starpu_infix_operators = (:(+), :(*), :(-), :(/), :(<), :(>), :(<=), :(>=), :(!=), :(%))
 
 
 
 
 function print_prefix(io :: IO, x :: StarpuExprCall ; indent = 0, restrict=false)
 function print_prefix(io :: IO, x :: StarpuExprCall ; indent = 0, restrict=false)
@@ -296,7 +299,6 @@ function apply(func :: Function, expr :: StarpuExprCall)
     return func(StarpuExprCall(expr.func, map((x -> apply(func, x)), expr.args)))
     return func(StarpuExprCall(expr.func, map((x -> apply(func, x)), expr.args)))
 end
 end
 
 
-
 #======================================================
 #======================================================
                 CUDA KERNEL CALL
                 CUDA KERNEL CALL
 ======================================================#
 ======================================================#
@@ -734,8 +736,6 @@ function print(io :: IO, x :: StarpuExprRef ; indent = 0,restrict=false)
 
 
 end
 end
 
 
-
-
 function apply(func :: Function, expr :: StarpuExprRef)
 function apply(func :: Function, expr :: StarpuExprRef)
 
 
     ref = apply(func, expr.ref)
     ref = apply(func, expr.ref)
@@ -744,6 +744,16 @@ function apply(func :: Function, expr :: StarpuExprRef)
     return func(StarpuExprRef(ref, indexes))
     return func(StarpuExprRef(ref, indexes))
 end
 end
 
 
+function print(io :: IO, x :: StarpuExprAddress ; indent = 0, restrict=false)
+    print(io, "&")
+    print(io, x.ref, indent = indent)
+end
+
+function apply(func :: Function, expr :: StarpuExprAddress)
+    ref = apply(func, expr.ref)
+    return func(StarpuExprAddress(ref))
+end
+
 #======================================================
 #======================================================
                 BREAK EXPRESSION
                 BREAK EXPRESSION
 ======================================================#
 ======================================================#
@@ -799,7 +809,7 @@ function apply(func :: Function, expr :: StarpuExpr)
     return func(expr)
     return func(expr)
 end
 end
 
 
-print(io :: IO, x :: StarpuExprVar ; indent = 0) = print(io, x.name)
+print(io :: IO, x :: StarpuExprVar ; indent = 0, restrict = false) = print(io, x.name)
 
 
 function print(io :: IO, x :: StarpuExprValue ; indent = 0,restrict=false)
 function print(io :: IO, x :: StarpuExprValue ; indent = 0,restrict=false)
 
 
@@ -869,26 +879,24 @@ end
 
 
 function starpu_type_traduction(x)
 function starpu_type_traduction(x)
     if x <: Array
     if x <: Array
-        return starpu_type_traduction_array(x)
+        return starpu_type_traduction(eltype(x)) * "*"
     end
     end
 
 
     if x <: Ptr
     if x <: Ptr
-        return starpu_type_traduction(eltype(x)) * "*"
+        depth = 1
+        type = eltype(x)
+        while type <: Ptr
+            depth +=1
+            type = eltype(type)
+        end
+
+        return starpu_type_traduction(type) * "*"^depth
     end
     end
 
 
     return starpu_type_traduction_dict[x]
     return starpu_type_traduction_dict[x]
 
 
 end
 end
 
 
-function starpu_type_traduction_array(x :: Type{Array{T,N}})  where {T,N}
-    output = starpu_type_traduction(T)
-    for i in (1 : N)
-        output *= "*"
-    end
-
-    return output
-end
-
 function print(io :: IO, x :: StarpuExprTyped ; indent = 0,restrict=false)
 function print(io :: IO, x :: StarpuExprTyped ; indent = 0,restrict=false)
 
 
     if (isa(x, StarpuExprTypedVar))
     if (isa(x, StarpuExprTypedVar))

+ 18 - 17
julia/src/compiler/file_generation.jl

@@ -18,6 +18,8 @@ const cpu_kernel_file_start = "#include <stdio.h>
 #include <starpu.h>
 #include <starpu.h>
 #include <math.h>
 #include <math.h>
 
 
+#include \"blas.h\"
+
 static inline long long jlstarpu_max(long long a, long long b)
 static inline long long jlstarpu_max(long long a, long long b)
 {
 {
 	return (a > b) ? a : b;
 	return (a > b) ? a : b;
@@ -38,15 +40,16 @@ const cuda_kernel_file_start = "#include <stdio.h>
 #include <stdint.h>
 #include <stdint.h>
 #include <starpu.h>
 #include <starpu.h>
 #include <math.h>
 #include <math.h>
+#include <starpu_cublas_v2.h>
 
 
 #define THREADS_PER_BLOCK 64
 #define THREADS_PER_BLOCK 64
 
 
-static inline long long jlstarpu_max(long long a, long long b)
+__attribute__((unused)) static inline long long jlstarpu_max(long long a, long long b)
 {
 {
 	return (a > b) ? a : b;
 	return (a > b) ? a : b;
 }
 }
 
 
-static inline long long jlstarpu_interval_size(long long start, long long step, long long stop)
+__attribute__((unused)) static inline long long jlstarpu_interval_size(long long start, long long step, long long stop)
 {
 {
     if (stop >= start){
     if (stop >= start){
             return jlstarpu_max(0, (stop - start + 1) / step);
             return jlstarpu_max(0, (stop - start + 1) / step);
@@ -56,12 +59,12 @@ static inline long long jlstarpu_interval_size(long long start, long long step,
 }
 }
 
 
 
 
-__device__ static inline long long jlstarpu_max__device(long long a, long long b)
+__attribute__((unused)) __device__ static inline long long jlstarpu_max__device(long long a, long long b)
 {
 {
 	return (a > b) ? a : b;
 	return (a > b) ? a : b;
 }
 }
 
 
-__device__ static inline long long jlstarpu_interval_size__device(long long start, long long step, long long stop)
+__attribute__((unused)) __device__ static inline long long jlstarpu_interval_size__device(long long start, long long step, long long stop)
 {
 {
 	if (stop >= start){
 	if (stop >= start){
 		return jlstarpu_max__device(0, (stop - start + 1) / step);
 		return jlstarpu_max__device(0, (stop - start + 1) / step);
@@ -70,7 +73,6 @@ __device__ static inline long long jlstarpu_interval_size__device(long long star
 	}
 	}
 }
 }
 
 
-
 "
 "
 
 
 """
 """
@@ -105,14 +107,10 @@ macro codelet(x)
     cpu_name = name
     cpu_name = name
     cuda_name = "CUDA_"*name
     cuda_name = "CUDA_"*name
     dump(name)
     dump(name)
-    parse_scalar_parameters(parsed, cpu_name, cuda_name)
+    parse_scalar_parameters(parsed, name)
     c_struct_param_decl = generate_c_struct_param_declaration(name)
     c_struct_param_decl = generate_c_struct_param_declaration(name)
     cpu_expr = transform_to_cpu_kernel(parsed)
     cpu_expr = transform_to_cpu_kernel(parsed)
 
 
-    if (starpu_target & STARPU_CUDA != 0)
-        prekernel, kernel = transform_to_cuda_kernel(parsed)
-    end
-
     generated_cpu_kernel_file_name=string("genc_",string(x.args[1].args[1].args[1]),".c")
     generated_cpu_kernel_file_name=string("genc_",string(x.args[1].args[1].args[1]),".c")
     generated_cuda_kernel_file_name=string("gencuda_",string(x.args[1].args[1].args[1]),".cu")
     generated_cuda_kernel_file_name=string("gencuda_",string(x.args[1].args[1].args[1]),".cu")
 
 
@@ -126,11 +124,16 @@ macro codelet(x)
         CPU_CODELETS[name]=cpu_name
         CPU_CODELETS[name]=cpu_name
     end
     end
 
 
-    if starpu_target & STARPU_CUDA!=0
+    if (starpu_target & STARPU_CUDA!=0) && STARPU_USE_CUDA == 1
         kernel_file = open(generated_cuda_kernel_file_name, "w")
         kernel_file = open(generated_cuda_kernel_file_name, "w")
         debug_print("generating ", generated_cuda_kernel_file_name)
         debug_print("generating ", generated_cuda_kernel_file_name)
         print(kernel_file, cuda_kernel_file_start)
         print(kernel_file, cuda_kernel_file_start)
-        print(kernel_file, "__global__ ", kernel)
+        prekernel, kernel = transform_to_cuda_kernel(parsed)
+
+        if kernel != nothing
+            print(kernel_file, "__global__ ", kernel)
+        end
+
         print(kernel_file, c_struct_param_decl)
         print(kernel_file, c_struct_param_decl)
         print(kernel_file, "\nextern \"C\" ", prekernel)
         print(kernel_file, "\nextern \"C\" ", prekernel)
         close(kernel_file)
         close(kernel_file)
@@ -138,7 +141,7 @@ macro codelet(x)
     end
     end
 end
 end
 
 
-function parse_scalar_parameters(expr :: StarpuExprFunction, cpu_name::String, cuda_name::String)
+function parse_scalar_parameters(expr :: StarpuExprFunction, codelet_name)
     scalar_parameters = []
     scalar_parameters = []
     for i in (1 : length(expr.args))
     for i in (1 : length(expr.args))
         type = expr.args[i].typ
         type = expr.args[i].typ
@@ -147,8 +150,7 @@ function parse_scalar_parameters(expr :: StarpuExprFunction, cpu_name::String, c
         end
         end
     end
     end
 
 
-    CODELETS_SCALARS[cpu_name] = scalar_parameters
-    CODELETS_SCALARS[cuda_name] = scalar_parameters
+    CODELETS_SCALARS[codelet_name] = scalar_parameters
 
 
     # declare structure carrying scalar parameters
     # declare structure carrying scalar parameters
     struct_params_name = Symbol("params_", rand_string())
     struct_params_name = Symbol("params_", rand_string())
@@ -164,6 +166,5 @@ function parse_scalar_parameters(expr :: StarpuExprFunction, cpu_name::String, c
     eval(Meta.parse(add_to_dict_str))
     eval(Meta.parse(add_to_dict_str))
 
 
     # save structure name
     # save structure name
-    CODELETS_PARAMS_STRUCT[cpu_name] = struct_params_name
-    CODELETS_PARAMS_STRUCT[cuda_name] = struct_params_name
+    CODELETS_PARAMS_STRUCT[codelet_name] = struct_params_name
 end
 end

+ 4 - 0
julia/src/data.jl

@@ -160,6 +160,10 @@ function starpu_data_release_on_node(handle :: StarpuDataHandle, node :: Int)
     starpu_data_release_on_node(handle.object, node)
     starpu_data_release_on_node(handle.object, node)
 end
 end
 
 
+function starpu_data_wont_use(handle :: StarpuDataHandle)
+    starpu_data_wont_use(handle.object)
+end
+
 function repl(x::Symbol)
 function repl(x::Symbol)
     return x
     return x
 end
 end

+ 13 - 11
julia/src/dynamic_compiler/Makefile.am

@@ -14,33 +14,35 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 #
 #
 
 
-#LD=$(CC_OR_NVCC)
-LD=$(CC)
-AM_CPPFLAGS = -I$(abs_top_srcdir)/include/ -I$(abs_top_builddir)/src -I$(abs_top_builddir)/include
+LD=$(CC_OR_NVCC)
+AM_CPPFLAGS = -I$(abs_top_srcdir)/include/ -I$(abs_top_builddir)/src -I$(abs_top_builddir)/include \
+	 -I$(abs_top_srcdir)/julia/src/
+
 AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS)
 AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS)
 AM_CFLAGS += -fPIC -O3 -g -DSTRIDE=${STRIDE} -Wall -mavx -fomit-frame-pointer -march=native -ffast-math
 AM_CFLAGS += -fPIC -O3 -g -DSTRIDE=${STRIDE} -Wall -mavx -fomit-frame-pointer -march=native -ffast-math
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@
-#CUDA_CFLAGS = ${CFLAGS}
-
+CUDA_CFLAGS = $(STARPU_CUDA_CPPFLAGS) -Wno-deprecated-gpu-targets
+LDFLAGS = -L @STARPU_BUILD_DIR@/julia/src/.libs/ -lstarpujulia-1.3
 EXTERNLIB=extern_tasks.so
 EXTERNLIB=extern_tasks.so
 GENERATEDLIB=generated_tasks.so
 GENERATEDLIB=generated_tasks.so
 
 
 C_OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
 C_OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
-#if STARPU_USE_CUDA
-#CUDA_OBJECTS=$(patsubst %.cu,%.o,$(wildcard gen*.cu))
-#else
+
+if STARPU_USE_CUDA
+CUDA_OBJECTS=$(patsubst %.cu,%.o,$(wildcard gen*.cu))
+else
 CUDA_OBJECTS=
 CUDA_OBJECTS=
-#endif
+endif
 
 
 %.o: %.c
 %.o: %.c
 	$(CC) -c $(AM_CPPFLAGS) $(AM_CFLAGS) $^ -o $@
 	$(CC) -c $(AM_CPPFLAGS) $(AM_CFLAGS) $^ -o $@
 
 
 %.o: %.cu
 %.o: %.cu
-	$(NVCC) -dc $(CUDA_CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
+	$(NVCC) -dc $(AM_CPPFLAGS) $(CUDA_CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
 
 
 ${EXTERNLIB}: $(SOURCES_CPU)
 ${EXTERNLIB}: $(SOURCES_CPU)
 	$(CC) $(AM_CPPFLAGS) $(AM_CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
 	$(CC) $(AM_CPPFLAGS) $(AM_CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
 
 
 ${GENERATEDLIB}: $(C_OBJECTS) $(CUDA_OBJECTS)
 ${GENERATEDLIB}: $(C_OBJECTS) $(CUDA_OBJECTS)
-	$(LD) -shared $(LDFLAGS) $^ -o $@
+	$(LD) -shared $^ -o $@ $(LDFLAGS)
 
 

+ 7 - 6
julia/src/globals.jl

@@ -23,16 +23,10 @@ global starpu_target=STARPU_CPU
 global generated_cuda_kernel_file_name = "PRINT TO STDOUT"
 global generated_cuda_kernel_file_name = "PRINT TO STDOUT"
 global generated_cpu_kernel_file_name = "PRINT TO STDOUT"
 global generated_cpu_kernel_file_name = "PRINT TO STDOUT"
 
 
-export CPU_CODELETS
 global CPU_CODELETS=Dict{String,String}()
 global CPU_CODELETS=Dict{String,String}()
-
-export CUDA_CODELETS
 global CUDA_CODELETS=Dict{String,String}()
 global CUDA_CODELETS=Dict{String,String}()
 
 
-export CODELETS_SCALARS
 global CODELETS_SCALARS=Dict{String,Any}()
 global CODELETS_SCALARS=Dict{String,Any}()
-
-export CODELETS_PARAMS_STRUCT
 global CODELETS_PARAMS_STRUCT=Dict{String,Any}()
 global CODELETS_PARAMS_STRUCT=Dict{String,Any}()
 
 
 global starpu_type_traduction_dict = Dict(
 global starpu_type_traduction_dict = Dict(
@@ -47,3 +41,10 @@ global starpu_type_traduction_dict = Dict(
 export starpu_type_traduction_dict
 export starpu_type_traduction_dict
 
 
 global mutex = Threads.SpinLock()
 global mutex = Threads.SpinLock()
+
+# detect CUDA support
+try
+    STARPU_USE_CUDA == 1
+catch
+   global  const STARPU_USE_CUDA = 0
+end

+ 4 - 1
julia/src/init.jl

@@ -26,8 +26,11 @@ function starpu_init()
         debug_print("Loading external codelet library")
         debug_print("Loading external codelet library")
         ff = Libdl.dlsym(starpu_tasks_library_handle,:starpu_find_function)
         ff = Libdl.dlsym(starpu_tasks_library_handle,:starpu_find_function)
         dump(ff)
         dump(ff)
-        for k in keys(CUDA_CODELETS)
+        for k in keys(CPU_CODELETS)
             CPU_CODELETS[k]=unsafe_string(ccall(ff,Cstring, (Cstring,Cstring),Cstring_from_String(string(k)),Cstring_from_String("cpu")))
             CPU_CODELETS[k]=unsafe_string(ccall(ff,Cstring, (Cstring,Cstring),Cstring_from_String(string(k)),Cstring_from_String("cpu")))
+            if STARPU_USE_CUDA == 1
+                CUDA_CODELETS[k]=unsafe_string(ccall(ff,Cstring, (Cstring,Cstring),Cstring_from_String(string(k)),Cstring_from_String("gpu")))
+            end
             print(k,">>>>",CPU_CODELETS[k],"\n")
             print(k,">>>>",CPU_CODELETS[k],"\n")
         end
         end
     else
     else

+ 9 - 0
julia/src/openblas_ldflags.jl

@@ -0,0 +1,9 @@
+import LinearAlgebra.BLAS
+import Libdl
+
+
+libdir = normpath(joinpath(splitpath(filter(x->occursin(Base.libblas_name,x), Libdl.dllist())[1])[1:end-1]...))
+libpath = normpath(filter(x->occursin(Base.libblas_name,x), Libdl.dllist())[1])
+libname = Base.libblas_name[4:end]
+println("-Wl,-rpath,$libpath -L$libdir -l$libname")
+

+ 109 - 22
julia/src/task.jl

@@ -27,8 +27,8 @@ end
 global codelet_list = Vector{jl_starpu_codelet}()
 global codelet_list = Vector{jl_starpu_codelet}()
 
 
 function starpu_codelet(;
 function starpu_codelet(;
-                        cpu_func :: Union{String, STARPU_BLAS} = "",
-                        cuda_func :: Union{String, STARPU_BLAS} = "",
+                        cpu_func :: Union{String, STARPU_BLAS, Cvoid} = "",
+                        cuda_func :: Union{String, STARPU_BLAS, Cvoid} = "",
                         opencl_func :: String = "",
                         opencl_func :: String = "",
                         modes = [],
                         modes = [],
                         perfmodel :: starpu_perfmodel,
                         perfmodel :: starpu_perfmodel,
@@ -42,7 +42,7 @@ function starpu_codelet(;
 
 
 
 
     if (where_to_execute == nothing)
     if (where_to_execute == nothing)
-        real_where = ((cpu_func != "") * STARPU_CPU) | ((cuda_func != "") * STARPU_CUDA)
+        real_where = ((cpu_func != nothing) * STARPU_CPU) | ((cuda_func != nothing) * STARPU_CUDA)
     else
     else
         real_where = where_to_execute
         real_where = where_to_execute
     end
     end
@@ -63,7 +63,7 @@ function starpu_codelet(;
         output.cpu_func = cpu_blas_codelets[cpu_func]
         output.cpu_func = cpu_blas_codelets[cpu_func]
         output.c_codelet.cpu_func = load_wrapper_function_pointer(output.cpu_func)
         output.c_codelet.cpu_func = load_wrapper_function_pointer(output.cpu_func)
     else
     else
-        output.c_codelet.cpu_func = load_starpu_function_pointer(cpu_func)
+        output.c_codelet.cpu_func = load_starpu_function_pointer(get(CPU_CODELETS, cpu_func, ""))
     end
     end
 
 
     if typeof(cuda_func) == STARPU_BLAS
     if typeof(cuda_func) == STARPU_BLAS
@@ -71,10 +71,10 @@ function starpu_codelet(;
         output.c_codelet.cuda_func = load_wrapper_function_pointer(output.cuda_func)
         output.c_codelet.cuda_func = load_wrapper_function_pointer(output.cuda_func)
         output.c_codelet.cuda_flags[1] = STARPU_CUDA_ASYNC
         output.c_codelet.cuda_flags[1] = STARPU_CUDA_ASYNC
     else
     else
-        output.c_codelet.cuda_func = load_starpu_function_pointer(cuda_func)
+        output.c_codelet.cuda_func = load_starpu_function_pointer(get(CUDA_CODELETS, cuda_func, ""))
     end
     end
 
 
-    output.c_codelet.opencl_func = load_starpu_function_pointer(opencl_func)
+    output.c_codelet.opencl_func = load_starpu_function_pointer("")
 
 
     # Codelets must not be garbage collected before starpu shutdown is called.
     # Codelets must not be garbage collected before starpu shutdown is called.
     lock(mutex)
     lock(mutex)
@@ -104,9 +104,18 @@ task_list = Vector{jl_starpu_task}()
 
 
             Creates a new task which will run the specified codelet on handle buffers and cl_args data
             Creates a new task which will run the specified codelet on handle buffers and cl_args data
         """
         """
-function starpu_task(; cl :: Union{Cvoid, jl_starpu_codelet} = nothing, handles :: Vector{StarpuDataHandle} = StarpuDataHandle[], cl_arg = (),
-                     callback :: Union{Cvoid, Function} = nothing, callback_arg = nothing, tag :: Union{Cvoid, starpu_tag_t} = nothing,
-                     sequential_consistency = true, detach = 1)
+function starpu_task(;
+                     cl :: Union{Cvoid, jl_starpu_codelet} = nothing,
+                     handles :: Vector{StarpuDataHandle} = StarpuDataHandle[],
+                     cl_arg = (),
+                     callback :: Union{Cvoid, Function} = nothing,
+                     callback_arg = nothing,
+                     tag :: Union{Cvoid, starpu_tag_t} = nothing,
+                     tag_only :: Union{Cvoid, starpu_tag_t} = nothing,
+                     sequential_consistency = true,
+                     detach = 1,
+                     color :: Union{Cvoid, UInt32} = nothing,
+                     where :: Union{Cvoid, Int32} = nothing)
     if (cl == nothing)
     if (cl == nothing)
         error("\"cl\" field can't be empty when creating a StarpuTask")
         error("\"cl\" field can't be empty when creating a StarpuTask")
     end
     end
@@ -114,15 +123,11 @@ function starpu_task(; cl :: Union{Cvoid, jl_starpu_codelet} = nothing, handles
     output = jl_starpu_task(cl, handles, map((x -> x.object), handles), false, nothing, Vector{Cint}(undef, 1), callback, callback_arg, starpu_task(zero))
     output = jl_starpu_task(cl, handles, map((x -> x.object), handles), false, nothing, Vector{Cint}(undef, 1), callback, callback_arg, starpu_task(zero))
 
 
     # handle scalar_parameters
     # handle scalar_parameters
-    codelet_name = cl.cpu_func
-    if isempty(codelet_name)
-        codelet_name = cl.cuda_func
-    end
-    if isempty(codelet_name)
-        codelet_name = cl.opencl_func
-    end
-    if isempty(codelet_name)
-        error("No function provided with codelet.")
+    codelet_name = ""
+    if isa(cl.cpu_func, String) && cl.cpu_func != ""
+        codelet = cl.cpu_func
+    elseif isa(cl.gpu_func, String) && cl.gpu_func != ""
+        codelet = cl.gpu_func
     end
     end
     scalar_parameters = get(CODELETS_SCALARS, codelet_name, nothing)
     scalar_parameters = get(CODELETS_SCALARS, codelet_name, nothing)
     if scalar_parameters != nothing
     if scalar_parameters != nothing
@@ -163,6 +168,18 @@ function starpu_task(; cl :: Union{Cvoid, jl_starpu_codelet} = nothing, handles
         output.c_task.use_tag = 1
         output.c_task.use_tag = 1
     end
     end
 
 
+    if tag_only != nothing
+        output.c_task.tag_id = tag_only
+    end
+
+    if color != nothing
+        output.c_task.color = color
+    end
+
+    if where != nothing
+        output.c_task.where = where
+    end
+
     # Tasks must not be garbage collected before starpu_task_wait_for_all is called.
     # Tasks must not be garbage collected before starpu_task_wait_for_all is called.
     # This is necessary in particular for tasks created inside callback functions.
     # This is necessary in particular for tasks created inside callback functions.
     lock(mutex)
     lock(mutex)
@@ -173,8 +190,8 @@ function starpu_task(; cl :: Union{Cvoid, jl_starpu_codelet} = nothing, handles
 end
 end
 
 
 
 
-function create_param_struct_from_clarg(name, cl_arg)
-    struct_params_name = CODELETS_PARAMS_STRUCT[name]
+function create_param_struct_from_clarg(codelet_name, cl_arg)
+    struct_params_name = CODELETS_PARAMS_STRUCT[codelet_name]
 
 
     if struct_params_name == false
     if struct_params_name == false
         error("structure name not found in CODELET_PARAMS_STRUCT")
         error("structure name not found in CODELET_PARAMS_STRUCT")
@@ -236,6 +253,76 @@ function starpu_modes(x :: Symbol)
     end
     end
 end
 end
 
 
+default_codelet = Dict{String, jl_starpu_codelet}()
+default_perfmodel = Dict{String, starpu_perfmodel}()
+
+function get_default_perfmodel(name)
+    if name in keys(default_perfmodel)
+        return default_perfmodel[name]
+    end
+
+    perfmodel = starpu_perfmodel(
+        perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
+        symbol = name
+    )
+    default_perfmodel[name] = perfmodel
+    return perfmodel
+end
+
+function get_default_codelet(codelet_name, perfmodel, modes) :: jl_starpu_codelet
+    if codelet_name in keys(default_codelet)
+        return default_codelet[codelet_name]
+    end
+
+    cl = starpu_codelet(
+        cpu_func  = codelet_name in keys(CPU_CODELETS) ? codelet_name : "",
+        cuda_func = codelet_name in keys(CUDA_CODELETS) ? codelet_name : "",
+        modes = modes,
+        perfmodel = perfmodel,
+    )
+    default_codelet[codelet_name] = cl
+    return cl
+end
+
+function starpu_task_insert(;
+                            codelet_name :: Union{Cvoid, String} = nothing,
+                            cl :: Union{Cvoid, jl_starpu_codelet} = nothing,
+                            perfmodel :: Union{starpu_perfmodel, Cvoid} = nothing,
+                            handles :: Vector{StarpuDataHandle} = StarpuDataHandle[],
+                            cl_arg = (),
+                            callback :: Union{Cvoid, Function} = nothing,
+                            callback_arg = nothing,
+                            tag :: Union{Cvoid, starpu_tag_t} = nothing,
+                            tag_only :: Union{Cvoid, starpu_tag_t} = nothing,
+                            sequential_consistency = true,
+                            detach = 1,
+                            where :: Union{Cvoid, Int32} = nothing,
+                            color :: Union{Cvoid, UInt32} = nothing,
+                            modes = nothing)
+    if cl == nothing && codelet_name == nothing
+        error("At least one of the two parameters codelet_name or cl must be provided when calling starpu_task_insert.")
+
+    end
+    if cl == nothing && modes == nothing
+        error("Modes must be defined when calling starpu_task_insert without a codelet.")
+    end
+
+    if perfmodel == nothing
+        perfmodel = get_default_perfmodel(codelet_name == nothing ? "default" : codelet_name)
+    end
+
+    if cl == nothing
+        cl = get_default_codelet(codelet_name, perfmodel, modes)
+    end
+
+    task = starpu_task(cl = cl, handles = handles, cl_arg = cl_arg, callback = callback,
+                       callback_arg = callback_arg, tag = tag, tag_only = tag_only,
+                       sequential_consistency = sequential_consistency,
+                       detach = detach, color = color, where = where)
+
+    starpu_task_submit(task)
+end
+
 """
 """
     Creates and submits an asynchronous task running cl Codelet function.
     Creates and submits an asynchronous task running cl Codelet function.
     Ex : @starpu_async_cl cl(handle1, handle2)
     Ex : @starpu_async_cl cl(handle1, handle2)
@@ -254,8 +341,8 @@ macro starpu_async_cl(expr, modes, cl_arg=(), color ::UInt32=0x00000000)
     )
     )
     println(CPU_CODELETS[string(expr.args[1])])
     println(CPU_CODELETS[string(expr.args[1])])
     cl = starpu_codelet(
     cl = starpu_codelet(
-        cpu_func = CPU_CODELETS[string(expr.args[1])],
-        # cuda_func = CUDA_CODELETS[string(expr.args[1])],
+        cpu_func  = string(expr.args[1]),
+        cuda_func = string(expr.args[1]),
         #opencl_func="ocl_matrix_mult",
         #opencl_func="ocl_matrix_mult",
         ### TODO: CORRECT !
         ### TODO: CORRECT !
         modes = map((x -> starpu_modes(x)),modes.args),
         modes = map((x -> starpu_modes(x)),modes.args),

+ 13 - 5
julia/src/translate_headers.jl

@@ -19,8 +19,8 @@ using Clang.LibClang.LLVM_jll
 function starpu_translate_headers()
 function starpu_translate_headers()
     debug_print("Translating StarPU headers...")
     debug_print("Translating StarPU headers...")
 
 
-    if !isdir((@__DIR__)*"/../gen")
-        mkdir((@__DIR__)*"/../gen")
+    if !isdir(joinpath(fstarpu_build_dir(), "julia/gen"))
+        mkdir(joinpath(fstarpu_build_dir(), "julia/gen"))
     end
     end
 
 
     STARPU_BUILD_INCLUDE=joinpath(fstarpu_build_dir(), "include")
     STARPU_BUILD_INCLUDE=joinpath(fstarpu_build_dir(), "include")
@@ -63,6 +63,7 @@ function starpu_translate_headers()
                                "starpu_data_set_default_sequential_consistency_flag",
                                "starpu_data_set_default_sequential_consistency_flag",
                                "starpu_data_get_sequential_consistency_flag",
                                "starpu_data_get_sequential_consistency_flag",
                                "starpu_data_set_sequential_consistency_flag",
                                "starpu_data_set_sequential_consistency_flag",
+                               "starpu_data_wont_use",
                                "starpu_matrix_data_register",
                                "starpu_matrix_data_register",
                                "starpu_block_data_register",
                                "starpu_block_data_register",
                                "starpu_vector_data_register",
                                "starpu_vector_data_register",
@@ -76,6 +77,7 @@ function starpu_translate_headers()
                                "starpu_task_submit",
                                "starpu_task_submit",
                                "starpu_task_wait",
                                "starpu_task_wait",
                                "starpu_task_wait_for_n_submitted",
                                "starpu_task_wait_for_n_submitted",
+                               "starpu_tag_remove",
                                "starpu_tag_wait",
                                "starpu_tag_wait",
                                "starpu_tag_declare_deps_array",
                                "starpu_tag_declare_deps_array",
                                "starpu_tag_notify_from_apps",
                                "starpu_tag_notify_from_apps",
@@ -83,16 +85,22 @@ function starpu_translate_headers()
                                "starpu_task_declare_deps_array",
                                "starpu_task_declare_deps_array",
                                "starpu_iteration_push",
                                "starpu_iteration_push",
                                "starpu_iteration_pop",
                                "starpu_iteration_pop",
+                               "starpu_worker_get_count",
+                               "starpu_cpu_worker_get_count",
+                               "starpu_cuda_worker_get_count",
+                               "starpu_opencl_worker_get_count",
+                               "starpu_mic_worker_get_count",
                                "STARPU_CPU",
                                "STARPU_CPU",
                                "STARPU_CUDA",
                                "STARPU_CUDA",
                                "STARPU_CUDA_ASYNC",
                                "STARPU_CUDA_ASYNC",
                                "STARPU_OPENCL",
                                "STARPU_OPENCL",
                                "STARPU_MAIN_RAM",
                                "STARPU_MAIN_RAM",
-                               "STARPU_NMAXBUFS"])
+                               "STARPU_NMAXBUFS",
+                               "STARPU_USE_CUDA"])
 
 
     wc = init(; headers = STARPU_HEADERS,
     wc = init(; headers = STARPU_HEADERS,
-              output_file = joinpath(@__DIR__, "../gen/libstarpu_api.jl"),
-              common_file = joinpath(@__DIR__, "../gen/libstarpu_common.jl"),
+              output_file = joinpath(fstarpu_build_dir(), "julia/gen/libstarpu_api.jl"),
+              common_file = joinpath(fstarpu_build_dir(), "julia/gen/libstarpu_common.jl"),
               clang_includes = vcat(LIBCLANG_INCLUDE, CLANG_INCLUDE),
               clang_includes = vcat(LIBCLANG_INCLUDE, CLANG_INCLUDE),
               clang_args = clang_args,
               clang_args = clang_args,
               header_library = x->"starpu_wrapper_library_name",
               header_library = x->"starpu_wrapper_library_name",

+ 8 - 1
mpi/Makefile.am

@@ -16,7 +16,14 @@
 
 
 include $(top_srcdir)/starpu-subdirtests.mk
 include $(top_srcdir)/starpu-subdirtests.mk
 
 
-SUBDIRS=src tests examples tools
+SUBDIRS=src tools
+
+if STARPU_BUILD_EXAMPLES
+SUBDIRS += examples
+endif
+if STARPU_BUILD_TESTS
+SUBDIRS += tests
+endif
 
 
 pkgconfigdir = $(libdir)/pkgconfig
 pkgconfigdir = $(libdir)/pkgconfig
 pkgconfig_DATA = libstarpumpi.pc starpumpi-1.0.pc starpumpi-1.1.pc starpumpi-1.2.pc starpumpi-1.3.pc
 pkgconfig_DATA = libstarpumpi.pc starpumpi-1.0.pc starpumpi-1.1.pc starpumpi-1.2.pc starpumpi-1.3.pc

+ 5 - 23
mpi/examples/Makefile.am

@@ -21,6 +21,8 @@ CCLD=$(MPICC)
 FC=$(MPIFORT)
 FC=$(MPIFORT)
 FCLD=$(MPIFORT)
 FCLD=$(MPIFORT)
 
 
+noinst_PROGRAMS		=
+
 if STARPU_HAVE_WINDOWS
 if STARPU_HAVE_WINDOWS
 LOADER_BIN		=
 LOADER_BIN		=
 else
 else
@@ -28,6 +30,7 @@ loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
 if !STARPU_SIMGRID
 if !STARPU_SIMGRID
 LOADER			=	loader
 LOADER			=	loader
 LOADER_BIN		=	$(abs_top_builddir)/mpi/examples/$(LOADER)
 LOADER_BIN		=	$(abs_top_builddir)/mpi/examples/$(LOADER)
+noinst_PROGRAMS		+=	loader
 endif
 endif
 loader_SOURCES		=	../../tests/loader.c
 loader_SOURCES		=	../../tests/loader.c
 endif
 endif
@@ -108,7 +111,6 @@ AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) $(ST
 ###################
 ###################
 # Stencil example #
 # Stencil example #
 ###################
 ###################
-if BUILD_EXAMPLES
 examplebin_PROGRAMS +=		\
 examplebin_PROGRAMS +=		\
 	stencil/stencil5
 	stencil/stencil5
 starpu_mpi_EXAMPLES	+=	\
 starpu_mpi_EXAMPLES	+=	\
@@ -121,14 +123,11 @@ starpu_mpi_EXAMPLES	+=	\
 	stencil/stencil5_lb
 	stencil/stencil5_lb
 endif
 endif
 
 
-endif
-
 ##################
 ##################
 # MPI LU example #
 # MPI LU example #
 ##################
 ##################
 
 
-if BUILD_EXAMPLES
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 
 
 examplebin_PROGRAMS += 			\
 examplebin_PROGRAMS += 			\
 	mpi_lu/plu_example_float	\
 	mpi_lu/plu_example_float	\
@@ -210,14 +209,12 @@ mpi_lu_plu_outofcore_example_double_SOURCES =	\
 	mpi_lu/pdlu_implicit.c			\
 	mpi_lu/pdlu_implicit.c			\
 	../../examples/common/blas.c
 	../../examples/common/blas.c
 endif
 endif
-endif
 
 
 ########################
 ########################
 # MPI Cholesky example #
 # MPI Cholesky example #
 ########################
 ########################
 
 
-if BUILD_EXAMPLES
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 examplebin_PROGRAMS +=		\
 examplebin_PROGRAMS +=		\
 	matrix_decomposition/mpi_cholesky			\
 	matrix_decomposition/mpi_cholesky			\
 	matrix_decomposition/mpi_cholesky_distributed
 	matrix_decomposition/mpi_cholesky_distributed
@@ -250,13 +247,11 @@ starpu_mpi_EXAMPLES +=				\
 	matrix_decomposition/mpi_cholesky			\
 	matrix_decomposition/mpi_cholesky			\
 	matrix_decomposition/mpi_cholesky_distributed
 	matrix_decomposition/mpi_cholesky_distributed
 endif
 endif
-endif
 
 
 ########################
 ########################
 # MPI Matrix mult example #
 # MPI Matrix mult example #
 ########################
 ########################
 
 
-if BUILD_EXAMPLES
 examplebin_PROGRAMS +=		\
 examplebin_PROGRAMS +=		\
 	matrix_mult/mm
 	matrix_mult/mm
 
 
@@ -270,14 +265,12 @@ if !STARPU_SIMGRID
 starpu_mpi_EXAMPLES +=				\
 starpu_mpi_EXAMPLES +=				\
 	matrix_mult/mm
 	matrix_mult/mm
 endif
 endif
-endif
 
 
 ##########################################
 ##########################################
 # Native Fortran MPI Matrix mult example #
 # Native Fortran MPI Matrix mult example #
 ##########################################
 ##########################################
 
 
 if STARPU_HAVE_MPIFORT
 if STARPU_HAVE_MPIFORT
-if BUILD_EXAMPLES
 if !STARPU_SANITIZE
 if !STARPU_SANITIZE
 examplebin_PROGRAMS +=		\
 examplebin_PROGRAMS +=		\
 	native_fortran/nf_mm	\
 	native_fortran/nf_mm	\
@@ -318,13 +311,11 @@ starpu_mpi_EXAMPLES +=				\
 endif
 endif
 endif
 endif
 endif
 endif
-endif
 
 
 ###################
 ###################
 # complex example #
 # complex example #
 ###################
 ###################
 
 
-if BUILD_EXAMPLES
 examplebin_PROGRAMS +=			\
 examplebin_PROGRAMS +=			\
 	complex/mpi_complex
 	complex/mpi_complex
 
 
@@ -334,13 +325,11 @@ complex_mpi_complex_SOURCES =		\
 
 
 starpu_mpi_EXAMPLES	+=			\
 starpu_mpi_EXAMPLES	+=			\
 	complex/mpi_complex
 	complex/mpi_complex
-endif
 
 
 #########################
 #########################
 # user_datatype example #
 # user_datatype example #
 #########################
 #########################
 
 
-if BUILD_EXAMPLES
 examplebin_PROGRAMS +=				\
 examplebin_PROGRAMS +=				\
 	user_datatype/user_datatype		\
 	user_datatype/user_datatype		\
 	user_datatype/user_datatype2
 	user_datatype/user_datatype2
@@ -358,13 +347,11 @@ starpu_mpi_EXAMPLES	+=			\
 	user_datatype/user_datatype2		\
 	user_datatype/user_datatype2		\
 	user_datatype/user_datatype
 	user_datatype/user_datatype
 endif
 endif
-endif
 
 
 ###################
 ###################
 # comm example #
 # comm example #
 ###################
 ###################
 
 
-if BUILD_EXAMPLES
 examplebin_PROGRAMS +=			\
 examplebin_PROGRAMS +=			\
 	comm/comm			\
 	comm/comm			\
 	comm/mix_comm
 	comm/mix_comm
@@ -374,13 +361,11 @@ starpu_mpi_EXAMPLES	+=			\
 	comm/comm				\
 	comm/comm				\
 	comm/mix_comm
 	comm/mix_comm
 endif
 endif
-endif
 
 
 ##################
 ##################
 # filter example #
 # filter example #
 ##################
 ##################
 
 
-if BUILD_EXAMPLES
 examplebin_PROGRAMS +=			\
 examplebin_PROGRAMS +=			\
 	filters/filter
 	filters/filter
 
 
@@ -388,7 +373,6 @@ if !STARPU_SIMGRID
 starpu_mpi_EXAMPLES	+=			\
 starpu_mpi_EXAMPLES	+=			\
 	filters/filter
 	filters/filter
 endif
 endif
-endif
 
 
 # Native Fortran example
 # Native Fortran example
 
 
@@ -401,7 +385,6 @@ native_fortran/fstarpu_mpi_mod.f90:
 	$(V_ln) $(LN_S) $(abs_top_srcdir)/mpi/include/$(notdir $@) $@
 	$(V_ln) $(LN_S) $(abs_top_srcdir)/mpi/include/$(notdir $@) $@
 
 
 if STARPU_HAVE_MPIFORT
 if STARPU_HAVE_MPIFORT
-if BUILD_EXAMPLES
 if !STARPU_SANITIZE
 if !STARPU_SANITIZE
 # - express the creation of .mod along .o
 # - express the creation of .mod along .o
 fstarpu_mod.mod: native_fortran/fstarpu_mod.o
 fstarpu_mod.mod: native_fortran/fstarpu_mod.o
@@ -416,4 +399,3 @@ native_fortran/nf_mm_task_build.o: nf_mm_cl.mod fstarpu_mpi_mod.mod fstarpu_mod.
 native_fortran/nf_basic_ring.o: fstarpu_mpi_mod.mod fstarpu_mod.mod
 native_fortran/nf_basic_ring.o: fstarpu_mpi_mod.mod fstarpu_mod.mod
 endif
 endif
 endif
 endif
-endif

+ 2 - 1
mpi/examples/matrix_mult/mm.c

@@ -289,7 +289,8 @@ static struct starpu_codelet gemm_cl =
 {
 {
 	.cpu_funcs = {cpu_mult}, /* cpu implementation(s) of the routine */
 	.cpu_funcs = {cpu_mult}, /* cpu implementation(s) of the routine */
 	.nbuffers = 3, /* number of data handles referenced by this routine */
 	.nbuffers = 3, /* number of data handles referenced by this routine */
-	.modes = {STARPU_R, STARPU_R, STARPU_RW} /* access modes for each data handle */
+	.modes = {STARPU_R, STARPU_R, STARPU_RW}, /* access modes for each data handle */
+	.name = "gemm" /* to display task name in traces */
 };
 };
 
 
 int main(int argc, char *argv[])
 int main(int argc, char *argv[])

+ 1 - 0
mpi/src/starpu_mpi.c

@@ -431,6 +431,7 @@ void starpu_mpi_data_migrate(MPI_Comm comm, starpu_data_handle_t data, int new_r
 
 
 	/* Flush cache in all other nodes */
 	/* Flush cache in all other nodes */
 	/* TODO: Ideally we'd transmit the knowledge of who owns it */
 	/* TODO: Ideally we'd transmit the knowledge of who owns it */
+	/* TODO: or at least remember that the previous owner has the data, that's an easy case to support */
 	starpu_mpi_cache_flush(comm, data);
 	starpu_mpi_cache_flush(comm, data);
 	return;
 	return;
 }
 }

+ 34 - 1
mpi/src/starpu_mpi_init.c

@@ -138,7 +138,38 @@ int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi, MPI_Comm
 	_starpu_mpi_do_initialize(argc_argv);
 	_starpu_mpi_do_initialize(argc_argv);
 #endif
 #endif
 
 
-	return _mpi_backend._starpu_mpi_backend_progress_init(argc_argv);
+	int ret = _mpi_backend._starpu_mpi_backend_progress_init(argc_argv);
+
+	if (starpu_get_env_number_default("STARPU_DISPLAY_BINDINGS", 0))
+	{
+		int rank, size, i;
+		char hostname[65];
+
+		starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+		starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
+		gethostname(hostname, sizeof(hostname));
+
+		/* We make a barrier between each node calling hwloc-ps, to avoid mixing
+		 * outputs in stdout. */
+		for (i = 0; i < size; i++)
+		{
+			starpu_mpi_barrier(MPI_COMM_WORLD);
+			if (rank == i)
+			{
+				fprintf(stdout, "== Binding for rank %d on node %s ==\n", rank, hostname);
+				starpu_display_bindings();
+				fflush(stdout);
+			}
+		}
+		starpu_mpi_barrier(MPI_COMM_WORLD);
+		if (rank == 0)
+		{
+			fprintf(stdout, "== End of bindings ==\n");
+			fflush(stdout);
+		}
+	}
+
+	return ret;
 }
 }
 
 
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
@@ -219,6 +250,8 @@ int starpu_mpi_init_conf(int *argc, char ***argv, int initialize_mpi, MPI_Comm c
 			conf->reserve_ncpus++;
 			conf->reserve_ncpus++;
 	}
 	}
 
 
+	conf->will_use_mpi = 1;
+
 	int ret = starpu_init(conf);
 	int ret = starpu_init(conf);
 	if (ret < 0)
 	if (ret < 0)
 		return ret;
 		return ret;

+ 11 - 10
mpi/tests/Makefile.am

@@ -19,6 +19,8 @@ include $(top_srcdir)/starpu.mk
 CC=$(MPICC)
 CC=$(MPICC)
 CCLD=$(MPICC)
 CCLD=$(MPICC)
 
 
+noinst_PROGRAMS		=
+
 if STARPU_HAVE_WINDOWS
 if STARPU_HAVE_WINDOWS
 LOADER_BIN		=
 LOADER_BIN		=
 else
 else
@@ -26,6 +28,7 @@ loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
 if !STARPU_SIMGRID
 if !STARPU_SIMGRID
 LOADER			=	loader
 LOADER			=	loader
 LOADER_BIN		=	$(abs_top_builddir)/mpi/tests/$(LOADER)
 LOADER_BIN		=	$(abs_top_builddir)/mpi/tests/$(LOADER)
+noinst_PROGRAMS		+=	loader
 endif
 endif
 loader_SOURCES		=	../../tests/loader.c
 loader_SOURCES		=	../../tests/loader.c
 endif
 endif
@@ -93,8 +96,6 @@ AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) $(ST
 # Unit testcases       #
 # Unit testcases       #
 ########################
 ########################
 
 
-if BUILD_TESTS
-
 starpu_mpi_TESTS =
 starpu_mpi_TESTS =
 
 
 starpu_mpi_TESTS +=				\
 starpu_mpi_TESTS +=				\
@@ -142,14 +143,15 @@ starpu_mpi_TESTS +=				\
 	user_defined_datatype			\
 	user_defined_datatype			\
 	early_stuff				\
 	early_stuff				\
 	sendrecv_bench				\
 	sendrecv_bench				\
-	burst
+	burst						\
+	display_bindings
 
 
 if !STARPU_USE_MPI_MPI
 if !STARPU_USE_MPI_MPI
 starpu_mpi_TESTS +=				\
 starpu_mpi_TESTS +=				\
 	sendrecv_parallel_tasks_bench
 	sendrecv_parallel_tasks_bench
 endif
 endif
 
 
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 starpu_mpi_TESTS +=				\
 starpu_mpi_TESTS +=				\
 	sendrecv_gemm_bench			\
 	sendrecv_gemm_bench			\
 	burst_gemm
 	burst_gemm
@@ -182,7 +184,7 @@ starpu_mpi_TESTS +=				\
 	starpu_redefine
 	starpu_redefine
 endif
 endif
 
 
-noinst_PROGRAMS =				\
+noinst_PROGRAMS +=				\
 	datatypes				\
 	datatypes				\
 	pingpong				\
 	pingpong				\
 	mpi_test				\
 	mpi_test				\
@@ -245,9 +247,10 @@ noinst_PROGRAMS =				\
 	sendrecv_bench				\
 	sendrecv_bench				\
 	sendrecv_parallel_tasks_bench		\
 	sendrecv_parallel_tasks_bench		\
 	burst					\
 	burst					\
-	nothing
+	nothing							\
+	display_bindings
 
 
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 noinst_PROGRAMS +=				\
 noinst_PROGRAMS +=				\
 	sendrecv_gemm_bench			\
 	sendrecv_gemm_bench			\
 	burst_gemm
 	burst_gemm
@@ -299,7 +302,7 @@ sendrecv_parallel_tasks_bench_SOURCES += abstract_sendrecv_bench.c
 burst_SOURCES = burst.c
 burst_SOURCES = burst.c
 burst_SOURCES += burst_helper.c
 burst_SOURCES += burst_helper.c
 
 
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 sendrecv_gemm_bench_SOURCES = sendrecv_gemm_bench.c
 sendrecv_gemm_bench_SOURCES = sendrecv_gemm_bench.c
 sendrecv_gemm_bench_SOURCES += bench_helper.c
 sendrecv_gemm_bench_SOURCES += bench_helper.c
 sendrecv_gemm_bench_SOURCES += gemm_helper.c
 sendrecv_gemm_bench_SOURCES += gemm_helper.c
@@ -315,5 +318,3 @@ burst_gemm_SOURCES += ../../examples/common/blas.c
 
 
 burst_gemm_LDADD = $(STARPU_BLAS_LDFLAGS)
 burst_gemm_LDADD = $(STARPU_BLAS_LDFLAGS)
 endif
 endif
-
-endif

+ 1 - 1
mpi/tests/abstract_sendrecv_bench.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 2
mpi/tests/abstract_sendrecv_bench.h

@@ -1,7 +1,6 @@
-
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
mpi/tests/bench_helper.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
mpi/tests/bench_helper.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
mpi/tests/burst.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by

+ 59 - 3
mpi/tests/burst_gemm.c

@@ -29,6 +29,9 @@
 #include "gemm_helper.h"
 #include "gemm_helper.h"
 #include "burst_helper.h"
 #include "burst_helper.h"
 
 
+static int gemm_warmup = 1;
+static int gemm_warmup_wait = 0;
+
 void parse_args(int argc, char **argv)
 void parse_args(int argc, char **argv)
 {
 {
 	int i;
 	int i;
@@ -62,10 +65,19 @@ void parse_args(int argc, char **argv)
 		{
 		{
 			burst_nb_requests = atoi(argv[++i]);
 			burst_nb_requests = atoi(argv[++i]);
 		}
 		}
+		else if (strcmp(argv[i], "-no-gemm-warmup") == 0)
+		{
+			gemm_warmup = 0;
+		}
+		else if (strcmp(argv[i], "-gemm-warmup-wait") == 0)
+		{
+			/* All warmup GEMMs will start at the same moment */
+			gemm_warmup_wait = 1;
+		}
 		else if (strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0)
 		else if (strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0)
 		{
 		{
-			fprintf(stderr,"Usage: %s [-nblocks n] [-size size] [-check] [-nreqs nreqs]\n", argv[0]);
-			fprintf(stderr,"Currently selected: matrix size: %u - %u blocks - %d requests in each burst\n", matrix_dim, nslices, burst_nb_requests);
+			fprintf(stderr,"Usage: %s [-nblocks n] [-size size] [-check] [-nreqs nreqs] [-no-gemm-warmup] [-gemm-warmup-wait]\n", argv[0]);
+			fprintf(stderr,"Currently selected: matrix size: %u - %u blocks - %d requests in each burst - gemm warmup: %d -gemm-warmup-wait: %d\n", matrix_dim, nslices, burst_nb_requests, gemm_warmup, gemm_warmup_wait);
 			exit(EXIT_SUCCESS);
 			exit(EXIT_SUCCESS);
 		}
 		}
 		else
 		else
@@ -106,13 +118,30 @@ int main(int argc, char **argv)
 	if (gemm_init_data() == -ENODEV)
 	if (gemm_init_data() == -ENODEV)
 		goto enodev;
 		goto enodev;
 
 
+	/* GEMM warmup, to really load the BLAS library */
+	if (gemm_warmup)
+	{
+		if (gemm_warmup_wait)
+		{
+			starpu_task_wait_for_all();
+			starpu_pause();
+		}
+
+		if(gemm_submit_tasks() == -ENODEV)
+			goto enodev;
+
+		if (gemm_warmup_wait)
+		{
+			starpu_resume();
+		}
+	}
+
 	burst_init_data(mpi_rank);
 	burst_init_data(mpi_rank);
 
 
 	/* Wait for everything and everybody: */
 	/* Wait for everything and everybody: */
 	starpu_task_wait_for_all();
 	starpu_task_wait_for_all();
 	starpu_mpi_barrier(MPI_COMM_WORLD);
 	starpu_mpi_barrier(MPI_COMM_WORLD);
 
 
-
 	FPRINTF(stderr, "** Burst warmup **\n");
 	FPRINTF(stderr, "** Burst warmup **\n");
 	burst_all(mpi_rank);
 	burst_all(mpi_rank);
 
 
@@ -142,6 +171,33 @@ int main(int argc, char **argv)
 	starpu_task_wait_for_all();
 	starpu_task_wait_for_all();
 	starpu_mpi_barrier(MPI_COMM_WORLD);
 	starpu_mpi_barrier(MPI_COMM_WORLD);
 
 
+	starpu_sleep(0.3); // sleep to easily distinguish different parts in traces
+
+	FPRINTF(stderr, "** Workers are computing, without communications **\n");
+	starpu_pause();
+	if(gemm_submit_tasks() == -ENODEV)
+		goto enodev;
+	starpu_resume();
+
+	/* Wait for everything and everybody: */
+	starpu_task_wait_for_all();
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+
+	starpu_sleep(0.3); // sleep to easily distinguish different parts in traces
+
+	FPRINTF(stderr, "** Burst while workers are computing, but polling a moment between each task **\n");
+	starpu_pause();
+	gemm_add_polling_dependencies();
+	if(gemm_submit_tasks_with_tags(/* enable task tags */ 1) == -ENODEV)
+		goto enodev;
+	starpu_resume();
+
+	burst_all(mpi_rank);
+
+	/* Wait for everything and everybody: */
+	starpu_task_wait_for_all();
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+
 enodev:
 enodev:
 	gemm_release();
 	gemm_release();
 	burst_free_data(mpi_rank);
 	burst_free_data(mpi_rank);

+ 1 - 1
mpi/tests/burst_helper.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
mpi/tests/burst_helper.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by

+ 44 - 0
mpi/tests/display_bindings.c

@@ -0,0 +1,44 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_mpi.h>
+#include <stdlib.h>
+#include "helper.h"
+
+#if !defined(STARPU_HAVE_SETENV)
+#warning setenv is not defined. Skipping test
+int main(void)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+int main(int argc, char **argv)
+{
+	int ret;
+	setenv("STARPU_DISPLAY_BINDINGS", "1", 1);
+
+	MPI_INIT_THREAD_real(&argc, &argv, MPI_THREAD_SERIALIZED);
+
+	ret = starpu_mpi_init_conf(NULL, NULL, 0, MPI_COMM_WORLD, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
+
+	starpu_mpi_shutdown();
+	MPI_Finalize();
+
+	return EXIT_SUCCESS;
+}
+#endif

+ 52 - 4
mpi/tests/gemm_helper.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -199,6 +199,7 @@ static struct starpu_codelet cl_init_matrix_zero =
 	.color = 0x808000 // olive
 	.color = 0x808000 // olive
 };
 };
 
 
+/* Allocate and partition buffers */
 void gemm_alloc_data()
 void gemm_alloc_data()
 {
 {
 	starpu_malloc_flags((void **)&A, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
 	starpu_malloc_flags((void **)&A, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
@@ -207,14 +208,13 @@ void gemm_alloc_data()
 	partition_mult_data();
 	partition_mult_data();
 }
 }
 
 
-
+/* Submit tasks to initialize matrices: fill them with zeros or random numbers */
 int gemm_init_data()
 int gemm_init_data()
 {
 {
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
 	int ret;
 	int ret;
 	unsigned x, y;
 	unsigned x, y;
 
 
-	// Initialize matrices:
 	for (x = 0; x < nslices; x++)
 	for (x = 0; x < nslices; x++)
 	{
 	{
 		struct starpu_task *task = starpu_task_create();
 		struct starpu_task *task = starpu_task_create();
@@ -237,11 +237,17 @@ int gemm_init_data()
 	return 0;
 	return 0;
 }
 }
 
 
-
+/* Submit tasks to compute the GEMM */
 int gemm_submit_tasks()
 int gemm_submit_tasks()
 {
 {
+	return gemm_submit_tasks_with_tags(/* by default, disable task tags */ 0);
+}
+
+int gemm_submit_tasks_with_tags(int with_tags)
+{
 	int ret;
 	int ret;
 	unsigned x, y;
 	unsigned x, y;
+	starpu_tag_t task_tag = 0;
 
 
 	for (x = 0; x < nslices; x++)
 	for (x = 0; x < nslices; x++)
 	for (y = 0; y < nslices; y++)
 	for (y = 0; y < nslices; y++)
@@ -253,6 +259,12 @@ int gemm_submit_tasks()
 		task->handles[2] = starpu_data_get_sub_data(C_handle, 2, x, y);
 		task->handles[2] = starpu_data_get_sub_data(C_handle, 2, x, y);
 		task->flops = 2ULL * (matrix_dim/nslices) * (matrix_dim/nslices) * matrix_dim;
 		task->flops = 2ULL * (matrix_dim/nslices) * (matrix_dim/nslices) * matrix_dim;
 
 
+		if (with_tags)
+		{
+			task->use_tag = 1;
+			task->tag_id = ++task_tag;
+		}
+
 		ret = starpu_task_submit(task);
 		ret = starpu_task_submit(task);
 		CHECK_TASK_SUBMIT(ret);
 		CHECK_TASK_SUBMIT(ret);
 		starpu_data_wont_use(starpu_data_get_sub_data(C_handle, 2, x, y));
 		starpu_data_wont_use(starpu_data_get_sub_data(C_handle, 2, x, y));
@@ -261,6 +273,42 @@ int gemm_submit_tasks()
 	return 0;
 	return 0;
 }
 }
 
 
+/* Add dependencies between GEMM tasks to see the impact of polling workers which will at the end get a task.
+ * The new dependency graph has the following shape:
+ * - the same number of GEMMs as the number of workers are executed in parallel on all workers ("a column of tasks")
+ * - then a GEMM waits all tasks of the previous column of tasks, and is executed on a worker
+ * - the next column of tasks waits for the previous GEMM
+ * - and so on...
+ *
+ * worker 0 |  1  |  4  |  5  |  8  |  9  |
+ * worker 1 |  2  |     |  6  |     | 10  |  ...
+ * worker 2 |  3  |     |  7  |     | 11  |
+ *
+ * This function has to be called before gemm_submit_tasks_with_tags(1).
+ */
+void gemm_add_polling_dependencies()
+{
+	starpu_tag_t nb_tasks = (starpu_tag_t) nslices * (starpu_tag_t) nslices;
+	unsigned nb_workers = starpu_worker_get_count();
+
+	for (starpu_tag_t synchro_tag = nb_workers+1; synchro_tag <= nb_tasks; synchro_tag += (nb_workers+1))
+	{
+		// this synchro tag depends on tasks of previous column of tasks:
+		for (starpu_tag_t previous_tag = synchro_tag - nb_workers; previous_tag < synchro_tag; previous_tag++)
+		{
+			starpu_tag_declare_deps(synchro_tag, 1, previous_tag);
+		}
+
+		// tasks of the next column of tasks depend on this synchro tag:
+		// this actually allows workers to poll for new tasks, while no task is available
+		for (starpu_tag_t next_tag = synchro_tag+1; next_tag < (synchro_tag + nb_workers + 1) && next_tag <= nb_tasks; next_tag++)
+		{
+			starpu_tag_declare_deps(next_tag, 1, synchro_tag);
+		}
+	}
+
+}
+
 void gemm_release()
 void gemm_release()
 {
 {
 	starpu_data_unpartition(C_handle, STARPU_MAIN_RAM);
 	starpu_data_unpartition(C_handle, STARPU_MAIN_RAM);

+ 3 - 1
mpi/tests/gemm_helper.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -29,5 +29,7 @@ void gemm_alloc_data();
 int gemm_init_data();
 int gemm_init_data();
 int gemm_submit_tasks();
 int gemm_submit_tasks();
 void gemm_release();
 void gemm_release();
+void gemm_add_polling_dependencies();
+int gemm_submit_tasks_with_tags(int with_tags);
 
 
 #endif /* __MPI_TESTS_GEMM_HELPER__ */
 #endif /* __MPI_TESTS_GEMM_HELPER__ */

+ 1 - 1
mpi/tests/nothing.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
mpi/tests/sendrecv_parallel_tasks_bench.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by

+ 2 - 2
sc_hypervisor/examples/Makefile.am

@@ -26,7 +26,7 @@ noinst_PROGRAMS =				\
 	lp_test/lp_resize_test			\
 	lp_test/lp_resize_test			\
 	hierarchical_ctxs/resize_hierarchical_ctxs
 	hierarchical_ctxs/resize_hierarchical_ctxs
 
 
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 noinst_PROGRAMS +=				\
 noinst_PROGRAMS +=				\
 	cholesky/cholesky_implicit
 	cholesky/cholesky_implicit
 
 
@@ -35,7 +35,7 @@ noinst_HEADERS = 				\
 	sched_ctx_utils/sched_ctx_utils.h
 	sched_ctx_utils/sched_ctx_utils.h
 endif
 endif
 
 
-if !NO_BLAS_LIB
+if !STARPU_NO_BLAS_LIB
 
 
 cholesky_cholesky_implicit_SOURCES =		\
 cholesky_cholesky_implicit_SOURCES =		\
 	cholesky/cholesky_implicit.c		\
 	cholesky/cholesky_implicit.c		\

+ 3 - 5
socl/examples/Makefile.am

@@ -25,11 +25,8 @@ if !STARPU_SIMGRID
 TESTS		=	$(SOCL_EXAMPLES)
 TESTS		=	$(SOCL_EXAMPLES)
 endif
 endif
 
 
-if STARPU_HAVE_WINDOWS
+noinst_PROGRAMS	=
 check_PROGRAMS	=	$(SOCL_EXAMPLES)
 check_PROGRAMS	=	$(SOCL_EXAMPLES)
-else
-check_PROGRAMS	=	$(LOADER) $(SOCL_EXAMPLES)
-endif
 
 
 if !STARPU_HAVE_WINDOWS
 if !STARPU_HAVE_WINDOWS
 ## test loader program
 ## test loader program
@@ -37,6 +34,7 @@ LOADER			=	loader
 loader_CPPFLAGS 	=	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
 loader_CPPFLAGS 	=	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
 LOADER_BIN		=	$(abs_top_builddir)/socl/examples/$(LOADER)
 LOADER_BIN		=	$(abs_top_builddir)/socl/examples/$(LOADER)
 loader_SOURCES		=	../../tests/loader.c
 loader_SOURCES		=	../../tests/loader.c
+noinst_PROGRAMS		+=	loader
 
 
 if STARPU_HAVE_AM111
 if STARPU_HAVE_AM111
 TESTS_ENVIRONMENT	=	LD_LIBRARY_PATH="@SOCL_OCL_LIB_OPENCL_DIR@:$(LD_LIBRARY_PATH)" OCL_ICD_VENDORS="$(abs_top_builddir)/socl/vendors/" top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
 TESTS_ENVIRONMENT	=	LD_LIBRARY_PATH="@SOCL_OCL_LIB_OPENCL_DIR@:$(LD_LIBRARY_PATH)" OCL_ICD_VENDORS="$(abs_top_builddir)/socl/vendors/" top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
@@ -75,7 +73,7 @@ matmul_matmul_LDADD = -lm
 mansched_mansched_SOURCES = mansched/mansched.c
 mansched_mansched_SOURCES = mansched/mansched.c
 
 
 #mandelbrot_mandelbrot_CPPFLAGS = $(AM_CPPFLAGS) $(AM_CFLAGS)
 #mandelbrot_mandelbrot_CPPFLAGS = $(AM_CPPFLAGS) $(AM_CFLAGS)
-#if HAVE_X11
+#if STARPU_HAVE_X11
 #mandelbrot_mandelbrot_CPPFLAGS += $(X_CFLAGS)
 #mandelbrot_mandelbrot_CPPFLAGS += $(X_CFLAGS)
 #mandelbrot_mandelbrot_LDADD = $(X_PRE_LIBS) $(X_LIBS) -lX11 $(X_EXTRA_LIBS)
 #mandelbrot_mandelbrot_LDADD = $(X_PRE_LIBS) $(X_LIBS) -lX11 $(X_EXTRA_LIBS)
 #endif
 #endif

+ 138 - 3
src/common/utils.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  * Copyright (C) 2010-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Federal University of Rio Grande do Sul (UFRGS)
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -23,6 +24,7 @@
 #include <unistd.h>
 #include <unistd.h>
 #endif
 #endif
 #include <fcntl.h>
 #include <fcntl.h>
+#include <ctype.h>
 
 
 #if defined(_WIN32) && !defined(__CYGWIN__)
 #if defined(_WIN32) && !defined(__CYGWIN__)
 #include <io.h>
 #include <io.h>
@@ -526,7 +528,7 @@ void _starpu_gethostname(char *hostname, size_t size)
 
 
 	if (force_mpi_hostnames && force_mpi_hostnames[0])
 	if (force_mpi_hostnames && force_mpi_hostnames[0])
 	{
 	{
-		char *host, *srv_hosts, *rsrv;
+		char *host, *srv_hosts;
 		srv_hosts = strdup(force_mpi_hostnames);
 		srv_hosts = strdup(force_mpi_hostnames);
 		int rank;
 		int rank;
 		if (starpu_mpi_world_rank)
 		if (starpu_mpi_world_rank)
@@ -541,8 +543,8 @@ void _starpu_gethostname(char *hostname, size_t size)
 
 
 		if (force_mpi_hostnames != NULL)
 		if (force_mpi_hostnames != NULL)
 		{
 		{
-			host = strtok_r(srv_hosts, " ", &rsrv);
-			while (rank-->0 && (host = strtok_r(NULL, " ", &rsrv)));
+			host = strtok(srv_hosts, " ");
+			while (rank-->0 && (host = strtok(NULL, " ")));
 			if(rank>=0)
 			if(rank>=0)
 			{
 			{
 				_STARPU_MSG("Missing hostnames in STARPU_MPI_HOSTNAMES\n");
 				_STARPU_MSG("Missing hostnames in STARPU_MPI_HOSTNAMES\n");
@@ -620,3 +622,136 @@ char *starpu_getenv(const char *str)
 #endif
 #endif
 	return getenv(str);
 	return getenv(str);
 }
 }
+
+int _strings_ncmp(const char *strings[], const char *str)
+{
+	int pos = 0;
+	while (strings[pos])
+	{
+		if ((strlen(str) == strlen(strings[pos]) && strncasecmp(str, strings[pos], strlen(strings[pos])) == 0))
+			break;
+		pos++;
+	}
+	if (strings[pos] == NULL)
+		return -1;
+	return pos;
+}
+
+int starpu_get_env_string_var_default(const char *str, const char *strings[], int defvalue)
+{
+	int val;
+	char *strval;
+
+	strval = starpu_getenv(str);
+	if (!strval)
+	{
+		val = defvalue;
+	}
+	else
+	{
+		val = _strings_ncmp(strings, strval);
+		if (val < 0)
+		{
+			int i;
+			_STARPU_MSG("\n");
+			_STARPU_MSG("Invalid value '%s' for environment variable '%s'\n", strval, str);
+			_STARPU_MSG("Valid values are:\n");
+			for(i=0;strings[i]!=NULL;i++) _STARPU_MSG("\t%s\n",strings[i]);
+			_STARPU_MSG("\n");
+			STARPU_ABORT();
+		}
+	}
+	return val;
+}
+
+static void remove_spaces(char *str)
+{
+	int i = 0;
+	int j = 0;
+
+	while (str[j] != '\0')
+	{
+		if (isspace(str[j]))
+		{
+			j++;
+			continue;
+		}
+		if (j > i)
+		{
+			str[i] = str[j];
+		}
+		i++;
+		j++;
+	}
+	if (j > i)
+	{
+		str[i] = str[j];
+	}
+}
+
+int starpu_get_env_size_default(const char *str, int defval)
+{
+	int val;
+	char *strval;
+
+	strval = starpu_getenv(str);
+	if (!strval)
+	{
+		val = defval;
+	}
+	else
+	{
+		char *value = strdup(strval);
+		if (value == NULL)
+			_STARPU_ERROR("memory allocation failed\n");
+		remove_spaces(value);
+		if (value[0] == '\0')
+		{
+			free(value);
+			val = defval;
+		}
+		else
+		{
+			char *endptr = NULL;
+			int mult = 1024;
+			errno = 0;
+			int v = (int)strtol(value, &endptr, 10);
+			if (errno != 0)
+				_STARPU_ERROR("could not parse environment variable '%s' with value '%s', strtol failed with error %s\n", str, value, strerror(errno));
+			if (*endptr != '\0')
+			{
+				switch (*endptr)
+				{
+				case 'b':
+				case 'B': mult = 1; break;
+				case 'k':
+				case 'K': mult = 1024; break;
+				case 'm':
+				case 'M': mult = 1024*1024; break;
+				case 'g':
+				case 'G': mult = 1024*1024*1024; break;
+				default:
+					_STARPU_ERROR("could not parse environment variable '%s' with value '%s' size suffix invalid\n", str, value);
+				}
+			}
+			val = v*mult;
+			free(value);
+		}
+	}
+	return val;
+}
+
+void starpu_display_bindings(void)
+{
+#ifdef STARPU_HAVE_HWLOC
+	int hwloc_ret = system("hwloc-ps -a -t -c");
+	if (hwloc_ret)
+	{
+		_STARPU_DISP("hwloc-ps returned %d\n", hwloc_ret);
+		fflush(stderr);
+	}
+	fflush(stdout);
+#else
+	_STARPU_DISP("hwloc not available to display bindings.\n");
+#endif
+}

+ 14 - 9
src/core/perfmodel/perfmodel_history.c

@@ -344,7 +344,10 @@ static void dump_reg_model(FILE *f, struct starpu_perfmodel *model, int comb, in
 	double a = nan(""), b = nan(""), c = nan("");
 	double a = nan(""), b = nan(""), c = nan("");
 
 
 	if (model->type == STARPU_NL_REGRESSION_BASED)
 	if (model->type == STARPU_NL_REGRESSION_BASED)
-		_starpu_regression_non_linear_power(per_arch_model->list, &a, &b, &c);
+	{
+		if (_starpu_regression_non_linear_power(per_arch_model->list, &a, &b, &c) != 0)
+			_STARPU_DISP("Warning: could not compute a non-linear regression for model %s\n", model->symbol);
+	}
 
 
 	fprintf(f, "# a\t\tb\t\tc\n");
 	fprintf(f, "# a\t\tb\t\tc\n");
 	_starpu_write_double(f, "%-15e", a);
 	_starpu_write_double(f, "%-15e", a);
@@ -1491,6 +1494,8 @@ int starpu_perfmodel_load_file(const char *filename, struct starpu_perfmodel *mo
 	res = fclose(f);
 	res = fclose(f);
 	STARPU_ASSERT(res == 0);
 	STARPU_ASSERT(res == 0);
 
 
+	if (ret)
+		starpu_perfmodel_unload_model(model);
 	return ret;
 	return ret;
 }
 }
 
 
@@ -1885,20 +1890,20 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 				STARPU_HG_DISABLE_CHECKING(entry->nsample);
 				STARPU_HG_DISABLE_CHECKING(entry->nsample);
 				STARPU_HG_DISABLE_CHECKING(entry->mean);
 				STARPU_HG_DISABLE_CHECKING(entry->mean);
 
 
-				/* Do not take the first measurement into account, it is very often quite bogus */
+				/* For history-based, do not take the first measurement into account, it is very often quite bogus */
 				/* TODO: it'd be good to use a better estimation heuristic, like the median, or latest n values, etc. */
 				/* TODO: it'd be good to use a better estimation heuristic, like the median, or latest n values, etc. */
-				//entry->mean = 0;
-				//entry->sum = 0;
-
-				//entry->deviation = 0.0;
-				//entry->sum2 = 0;
+				if (model->type != STARPU_HISTORY_BASED)
+				{
+					entry->sum = measured;
+					entry->sum2 = measured*measured;
+					entry->nsample = 1;
+					entry->mean = measured;
+				}
 
 
 				entry->size = _starpu_job_get_data_size(model, arch, impl, j);
 				entry->size = _starpu_job_get_data_size(model, arch, impl, j);
 				entry->flops = j->task->flops;
 				entry->flops = j->task->flops;
 
 
 				entry->footprint = key;
 				entry->footprint = key;
-				//entry->nsample = 0;
-				//entry->nerror = 0;
 
 
 				insert_history_entry(entry, list, &per_arch_model->history);
 				insert_history_entry(entry, list, &per_arch_model->history);
 			}
 			}

+ 123 - 45
src/core/perfmodel/regression.c

@@ -20,7 +20,32 @@
 #define MAXREGITER	1000
 #define MAXREGITER	1000
 #define EPS 1.0e-10
 #define EPS 1.0e-10
 
 
-static double compute_b(double c, unsigned n, unsigned *x, double *y)
+/* For measurements close to C, we do not want to try to fit, since we are
+   fitting the distance to C, which won't actually really get smaller */
+#define C_RADIUS 1
+
+/*
+ * smoothly ramp from 0 to 1 between 0 and 1
+ * <= 0: stay 0
+ * >= 1: stay 1 */
+static double level(double x)
+{
+	if (x <= 0.)
+		return 0.;
+	if (x >= 1.)
+		return 1.;
+	if (x < 0.5)
+		return -2*x*x+4*x-1;
+	return 2*x*x;
+}
+
+static double fixpop(unsigned pop, double c, double y)
+{
+	double distance = (y-c)/c;
+	return pop * level((distance - C_RADIUS) / C_RADIUS);
+}
+
+static double compute_b(double c, unsigned n, size_t *x, double *y, unsigned *pop)
 {
 {
 	double b;
 	double b;
 
 
@@ -29,43 +54,55 @@ static double compute_b(double c, unsigned n, unsigned *x, double *y)
 	double sumx = 0.0;
 	double sumx = 0.0;
 	double sumx2 = 0.0;
 	double sumx2 = 0.0;
 	double sumy = 0.0;
 	double sumy = 0.0;
+	double nn = 0;
 
 
 	unsigned i;
 	unsigned i;
 	for (i = 0; i < n; i++)
 	for (i = 0; i < n; i++)
 	{
 	{
 		double xi = log(x[i]);
 		double xi = log(x[i]);
 		double yi = log(y[i]-c);
 		double yi = log(y[i]-c);
+		double popi = fixpop(pop[i], c, y[i]);
+		if (popi <= 0)
+			continue;
 
 
-		sumxy += xi*yi;
-		sumx += xi;
-		sumx2 += xi*xi;
-		sumy += yi;
+		sumxy += xi*yi*popi;
+		sumx += xi*popi;
+		sumx2 += xi*xi*popi;
+		sumy += yi*popi;
+
+		nn += popi;
 	}
 	}
 
 
-	b = (n * sumxy - sumx * sumy) / (n*sumx2 - sumx*sumx);
+	b = (nn * sumxy - sumx * sumy) / (nn*sumx2 - sumx*sumx);
 
 
 	return b;
 	return b;
 }
 }
 
 
-static double compute_a(double c, double b, unsigned n, unsigned *x, double *y)
+static double compute_a(double c, double b, unsigned n, size_t *x, double *y, unsigned *pop)
 {
 {
 	double a;
 	double a;
 
 
 	/* X = log (x) , Y = log (y - c) */
 	/* X = log (x) , Y = log (y - c) */
 	double sumx = 0.0;
 	double sumx = 0.0;
 	double sumy = 0.0;
 	double sumy = 0.0;
+	double nn = 0;
 
 
 	unsigned i;
 	unsigned i;
 	for (i = 0; i < n; i++)
 	for (i = 0; i < n; i++)
 	{
 	{
 		double xi = log(x[i]);
 		double xi = log(x[i]);
 		double yi = log(y[i]-c);
 		double yi = log(y[i]-c);
+		double popi = fixpop(pop[i], c, y[i]);
+		if (popi <= 0)
+			continue;
+
+		sumx += xi*popi;
+		sumy += yi*popi;
 
 
-		sumx += xi;
-		sumy += yi;
+		nn += popi;
 	}
 	}
 
 
-	a = (sumy - b*sumx) / n;
+	a = (sumy - b*sumx) / nn;
 
 
 	return a;
 	return a;
 }
 }
@@ -73,7 +110,7 @@ static double compute_a(double c, double b, unsigned n, unsigned *x, double *y)
 
 
 
 
 /* returns r */
 /* returns r */
-static double test_r(double c, unsigned n, unsigned *x, double *y)
+static double test_r(double c, unsigned n, size_t *x, double *y, unsigned *pop)
 {
 {
 	double r;
 	double r;
 
 
@@ -85,20 +122,26 @@ static double test_r(double c, unsigned n, unsigned *x, double *y)
 	double sumx2 = 0.0;
 	double sumx2 = 0.0;
 	double sumy = 0.0;
 	double sumy = 0.0;
 	double sumy2 = 0.0;
 	double sumy2 = 0.0;
+	double nn = 0;
 
 
 	unsigned i;
 	unsigned i;
 	for (i = 0; i < n; i++)
 	for (i = 0; i < n; i++)
 	{
 	{
 		double xi = log(x[i]);
 		double xi = log(x[i]);
 		double yi = log(y[i]-c);
 		double yi = log(y[i]-c);
+		double popi = fixpop(pop[i], c, y[i]);
+		if (popi <= 0)
+			continue;
 
 
 	//	printf("Xi = %e, Yi = %e\n", xi, yi);
 	//	printf("Xi = %e, Yi = %e\n", xi, yi);
 
 
-		sumxy += xi*yi;
-		sumx += xi;
-		sumx2 += xi*xi;
-		sumy += yi;
-		sumy2 += yi*yi;
+		sumxy += xi*yi*popi;
+		sumx += xi*popi;
+		sumx2 += xi*xi*popi;
+		sumy += yi*popi;
+		sumy2 += yi*yi*popi;
+
+		nn += popi;
 	}
 	}
 
 
 	//printf("sumxy %e\n", sumxy);
 	//printf("sumxy %e\n", sumxy);
@@ -107,7 +150,7 @@ static double test_r(double c, unsigned n, unsigned *x, double *y)
 	//printf("sumy %e\n", sumy);
 	//printf("sumy %e\n", sumy);
 	//printf("sumy2 %e\n", sumy2);
 	//printf("sumy2 %e\n", sumy2);
 
 
-	r = (n * sumxy - sumx * sumy) / sqrt( (n* sumx2 - sumx*sumx) * (n*sumy2 - sumy*sumy) );
+	r = (nn * sumxy - sumx * sumy) / sqrt( (nn* sumx2 - sumx*sumx) * (nn*sumy2 - sumy*sumy) );
 
 
 	return r;
 	return r;
 }
 }
@@ -119,38 +162,52 @@ static unsigned find_list_size(struct starpu_perfmodel_history_list *list_histor
 	struct starpu_perfmodel_history_list *ptr = list_history;
 	struct starpu_perfmodel_history_list *ptr = list_history;
 	while (ptr)
 	while (ptr)
 	{
 	{
-		cnt++;
+		if (ptr->entry->nsample)
+			cnt++;
 		ptr = ptr->next;
 		ptr = ptr->next;
 	}
 	}
 
 
 	return cnt;
 	return cnt;
 }
 }
 
 
-static double find_list_min(double *y, unsigned n)
+static int compar(const void *_a, const void *_b)
 {
 {
-	double min = DBL_MAX;
+	double a = *(double*) _a;
+	double b = *(double*) _b;
+	if (a < b)
+		return -1;
+	if (a > b)
+		return 1;
+	return 0;
+}
 
 
-	unsigned i;
-	for (i = 0; i < n; i++)
-	{
-		min = STARPU_MIN(min, y[i]);
-	}
+static double get_list_fourth(double *y, unsigned n)
+{
+	double sorted[n];
+
+	memcpy(sorted, y, n * sizeof(*sorted));
 
 
-	return min;
+	qsort(sorted, n, sizeof(*sorted), compar);
+
+	return sorted[n/3];
 }
 }
 
 
-static void dump_list(unsigned *x, double *y, struct starpu_perfmodel_history_list *list_history)
+static void dump_list(size_t *x, double *y, unsigned *pop, struct starpu_perfmodel_history_list *list_history)
 {
 {
 	struct starpu_perfmodel_history_list *ptr = list_history;
 	struct starpu_perfmodel_history_list *ptr = list_history;
 	unsigned i = 0;
 	unsigned i = 0;
 
 
 	while (ptr)
 	while (ptr)
 	{
 	{
-		x[i] = ptr->entry->size;
-		y[i] = ptr->entry->mean;
+		if (ptr->entry->nsample)
+		{
+			x[i] = ptr->entry->size;
+			y[i] = ptr->entry->mean;
+			pop[i] = ptr->entry->nsample;
+			i++;
+		}
 
 
 		ptr = ptr->next;
 		ptr = ptr->next;
-		i++;
 	}
 	}
 }
 }
 
 
@@ -159,52 +216,72 @@ static void dump_list(unsigned *x, double *y, struct starpu_perfmodel_history_li
  * 	return 0 if success, -1 otherwise
  * 	return 0 if success, -1 otherwise
  * 	if success, a, b and c are modified
  * 	if success, a, b and c are modified
  * */
  * */
+
+/* See in Cedric Augonnet's PhD thesis's Appendix B for the rationale
+ * Scheduling Tasks over Multicore machines enhanced with Accelerators: a
+ * Runtime System’s Perspective */
 int _starpu_regression_non_linear_power(struct starpu_perfmodel_history_list *ptr, double *a, double *b, double *c)
 int _starpu_regression_non_linear_power(struct starpu_perfmodel_history_list *ptr, double *a, double *b, double *c)
 {
 {
 	unsigned n = find_list_size(ptr);
 	unsigned n = find_list_size(ptr);
-	STARPU_ASSERT(n);
+	if (!n)
+		return -1;
 
 
-	unsigned *x;
-	_STARPU_MALLOC(x, n*sizeof(unsigned));
+	size_t *x;
+	_STARPU_MALLOC(x, n*sizeof(size_t));
 
 
 	double *y;
 	double *y;
 	_STARPU_MALLOC(y, n*sizeof(double));
 	_STARPU_MALLOC(y, n*sizeof(double));
 	STARPU_ASSERT(y);
 	STARPU_ASSERT(y);
 
 
-	dump_list(x, y, ptr);
+	unsigned *pop;
+	_STARPU_MALLOC(pop, n*sizeof(unsigned));
+	STARPU_ASSERT(y);
+
+	dump_list(x, y, pop, ptr);
 
 
 	double cmin = 0.0;
 	double cmin = 0.0;
-	double cmax = find_list_min(y, n);
+	double cmax = get_list_fourth(y, n);
 
 
 	unsigned iter;
 	unsigned iter;
 
 
 	double err = 100000.0;
 	double err = 100000.0;
 
 
+/*
+	unsigned i;
+	for (i = 0; i < 100; i++)
+	{
+		double ci = cmin + (cmax-cmin)*i/100.;
+		fprintf(stderr,"%f: %f\n", ci, 1.0 - test_r(ci, n, x, y, pop));
+	}
+*/
+
+	/* Use dichotomy to find c that gives the best matching */
 	for (iter = 0; iter < MAXREGITER; iter++)
 	for (iter = 0; iter < MAXREGITER; iter++)
 	{
 	{
 		double c1, c2;
 		double c1, c2;
 		double r1, r2;
 		double r1, r2;
 
 
-		double radius = 0.01;
-
-		c1 = cmin + (0.5-radius)*(cmax - cmin);
-		c2 = cmin + (0.5+radius)*(cmax - cmin);
+		c1 = cmin + (0.33)*(cmax - cmin);
+		c2 = cmin + (0.67)*(cmax - cmin);
 
 
-		r1 = test_r(c1, n, x, y);
-		r2 = test_r(c2, n, x, y);
+		r1 = test_r(c1, n, x, y, pop);
+		r2 = test_r(c2, n, x, y, pop);
 
 
 		double err1, err2;
 		double err1, err2;
 		err1 = fabs(1.0 - r1);
 		err1 = fabs(1.0 - r1);
 		err2 = fabs(1.0 - r2);
 		err2 = fabs(1.0 - r2);
 
 
+		//fprintf(stderr,"%f - %f: %f - %f: %f - %f\n", cmin, c1, err1, c2, err2, cmax);
+
 		if (err1 < err2)
 		if (err1 < err2)
 		{
 		{
-			cmax = (cmin + cmax)/2;
+			/* 1 is better */
+			cmax = c2;
 		}
 		}
 		else
 		else
 		{
 		{
 			/* 2 is better */
 			/* 2 is better */
-			cmin = (cmin + cmax)/2;
+			cmin = c1;
 		}
 		}
 
 
 		if (fabs(err - STARPU_MIN(err1, err2)) < EPS)
 		if (fabs(err - STARPU_MIN(err1, err2)) < EPS)
@@ -215,11 +292,12 @@ int _starpu_regression_non_linear_power(struct starpu_perfmodel_history_list *pt
 
 
 	*c = (cmin + cmax)/2;
 	*c = (cmin + cmax)/2;
 
 
-	*b = compute_b(*c, n, x, y);
-	*a = exp(compute_a(*c, *b, n, x, y));
+	*b = compute_b(*c, n, x, y, pop);
+	*a = exp(compute_a(*c, *b, n, x, y, pop));
 
 
 	free(x);
 	free(x);
 	free(y);
 	free(y);
+	free(pop);
 
 
 	return 0;
 	return 0;
 }
 }

+ 2 - 0
src/core/topology.c

@@ -1049,6 +1049,8 @@ static inline unsigned _starpu_get_next_bindid(struct _starpu_machine_config *co
 {
 {
 	struct _starpu_machine_topology *topology = &config->topology;
 	struct _starpu_machine_topology *topology = &config->topology;
 
 
+	STARPU_ASSERT_MSG(topology_is_initialized, "The StarPU core is not initialized yet, have you called starpu_init?");
+
 	unsigned current_preferred;
 	unsigned current_preferred;
 	unsigned nhyperthreads = topology->nhwpus / topology->nhwcpus;
 	unsigned nhyperthreads = topology->nhwpus / topology->nhwcpus;
 	unsigned ncores = topology->nhwpus / nhyperthreads;
 	unsigned ncores = topology->nhwpus / nhyperthreads;

+ 0 - 1
src/core/topology.h

@@ -24,7 +24,6 @@
 #include <common/list.h>
 #include <common/list.h>
 #include <common/fxt.h>
 #include <common/fxt.h>
 
 
-/** TODO actually move this struct into this header */
 struct _starpu_machine_config;
 struct _starpu_machine_config;
 
 
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID

+ 17 - 0
src/core/workers.c

@@ -1059,6 +1059,7 @@ int starpu_conf_init(struct starpu_conf *conf)
 
 
 	memset(conf, 0, sizeof(*conf));
 	memset(conf, 0, sizeof(*conf));
 	conf->magic = 42;
 	conf->magic = 42;
+	conf->will_use_mpi = 0;
 	conf->sched_policy_name = starpu_getenv("STARPU_SCHED");
 	conf->sched_policy_name = starpu_getenv("STARPU_SCHED");
 	conf->sched_policy = NULL;
 	conf->sched_policy = NULL;
 	conf->global_sched_ctx_min_priority = starpu_get_env_number("STARPU_MIN_PRIO");
 	conf->global_sched_ctx_min_priority = starpu_get_env_number("STARPU_MIN_PRIO");
@@ -1143,6 +1144,9 @@ int starpu_conf_init(struct starpu_conf *conf)
 	/* 64MiB by default */
 	/* 64MiB by default */
 	conf->trace_buffer_size = ((uint64_t) starpu_get_env_number_default("STARPU_TRACE_BUFFER_SIZE", 64)) << 20;
 	conf->trace_buffer_size = ((uint64_t) starpu_get_env_number_default("STARPU_TRACE_BUFFER_SIZE", 64)) << 20;
 
 
+	conf->driver_spinning_backoff_min = (unsigned) starpu_get_env_number_default("STARPU_BACKOFF_MIN", 1);
+	conf->driver_spinning_backoff_max = (unsigned) starpu_get_env_number_default("STARPU_BACKOFF_MAX", 32);
+
 	/* Do not start performance counter collection by default */
 	/* Do not start performance counter collection by default */
 	conf->start_perf_counter_collection = 0;
 	conf->start_perf_counter_collection = 0;
 	return 0;
 	return 0;
@@ -1663,6 +1667,15 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 
 
 	_starpu_catch_signals();
 	_starpu_catch_signals();
 
 
+	/* if MPI is enabled, binding display will be done later, after MPI initialization */
+	if (!_starpu_config.conf.will_use_mpi && starpu_get_env_number_default("STARPU_DISPLAY_BINDINGS", 0))
+	{
+		fprintf(stdout, "== Binding ==\n");
+		starpu_display_bindings();
+		fprintf(stdout, "== End of binding ==\n");
+		fflush(stdout);
+	}
+
 	return 0;
 	return 0;
 }
 }
 
 
@@ -1751,6 +1764,8 @@ void starpu_pause()
 {
 {
 	STARPU_HG_DISABLE_CHECKING(_starpu_config.pause_depth);
 	STARPU_HG_DISABLE_CHECKING(_starpu_config.pause_depth);
 	_starpu_config.pause_depth += 1;
 	_starpu_config.pause_depth += 1;
+
+	starpu_fxt_trace_user_event_string("starpu_pause");
 }
 }
 
 
 void starpu_resume()
 void starpu_resume()
@@ -1762,6 +1777,8 @@ void starpu_resume()
 		STARPU_PTHREAD_COND_BROADCAST(&pause_cond);
 		STARPU_PTHREAD_COND_BROADCAST(&pause_cond);
 	}
 	}
 	STARPU_PTHREAD_MUTEX_UNLOCK(&pause_mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&pause_mutex);
+
+	starpu_fxt_trace_user_event_string("starpu_resume");
 }
 }
 
 
 unsigned _starpu_worker_can_block(unsigned memnode STARPU_ATTRIBUTE_UNUSED, struct _starpu_worker *worker STARPU_ATTRIBUTE_UNUSED)
 unsigned _starpu_worker_can_block(unsigned memnode STARPU_ATTRIBUTE_UNUSED, struct _starpu_worker *worker STARPU_ATTRIBUTE_UNUSED)

+ 16 - 2
src/core/workers.h

@@ -203,6 +203,10 @@ LIST_TYPE(_starpu_worker,
 
 
 	int enable_knob;
 	int enable_knob;
 	int bindid_requested;
 	int bindid_requested;
+
+	/* Keep this last, to make sure to separate worker data in separate
+	  cache lines. */
+	char padding[STARPU_CACHELINE_SIZE];
 );
 );
 
 
 struct _starpu_combined_worker
 struct _starpu_combined_worker
@@ -223,6 +227,10 @@ struct _starpu_combined_worker
 #ifdef STARPU_HAVE_HWLOC
 #ifdef STARPU_HAVE_HWLOC
 	hwloc_bitmap_t hwloc_cpu_set;
 	hwloc_bitmap_t hwloc_cpu_set;
 #endif
 #endif
+
+	/* Keep this last, to make sure to separate worker data in separate
+	  cache lines. */
+	char padding[STARPU_CACHELINE_SIZE];
 };
 };
 
 
 /**
 /**
@@ -389,6 +397,9 @@ struct _starpu_machine_config
 	/** Memory node for MPI, if only one */
 	/** Memory node for MPI, if only one */
 	int mpi_nodeid;
 	int mpi_nodeid;
 
 
+	/* Separate out previous variables from per-worker data. */
+	char padding1[STARPU_CACHELINE_SIZE];
+
 	/** Basic workers : each of this worker is running its own driver and
 	/** Basic workers : each of this worker is running its own driver and
 	 * can be combined with other basic workers. */
 	 * can be combined with other basic workers. */
 	struct _starpu_worker workers[STARPU_NMAXWORKERS];
 	struct _starpu_worker workers[STARPU_NMAXWORKERS];
@@ -397,6 +408,11 @@ struct _starpu_machine_config
 	 * that can run parallel tasks together. */
 	 * that can run parallel tasks together. */
 	struct _starpu_combined_worker combined_workers[STARPU_NMAX_COMBINEDWORKERS];
 	struct _starpu_combined_worker combined_workers[STARPU_NMAX_COMBINEDWORKERS];
 
 
+	starpu_pthread_mutex_t submitted_mutex;
+
+	/* Separate out previous mutex from the rest of the data. */
+	char padding2[STARPU_CACHELINE_SIZE];
+
 	/** Translation table from bindid to worker IDs */
 	/** Translation table from bindid to worker IDs */
 	struct
 	struct
 	{
 	{
@@ -432,8 +448,6 @@ struct _starpu_machine_config
 
 
 	/** When >0, StarPU should stop performance counters collection. */
 	/** When >0, StarPU should stop performance counters collection. */
 	int perf_counter_pause_depth;
 	int perf_counter_pause_depth;
-
-	starpu_pthread_mutex_t submitted_mutex;
 };
 };
 
 
 extern int _starpu_worker_parallel_blocks;
 extern int _starpu_worker_parallel_blocks;

+ 5 - 0
src/datawizard/coherency.h

@@ -281,6 +281,11 @@ struct _starpu_data_state
 
 
 	int partition_automatic_disabled;
 	int partition_automatic_disabled;
 
 
+	/** Application-provided coordinates. The maximum dimension (5) is
+	  * relatively arbitrary. */
+	unsigned dimensions;
+	int coordinates[5];
+
 	/** A generic pointer to data in the user land (could be anything and this
 	/** A generic pointer to data in the user land (could be anything and this
 	 * is not manage by StarPU) */
 	 * is not manage by StarPU) */
 	void *user_data;
 	void *user_data;

+ 24 - 1
src/datawizard/interfaces/data_interface.c

@@ -1117,8 +1117,18 @@ int starpu_data_get_home_node(starpu_data_handle_t handle)
 	return handle->home_node;
 	return handle->home_node;
 }
 }
 
 
-void starpu_data_set_coordinates_array(starpu_data_handle_t handle STARPU_ATTRIBUTE_UNUSED, int dimensions STARPU_ATTRIBUTE_UNUSED, int dims[] STARPU_ATTRIBUTE_UNUSED)
+void starpu_data_set_coordinates_array(starpu_data_handle_t handle, unsigned dimensions, int dims[])
 {
 {
+	unsigned i;
+	unsigned max_dimensions = sizeof(handle->coordinates)/sizeof(handle->coordinates[0]);
+
+	if (dimensions > max_dimensions)
+		dimensions = max_dimensions;
+
+	handle->dimensions = dimensions;
+	for (i = 0; i < dimensions; i++)
+		handle->coordinates[i] = dims[i];
+
 	_STARPU_TRACE_DATA_COORDINATES(handle, dimensions, dims);
 	_STARPU_TRACE_DATA_COORDINATES(handle, dimensions, dims);
 }
 }
 
 
@@ -1135,3 +1145,16 @@ void starpu_data_set_coordinates(starpu_data_handle_t handle, unsigned dimension
 
 
 	starpu_data_set_coordinates_array(handle, dimensions, dims);
 	starpu_data_set_coordinates_array(handle, dimensions, dims);
 }
 }
+
+unsigned starpu_data_get_coordinates_array(starpu_data_handle_t handle, unsigned dimensions, int dims[])
+{
+	unsigned i;
+
+	if (dimensions > handle->dimensions)
+		dimensions = handle->dimensions;
+
+	for (i = 0; i < dimensions; i++)
+		dims[i] = handle->coordinates[i];
+
+	return dimensions;
+}

+ 3 - 5
src/drivers/driver_common/driver_common.c

@@ -28,8 +28,6 @@
 #include <core/debug.h>
 #include <core/debug.h>
 #include <core/task.h>
 #include <core/task.h>
 
 
-#define BACKOFF_MAX 32  /* TODO : use parameter to define them */
-#define BACKOFF_MIN 1
 
 
 void _starpu_driver_start_job(struct _starpu_worker *worker, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, int rank, int profiling)
 void _starpu_driver_start_job(struct _starpu_worker *worker, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, int rank, int profiling)
 {
 {
@@ -374,7 +372,7 @@ static void _starpu_exponential_backoff(struct _starpu_worker *worker)
 {
 {
 	int delay = worker->spinning_backoff;
 	int delay = worker->spinning_backoff;
 
 
-	if (worker->spinning_backoff < BACKOFF_MAX)
+	if (worker->spinning_backoff < worker->config->conf.driver_spinning_backoff_max)
 		worker->spinning_backoff<<=1;
 		worker->spinning_backoff<<=1;
 
 
 	while(delay--)
 	while(delay--)
@@ -504,7 +502,7 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *worker, int w
 	{
 	{
 		_starpu_worker_set_status_sleeping(workerid);
 		_starpu_worker_set_status_sleeping(workerid);
 	}
 	}
-	worker->spinning_backoff = BACKOFF_MIN;
+	worker->spinning_backoff = worker->config->conf.driver_spinning_backoff_min;
 
 
 	_starpu_worker_leave_sched_op(worker);
 	_starpu_worker_leave_sched_op(worker);
 	STARPU_PTHREAD_COND_BROADCAST(&worker->sched_cond);
 	STARPU_PTHREAD_COND_BROADCAST(&worker->sched_cond);
@@ -703,7 +701,7 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 	}
 	}
 
 
 	_starpu_worker_set_status_wakeup(workerid);
 	_starpu_worker_set_status_wakeup(workerid);
-	worker->spinning_backoff = BACKOFF_MIN;
+	worker->spinning_backoff = worker->config->conf.driver_spinning_backoff_min;
 #endif /* !STARPU_SIMGRID */
 #endif /* !STARPU_SIMGRID */
 
 
 	_starpu_worker_leave_sched_op(&workers[0]);
 	_starpu_worker_leave_sched_op(&workers[0]);

+ 20 - 16
src/profiling/profiling.c

@@ -29,6 +29,8 @@
 #include <papi.h>
 #include <papi.h>
 #endif
 #endif
 
 
+/* TODO: move to worker structure */
+
 static struct starpu_profiling_worker_info worker_info[STARPU_NMAXWORKERS];
 static struct starpu_profiling_worker_info worker_info[STARPU_NMAXWORKERS];
 /* TODO: rather use rwlock */
 /* TODO: rather use rwlock */
 static starpu_pthread_mutex_t worker_info_mutex[STARPU_NMAXWORKERS];
 static starpu_pthread_mutex_t worker_info_mutex[STARPU_NMAXWORKERS];
@@ -325,26 +327,28 @@ void _starpu_worker_stop_sleeping(int workerid)
 
 
 		STARPU_PTHREAD_MUTEX_LOCK(&worker_info_mutex[workerid]);
 		STARPU_PTHREAD_MUTEX_LOCK(&worker_info_mutex[workerid]);
 
 
-		STARPU_ASSERT(worker_registered_sleeping_start[workerid] == 1);
-		sleeping_start = &sleeping_start_date[workerid];
-
-                /* Perhaps that profiling was enabled while the worker was
-                 * already blocked, so we don't measure (end - start), but
-                 * (end - max(start,worker_start)) where worker_start is the
-                 * date of the previous profiling info reset on the worker */
-		struct timespec *worker_start = &worker_info[workerid].start_time;
-		if (starpu_timespec_cmp(sleeping_start, worker_start, <))
+		if (worker_registered_sleeping_start[workerid] == 1)
 		{
 		{
-			/* sleeping_start < worker_start */
-			sleeping_start = worker_start;
-		}
+			sleeping_start = &sleeping_start_date[workerid];
+
+			/* Perhaps that profiling was enabled while the worker was
+			 * already blocked, so we don't measure (end - start), but
+			 * (end - max(start,worker_start)) where worker_start is the
+			 * date of the previous profiling info reset on the worker */
+			struct timespec *worker_start = &worker_info[workerid].start_time;
+			if (starpu_timespec_cmp(sleeping_start, worker_start, <))
+			{
+				/* sleeping_start < worker_start */
+				sleeping_start = worker_start;
+			}
 
 
-		struct timespec sleeping_time;
-		starpu_timespec_sub(&sleep_end_time, sleeping_start, &sleeping_time);
+			struct timespec sleeping_time;
+			starpu_timespec_sub(&sleep_end_time, sleeping_start, &sleeping_time);
 
 
-		starpu_timespec_accumulate(&worker_info[workerid].sleeping_time, &sleeping_time);
+			starpu_timespec_accumulate(&worker_info[workerid].sleeping_time, &sleeping_time);
 
 
-		worker_registered_sleeping_start[workerid] = 0;
+			worker_registered_sleeping_start[workerid] = 0;
+		}
 
 
 		STARPU_PTHREAD_MUTEX_UNLOCK(&worker_info_mutex[workerid]);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&worker_info_mutex[workerid]);
 
 

+ 17 - 11
src/sched_policies/component_heft.c

@@ -77,10 +77,13 @@ static int heft_progress_one(struct starpu_sched_component *component)
 		/* Estimated transfer+task termination for each child */
 		/* Estimated transfer+task termination for each child */
 		double estimated_ends_with_task[component->nchildren * ntasks];
 		double estimated_ends_with_task[component->nchildren * ntasks];
 
 
-		/* Minimum transfer+task termination on all children */
-		double min_exp_end_with_task[ntasks];
-		/* Maximum transfer+task termination on all children */
-		double max_exp_end_with_task[ntasks];
+		/* estimated energy */
+		double local_energy[component->nchildren * ntasks];
+
+		/* Minimum transfer+task termination of the NTASKS tasks over all workers */
+		double min_exp_end_of_task[ntasks];
+		/* Maximum termination of the already-scheduled tasks over all workers */
+		double max_exp_end_of_workers;
 
 
 		unsigned suitable_components[component->nchildren * ntasks];
 		unsigned suitable_components[component->nchildren * ntasks];
 
 
@@ -100,20 +103,23 @@ static int heft_progress_one(struct starpu_sched_component *component)
 					estimated_lengths + offset,
 					estimated_lengths + offset,
 					estimated_transfer_length + offset,
 					estimated_transfer_length + offset,
 					estimated_ends_with_task + offset,
 					estimated_ends_with_task + offset,
-					&min_exp_end_with_task[n], &max_exp_end_with_task[n],
+					&min_exp_end_of_task[n], &max_exp_end_of_workers,
 							  suitable_components + offset, nsuitable_components[n]);
 							  suitable_components + offset, nsuitable_components[n]);
+			
+			/* Compute the energy, if provided*/
+			starpu_mct_compute_energy(component, tasks[n], local_energy + offset, suitable_components + offset, nsuitable_components[n]);
 		}
 		}
 
 
+		/* best_task is the task that will finish first among the ntasks, while best_benefit is its expected execution time*/
 		int best_task = 0;
 		int best_task = 0;
-		double max_benefit = 0;
+		double best_benefit = min_exp_end_of_task[0];
 
 
 		/* Find the task which provides the most computation time benefit */
 		/* Find the task which provides the most computation time benefit */
-		for (n = 0; n < ntasks; n++)
+		for (n = 1; n < ntasks; n++)
 		{
 		{
-			double benefit = max_exp_end_with_task[n] - min_exp_end_with_task[n];
-			if (max_benefit < benefit)
+			if (best_benefit > min_exp_end_of_task[n])
 			{
 			{
-				max_benefit = benefit;
+				best_benefit =  min_exp_end_of_task[n];
 				best_task = n;
 				best_task = n;
 			}
 			}
 		}
 		}
@@ -129,7 +135,7 @@ static int heft_progress_one(struct starpu_sched_component *component)
 
 
 		unsigned offset = component->nchildren * best_task;
 		unsigned offset = component->nchildren * best_task;
 
 
-		int best_icomponent = starpu_mct_get_best_component(d, tasks[best_task], estimated_lengths + offset, estimated_transfer_length + offset, estimated_ends_with_task + offset, min_exp_end_with_task[best_task], max_exp_end_with_task[best_task], suitable_components + offset, nsuitable_components[best_task]);
+		int best_icomponent = starpu_mct_get_best_component(d, tasks[best_task], estimated_lengths + offset, estimated_transfer_length + offset, estimated_ends_with_task + offset, local_energy + offset, min_exp_end_of_task[best_task], max_exp_end_of_workers, suitable_components + offset, nsuitable_components[best_task]);
 
 
 		STARPU_ASSERT(best_icomponent != -1);
 		STARPU_ASSERT(best_icomponent != -1);
 		best_component = component->children[best_icomponent];
 		best_component = component->children[best_icomponent];

+ 28 - 12
src/sched_policies/component_heteroprio.c

@@ -106,10 +106,13 @@ static int heteroprio_progress_accel(struct starpu_sched_component *component, s
 	/* Estimated transfer+task termination for each child */
 	/* Estimated transfer+task termination for each child */
 	double estimated_ends_with_task[component->nchildren];
 	double estimated_ends_with_task[component->nchildren];
 
 
-	/* Minimum transfer+task termination on all children */
-	double min_exp_end_with_task;
-	/* Maximum transfer+task termination on all children */
-	double max_exp_end_with_task;
+	/* provided local energy */
+	double local_energy[component->nchildren];
+
+	/* Minimum transfer+task termination of the task over all workers */
+	double min_exp_end_of_task;
+	/* Maximum termination of the already-scheduled tasks over all workers */
+	double max_exp_end_of_workers;
 
 
 	unsigned suitable_components[component->nchildren];
 	unsigned suitable_components[component->nchildren];
 	unsigned nsuitable_components;
 	unsigned nsuitable_components;
@@ -155,16 +158,21 @@ static int heteroprio_progress_accel(struct starpu_sched_component *component, s
 			estimated_lengths,
 			estimated_lengths,
 			estimated_transfer_length,
 			estimated_transfer_length,
 			estimated_ends_with_task,
 			estimated_ends_with_task,
-			&min_exp_end_with_task, &max_exp_end_with_task,
+			&min_exp_end_of_task, &max_exp_end_of_workers,
 			suitable_components, nsuitable_components);
 			suitable_components, nsuitable_components);
 
 
+	/* Compute the energy, if provided*/
+	starpu_mct_compute_energy(component, task, local_energy, suitable_components, nsuitable_components);
+
 	/* And now find out which worker suits best for this task,
 	/* And now find out which worker suits best for this task,
 	 * including data transfer */
 	 * including data transfer */
+
 	int best_icomponent = starpu_mct_get_best_component(d, task,
 	int best_icomponent = starpu_mct_get_best_component(d, task,
 			estimated_lengths,
 			estimated_lengths,
 			estimated_transfer_length,
 			estimated_transfer_length,
 			estimated_ends_with_task,
 			estimated_ends_with_task,
-			min_exp_end_with_task, max_exp_end_with_task,
+                        local_energy,
+			min_exp_end_of_task, max_exp_end_of_workers,
 			suitable_components, nsuitable_components);
 			suitable_components, nsuitable_components);
 
 
 	if (best_icomponent == -1)
 	if (best_icomponent == -1)
@@ -236,10 +244,13 @@ static int heteroprio_progress_noaccel(struct starpu_sched_component *component,
 	/* Estimated transfer+task termination for each child */
 	/* Estimated transfer+task termination for each child */
 	double estimated_ends_with_task[component->nchildren];
 	double estimated_ends_with_task[component->nchildren];
 
 
-	/* Minimum transfer+task termination on all children */
-	double min_exp_end_with_task;
-	/* Maximum transfer+task termination on all children */
-	double max_exp_end_with_task;
+	/* estimated energy */
+	double local_energy[component->nchildren];
+
+	/* Minimum transfer+task termination of the task over all workers */
+	double min_exp_end_of_task;
+	/* Maximum termination of the already-scheduled tasks over all workers */
+	double max_exp_end_of_workers;
 
 
 	unsigned suitable_components[component->nchildren];
 	unsigned suitable_components[component->nchildren];
 	unsigned nsuitable_components;
 	unsigned nsuitable_components;
@@ -264,16 +275,21 @@ static int heteroprio_progress_noaccel(struct starpu_sched_component *component,
 			estimated_lengths,
 			estimated_lengths,
 			estimated_transfer_length,
 			estimated_transfer_length,
 			estimated_ends_with_task,
 			estimated_ends_with_task,
-			&min_exp_end_with_task, &max_exp_end_with_task,
+			&min_exp_end_of_task, &max_exp_end_of_workers,
 			suitable_components, nsuitable_components);
 			suitable_components, nsuitable_components);
 
 
+	/* Compute the energy, if provided*/
+	starpu_mct_compute_energy(component, task, local_energy, suitable_components, nsuitable_components);
+	
 	/* And now find out which worker suits best for this task,
 	/* And now find out which worker suits best for this task,
 	 * including data transfer */
 	 * including data transfer */
+
 	int best_icomponent = starpu_mct_get_best_component(d, task,
 	int best_icomponent = starpu_mct_get_best_component(d, task,
 			estimated_lengths,
 			estimated_lengths,
 			estimated_transfer_length,
 			estimated_transfer_length,
 			estimated_ends_with_task,
 			estimated_ends_with_task,
-			min_exp_end_with_task, max_exp_end_with_task,
+                        local_energy,
+			min_exp_end_of_task, max_exp_end_of_workers,
 			suitable_components, nsuitable_components);
 			suitable_components, nsuitable_components);
 
 
 	/* If no best component is found, it means that the perfmodel of
 	/* If no best component is found, it means that the perfmodel of

+ 13 - 7
src/sched_policies/component_mct.c

@@ -35,10 +35,13 @@ static int mct_push_task(struct starpu_sched_component * component, struct starp
 	/* Estimated transfer+task termination for each child */
 	/* Estimated transfer+task termination for each child */
 	double estimated_ends_with_task[component->nchildren];
 	double estimated_ends_with_task[component->nchildren];
 
 
-	/* Minimum transfer+task termination on all children */
-	double min_exp_end_with_task;
-	/* Maximum transfer+task termination on all children */
-	double max_exp_end_with_task;
+	/* estimated energy */
+	double local_energy[component->nchildren];
+
+	/* Minimum transfer+task termination of the task over all workers */
+	double min_exp_end_of_task;
+	/* Maximum termination of the already-scheduled tasks over all workers */
+	double max_exp_end_of_workers;
 
 
 	unsigned suitable_components[component->nchildren];
 	unsigned suitable_components[component->nchildren];
 	unsigned nsuitable_components;
 	unsigned nsuitable_components;
@@ -58,12 +61,14 @@ static int mct_push_task(struct starpu_sched_component * component, struct starp
 	   make scheduling decisions at the same time */
 	   make scheduling decisions at the same time */
 	STARPU_COMPONENT_MUTEX_LOCK(&d->scheduling_mutex);
 	STARPU_COMPONENT_MUTEX_LOCK(&d->scheduling_mutex);
 
 
-
 	starpu_mct_compute_expected_times(component, task, estimated_lengths, estimated_transfer_length,
 	starpu_mct_compute_expected_times(component, task, estimated_lengths, estimated_transfer_length,
-					  estimated_ends_with_task, &min_exp_end_with_task, &max_exp_end_with_task, suitable_components, nsuitable_components);
+					  estimated_ends_with_task, &min_exp_end_of_task, &max_exp_end_of_workers, suitable_components, nsuitable_components);
+
+	/* Compute the energy, if provided*/
+	starpu_mct_compute_energy(component, task, local_energy, suitable_components, nsuitable_components);
 
 
 	int best_icomponent = starpu_mct_get_best_component(d, task, estimated_lengths, estimated_transfer_length,
 	int best_icomponent = starpu_mct_get_best_component(d, task, estimated_lengths, estimated_transfer_length,
-					  estimated_ends_with_task, min_exp_end_with_task, max_exp_end_with_task, suitable_components, nsuitable_components);
+							    estimated_ends_with_task, local_energy, min_exp_end_of_task, max_exp_end_of_workers, suitable_components, nsuitable_components);
 
 
 	/* If no best component is found, it means that the perfmodel of
 	/* If no best component is found, it means that the perfmodel of
 	 * the task had been purged since it has been pushed on the mct component.
 	 * the task had been purged since it has been pushed on the mct component.
@@ -105,6 +110,7 @@ static void mct_component_deinit_data(struct starpu_sched_component * component)
 
 
 int starpu_sched_component_is_mct(struct starpu_sched_component * component)
 int starpu_sched_component_is_mct(struct starpu_sched_component * component)
 {
 {
+
 	return component->push_task == mct_push_task;
 	return component->push_task == mct_push_task;
 }
 }
 
 

+ 0 - 0
src/sched_policies/deque_modeling_policy_data_aware.c


Неке датотеке нису приказане због велике количине промена