Browse Source

Merge branch 'master' of git+ssh://scm.gforge.inria.fr/gitroot/starpu/starpu into ft_checkpoint

# Conflicts:
#	ChangeLog
#	julia/examples/Makefile.am
#	julia/examples/axpy/axpy.jl
#	julia/examples/callback/callback.jl
#	julia/examples/check_deps/check_deps.jl
#	julia/examples/dependency/end_dep.jl
#	julia/examples/dependency/tag_dep.jl
#	julia/examples/dependency/task_dep.jl
#	julia/examples/execute.sh.in
#	julia/examples/mandelbrot/cpu_mandelbrot.c
#	julia/examples/mandelbrot/mandelbrot.jl
#	julia/examples/mult/cpu_mult.c
#	julia/examples/mult/mult.jl
#	julia/examples/task_insert_color/task_insert_color.jl
#	julia/examples/vector_scal/vector_scal.jl
#	julia/src/Makefile.am
#	julia/src/StarPU.jl
#	julia/src/blas.c
#	julia/src/blas.h
#	julia/src/blas.jl
#	julia/src/blas_wrapper.c
#	julia/src/data.jl
#	julia/src/dynamic_compiler/Makefile.am
#	julia/src/globals.jl
#	julia/src/init.jl
#	julia/src/jlstarpu.h
#	julia/src/task.jl
#	julia/src/translate_headers.jl
#	mpi/tests/burst.c
#	mpi/tests/burst_gemm.c
#	mpi/tests/burst_helper.c
#	mpi/tests/burst_helper.h
#	mpi/tests/gemm_helper.c
#	mpi/tests/nothing.c
Romain LION 5 years ago
parent
commit
3154eea92e
74 changed files with 4685 additions and 1553 deletions
  1. 1 0
      AUTHORS
  2. 3 0
      ChangeLog
  3. 1 1
      configure.ac
  4. 1 0
      doc/doxygen/chapters/470_simgrid.doxy
  5. 1 0
      doc/doxygen/chapters/501_environment_variables.doxy
  6. 2 0
      examples/mlr/mlr.c
  7. 9 1
      include/starpu_data.h
  8. 146 0
      julia/examples/Makefile.am
  9. 99 0
      julia/examples/axpy/axpy.jl
  10. 77 0
      julia/examples/callback/callback.jl
  11. 32 0
      julia/examples/check_deps/check_deps.jl
  12. 20 0
      julia/examples/cholesky/cholesky.sh
  13. 53 0
      julia/examples/cholesky/cholesky_codelets.jl
  14. 155 0
      julia/examples/cholesky/cholesky_common.jl
  15. 64 0
      julia/examples/cholesky/cholesky_implicit.jl
  16. 139 0
      julia/examples/cholesky/cholesky_native.jl
  17. 86 0
      julia/examples/cholesky/cholesky_tag.jl
  18. 104 0
      julia/examples/dependency/end_dep.jl
  19. 122 0
      julia/examples/dependency/tag_dep.jl
  20. 88 0
      julia/examples/dependency/task_dep.jl
  21. 53 0
      julia/examples/execute.sh.in
  22. 145 0
      julia/examples/gemm/gemm.jl
  23. 22 0
      julia/examples/gemm/gemm.sh
  24. 146 0
      julia/examples/gemm/gemm_bare.jl
  25. 56 0
      julia/examples/gemm/gemm_native.jl
  26. 33 14
      julia/examples/mandelbrot/cpu_mandelbrot.c
  27. 29 10
      julia/examples/mandelbrot/mandelbrot.jl
  28. 25 14
      julia/examples/mult/cpu_mult.c
  29. 45 40
      julia/examples/mult/mult.jl
  30. 30 8
      julia/examples/task_insert_color/task_insert_color.jl
  31. 1 1
      julia/examples/variable/variable.jl
  32. 42 18
      julia/examples/vector_scal/vector_scal.jl
  33. 60 0
      julia/src/Makefile.am
  34. 85 1276
      julia/src/StarPU.jl
  35. 194 0
      julia/src/blas.c
  36. 148 0
      julia/src/blas.h
  37. 21 0
      julia/src/blas.jl
  38. 50 0
      julia/src/blas_wrapper.c
  39. 25 8
      julia/src/compiler/c.jl
  40. 155 25
      julia/src/compiler/cuda.jl
  41. 24 16
      julia/src/compiler/expressions.jl
  42. 17 13
      julia/src/compiler/file_generation.jl
  43. 235 0
      julia/src/data.jl
  44. 48 0
      julia/src/dynamic_compiler/Makefile.am
  45. 50 0
      julia/src/globals.jl
  46. 73 0
      julia/src/init.jl
  47. 17 16
      julia/src/jlstarpu.h
  48. 9 0
      julia/src/openblas_ldflags.jl
  49. 400 0
      julia/src/task.jl
  50. 114 0
      julia/src/translate_headers.jl
  51. 1 1
      mpi/tests/abstract_sendrecv_bench.c
  52. 1 2
      mpi/tests/abstract_sendrecv_bench.h
  53. 1 1
      mpi/tests/bench_helper.c
  54. 1 1
      mpi/tests/bench_helper.h
  55. 75 0
      mpi/tests/burst.c
  56. 210 0
      mpi/tests/burst_gemm.c
  57. 223 0
      mpi/tests/burst_helper.c
  58. 29 0
      mpi/tests/burst_helper.h
  59. 330 0
      mpi/tests/gemm_helper.c
  60. 70 0
      mpi/tests/nothing.c
  61. 1 1
      mpi/tests/sendrecv_parallel_tasks_bench.c
  62. 1 0
      src/common/utils.c
  63. 8 8
      src/core/perfmodel/perfmodel_history.c
  64. 5 0
      src/datawizard/coherency.h
  65. 24 1
      src/datawizard/interfaces/data_interface.c
  66. 17 11
      src/sched_policies/component_heft.c
  67. 28 12
      src/sched_policies/component_heteroprio.c
  68. 13 7
      src/sched_policies/component_mct.c
  69. 25 21
      src/sched_policies/deque_modeling_policy_data_aware.c
  70. 52 21
      src/sched_policies/helper_mct.c
  71. 12 4
      src/sched_policies/helper_mct.h
  72. 1 1
      tools/starpu_env.in
  73. 1 0
      tools/starpu_perfmodel_recdump.c
  74. 1 0
      tools/starpu_smpirun.in

+ 1 - 0
AUTHORS

@@ -12,6 +12,7 @@ Danjean Vincent, University Grenoble Alpes, <Vincent.Danjean@ens-lyon.org>
 Denis Alexandre, Inria, <alexandre.denis@inria.fr>
 Eyraud-Dubois Lionel, Inria, <lionel.eyraud-dubois@inria.fr>
 Furmento Nathalie, CNRS, <nathalie.furmento@labri.fr>
+Guermouche Amina, Télécom SudParis, <amina.guermouche@inria.fr>
 Guilbaud Adrien, Inria, <adrien.guilbaud@inria.fr>
 Henry Sylvain, Université de Bordeaux, <sylvain.henry@inria.fr>
 Hugo Andra, Université de Bordeaux/Inria, <andra.hugo@inria.fr>

+ 3 - 0
ChangeLog

@@ -54,6 +54,9 @@ Small features:
   * Add STARPU_WORKERS_GETBIND environment variable.
   * Add STARPU_SCHED_SIMPLE_DECIDE_ALWAYS modular scheduler flag.
   * And STARPU_LIMIT_BANDWIDTH environment variable.
+  * Add field starpu_conf::precedence_over_environment_variables to ignore
+    environment variables when parameters are set directly in starpu_conf
+  * Add starpu_data_get_coordinates_array
 
 StarPU 1.3.3 (git revision 11afc5b007fe1ab1c729b55b47a5a98ef7f3cfad)
 ====================================================================

File diff suppressed because it is too large
+ 1 - 1
configure.ac


+ 1 - 0
doc/doxygen/chapters/470_simgrid.doxy

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Federal University of Rio Grande do Sul (UFRGS)
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 0
doc/doxygen/chapters/501_environment_variables.doxy

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  * Copyright (C) 2016       Uppsala University
+ * Copyright (C) 2020       Federal University of Rio Grande do Sul (UFRGS)
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 2 - 0
examples/mlr/mlr.c

@@ -110,7 +110,9 @@ static struct starpu_perfmodel cl_model_init =
    template.
  */
 
+/* M^2 * N^1 * K^0 */
 static unsigned combi1 [3]		= {	2,	1,	0 };
+/* M^0 * N^3 * K^1 */
 static unsigned combi2 [3]		= {	0,	3,	1 };
 
 static unsigned *combinations[] = { combi1, combi2 };

+ 9 - 1
include/starpu_data.h

@@ -123,7 +123,7 @@ void starpu_data_set_name(starpu_data_handle_t handle, const char *name);
    tools. \p dimensions is the size of the \p dims array. This can be
    for instance the tile coordinates within a big matrix.
 */
-void starpu_data_set_coordinates_array(starpu_data_handle_t handle, int dimensions, int dims[]);
+void starpu_data_set_coordinates_array(starpu_data_handle_t handle, unsigned dimensions, int dims[]);
 
 /**
    Set the coordinates of the data, to be shown in various profiling
@@ -133,6 +133,14 @@ void starpu_data_set_coordinates_array(starpu_data_handle_t handle, int dimensio
 void starpu_data_set_coordinates(starpu_data_handle_t handle, unsigned dimensions, ...);
 
 /**
+   Get the coordinates of the data, as set by a previous call to
+   starpu_data_set_coordinates_array() or starpu_data_set_coordinates()
+   \p dimensions is the size of the \p dims array.
+   This returns the actual number of returned coordinates.
+*/
+unsigned starpu_data_get_coordinates_array(starpu_data_handle_t handle, unsigned dimensions, int dims[]);
+
+/**
    Unregister a data \p handle from StarPU. If the data was
    automatically allocated by StarPU because the home node was -1, all
    automatically allocated buffers are freed. Otherwise, a valid copy

+ 146 - 0
julia/examples/Makefile.am

@@ -0,0 +1,146 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+include $(top_srcdir)/starpu.mk
+
+noinst_PROGRAMS		=
+
+if STARPU_HAVE_WINDOWS
+LOADER_BIN		=
+else
+loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
+if !STARPU_SIMGRID
+LOADER			=	loader
+LOADER_BIN		=	$(abs_top_builddir)/julia/examples/$(LOADER)
+noinst_PROGRAMS		+=	loader
+endif
+loader_SOURCES		=	../../tests/loader.c
+endif
+
+if STARPU_HAVE_AM111
+TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
+else
+TESTS_ENVIRONMENT 	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(LOADER_BIN)
+endif
+
+BUILT_SOURCES =
+
+CLEANFILES = *.gcno *.gcda *.linkinfo starpu_idle_microsec.log
+
+EXTRA_DIST =					\
+	axpy/axpy.jl				\
+	axpy/axpy.sh				\
+	black_scholes/black_scholes.jl		\
+	callback/callback.jl			\
+	callback/callback.sh			\
+	check_deps/check_deps.jl		\
+	check_deps/check_deps.sh		\
+	cholesky/cholesky_codelets.jl		\
+	cholesky/cholesky_common.jl		\
+	cholesky/cholesky_native.jl		\
+	cholesky/cholesky_implicit.jl		\
+	cholesky/cholesky_tag.jl		\
+	cholesky/cholesky.sh			\
+	dependency/end_dep.jl			\
+	dependency/end_dep.sh			\
+	dependency/tag_dep.jl			\
+	dependency/tag_dep.sh			\
+	dependency/task_dep.sh			\
+	dependency/task_dep.jl			\
+	gemm/gemm.jl				\
+	gemm/gemm_native.jl			\
+	gemm/gemm.sh				\
+	mandelbrot/mandelbrot_native.jl		\
+	mandelbrot/mandelbrot.jl		\
+	mandelbrot/mandelbrot.sh		\
+	mult/mult_native.jl			\
+	mult/mult.jl				\
+	mult/perf.sh				\
+	mult/mult_starpu.sh			\
+	task_insert_color/task_insert_color.jl	\
+	task_insert_color/task_insert_color.sh	\
+	variable/variable.jl			\
+	variable/variable_native.jl		\
+	variable/variable.sh			\
+	vector_scal/vector_scal.jl		\
+	vector_scal/vector_scal.sh
+
+examplebindir = $(libdir)/starpu/julia
+
+examplebin_PROGRAMS =
+
+if STARPU_USE_CUDA
+if STARPU_COVERITY
+include $(top_srcdir)/starpu-mynvcc.mk
+else
+NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -I$(top_builddir)/include/ $(HWLOC_CFLAGS)
+
+.cu.cubin:
+	$(V_nvcc) $(NVCC) -cubin $< -o $@ $(NVCCFLAGS)
+
+.cu.o:
+	$(V_nvcc) $(NVCC) $< -c -o $@ $(NVCCFLAGS)
+endif
+endif
+
+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ ../src/libstarpujulia-@STARPU_EFFECTIVE_VERSION@.la -lm @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
+
+check_PROGRAMS = $(LOADER) $(starpu_julia_EXAMPLES)
+SHELL_TESTS	=
+STARPU_JULIA_EXAMPLES	=
+
+if BUILD_EXAMPLES
+examplebin_PROGRAMS 	+=	$(STARPU_JULIA_EXAMPLES)
+
+TESTS			=	$(SHELL_TESTS) $(STARPU_JULIA_EXAMPLES)
+endif
+
+######################
+#      Examples      #
+######################
+
+SHELL_TESTS	+=	check_deps/check_deps.sh
+
+STARPU_JULIA_EXAMPLES	+=	mult/mult
+mult_mult_SOURCES	=	mult/mult.c mult/cpu_mult.c
+SHELL_TESTS		+=	mult/mult_starpu.sh
+
+STARPU_JULIA_EXAMPLES				+=	task_insert_color/task_insert_color
+task_insert_color_task_insert_color_SOURCES	=	task_insert_color/task_insert_color.c
+SHELL_TESTS					+=	task_insert_color/task_insert_color.sh
+
+SHELL_TESTS	+=	variable/variable.sh
+SHELL_TESTS	+=	vector_scal/vector_scal.sh
+
+STARPU_JULIA_EXAMPLES		+=	mandelbrot/mandelbrot
+mandelbrot_mandelbrot_SOURCES	=	mandelbrot/mandelbrot.c mandelbrot/cpu_mandelbrot.c mandelbrot/cpu_mandelbrot.h
+SHELL_TESTS			+=	mandelbrot/mandelbrot.sh
+
+STARPU_JULIA_EXAMPLES		+= 	callback/callback
+callback_callback_SOURCES	=	callback/callback.c
+SHELL_TESTS			+=	callback/callback.sh
+
+SHELL_TESTS			+=	dependency/tag_dep.sh
+SHELL_TESTS			+=	dependency/task_dep.sh
+SHELL_TESTS			+=	dependency/end_dep.sh
+
+if !NO_BLAS_LIB
+SHELL_TESTS			+=	axpy/axpy.sh
+SHELL_TESTS			+=	cholesky/cholesky.sh
+SHELL_TESTS			+=	gemm/gemm.sh
+endif

+ 99 - 0
julia/examples/axpy/axpy.jl

@@ -0,0 +1,99 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+using StarPU
+using Printf
+const EPSILON = 1e-6
+
+function check(alpha, X, Y)
+    for i in 1:length(X)
+        expected_value = alpha * X[i] + 4.0
+        if abs(Y[i] - expected_value) > expected_value * EPSILON
+            error("at ", i, ", ", alpha, "*", X[i], "+4.0=", Y[i], ", expected ", expected_value)
+        end
+    end
+end
+
+@target STARPU_CPU+STARPU_CUDA
+@codelet function axpy(X :: Vector{Float32}, Y :: Vector{Float32}, alpha ::Float32) :: Nothing
+    STARPU_SAXPY(length(X), alpha, X, 1, Y, 1)
+    return
+end
+
+function axpy(N, NBLOCKS, alpha, display = true)
+    X = Array(fill(1.0f0, N))
+    Y = Array(fill(4.0f0, N))
+
+    starpu_memory_pin(X)
+    starpu_memory_pin(Y)
+
+    block_filter = starpu_data_filter(STARPU_VECTOR_FILTER_BLOCK, NBLOCKS)
+
+    if display
+        println("BEFORE x[0] = ", X[1])
+        println("BEFORE y[0] = ", Y[1])
+    end
+
+    t_start = time_ns()
+
+    @starpu_block let
+        hX,hY = starpu_data_register(X, Y)
+
+        starpu_data_partition(hX, block_filter)
+        starpu_data_partition(hY, block_filter)
+
+        for b in 1:NBLOCKS
+            starpu_task_insert(codelet_name = "axpy",
+                               handles = [hX[b], hY[b]],
+                               cl_arg = (Float32(alpha),),
+                               tag = starpu_tag_t(b),
+                               modes = [STARPU_R, STARPU_RW])
+        end
+
+        starpu_task_wait_for_all()
+    end
+
+    t_end = time_ns()
+
+    timing = (t_end-t_start)/1000
+
+    if display
+        @printf("timing -> %d us %.2f MB/s\n", timing, 3*N*4/timing)
+        println("AFTER y[0] = ", Y[1], " (ALPHA=", alpha, ")")
+    end
+
+    check(alpha, X, Y)
+
+    starpu_memory_unpin(X)
+    starpu_memory_unpin(Y)
+end
+
+function main()
+    N = 16 * 1024 * 1024
+    NBLOCKS = 8
+    alpha = 3.41
+
+    starpu_init()
+    starpu_cublas_init()
+
+    # warmup
+    axpy(10, 1, alpha, false)
+
+    axpy(N, NBLOCKS, alpha)
+
+    starpu_shutdown()
+end
+
+main()

+ 77 - 0
julia/examples/callback/callback.jl

@@ -0,0 +1,77 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+using StarPU
+
+@target STARPU_CPU
+@codelet function variable(val ::Ref{Int32}) :: Nothing
+    val[] = val[] + 1
+
+    return
+end
+
+function callback(args)
+    cl = args[1]
+    handles = args[2]
+
+    task = starpu_task(cl = cl, handles=handles)
+    starpu_task_submit(task)
+end
+
+function variable_with_starpu(val ::Ref{Int32})
+    perfmodel = starpu_perfmodel(
+        perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
+        symbol = "history_perf"
+    )
+
+    cl = starpu_codelet(
+        cpu_func = "variable",
+        modes = [STARPU_RW],
+        perfmodel = perfmodel
+    )
+
+    @starpu_block let
+	hVal = starpu_data_register(val)
+
+        starpu_task_insert(codelet_name = "variable",
+                           cl = cl,
+                           handles = [hVal],
+                           callback = callback,
+                           callback_arg = (cl, [hVal]))
+
+        starpu_task_wait_for_all()
+    end
+end
+
+function display()
+    v = Ref(Int32(40))
+
+    variable_with_starpu(v)
+
+    println("variable -> ", v[])
+    if v[] == 42
+        println("result is correct")
+    else
+        error("result is incorret")
+    end
+end
+
+# Disable garbage collector because of random segfault/hang when using mutex.
+# This issue should be solved with Julia release 1.5.
+GC.enable(false)
+starpu_init()
+display()
+starpu_shutdown()
+GC.enable(true)

+ 32 - 0
julia/examples/check_deps/check_deps.jl

@@ -0,0 +1,32 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+import Pkg
+
+try
+    using CBinding
+    using Clang
+    using ThreadPools
+catch
+    Pkg.activate((@__DIR__)*"/../..")
+    Pkg.instantiate()
+    using Clang
+    using CBinding
+    using ThreadPools
+end
+
+using StarPU
+
+starpu_translate_headers()

+ 20 - 0
julia/examples/cholesky/cholesky.sh

@@ -0,0 +1,20 @@
+#!/bin/bash
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+$(dirname $0)/../execute.sh cholesky/cholesky_native.jl
+$(dirname $0)/../execute.sh cholesky/cholesky_implicit.jl
+$(dirname $0)/../execute.sh cholesky/cholesky_tag.jl

+ 53 - 0
julia/examples/cholesky/cholesky_codelets.jl

@@ -0,0 +1,53 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+chol_model11 = starpu_perfmodel(
+    perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
+    symbol = "chol_model11"
+)
+
+chol_model21 = starpu_perfmodel(
+    perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
+    symbol = "chol_model21"
+)
+
+chol_model22 = starpu_perfmodel(
+    perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
+    symbol = "chol_model22"
+)
+
+cl_11 = starpu_codelet(
+    cpu_func = "u11",
+    # This kernel cannot be translated to CUDA yet.
+    # cuda_func = "u11",
+    modes = [STARPU_RW],
+    color = 0xffff00,
+    perfmodel = chol_model11
+)
+cl_21 = starpu_codelet(
+    cpu_func = "u21",
+    cuda_func = "u21",
+    modes = [STARPU_R, STARPU_RW],
+    color = 0x8080ff,
+    perfmodel = chol_model21
+)
+cl_22 = starpu_codelet(
+    cpu_func = "u22",
+    cuda_func = "u22",
+    modes = [STARPU_R, STARPU_R, STARPU_RW],
+    color = 0x00ff00,
+    perfmodel = chol_model22
+)

+ 155 - 0
julia/examples/cholesky/cholesky_common.jl

@@ -0,0 +1,155 @@
+# Standard kernels for the Cholesky factorization
+# U22 is the gemm update
+# U21 is the trsm update
+# U11 is the cholesky factorization
+
+@target STARPU_CPU+STARPU_CUDA
+@codelet function u11(sub11 :: Matrix{Float32}) :: Nothing
+    nx :: Int32 = width(sub11)
+    ld :: Int32 = ld(sub11)
+
+    for z in 0:nx-1
+        lambda11 :: Float32 = sqrt(sub11[z+1,z+1])
+        sub11[z+1,z+1] = lambda11
+
+        alpha ::Float32 = 1.0f0 / lambda11
+        X :: Vector{Float32} = view(sub11, z+2:z+2+(nx-z-2), z+1)
+        STARPU_SSCAL(nx-z-1, alpha, X, 1)
+
+        alpha = -1.0f0
+        A :: Matrix{Float32} = view(sub11, z+2:z+2+(nx-z-2), z+2:z+2+(nx-z-2))
+	STARPU_SSYR("L", nx-z-1, alpha, X, 1, A, ld)
+    end
+    return
+end
+
+@target STARPU_CPU+STARPU_CUDA
+@codelet function u21(sub11 :: Matrix{Float32},
+                      sub21 :: Matrix{Float32}) :: Nothing
+    ld11 :: Int32 = ld(sub11)
+    ld21 :: Int32 = ld(sub21)
+    nx21 :: Int32 = width(sub21)
+    ny21 :: Int32 = height(sub21)
+    alpha :: Float32 = 1.0f0
+    STARPU_STRSM("R", "L", "T", "N", nx21, ny21, alpha, sub11, ld11, sub21, ld21)
+    return
+end
+
+@target STARPU_CPU+STARPU_CUDA
+@codelet function u22(left   :: Matrix{Float32},
+                      right  :: Matrix{Float32},
+                      center :: Matrix{Float32}) :: Nothing
+    dx :: Int32 = width(center)
+    dy :: Int32 = height(center)
+    dz :: Int32 = width(left)
+    ld21 :: Int32 = ld(left)
+    ld12 :: Int32 = ld(center)
+    ld22 :: Int32 = ld(right)
+    alpha :: Float32 = -1.0f0
+    beta :: Float32 = 1.0f0
+    STARPU_SGEMM("N", "T", dy, dx, dz, alpha, left, ld21, right, ld12, beta, center, ld22)
+    return
+end
+
+@inline function tag11(k)
+    return starpu_tag_t((UInt64(1)<<60) | UInt64(k))
+end
+
+@inline function tag21(k, j)
+    return starpu_tag_t((UInt64(3)<<60) | (UInt64(k)<<32) |  UInt64(j))
+end
+
+@inline function tag22(k, i, j)
+    return starpu_tag_t((UInt64(4)<<60) | (UInt64(k)<<32) | (UInt64(i)<<16) |  UInt64(j))
+end
+
+function check(mat::Matrix{Float32})
+    size_p = size(mat, 1)
+
+    for i in 1:size_p
+        for j in 1:size_p
+            if j > i
+                mat[i, j] = 0.0f0
+            end
+        end
+    end
+
+    test_mat ::Matrix{Float32} = zeros(Float32, size_p, size_p)
+
+    syrk!('L', 'N', 1.0f0, mat, 0.0f0, test_mat)
+
+    for i in 1:size_p
+        for j in 1:size_p
+            if j <= i
+                orig = (1.0f0/(1.0f0+(i-1)+(j-1))) + ((i == j) ? 1.0f0*size_p : 0.0f0)
+                err = abs(test_mat[i,j] - orig) / orig
+                if err > 0.0001
+                    got = test_mat[i,j]
+                    expected = orig
+                    error("[$i, $j] -> $got != $expected (err $err)")
+                end
+            end
+        end
+    end
+
+    println("Verification successful !")
+end
+
+function clean_tags(nblocks)
+    for k in 1:nblocks
+        starpu_tag_remove(tag11(k))
+
+        for m in k+1:nblocks
+            starpu_tag_remove(tag21(k, m))
+
+            for n in k+1:nblocks
+                if n <= m
+                    starpu_tag_remove(tag22(k, m, n))
+                end
+            end
+        end
+    end
+end
+
+function main(size_p :: Int, nblocks :: Int; verify = false, verbose = false)
+    mat :: Matrix{Float32} = zeros(Float32, size_p, size_p)
+
+    # create a simple definite positive symetric matrix
+    # Hilbert matrix h(i,j) = 1/(i+j+1)
+
+    for i in 1:size_p
+        for j in 1:size_p
+            mat[i, j] = 1.0f0 / (1.0f0+(i-1)+(j-1)) + ((i == j) ? 1.0f0*size_p : 0.0f0)
+        end
+    end
+
+    if verbose
+        display(mat)
+    end
+
+    starpu_memory_pin(mat)
+
+    t_start = time_ns()
+
+    cholesky(mat, size_p, nblocks)
+
+    t_end = time_ns()
+
+    starpu_memory_unpin(mat)
+
+    flop = (1.0*size_p*size_p*size_p)/3.0
+    println("# size\tms\tGFlops")
+    time_ms = (t_end-t_start) / 1e6
+    gflops = flop/(time_ms*1000)/1000
+    println("# $size_p\t$time_ms\t$gflops")
+
+    clean_tags(nblocks)
+
+    if verbose
+        display(mat)
+    end
+
+    if verify
+        check(mat)
+    end
+end

+ 64 - 0
julia/examples/cholesky/cholesky_implicit.jl

@@ -0,0 +1,64 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+using StarPU
+using LinearAlgebra.BLAS
+
+include("cholesky_common.jl")
+
+function cholesky(mat :: Matrix{Float32}, size, nblocks)
+    include("cholesky_codelets.jl")
+
+    horiz = starpu_data_filter(STARPU_MATRIX_FILTER_BLOCK, nblocks)
+    vert = starpu_data_filter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nblocks)
+
+    @starpu_block let
+        h_mat = starpu_data_register(mat)
+        starpu_data_map_filters(h_mat, horiz, vert)
+
+        for k in 1:nblocks
+
+            starpu_iteration_push(k)
+
+            starpu_task_insert(cl = cl_11, handles = [h_mat[k, k]], tag_only = tag11(k))
+
+            for m in k+1:nblocks
+                starpu_task_insert(cl = cl_21, handles = [h_mat[k, k], h_mat[m, k]], tag_only = tag21(m, k))
+            end
+            starpu_data_wont_use(h_mat[k, k])
+
+            for m in k+1:nblocks
+                for n in k+1:nblocks
+                    if n <= m
+                        starpu_task_insert(cl = cl_22, handles = [h_mat[m, k], h_mat[n, k], h_mat[m, n]], tag_only= tag22(k, m, n))
+                    end
+                end
+                starpu_data_wont_use(h_mat[m, k])
+            end
+
+            starpu_iteration_pop()
+        end
+
+        starpu_task_wait_for_all()
+    end
+end
+
+starpu_init()
+starpu_cublas_init()
+
+main(1024, 8, verify = true)
+main(15360, 16)
+
+starpu_shutdown()

+ 139 - 0
julia/examples/cholesky/cholesky_native.jl

@@ -0,0 +1,139 @@
+using LinearAlgebra.BLAS
+
+function u11(sub11)
+    nx = size(sub11, 1)
+    ld = size(sub11, 1)
+
+    for z in 0:nx-1
+        lambda11::Float32 = sqrt(sub11[z+1,z+1])
+        sub11[z+1,z+1] = lambda11
+        if lambda11 == 0.0f0
+            error("lamda11")
+        end
+
+        X = view(sub11, z+2:z+2+(nx-z-2), z+1)
+        scal!(nx-z-1, 1.0f0/lambda11, X, 1)
+
+        A = view(sub11, z+2:z+2+(nx-z-2), z+2:z+2+(nx-z-2))
+        syr!('L', -1.0f0, X, A)
+    end
+end
+
+function u21(sub11, sub21)
+    trsm!('R', 'L', 'T', 'N', 1.0f0, sub11, sub21)
+end
+
+function u22(left, right, center)
+    gemm!('N', 'T', -1.0f0, left, right, 1.0f0, center)
+end
+
+function get_block(mat :: Matrix{Float32}, m, n, nblocks)
+    dim = size(mat, 1)
+    if dim != size(mat,2)
+        error("mat must be a square matrix")
+    end
+    if dim % nblocks != 0
+        error("dim must be a multiple of nblocks")
+    end
+
+    stride = Int(dim/nblocks)
+
+    return view(mat,
+                m*stride+1:(m+1)*stride,
+                n*stride+1:(n+1)*stride)
+end
+
+function cholesky(mat :: Matrix{Float32}, size, nblocks)
+    for k in 0:nblocks-1
+        sdatakk = get_block(mat, k, k, nblocks)
+        u11(sdatakk)
+
+        for m in k+1:nblocks-1
+            sdatamk = get_block(mat, m, k, nblocks)
+            u21(sdatakk, sdatamk)
+        end
+
+        for m in k+1:nblocks-1
+            sdatamk = get_block(mat, m, k, nblocks)
+
+            for n in k+1:nblocks-1
+                if n <= m
+                    sdatank = get_block(mat, n, k, nblocks)
+                    sdatamn = get_block(mat, m, n, nblocks)
+                    u22(sdatamk, sdatank, sdatamn)
+                end
+            end
+        end
+
+    end
+end
+
+function check(mat::Matrix{Float32})
+    size_p = size(mat, 1)
+
+    for i in 1:size_p
+        for j in 1:size_p
+            if j > i
+                mat[i, j] = 0.0f0
+            end
+        end
+    end
+
+    test_mat ::Matrix{Float32} = zeros(Float32, size_p, size_p)
+
+    syrk!('L', 'N', 1.0f0, mat, 0.0f0, test_mat)
+
+    for i in 1:size_p
+        for j in 1:size_p
+            if j <= i
+                orig = (1.0f0/(1.0f0+(i-1)+(j-1))) + ((i == j) ? 1.0f0*size_p : 0.0f0)
+                err = abs(test_mat[i,j] - orig) / orig
+                if err > 0.0001
+                    got = test_mat[i,j]
+                    expected = orig
+                    error("[$i, $j] -> $got != $expected (err $err)")
+                end
+            end
+        end
+    end
+
+    println("Verification successful !")
+end
+
+function main(size_p :: Int, nblocks :: Int, display = false)
+    mat :: Matrix{Float32} = zeros(Float32, size_p, size_p)
+
+    # create a simple definite positive symetric matrix
+    # Hilbert matrix h(i,j) = 1/(i+j+1)
+
+    for i in 1:size_p
+        for j in 1:size_p
+            mat[i, j] = 1.0f0 / (1.0f0+(i-1)+(j-1)) + ((i == j) ? 1.0f0*size_p : 0.0f0)
+        end
+    end
+
+    if display
+        display(mat)
+    end
+
+    t_start = time_ns()
+
+    cholesky(mat, size_p, nblocks)
+
+    t_end = time_ns()
+
+    flop = (1.0*size_p*size_p*size_p)/3.0
+    println("# size\tms\tGFlops")
+    time_ms = (t_end-t_start) / 1e6
+    gflops = flop/(time_ms*1000)/1000
+    println("# $size_p\t$time_ms\t$gflops")
+
+    if display
+        display(mat)
+    end
+
+    check(mat)
+end
+
+main(1024*20, 8)
+

+ 86 - 0
julia/examples/cholesky/cholesky_tag.jl

@@ -0,0 +1,86 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+using StarPU
+using LinearAlgebra.BLAS
+
+include("cholesky_common.jl")
+
+function cholesky(mat :: Matrix{Float32}, size, nblocks)
+    include("cholesky_codelets.jl")
+
+    horiz = starpu_data_filter(STARPU_MATRIX_FILTER_BLOCK, nblocks)
+    vert = starpu_data_filter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nblocks)
+
+    @starpu_block let
+        h_mat = starpu_data_register(mat)
+        starpu_data_set_sequential_consistency_flag(h_mat, 0)
+        starpu_data_map_filters(h_mat, horiz, vert)
+
+        entry_task = starpu_task(cl = cl_11,
+                                 handles = [h_mat[1, 1]],
+                                 tag = tag11(1))
+
+        for k in 1:nblocks
+
+            starpu_iteration_push(k)
+
+            if k > 1
+                # enforce dependencies...
+                starpu_tag_declare_deps(tag11(k), tag22(k-1, k, k))
+                starpu_task_insert(cl = cl_11,
+                                   handles = [h_mat[k, k]],
+                                   tag = tag11(k))
+            end
+
+            for m in k+1:nblocks
+                # enforce dependencies...
+                if k > 1
+                    starpu_tag_declare_deps(tag21(k, m), tag11(k), tag22(k-1, m, k))
+                else
+                    starpu_tag_declare_deps(tag21(k, m), tag11(k))
+                end
+
+                starpu_task_insert(cl = cl_21, handles = [h_mat[k, k], h_mat[m, k]], tag = tag21(k, m))
+
+                for n in k+1:nblocks
+                    if n <= m
+                        # enforce dependencies...
+                        if k > 1
+                            starpu_tag_declare_deps(tag22(k, m, n), tag22(k-1, m, n), tag21(k, n), tag21(k, m))
+                        else
+                            starpu_tag_declare_deps(tag22(k, m, n), tag21(k, n), tag21(k, m))
+                        end
+
+                        starpu_task_insert(cl = cl_22, handles = [h_mat[m, k], h_mat[n, k], h_mat[m, n]], tag = tag22(k, m, n))
+                    end
+                end
+            end
+
+            starpu_iteration_pop()
+        end
+
+        starpu_task_submit(entry_task)
+        starpu_tag_wait(tag11(nblocks))
+    end
+end
+
+starpu_init()
+starpu_cublas_init()
+
+main(1024, 8, verify = true)
+main(15360, 16)
+
+starpu_shutdown()

+ 104 - 0
julia/examples/dependency/end_dep.jl

@@ -0,0 +1,104 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+using StarPU
+
+@target STARPU_CPU
+@codelet function codeletA() :: Nothing
+    # print("[Task A] Value = ", val[]);
+    # do nothing
+end
+
+@target STARPU_CPU
+@codelet function codeletB(val ::Ref{Int32}) :: Nothing
+    # println("[Task B] Value = ", val[]);
+    val[] = val[] *2
+end
+
+function callbackB(task)
+    sleep(1)
+    starpu_task_end_dep_release(task)
+end
+
+@target STARPU_CPU
+@codelet function codeletC(val ::Ref{Int32}) :: Nothing
+    # println("[Task C] Value = ", val[]);
+    val[] = val[] *2
+end
+
+function callbackC(task)
+    starpu_task_end_dep_release(task)
+end
+
+
+function main()
+    value = Ref(Int32(12))
+
+    @starpu_block let
+        perfmodel = starpu_perfmodel(
+            perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
+            symbol = "history_perf"
+        )
+
+        clA = starpu_codelet(
+            cpu_func = "codeletA",
+            perfmodel = perfmodel
+        )
+        clB = starpu_codelet(
+            cpu_func = "codeletB",
+            modes = [STARPU_RW],
+            perfmodel = perfmodel
+        )
+        clC = starpu_codelet(
+            cpu_func = "codeletC",
+            modes = [STARPU_RW],
+            perfmodel = perfmodel
+        )
+
+        handle = starpu_data_register(value)
+
+	starpu_data_set_sequential_consistency_flag(handle, 0)
+
+        taskA = starpu_task(cl = clA, detach=0)
+        taskB = starpu_task(cl = clB, handles = [handle], callback=callbackB, callback_arg=taskA)
+	taskC = starpu_task(cl = clC, handles = [handle], callback=callbackC, callback_arg=taskA)
+
+	starpu_task_end_dep_add(taskA, 2)
+        starpu_task_declare_deps(taskC, taskB)
+
+        starpu_task_submit(taskA)
+        starpu_task_submit(taskB)
+        starpu_task_submit(taskC)
+        starpu_task_wait(taskA)
+
+        starpu_data_acquire_on_node(handle, STARPU_MAIN_RAM, STARPU_R);
+	# Waiting for taskA should have also waited for taskB and taskC
+        if value[] != 48
+            error("Incorrect value $(value[]) (expected 48)")
+        end
+	starpu_data_release_on_node(handle, STARPU_MAIN_RAM);
+    end
+
+
+    println("Value = ", value[])
+end
+
+# Disable garbage collector because of random segfault/hang when using mutex.
+# This issue should be solved with Julia release 1.5.
+GC.enable(false)
+starpu_init()
+main()
+starpu_shutdown()
+GC.enable(true)

+ 122 - 0
julia/examples/dependency/tag_dep.jl

@@ -0,0 +1,122 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+using StarPU
+
+@target STARPU_CPU
+@codelet function codeletA(val ::Ref{Int32}) :: Nothing
+    # print("[Task A] Value = ", val[]);
+    val[] = val[] * 2
+end
+
+function callbackA(arg)
+    clB = arg[1]
+    handle = arg[2]
+    tagHoldC = arg[3]
+
+    taskB = starpu_task(cl = clB, handles = [handle],
+                        callback = starpu_tag_notify_from_apps,
+                        callback_arg = tagHoldC,
+                        sequential_consistency=false)
+
+    starpu_task_submit(taskB)
+end
+
+@target STARPU_CPU
+@codelet function codeletB(val ::Ref{Int32}) :: Nothing
+    # println("[Task B] Value = ", val[]);
+    val[] = val[] +1
+end
+
+@target STARPU_CPU
+@codelet function codeletC(val ::Ref{Int32}) :: Nothing
+    # println("[Task C] Value = ", val[]);
+    val[] = val[] *2
+end
+
+
+# Submit taskA and hold it
+# Submit taskC and hold it
+# Release taskA
+# Execute taskA       --> callback: submit taskB
+# Execute taskB       --> callback: release taskC
+#
+# All three tasks use the same data in RW, taskB is submitted after
+# taskC, so taskB should normally only execute after taskC but as the
+# sequential consistency for (taskB, data) is unset, taskB can
+# execute straightaway
+function main()
+    value = Ref(Int32(12))
+
+    @starpu_block let
+    tagHoldA :: starpu_tag_t = 32
+    tagHoldC :: starpu_tag_t = 84
+    tagA :: starpu_tag_t = 421
+    tagC :: starpu_tag_t = 842
+
+    starpu_tag_declare_deps(tagA, tagHoldA)
+    starpu_tag_declare_deps(tagC, tagHoldC)
+
+    perfmodel = starpu_perfmodel(
+        perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
+        symbol = "history_perf"
+    )
+
+        clA = starpu_codelet(
+            cpu_func = "codeletA",
+            modes = [STARPU_RW],
+            perfmodel = perfmodel
+        )
+        clB = starpu_codelet(
+            cpu_func = "codeletB",
+            modes = [STARPU_RW],
+            perfmodel = perfmodel
+        )
+        clC = starpu_codelet(
+            cpu_func = "codeletC",
+            modes = [STARPU_RW],
+            perfmodel = perfmodel
+        )
+
+        handle = starpu_data_register(value)
+
+        taskA = starpu_task(cl = clA, handles = [handle], tag = tagA,
+                            callback = callbackA,
+                            callback_arg=(clB, handle, tagHoldC))
+        starpu_task_submit(taskA)
+
+        taskC = starpu_task(cl = clC, handles = [handle], tag = tagC)
+        starpu_task_submit(taskC)
+
+        # Release taskA (we want to make sure it will execute after taskC has been submitted)
+        starpu_tag_notify_from_apps(tagHoldA)
+
+        starpu_task_wait_for_all()
+    end
+
+    if value[] != 50
+        error("Incorrect value $(value[]) (expected 50)")
+    end
+
+    println("Value = ", value[])
+end
+
+# Disable garbage collector because of random segfault/hang when using mutex.
+# This issue should be solved with Julia release 1.5.
+GC.enable(false)
+starpu_init()
+main()
+starpu_shutdown()
+GC.enable(true)

+ 88 - 0
julia/examples/dependency/task_dep.jl

@@ -0,0 +1,88 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+using StarPU
+
+@target STARPU_CPU
+@codelet function codeletA(val ::Ref{Int32}) :: Nothing
+    # print("[Task A] Value = ", val[]);
+    val[] = val[] * 2
+end
+
+@target STARPU_CPU
+@codelet function codeletB(val ::Ref{Int32}) :: Nothing
+    # println("[Task B] Value = ", val[]);
+    val[] = val[] +1
+end
+
+@target STARPU_CPU
+@codelet function codeletC(val ::Ref{Int32}) :: Nothing
+    # println("[Task C] Value = ", val[]);
+    val[] = val[] *2
+end
+
+function main()
+    value = Ref(Int32(12))
+
+    @starpu_block let
+        perfmodel = starpu_perfmodel(
+            perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
+            symbol = "history_perf"
+        )
+
+        clA = starpu_codelet(
+            cpu_func = "codeletA",
+            modes = [STARPU_RW],
+            perfmodel = perfmodel
+        )
+        clB = starpu_codelet(
+            cpu_func = "codeletB",
+            modes = [STARPU_RW],
+            perfmodel = perfmodel
+        )
+        clC = starpu_codelet(
+            cpu_func = "codeletC",
+            modes = [STARPU_RW],
+            perfmodel = perfmodel
+        )
+
+        starpu_data_set_default_sequential_consistency_flag(0)
+
+        handle = starpu_data_register(value)
+
+        taskA = starpu_task(cl = clA, handles = [handle])
+        taskB = starpu_task(cl = clB, handles = [handle])
+        taskC = starpu_task(cl = clC, handles = [handle])
+
+        starpu_task_declare_deps(taskA, taskB)
+        starpu_task_declare_deps(taskC, taskA, taskB)
+
+        starpu_task_submit(taskA)
+        starpu_task_submit(taskB)
+        starpu_task_submit(taskC)
+
+        starpu_task_wait_for_all()
+    end
+
+    if value[] != 52
+        error("Incorrect value $(value[]) (expected 52)")
+    end
+
+    println("Value = ", value[])
+end
+
+starpu_init()
+main()
+starpu_shutdown()

+ 53 - 0
julia/examples/execute.sh.in

@@ -0,0 +1,53 @@
+#!@REALBASH@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+set -x
+export JULIA_LOAD_PATH=@STARPU_SRC_DIR@/julia:$JULIA_LOAD_PATH
+export STARPU_BUILD_DIR=@STARPU_BUILD_DIR@
+export STARPU_SRC_DIR=@STARPU_SRC_DIR@
+export STARPU_JULIA_LIB=@STARPU_BUILD_DIR@/julia/src/.libs/libstarpujulia-1.3
+export STARPU_JULIA_BUILD=@STARPU_BUILD_DIR@/julia
+export LD_LIBRARY_PATH=@STARPU_BUILD_DIR@/julia/src/.libs/:$LD_LIBRARY_PATH
+export JULIA_NUM_THREADS=8
+export STARPU_NOPENCL=0
+export STARPU_SCHED=dmda
+
+srcdir=@STARPU_SRC_DIR@/julia/examples
+
+rm -f genc*.c gencuda*.cu genc*.o
+
+if test "$1" == "-calllib"
+then
+    shift
+    pwd
+    rm -f extern_tasks.so
+    make -f @STARPU_BUILD_DIR@/julia/src/dynamic_compiler/Makefile extern_tasks.so SOURCES_CPU=$srcdir/$1
+    shift
+    export JULIA_TASK_LIB=$PWD/extern_tasks.so
+fi
+
+srcfile=$1
+if test ! -f $srcdir/$srcfile
+then
+    echo "Error. File $srcdir/$srcfile not found"
+    exit 1
+fi
+shift
+#cd $srcdir/$(dirname $srcfile)
+#@JULIA@ $(basename $srcfile) $*
+@JULIA@ $srcdir/$srcfile $*
+

+ 145 - 0
julia/examples/gemm/gemm.jl

@@ -0,0 +1,145 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+using StarPU
+using LinearAlgebra.BLAS
+using BenchmarkTools
+
+@target STARPU_CPU+STARPU_CUDA
+@codelet function gemm(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, alpha :: Float32, beta :: Float32) :: Nothing
+
+    M :: Int32 = height(A)
+    N :: Int32 = width(B)
+    K :: Int32 = width(A)
+    lda :: Int32 = ld(A)
+    ldb :: Int32 = ld(B)
+    ldc :: Int32 = ld(C)
+    STARPU_SGEMM("N", "N", M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+    return
+end
+
+function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, alpha :: Float32, beta :: Float32, nslicesx, nslicesy)
+    scale= 3
+    tmin=0
+    vert = starpu_data_filter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslicesx)
+    horiz = starpu_data_filter(STARPU_MATRIX_FILTER_BLOCK, nslicesy)
+    @starpu_block let
+        hA,hB,hC = starpu_data_register(A, B, C)
+        starpu_data_partition(hB, vert)
+        starpu_data_partition(hA, horiz)
+        starpu_data_map_filters(hC, vert, horiz)
+        tmin=0
+
+        for i in (1 : 10 )
+            t=time_ns()
+            @starpu_sync_tasks begin
+                for taskx in (1 : nslicesx)
+                    for tasky in (1 : nslicesy)
+                        starpu_task_insert(codelet_name = "gemm",
+                                           handles = [hA[tasky], hB[taskx], hC[taskx, tasky]],
+                                           cl_arg = (alpha, beta),
+                                           modes = [STARPU_R, STARPU_R, STARPU_RW])
+                    end
+                end
+            end
+            t=time_ns()-t
+            if (tmin==0 || tmin>t)
+                tmin=t
+            end
+        end
+    end
+    return tmin
+end
+
+
+function approximately_equals(
+    A :: Matrix{Cfloat},
+    B :: Matrix{Cfloat},
+    eps = 1e-2
+)
+    (height, width) = size(A)
+
+    for j in (1 : width)
+        for i in (1 : height)
+            if (abs(A[i,j] - B[i,j]) > eps * max(abs(B[i,j]), abs(A[i,j])))
+                println("A[$i,$j] : $(A[i,j]), B[$i,$j] : $(B[i,j])")
+                return false
+            end
+        end
+    end
+
+    return true
+end
+
+function check(expected, A, B, C, alpha, beta)
+    for i in 1 : 10
+        gemm!('N', 'N', alpha, A, B, beta, expected)
+    end
+
+    height,width = size(C)
+    for i in 1:height
+        for j in 1:width
+            got = C[i, j]
+            exp = expected[i, j]
+
+            err = abs(exp - got) / exp
+            if err > 0.0001
+                error("[$i] -> $got != $exp (err $err)")
+            end
+        end
+    end
+end
+
+function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy)
+    for dim in (start_dim : step_dim : stop_dim)
+        A = Array(rand(Cfloat, dim, dim))
+        B = Array(rand(Cfloat, dim, dim))
+        C = zeros(Float32, dim, dim)
+        C_ref = copy(C)
+        starpu_memory_pin(A)
+        starpu_memory_pin(B)
+        starpu_memory_pin(C)
+        alpha = 4.0f0
+        beta = 2.0f0
+        mt =  multiply_with_starpu(A, B, C, alpha, beta, nslicesx, nslicesy)
+        gflop = 2 * dim * dim * dim * 1.e-9
+        gflops = gflop / (mt * 1.e-9)
+        size=dim*dim*dim*4*3/1024/1024
+        println(io,"$dim $gflops")
+        println("$dim $gflops")
+        starpu_memory_unpin(A)
+        starpu_memory_unpin(B)
+        starpu_memory_unpin(C)
+        check(C_ref, A, B, C, alpha, beta)
+    end
+end
+
+if size(ARGS, 1) < 1
+    filename="x.dat"
+else
+    filename=ARGS[1]
+end
+
+starpu_init()
+starpu_cublas_init()
+nblock_x = Int32(ceil(sqrt(starpu_worker_get_count())))
+nblock_y = nblock_x
+io=open(filename,"w")
+compute_times(io,64,512,4096,nblock_x,nblock_y)
+close(io)
+
+starpu_shutdown()
+

+ 22 - 0
julia/examples/gemm/gemm.sh

@@ -0,0 +1,22 @@
+#!/bin/bash
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+$(dirname $0)/../execute.sh gemm/gemm_native.jl
+
+export OMP_NUM_THREADS=1
+$(dirname $0)/../execute.sh gemm/gemm.jl
+

+ 146 - 0
julia/examples/gemm/gemm_bare.jl

@@ -0,0 +1,146 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+using StarPU
+using LinearAlgebra.BLAS
+
+@target STARPU_CPU+STARPU_CUDA
+@codelet function gemm(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, alpha :: Float32, beta :: Float32) :: Nothing
+
+    M :: Int32 = height(A)
+    N :: Int32 = width(B)
+    K :: Int32 = width(A)
+    lda :: Int32 = ld(A)
+    ldb :: Int32 = ld(B)
+    ldc :: Int32 = ld(C)
+    STARPU_SGEMM("N", "N", M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+    return
+end
+
+function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, alpha :: Float32, beta :: Float32, nslicesx, nslicesy)
+    scale= 3
+    tmin=0
+    hA,hB,hC = starpu_data_register(A, B, C)
+    tmin=0
+    perfmodel = starpu_perfmodel(
+        perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
+        symbol = "gemm"
+    )
+    cl = starpu_codelet(
+        cpu_func  = "gemm",
+        cuda_func = "",
+        modes =[STARPU_R,STARPU_R,STARPU_RW], 
+        perfmodel = perfmodel,
+    )
+    task = starpu_task(cl = cl, handles =[hA,hB,hC], cl_arg = (alpha,beta), callback = nothing,
+		callback_arg = nothing, tag = nothing, tag_only = nothing,
+                       sequential_consistency = true,
+                       detach = 1, color = nothing, where = nothing)
+
+
+    for i in (1 : 10 )
+        t=time_ns()
+starpu_task_submit(Ref(task.c_task))
+        #starpu_task_submit(task)
+        starpu_task_wait_for_all()
+        t=time_ns()-t
+	if (tmin==0 || tmin>t)
+           tmin=t
+        end
+    end
+    starpu_data_unregister(hA)
+    starpu_data_unregister(hB)
+    starpu_data_unregister(hC)
+    return tmin
+end
+
+
+function approximately_equals(
+    A :: Matrix{Cfloat},
+    B :: Matrix{Cfloat},
+    eps = 1e-2
+)
+    (height, width) = size(A)
+
+    for j in (1 : width)
+        for i in (1 : height)
+            if (abs(A[i,j] - B[i,j]) > eps * max(abs(B[i,j]), abs(A[i,j])))
+                println("A[$i,$j] : $(A[i,j]), B[$i,$j] : $(B[i,j])")
+                return false
+            end
+        end
+    end
+
+    return true
+end
+
+function check(expected, A, B, C, alpha, beta)
+    for i in 1 : 10
+        gemm!('N', 'N', alpha, A, B, beta, expected)
+    end
+
+    height,width = size(C)
+    for i in 1:height
+        for j in 1:width
+            got = C[i, j]
+            exp = expected[i, j]
+
+            err = abs(exp - got) / exp
+            if err > 0.0001
+                error("[$i] -> $got != $exp (err $err)")
+            end
+        end
+    end
+end
+
+function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy)
+    for dim in (start_dim : step_dim : stop_dim)
+        A = Array(rand(Cfloat, dim, dim))
+        B = Array(rand(Cfloat, dim, dim))
+        C = zeros(Float32, dim, dim)
+        C_ref = copy(C)
+        starpu_memory_pin(A)
+        starpu_memory_pin(B)
+        starpu_memory_pin(C)
+        alpha = 4.0f0
+        beta = 2.0f0
+        mt =  multiply_with_starpu(A, B, C, alpha, beta, nslicesx, nslicesy)
+        gflop = 2 * dim * dim * dim * 1.e-9
+        gflops = gflop / (mt * 1.e-9)
+        size=dim*dim*dim*4*3/1024/1024
+        println(io,"$dim $gflops")
+        println("$dim $gflops")
+        starpu_memory_unpin(A)
+        starpu_memory_unpin(B)
+        starpu_memory_unpin(C)
+        #check(C_ref, A, B, C, alpha, beta)
+    end
+end
+
+if size(ARGS, 1) < 1
+    filename="x.dat"
+else
+    filename=ARGS[1]
+end
+
+starpu_init()
+starpu_cublas_init()
+io=open(filename,"w")
+compute_times(io,64,512,4096,1,1)
+close(io)
+
+starpu_shutdown()
+

+ 56 - 0
julia/examples/gemm/gemm_native.jl

@@ -0,0 +1,56 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+using LinearAlgebra.BLAS
+
+function gemm_without_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, alpha :: Float32, beta :: Float32)
+    tmin = 0
+    for i in (1 : 10 )
+        t=time_ns()
+        gemm!('N', 'N', alpha, A, B, beta, C)
+        t=time_ns() - t
+        if (tmin==0 || tmin>t)
+            tmin=t
+        end
+    end
+    return tmin
+end
+
+
+function compute_times(io,start_dim, step_dim, stop_dim)
+    for dim in (start_dim : step_dim : stop_dim)
+        A = Array(rand(Cfloat, dim, dim))
+        B = Array(rand(Cfloat, dim, dim))
+        C = zeros(Float32, dim, dim)
+        alpha = 4.0f0
+        beta = 2.0f0
+        mt =  gemm_without_starpu(A, B, C, alpha, beta)
+        gflop = 2 * dim * dim * dim * 1.e-9
+        gflops = gflop / (mt * 1.e-9)
+        size=dim*dim*dim*4*3/1024/1024
+        println(io,"$dim $gflops")
+        println("$dim $gflops")
+    end
+end
+
+if size(ARGS, 1) < 1
+    filename="x.dat"
+else
+    filename=ARGS[1]
+end
+io=open(filename,"w")
+compute_times(io,64,512,4096)
+close(io)
+

+ 33 - 14
julia/examples/mandelbrot/cpu_mandelbrot.c

@@ -1,44 +1,62 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
 #include <stdio.h>
 #include <starpu.h>
 #include <math.h>
+#include "cpu_mandelbrot.h"
 
 void cpu_mandelbrot(void *descr[], void *cl_arg)
 {
         long long *pixels;
-        float *params;
 
         pixels = (long long int *)STARPU_MATRIX_GET_PTR(descr[0]);
-        params = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
+        struct params *params = (struct params *) cl_arg;
 
-        long long width = STARPU_MATRIX_GET_NY(descr[0]);
-        long long height = STARPU_MATRIX_GET_NX(descr[0]);
+        long width = STARPU_MATRIX_GET_NY(descr[0]);
+        long height = STARPU_MATRIX_GET_NX(descr[0]);
         double zoom = width * 0.25296875;
         double iz = 1. / zoom;
         float diverge = 4.0;
         float max_iterations = (width/2) * 0.049715909 * log10(zoom);
         float imi = 1. / max_iterations;
-        float centerr = params[0];
-        float centeri = params[1];
-        float offset = params[2];
-        float dim = params[3];
+        double centerr = params->centerr;
+        double centeri = params->centeri;
+        long offset = params->offset;
+        long dim = params->dim;
         double cr = 0;
         double zr = 0;
         double ci = 0;
         double zi = 0;
-        long long n = 0;
+        long n = 0;
         double tmp = 0;
         int ldP = STARPU_MATRIX_GET_LD(descr[0]);
 
         long long x,y;
 
-        for (y = 0; y < height; y++){
-                for (x = 0; x < width; x++){
+        for (y = 0; y < height; y++)
+	{
+                for (x = 0; x < width; x++)
+		{
                         cr = centerr + (x - (dim/2)) * iz;
 			zr = cr;
                         ci = centeri + (y+offset - (dim/2)) * iz;
                         zi = ci;
 
-                        for (n = 0; n <= max_iterations; n++) {
+                        for (n = 0; n <= max_iterations; n++)
+			{
 				if (zr*zr + zi*zi>diverge) break;
                                 tmp = zr*zr - zi*zi + cr;
                                 zi = 2*zr*zi + ci;
@@ -53,8 +71,9 @@ void cpu_mandelbrot(void *descr[], void *cl_arg)
 }
 
 char* CPU = "cpu_mandelbrot";
-char* GPU = "gpu_mandelbrot";
-extern char *starpu_find_function(char *name, char *device) {
+char* GPU = "";
+extern char *starpu_find_function(char *name, char *device)
+{
 	if (!strcmp(device,"gpu")) return GPU;
 	return CPU;
 }

+ 29 - 10
julia/examples/mandelbrot/mandelbrot.jl

@@ -1,3 +1,18 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
 import Libdl
 using StarPU
 using LinearAlgebra
@@ -34,7 +49,7 @@ using LinearAlgebra
                 zi = 2*zr*zi + ci
                 zr = tmp
             end
-            
+
             if (n < max_iterations)
                 pixels[y,x] = round(15 * n * imi)
             else
@@ -49,13 +64,16 @@ end
 starpu_init()
 
 function mandelbrot_with_starpu(A ::Matrix{Int64}, cr ::Float64, ci ::Float64, dim ::Int64, nslicesx ::Int64)
-    horiz = StarpuDataFilter(STARPU_MATRIX_FILTER_BLOCK, nslicesx)
+    horiz = starpu_data_filter(STARPU_MATRIX_FILTER_BLOCK, nslicesx)
     @starpu_block let
 	hA = starpu_data_register(A)
 	starpu_data_partition(hA,horiz)
 
 	@starpu_sync_tasks for taskx in (1 : nslicesx)
-                @starpu_async_cl mandelbrot(hA[taskx]) [STARPU_W] [cr, ci, (taskx-1)*dim/nslicesx, dim]
+            starpu_task_insert(codelet_name = "mandelbrot",
+                               handles = [hA[taskx]],
+                               modes = [STARPU_W],
+                               cl_arg = (cr, ci, Int64((taskx-1)*dim/nslicesx), dim))
 	end
     end
 end
@@ -73,9 +91,9 @@ function pixels2img(pixels ::Matrix{Int64}, width ::Int64, height ::Int64, filen
     end
 end
 
-function min_times(cr ::Float64, ci ::Float64, dim ::Int64, nslices ::Int64)
+function min_times(cr ::Float64, ci ::Float64, dim ::Int64, nslices ::Int64, gen_images)
     tmin=0;
-    
+
     pixels ::Matrix{Int64} = zeros(dim, dim)
     for i = 1:10
         t = time_ns();
@@ -85,20 +103,21 @@ function min_times(cr ::Float64, ci ::Float64, dim ::Int64, nslices ::Int64)
             tmin=t
         end
     end
-    pixels2img(pixels,dim,dim,"out$(dim).ppm")
+    if (gen_images == 1)
+        pixels2img(pixels,dim,dim,"out$(dim).ppm")
+    end
     return tmin
 end
 
-function display_time(cr ::Float64, ci ::Float64, start_dim ::Int64, step_dim ::Int64, stop_dim ::Int64, nslices ::Int64)
+function display_time(cr ::Float64, ci ::Float64, start_dim ::Int64, step_dim ::Int64, stop_dim ::Int64, nslices ::Int64, gen_images)
     for dim in (start_dim : step_dim : stop_dim)
-        res = min_times(cr, ci, dim, nslices)
+        res = min_times(cr, ci, dim, nslices, gen_images)
         res=res/dim/dim; # time per pixel
         println("$(dim) $(res)")
     end
 end
 
 
-display_time(-0.800671,-0.158392,32,32,4096,4)
+display_time(-0.800671,-0.158392,32,32,512,4, 0)
 
 starpu_shutdown()
-

+ 25 - 14
julia/examples/mult/cpu_mult.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2018                                     Alexis Juven
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2018       Alexis Juven
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -13,26 +14,30 @@
  *
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
+
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
 #include <starpu.h>
+
 /*
  * The codelet is passed 3 matrices, the "descr" union-type field gives a
  * description of the layout of those 3 matrices in the local memory (ie. RAM
  * in the case of CPU, GPU frame buffer in the case of GPU etc.). Since we have
  * registered data with the "matrix" data interface, we use the matrix macros.
  */
-void cpu_mult(void *descr[], void *arg)
+void cpu_mult(void *descr[], void *cl_arg)
 {
-	(void)arg;
+	int stride;
 	float *subA, *subB, *subC;
+
+	stride = *((int *)cl_arg);
+
 	/* .blas.ptr gives a pointer to the first element of the local copy */
 	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
 	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
 	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
 
-
 	/* .blas.nx is the number of rows (consecutive elements) and .blas.ny
 	 * is the number of lines that are separated by .blas.ld elements (ld
 	 * stands for leading dimension).
@@ -50,14 +55,18 @@ void cpu_mult(void *descr[], void *arg)
 	int i,j,k,ii,jj,kk;
 	for (i = 0; i < nyC*nxC; i++) subC[i] = 0;
 	//fprintf(stderr,"inside cpu_mult %dx%dx%d %d/%d on %d\n",nyC,nyA,nxC,starpu_worker_get_id(),STARPU_NMAXWORKERS,starpu_worker_get_devid(starpu_worker_get_id()));
-	for (i=0;i<nyC;i+=STRIDE) {
-		for (k=0;k<nyA;k+=STRIDE) {
-			for (j=0;j<nxC;j+=STRIDE) {
-				
-				for (ii = i; ii < i+STRIDE; ii+=2) {
+	for (i=0;i<nyC;i+=stride)
+	{
+		for (k=0;k<nyA;k+=stride)
+		{
+			for (j=0;j<nxC;j+=stride)
+			{
+				for (ii = i; ii < i+stride; ii+=2)
+				{
 					float *sC0=subC+ii*ldC+j;
 					float *sC1=subC+ii*ldC+ldC+j;
-					for (kk = k; kk < k+STRIDE; kk+=4) {
+					for (kk = k; kk < k+stride; kk+=4)
+					{
 						float alpha00=subB[kk +  ii*ldB];
 						float alpha01=subB[kk+1+ii*ldB];
 						float alpha10=subB[kk+  ii*ldB+ldB];
@@ -70,7 +79,8 @@ void cpu_mult(void *descr[], void *arg)
 						float *sA1=subA+kk*ldA+ldA+j;
 						float *sA2=subA+kk*ldA+2*ldA+j;
 						float *sA3=subA+kk*ldA+3*ldA+j;
-						for (jj = 0; jj < STRIDE; jj+=1) {
+						for (jj = 0; jj < stride; jj+=1)
+						{
 							sC0[jj] += alpha00*sA0[jj]+alpha01*sA1[jj]+alpha02*sA2[jj]+alpha03*sA3[jj];
 							sC1[jj] += alpha10*sA0[jj]+alpha11*sA1[jj]+alpha12*sA2[jj]+alpha13*sA3[jj];
 						}
@@ -80,11 +90,12 @@ void cpu_mult(void *descr[], void *arg)
 		}
 	}
 	//fprintf(stderr,"inside cpu_mult %dx%dx%d\n",nyC,nyA,nxC);
-
 }
+
 char* CPU = "cpu_mult";
-char* GPU = "gpu_mult";
-extern char *starpu_find_function(char *name, char *device) {
+char* GPU = "";
+extern char *starpu_find_function(char *name, char *device)
+{
 	if (!strcmp(device,"gpu")) return GPU;
 	return CPU;
 }

+ 45 - 40
julia/examples/mult/mult.jl

@@ -1,12 +1,24 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
 import Libdl
 using StarPU
 using LinearAlgebra
 
-#shoud be the same as in the makefile
-const STRIDE = 72
-
 @target STARPU_CPU+STARPU_CUDA
-@codelet function matrix_mult(m1 :: Matrix{Float32}, m2 :: Matrix{Float32}, m3 :: Matrix{Float32}) :: Nothing
+@codelet function matrix_mult(m1 :: Matrix{Float32}, m2 :: Matrix{Float32}, m3 :: Matrix{Float32}, stride ::Int32) :: Nothing
 
     width_m2 :: Int32 = width(m2)
     height_m1 :: Int32 = height(m1)
@@ -59,38 +71,27 @@ end
 
 starpu_init()
 
-function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, nslicesx, nslicesy)
+function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, nslicesx, nslicesy, stride)
     scale= 3
     tmin=0
-    vert = StarpuDataFilter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslicesx)
-    horiz = StarpuDataFilter(STARPU_MATRIX_FILTER_BLOCK, nslicesy)
+    vert = starpu_data_filter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslicesx)
+    horiz = starpu_data_filter(STARPU_MATRIX_FILTER_BLOCK, nslicesy)
     @starpu_block let
         hA,hB,hC = starpu_data_register(A, B, C)
         starpu_data_partition(hB, vert)
         starpu_data_partition(hA, horiz)
         starpu_data_map_filters(hC, vert, horiz)
         tmin=0
-        perfmodel = StarpuPerfmodel(
-            perf_type = STARPU_HISTORY_BASED,
-            symbol = "history_perf"
-        )
-        cl = StarpuCodelet(
-            cpu_func = CPU_CODELETS["matrix_mult"],
-            # cuda_func = CUDA_CODELETS["matrix_mult"],
-            #opencl_func="ocl_matrix_mult",
-            modes = [STARPU_R, STARPU_R, STARPU_W],
-            perfmodel = perfmodel
-        )
 
         for i in (1 : 10 )
             t=time_ns()
             @starpu_sync_tasks begin
                 for taskx in (1 : nslicesx)
                     for tasky in (1 : nslicesy)
-                        handles = [hA[tasky], hB[taskx], hC[taskx, tasky]]
-                        task = StarpuTask(cl = cl, handles = handles)
-                        starpu_task_submit(task)
-                        #@starpu_async_cl matrix_mult(hA[tasky], hB[taskx], hC[taskx, tasky])
+                        starpu_task_insert(codelet_name = "matrix_mult",
+                                           modes = [STARPU_R, STARPU_R, STARPU_W],
+                                           handles = [hA[tasky], hB[taskx], hC[taskx, tasky]],
+                                           cl_arg = (Int32(stride),))
                     end
                 end
             end
@@ -104,41 +105,45 @@ function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: M
 end
 
 
-function approximately_equals(
-    A :: Matrix{Cfloat},
-    B :: Matrix{Cfloat},
-    eps = 1e-2
-)
-    (height, width) = size(A)
+function check(A, B, C)
+    expected = A * B
+    height,width = size(C)
+    for i in 1:height
+        for j in 1:width
+            got = C[i, j]
+            exp = expected[i, j]
 
-    for j in (1 : width)
-        for i in (1 : height)
-            if (abs(A[i,j] - B[i,j]) > eps * max(abs(B[i,j]), abs(A[i,j])))
-                println("A[$i,$j] : $(A[i,j]), B[$i,$j] : $(B[i,j])")
-                return false
+            err = abs(exp - got) / exp
+            if err > 0.0001
+                error("[$i] -> $got != $exp (err $err)")
             end
         end
     end
-
-    return true
 end
 
-function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy)
+function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy, stride)
     for dim in (start_dim : step_dim : stop_dim)
         A = Array(rand(Cfloat, dim, dim))
         B = Array(rand(Cfloat, dim, dim))
         C = zeros(Float32, dim, dim)
-        mt =  multiply_with_starpu(A, B, C, nslicesx, nslicesy)
+        mt =  multiply_with_starpu(A, B, C, nslicesx, nslicesy, stride)
         flops = (2*dim-1)*dim*dim/mt
         size=dim*dim*4*3/1024/1024
         println(io,"$size $flops")
         println("$size $flops")
+        check(A, B, C)
     end
 end
 
-
-io=open(ARGS[1],"w")
-compute_times(io,16*STRIDE,4*STRIDE,4096,2,2)
+if size(ARGS, 1) < 2
+    stride=4
+    filename="x.dat"
+else
+    stride=parse(Int, ARGS[1])
+    filename=ARGS[2]
+end
+io=open(filename,"w")
+compute_times(io,16*stride,4*stride,128*stride,2,2,stride)
 close(io)
 
 starpu_shutdown()

+ 30 - 8
julia/examples/task_insert_color/task_insert_color.jl

@@ -1,3 +1,18 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
 import Libdl
 using StarPU
 
@@ -14,27 +29,34 @@ function task_insert_color_with_starpu(val ::Ref{Int32})
     @starpu_block let
 	hVal = starpu_data_register(val)
 
-        cl1 = StarpuCodelet(
-            cpu_func = CPU_CODELETS["task_insert_color"],
-            modes = [STARPU_RW]
+        perfmodel = starpu_perfmodel(
+            perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
+            symbol = "history_perf"
         )
 
-        cl2 = StarpuCodelet(
-            cpu_func = CPU_CODELETS["task_insert_color"],
+        cl1 = starpu_codelet(
+            cpu_func = "task_insert_color",
             modes = [STARPU_RW],
+            perfmodel = perfmodel
+        )
+
+        cl2 = starpu_codelet(
+            cpu_func = "task_insert_color",
+            modes = [STARPU_RW],
+            perfmodel = perfmodel,
             color = 0x0000FF
         )
 
 	@starpu_sync_tasks begin
 
             # In the trace file, the following task should be green (executed on CPU)
-            starpu_task_submit(StarpuTask(cl = cl1, handles = [hVal]))
+            starpu_task_submit(starpu_task(cl = cl1, handles = [hVal]))
 
             # In the trace file, the following task will be blue as specified by the field color of cl2
-            starpu_task_submit(StarpuTask(cl = cl2, handles = [hVal]))
+            starpu_task_submit(starpu_task(cl = cl2, handles = [hVal]))
 
             # In the trace file, the following tasks will be red as specified in @starpu_async_cl
-            @starpu_async_cl task_insert_color(hVal) [STARPU_RW] [] 0xFF0000
+            @starpu_async_cl task_insert_color(hVal) [STARPU_RW] () 0xFF0000
 
 	end
     end

+ 1 - 1
julia/examples/variable/variable.jl

@@ -29,7 +29,7 @@ function display(niter)
     if foo[] == niter
         println("result is correct")
     else
-        println("result is incorret")
+        error("result is incorret")
     end
 end
 

+ 42 - 18
julia/examples/vector_scal/vector_scal.jl

@@ -1,3 +1,18 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
 import Libdl
 using StarPU
 using LinearAlgebra
@@ -21,28 +36,15 @@ function vector_scal_with_starpu(v :: Vector{Float32}, m :: Int32, k :: Float32,
     @starpu_block let
         hV = starpu_data_register(v)
         tmin=0
-        perfmodel = StarpuPerfmodel(
-            perf_type = STARPU_HISTORY_BASED,
-            symbol = "history_perf"
-        )
-        cl = StarpuCodelet(
-            cpu_func = CPU_CODELETS["vector_scal"],
-            # cuda_func = CUDA_CODELETS["vector_scal"],
-            #opencl_func="ocl_matrix_mult",
-            modes = [STARPU_RW],
-            perfmodel = perfmodel
-        )
 
         for i in (1 : 1)
             t=time_ns()
             @starpu_sync_tasks begin
-                handles = [hV]
-                task = StarpuTask(cl = cl, handles = handles, cl_arg=[m, k, l])
-                starpu_task_submit(task)
+                starpu_task_insert(codelet_name = "vector_scal",
+                                   modes = [STARPU_RW],
+                                   handles = [hV],
+                                   cl_arg=(m, k, l))
             end
-            # @starpu_sync_tasks for task in (1:1)
-            #     @starpu_async_cl vector_scal(hV, STARPU_RW, [m, k, l])
-            # end
             t=time_ns()-t
             if (tmin==0 || tmin>t)
                 tmin=t
@@ -52,9 +54,24 @@ function vector_scal_with_starpu(v :: Vector{Float32}, m :: Int32, k :: Float32,
     return tmin
 end
 
+function check(ref, res, m, k, l)
+    expected = ref .* m .+ (k+l)
+
+    for i in 1:length(expected)
+        got = res[i]
+        exp = expected[i]
+
+        err = abs(exp - got) / exp
+        if err > 0.0001
+            error("[$i] -> $got != $exp (err $err)")
+        end
+    end
+end
+
 function compute_times(io,start_dim, step_dim, stop_dim)
     for size in (start_dim : step_dim : stop_dim)
         V = Array(rand(Cfloat, size))
+        V_ref = copy(V)
         starpu_memory_pin(V)
 
         m :: Int32 = 10
@@ -70,11 +87,18 @@ function compute_times(io,start_dim, step_dim, stop_dim)
         println("OUTPUT ", V[1:10])
         println(io,"$size $mt")
         println("$size $mt")
+
+        check(V_ref, V, m, k, l)
     end
 end
 
+if size(ARGS, 1) < 1
+    filename="x.dat"
+else
+    filename=ARGS[1]
+end
 
-io=open(ARGS[1],"w")
+io=open(filename,"w")
 compute_times(io,1024,1024,4096)
 close(io)
 

+ 60 - 0
julia/src/Makefile.am

@@ -0,0 +1,60 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+include $(top_srcdir)/starpu-notests.mk
+
+CLEANFILES = *.gcno *.gcda
+
+AM_CFLAGS = $(GLOBAL_AM_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(HWLOC_CFLAGS) $(FXT_CFLAGS) -fPIC
+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ `@JULIA@ $(top_srcdir)/julia/src/openblas_ldflags.jl`
+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_builddir)/src -I$(top_srcdir)/src -I$(top_srcdir)/julia/src
+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) -no-undefined
+
+SUBDIRS = dynamic_compiler
+
+lib_LTLIBRARIES = libstarpujulia-@STARPU_EFFECTIVE_VERSION@.la
+
+noinst_HEADERS =
+
+libstarpujulia_@STARPU_EFFECTIVE_VERSION@_la_LDFLAGS = $(ldflags) -no-undefined			\
+  -version-info $(LIBSTARPUJULIA_INTERFACE_CURRENT):$(LIBSTARPUJULIA_INTERFACE_REVISION):$(LIBSTARPUJULIA_INTERFACE_AGE)
+
+libstarpujulia_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 						\
+	callback_wrapper.c \
+	blas_wrapper.c \
+	blas.c
+
+EXTRA_DIST = blas.h				\
+	blas.jl  				\
+	data.jl					\
+	destructible.jl				\
+	globals.jl				\
+	init.jl					\
+	linked_list.jl				\
+	perfmodel.jl				\
+	StarPU.jl				\
+	task_dep.jl				\
+	task.jl					\
+	translate_headers.jl			\
+	utils.jl				\
+	compiler/c.jl				\
+	compiler/cuda.jl			\
+	compiler/expression_manipulation.jl	\
+	compiler/expressions.jl			\
+	compiler/file_generation.jl		\
+	compiler/include.jl			\
+	compiler/parsing.jl			\
+	compiler/utils.jl

File diff suppressed because it is too large
+ 85 - 1276
julia/src/StarPU.jl


+ 194 - 0
julia/src/blas.c

@@ -0,0 +1,194 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <ctype.h>
+#include <stdio.h>
+
+#include "blas.h"
+
+inline void STARPU_SGEMM(char *transa, char *transb, BLASINT M, BLASINT N, BLASINT K, 
+			float alpha, const float *A, BLASINT lda, const float *B, BLASINT ldb, 
+			float beta, float *C, BLASINT ldc)
+{
+	sgemm_64_(transa, transb, &M, &N, &K, &alpha,
+			 A, &lda, B, &ldb,
+			 &beta, C, &ldc);	
+}
+
+inline void STARPU_DGEMM(char *transa, char *transb, BLASINT M, BLASINT N, BLASINT K, 
+			double alpha, double *A, BLASINT lda, double *B, BLASINT ldb, 
+			double beta, double *C, BLASINT ldc)
+{
+	dgemm_64_(transa, transb, &M, &N, &K, &alpha,
+			 A, &lda, B, &ldb,
+			 &beta, C, &ldc);	
+}
+
+
+inline void STARPU_SGEMV(char *transa, BLASINT M, BLASINT N, float alpha, float *A, BLASINT lda,
+		float *X, BLASINT incX, float beta, float *Y, BLASINT incY)
+{
+	sgemv_64_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
+}
+
+inline void STARPU_DGEMV(char *transa, BLASINT M, BLASINT N, double alpha, double *A, BLASINT lda,
+		double *X, BLASINT incX, double beta, double *Y, BLASINT incY)
+{
+	dgemv_64_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
+}
+
+inline float STARPU_SASUM(BLASINT N, float *X, BLASINT incX)
+{
+	return sasum_64_(&N, X, &incX);
+}
+
+inline double STARPU_DASUM(BLASINT N, double *X, BLASINT incX)
+{
+	return dasum_64_(&N, X, &incX);
+}
+
+void STARPU_SSCAL(BLASINT N, float alpha, float *X, BLASINT incX)
+{
+	sscal_64_(&N, &alpha, X, &incX);
+}
+
+void STARPU_DSCAL(BLASINT N, double alpha, double *X, BLASINT incX)
+{
+	dscal_64_(&N, &alpha, X, &incX);
+}
+
+void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
+                   const char *diag, const BLASINT m, const BLASINT n,
+                   const float alpha, const float *A, const BLASINT lda,
+                   float *B, const BLASINT ldb)
+{
+	strsm_64_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
+}
+
+void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
+                   const char *diag, const BLASINT m, const BLASINT n,
+                   const double alpha, const double *A, const BLASINT lda,
+                   double *B, const BLASINT ldb)
+{
+	dtrsm_64_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
+}
+
+void STARPU_SSYR (const char *uplo, const BLASINT n, const float alpha,
+                  const float *x, const BLASINT incx, float *A, const BLASINT lda)
+{
+	ssyr_64_(uplo, &n, &alpha, x, &incx, A, &lda); 
+}
+
+void STARPU_SSYRK (const char *uplo, const char *trans, const BLASINT n,
+                   const BLASINT k, const float alpha, const float *A,
+                   const BLASINT lda, const float beta, float *C,
+                   const BLASINT ldc)
+{
+	ssyrk_64_(uplo, trans, &n, &k, &alpha, A, &lda, &beta, C, &ldc); 
+}
+
+void STARPU_SGER(const BLASINT m, const BLASINT n, const float alpha,
+                  const float *x, const BLASINT incx, const float *y,
+                  const BLASINT incy, float *A, const BLASINT lda)
+{
+	sger_64_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
+}
+
+void STARPU_DGER(const BLASINT m, const BLASINT n, const double alpha,
+                  const double *x, const BLASINT incx, const double *y,
+                  const BLASINT incy, double *A, const BLASINT lda)
+{
+	dger_64_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
+}
+
+void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
+                   const BLASINT n, const float *A, const BLASINT lda, float *x, 
+                   const BLASINT incx)
+{
+	strsv_64_(uplo, trans, diag, &n, A, &lda, x, &incx);
+}
+
+void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const BLASINT m, const BLASINT n,
+                 const float alpha, const float *A, const BLASINT lda,
+                 float *B, const BLASINT ldb)
+{
+	strmm_64_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
+}
+
+void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const BLASINT m, const BLASINT n,
+                 const double alpha, const double *A, const BLASINT lda,
+                 double *B, const BLASINT ldb)
+{
+	dtrmm_64_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
+}
+
+void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
+                 const BLASINT n, const float *A, const BLASINT lda, float *X,
+                 const BLASINT incX)
+{
+	strmv_64_(uplo, transA, diag, &n, A, &lda, X, &incX);
+}
+
+void STARPU_SAXPY(const BLASINT n, const float alpha, float *X, const BLASINT incX, float *Y, const BLASINT incY)
+{
+	saxpy_64_(&n, &alpha, X, &incX, Y, &incY);
+}
+
+void STARPU_DAXPY(const BLASINT n, const double alpha, double *X, const BLASINT incX, double *Y, const BLASINT incY)
+{
+	daxpy_64_(&n, &alpha, X, &incX, Y, &incY);
+}
+
+BLASINT STARPU_ISAMAX (const BLASINT n, float *X, const BLASINT incX)
+{
+    BLASINT retVal;
+    retVal = isamax_64_ (&n, X, &incX);
+    return retVal;
+}
+
+BLASINT STARPU_IDAMAX (const BLASINT n, double *X, const BLASINT incX)
+{
+    BLASINT retVal;
+    retVal = idamax_64_ (&n, X, &incX);
+    return retVal;
+}
+
+float STARPU_SDOT(const BLASINT n, const float *x, const BLASINT incx, const float *y, const BLASINT incy)
+{
+	float retVal = 0;
+
+	/* GOTOBLAS will return a FLOATRET which is a double, not a float */
+	retVal = (float)sdot_64_(&n, x, &incx, y, &incy);
+
+	return retVal;
+}
+
+double STARPU_DDOT(const BLASINT n, const double *x, const BLASINT incx, const double *y, const BLASINT incy)
+{
+	return ddot_64_(&n, x, &incx, y, &incy);
+}
+
+void STARPU_SSWAP(const BLASINT n, float *X, const BLASINT incX, float *Y, const BLASINT incY)
+{
+	sswap_64_(&n, X, &incX, Y, &incY);
+}
+
+void STARPU_DSWAP(const BLASINT n, double *X, const BLASINT incX, double *Y, const BLASINT incY)
+{
+	dswap_64_(&n, X, &incX, Y, &incY);
+}

+ 148 - 0
julia/src/blas.h

@@ -0,0 +1,148 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __BLAS_H__
+#define __BLAS_H__
+
+#include <stdint.h>
+
+#define BLASINT int64_t
+
+void STARPU_SGEMM(char *transa, char *transb, BLASINT M, BLASINT N, BLASINT K, float alpha, const float *A, BLASINT lda, 
+		const float *B, BLASINT ldb, float beta, float *C, BLASINT ldc);
+void STARPU_DGEMM(char *transa, char *transb, BLASINT M, BLASINT N, BLASINT K, double alpha, double *A, BLASINT lda, 
+		double *B, BLASINT ldb, double beta, double *C, BLASINT ldc);
+void STARPU_SGEMV(char *transa, BLASINT M, BLASINT N, float alpha, float *A, BLASINT lda,
+		float *X, BLASINT incX, float beta, float *Y, BLASINT incY);
+void STARPU_DGEMV(char *transa, BLASINT M, BLASINT N, double alpha, double *A, BLASINT lda,
+		double *X, BLASINT incX, double beta, double *Y, BLASINT incY);
+float STARPU_SASUM(BLASINT N, float *X, BLASINT incX);
+double STARPU_DASUM(BLASINT N, double *X, BLASINT incX);
+void STARPU_SSCAL(BLASINT N, float alpha, float *X, BLASINT incX);
+void STARPU_DSCAL(BLASINT N, double alpha, double *X, BLASINT incX);
+void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
+                   const char *diag, const BLASINT m, const BLASINT n,
+                   const float alpha, const float *A, const BLASINT lda,
+                   float *B, const BLASINT ldb);
+void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
+                   const char *diag, const BLASINT m, const BLASINT n,
+                   const double alpha, const double *A, const BLASINT lda,
+                   double *B, const BLASINT ldb);
+void STARPU_SSYR (const char *uplo, const BLASINT n, const float alpha,
+                  const float *x, const BLASINT incx, float *A, const BLASINT lda);
+void STARPU_SSYRK (const char *uplo, const char *trans, const BLASINT n,
+                   const BLASINT k, const float alpha, const float *A,
+                   const BLASINT lda, const float beta, float *C,
+                   const BLASINT ldc);
+void STARPU_SGER (const BLASINT m, const BLASINT n, const float alpha,
+                  const float *x, const BLASINT incx, const float *y,
+                  const BLASINT incy, float *A, const BLASINT lda);
+void STARPU_DGER(const BLASINT m, const BLASINT n, const double alpha,
+                  const double *x, const BLASINT incx, const double *y,
+                  const BLASINT incy, double *A, const BLASINT lda);
+void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
+                   const BLASINT n, const float *A, const BLASINT lda, float *x, 
+                   const BLASINT incx);
+void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const BLASINT m, const BLASINT n,
+                 const float alpha, const float *A, const BLASINT lda,
+                 float *B, const BLASINT ldb);
+void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const BLASINT m, const BLASINT n,
+                 const double alpha, const double *A, const BLASINT lda,
+                 double *B, const BLASINT ldb);
+void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
+                 const BLASINT n, const float *A, const BLASINT lda, float *X,
+                 const BLASINT incX);
+void STARPU_SAXPY(const BLASINT n, const float alpha, float *X, const BLASINT incX, float *Y, const BLASINT incy);
+void STARPU_DAXPY(const BLASINT n, const double alpha, double *X, const BLASINT incX, double *Y, const BLASINT incY);
+BLASINT STARPU_ISAMAX (const BLASINT n, float *X, const BLASINT incX);
+BLASINT STARPU_IDAMAX (const BLASINT n, double *X, const BLASINT incX);
+float STARPU_SDOT(const BLASINT n, const float *x, const BLASINT incx, const float *y, const BLASINT incy);
+double STARPU_DDOT(const BLASINT n, const double *x, const BLASINT incx, const double *y, const BLASINT incy);
+void STARPU_SSWAP(const BLASINT n, float *x, const BLASINT incx, float *y, const BLASINT incy);
+void STARPU_DSWAP(const BLASINT n, double *x, const BLASINT incx, double *y, const BLASINT incy);
+
+
+extern void sgemm_64_ (const char *transa, const char *transb, const BLASINT *m,
+                   const BLASINT *n, const BLASINT *k, const float *alpha, 
+                   const float *A, const BLASINT *lda, const float *B, 
+                   const BLASINT *ldb, const float *beta, float *C, 
+                   const BLASINT *ldc);
+extern void dgemm_64_ (const char *transa, const char *transb, const BLASINT *m,
+                   const BLASINT *n, const BLASINT *k, const double *alpha, 
+                   const double *A, const BLASINT *lda, const double *B, 
+                   const BLASINT *ldb, const double *beta, double *C, 
+                   const BLASINT *ldc);
+extern void sgemv_64_(const char *trans, const BLASINT *m, const BLASINT *n, const float *alpha,
+                   const float *a, const BLASINT *lda, const float *x, const BLASINT *incx, 
+                   const float *beta, float *y, const BLASINT *incy);
+extern void dgemv_64_(const char *trans, const BLASINT *m, const BLASINT *n, const double *alpha,
+                   const double *a, const BLASINT *lda, const double *x, const BLASINT *incx,
+                   const double *beta, double *y, const BLASINT *incy);
+extern void ssyr_64_ (const char *uplo, const BLASINT *n, const float *alpha,
+                  const float *x, const BLASINT *incx, float *A, const BLASINT *lda);
+extern void ssyrk_64_ (const char *uplo, const char *trans, const BLASINT *n,
+                   const BLASINT *k, const float *alpha, const float *A,
+                   const BLASINT *lda, const float *beta, float *C,
+                   const BLASINT *ldc);
+extern void strsm_64_ (const char *side, const char *uplo, const char *transa, 
+                   const char *diag, const BLASINT *m, const BLASINT *n,
+                   const float *alpha, const float *A, const BLASINT *lda,
+                   float *B, const BLASINT *ldb);
+extern void dtrsm_64_ (const char *side, const char *uplo, const char *transa, 
+                   const char *diag, const BLASINT *m, const BLASINT *n,
+                   const double *alpha, const double *A, const BLASINT *lda,
+                   double *B, const BLASINT *ldb);
+extern double sasum_64_ (const BLASINT *n, const float *x, const BLASINT *incx);
+extern double dasum_64_ (const BLASINT *n, const double *x, const BLASINT *incx);
+extern void sscal_64_ (const BLASINT *n, const float *alpha, float *x,
+                   const BLASINT *incx);
+extern void dscal_64_ (const BLASINT *n, const double *alpha, double *x,
+                   const BLASINT *incx);
+extern void sger_64_(const BLASINT *m, const BLASINT *n, const float *alpha,
+                  const float *x, const BLASINT *incx, const float *y,
+                  const BLASINT *incy, float *A, const BLASINT *lda);
+extern void dger_64_(const BLASINT *m, const BLASINT *n, const double *alpha,
+                  const double *x, const BLASINT *incx, const double *y,
+                  const BLASINT *incy, double *A, const BLASINT *lda);
+extern void strsv_64_ (const char *uplo, const char *trans, const char *diag, 
+                   const BLASINT *n, const float *A, const BLASINT *lda, float *x, 
+                   const BLASINT *incx);
+extern void strmm_64_(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const BLASINT *m, const BLASINT *n,
+                 const float *alpha, const float *A, const BLASINT *lda,
+                 float *B, const BLASINT *ldb);
+extern void dtrmm_64_(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const BLASINT *m, const BLASINT *n,
+                 const double *alpha, const double *A, const BLASINT *lda,
+                 double *B, const BLASINT *ldb);
+extern void strmv_64_(const char *uplo, const char *transA, const char *diag,
+                 const BLASINT *n, const float *A, const BLASINT *lda, float *X,
+                 const BLASINT *incX);
+extern void saxpy_64_(const BLASINT *n, const float *alpha, const float *X, const BLASINT *incX,
+		float *Y, const BLASINT *incy);
+extern void daxpy_64_(const BLASINT *n, const double *alpha, const double *X, const BLASINT *incX,
+		double *Y, const BLASINT *incy);
+extern BLASINT isamax_64_(const BLASINT *n, const float *X, const BLASINT *incX);
+extern BLASINT idamax_64_(const BLASINT *n, const double *X, const BLASINT *incX);
+/* for some reason, FLOATRET is not a float but a double in GOTOBLAS */
+extern double sdot_64_(const BLASINT *n, const float *x, const BLASINT *incx, const float *y, const BLASINT *incy);
+extern double ddot_64_(const BLASINT *n, const double *x, const BLASINT *incx, const double *y, const BLASINT *incy);
+extern void sswap_64_(const BLASINT *n, float *x, const BLASINT *incx, float *y, const BLASINT *incy);
+extern void dswap_64_(const BLASINT *n, double *x, const BLASINT *incx, double *y, const BLASINT *incy);
+
+#endif /* __BLAS_H__ */

+ 21 - 0
julia/src/blas.jl

@@ -0,0 +1,21 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+@enum STARPU_BLAS begin
+    STARPU_SAXPY
+end
+
+cuda_blas_codelets = Dict(STARPU_SAXPY => "julia_saxpy_cuda_codelet")
+cpu_blas_codelets = Dict(STARPU_SAXPY => "julia_saxpy_cpu_codelet")

+ 50 - 0
julia/src/blas_wrapper.c

@@ -0,0 +1,50 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <starpu.h>
+#include <blas.h>
+
+#if defined(STARPU_ATLAS) || defined(STARPU_OPENBLAS) || defined(STARPU_MKL)
+void julia_saxpy_cpu_codelet(void *descr[], void *arg)
+{
+	float alpha = *((float *)arg);
+
+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+
+	float *block_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
+	float *block_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	STARPU_SAXPY((int)n, alpha, block_x, 1, block_y, 1);
+}
+#endif
+
+#ifdef STARPU_USE_CUDA
+
+#include <starpu_cublas_v2.h>
+
+void julia_saxpy_cuda_codelet(void *descr[], void *arg)
+{
+	float alpha = *((float *)arg);
+
+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+
+	float *block_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
+	float *block_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	cublasStatus_t status = cublasSaxpy(starpu_cublas_get_local_handle(), (int)n, &alpha, block_x, 1, block_y, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
+}
+#endif

+ 25 - 8
julia/src/compiler/c.jl

@@ -58,15 +58,16 @@ function transform_to_cpu_kernel(expr :: StarpuExprFunction)
     output = add_for_loop_declarations(expr)
     output = substitute_args(output)
     output = substitute_func_calls(output)
+    output = substitute_views(output)
     output = substitute_indexing(output)
     output = flatten_blocks(output)
 
     return output
 end
 
-function generate_c_struct_param_declaration(funcname)
-    scalar_parameters = CODELETS_SCALARS[funcname]
-    struct_params_name = CODELETS_PARAMS_STRUCT[funcname]
+function generate_c_struct_param_declaration(codelet_name)
+    scalar_parameters = CODELETS_SCALARS[codelet_name]
+    struct_params_name = CODELETS_PARAMS_STRUCT[codelet_name]
 
     output = "struct $struct_params_name {\n"
     for p in scalar_parameters
@@ -197,18 +198,18 @@ function substitute_args(expr :: StarpuExprFunction)
 
 
     new_args = [
-                    starpu_parse(:($buffer_arg_name :: Matrix{Nothing})),
-                    starpu_parse(:($cl_arg_name :: Vector{Nothing}))
-                ]
+        starpu_parse(:($buffer_arg_name :: Ptr{Ptr{Nothing}})),
+        starpu_parse(:($cl_arg_name :: Vector{Nothing}))
+    ]
     new_body = StarpuExprBlock([function_start_affectations..., new_body.exprs...])
 
     return StarpuExprFunction(expr.ret_type, expr.func, new_args, new_body)
 end
 
 func_substitution = Dict(
-    :width => :STARPU_MATRIX_GET_NY,
+    :width  => :STARPU_MATRIX_GET_NY,
     :height => :STARPU_MATRIX_GET_NX,
-
+    :ld     => :STARPU_MATRIX_GET_LD,
     :length => :STARPU_VECTOR_GET_NX
 )
 
@@ -228,6 +229,22 @@ function substitute_func_calls(expr :: StarpuExpr)
     return apply(func_to_apply, expr)
 end
 
+function substitute_views(expr :: StarpuExpr)
+    function func_to_apply(x :: StarpuExpr)
+
+        if !isa(x, StarpuExprCall) || x.func != :view
+            return x
+        end
+
+        ref = x.args[1]
+        indexes = map(i -> isa(i, StarpuExprInterval) ? i.start : i, x.args[2:end])
+
+        return StarpuExprAddress(StarpuExprRef(ref, indexes))
+    end
+
+    return apply(func_to_apply, expr)
+
+end
 
 function substitute_indexing(expr :: StarpuExpr)
 

+ 155 - 25
julia/src/compiler/cuda.jl

@@ -129,7 +129,134 @@ function add_device_to_interval_call(expr :: StarpuExpr)
     return apply(func_to_apply, expr)
 end
 
+function translate_cublas(expr :: StarpuExpr)
+    function func_to_run(x :: StarpuExpr)
+        # STARPU_BLAS => (CUBLAS, TRANS, FILLMODE, ALPHA, SIDE, DIAG)
+        blas_to_cublas = Dict(:STARPU_SGEMM  => (:cublasSgemm, [1, 2], [], [6, 11], [], []),
+                              :STARPU_DGEMM  => (:cublasDgemm, [1, 2], [], [6, 11], [], []),
+                              :STARPU_SGEMV  => (:cublasSgemv, [1], [], [4,9], [], []),
+                              :STARPU_DGEMV  => (:cublasDgemv, [1], [], [4,9], [], []),
+                              :STARPU_SSCAL  => (:cublasSscal, [], [], [2], [], []),
+                              :STARPU_DSCAL  => (:cublasDscal, [], [], [2], [], []),
+                              :STARPU_STRSM  => (:cublasStrsm, [3], [2], [7], [1], [4]),
+                              :STARPU_DTRSM  => (:cublasDtrsm, [3], [2], [7], [1], [4]),
+                              :STARPU_SSYR   => (:cublasSsyr, [], [1], [3], [], []),
+                              :STARPU_SSYRK  => (:cublasSsyrk, [2], [1], [5,8], [], []),
+                              :STARPU_SGER   => (:cublasSger, [], [], [3], [], []),
+                              :STARPU_DGER   => (:cublasDger, [], [], [3], [], []),
+                              :STARPU_STRSV  => (:cublasStrsv, [2], [1], [], [], [3]),
+                              :STARPU_STRMM  => (:cublasStrmm, [3], [2], [7], [1], [4]),
+                              :STARPU_DTRMM  => (:cublasDtrmm, [3], [2], [7], [1], [4]),
+                              :STARPU_STRMV  => (:cublasStrmv, [2], [1], [], [], [3]),
+                              :STARPU_SAXPY  => (:cublasSaxpy, [], [], [2], [], []),
+                              :STARPU_DAXPY  => (:cublasDaxpy, [], [], [2], [], []),
+                              :STARPU_SSWAP  => (:cublasSswap, [], [], [], [], []),
+                              :STARPU_DSWAP  => (:cublasDswap, [], [], [], [], []))
+
+        if !(isa(x, StarpuExprCall) && x.func in keys(blas_to_cublas))
+            return x
+        end
+
+        new_args = x.args
+
+        # cublasOperation_t parameters (e.g. StarpuExprValue("N")
+        for i in blas_to_cublas[x.func][2]
+            if !isa(new_args[i], StarpuExprValue) || !isa(new_args[i].value, String)
+                error("Argument $i of ", x.func, " must be a string")
+            end
+
+            value = new_args[i].value
+
+            if value == "N" || value == "n"
+                new_args[i] = StarpuExprVar(:CUBLAS_OP_N)
+            elseif value == "T" || value == "t"
+                new_args[i] = StarpuExprVar(:CUBLAS_OP_T)
+            elseif value == "C" || value == "c"
+                new_args[i] = StarpuExprVar(:CUBLAS_OP_C)
+            else
+                error("Unhandled value for rgument $i of ", x.func, ": ", value,
+                      "expecting (\"N\", \"T\", or \"C\")")
+            end
+        end
+
+        # cublasFillMode_t parameters (e.g. StarpuExprValue("L")
+        for i in blas_to_cublas[x.func][3]
+            if !isa(new_args[i], StarpuExprValue) || !isa(new_args[i].value, String)
+                error("Argument $i of ", x.func, " must be a string")
+            end
+
+            value = new_args[i].value
+
+            if value == "L" || value == "l"
+                new_args[i] = StarpuExprVar(:CUBLAS_FILL_MODE_LOWER)
+            elseif value == "U" || value == "u"
+                new_args[i] = StarpuExprVar(:CUBLAS_FILL_MODE_UPPER)
+            else
+                error("Unhandled value for rgument $i of ", x.func, ": ", value,
+                      "expecting (\"L\" or \"U\")")
+            end
+        end
+
+        # scalar parameters (alpha, beta, ...):  alpha -> &alpha
+        for i in blas_to_cublas[x.func][4]
+            if !isa(new_args[i], StarpuExprVar)
+                error("Argument $i of ", x.func, " must be a variable")
+            end
+            var_name = new_args[i].name
+            new_args[i] = StarpuExprVar(Symbol("&$var_name"))
+        end
+
+        # cublasSideMode_t parameters (e.g. StarpuExprValue("L")
+        for i in blas_to_cublas[x.func][5]
+            if !isa(new_args[i], StarpuExprValue) || !isa(new_args[i].value, String)
+                error("Argument $i of ", x.func, " must be a string, got: ", new_args[i])
+            end
 
+            value = new_args[i].value
+
+            if value == "L" || value == "l"
+                new_args[i] = StarpuExprVar(:CUBLAS_SIDE_LEFT)
+            elseif value == "R" || value == "r"
+                new_args[i] = StarpuExprVar(:CUBLAS_SIDE_RIGHT)
+            else
+                error("Unhandled value for rgument $i of ", x.func, ": ", value,
+                      "expecting (\"L\" or \"R\")")
+            end
+        end
+
+        # cublasDiag_Typet parameters (e.g. StarpuExprValue("N")
+        for i in blas_to_cublas[x.func][6]
+            if !isa(new_args[i], StarpuExprValue) || !isa(new_args[i].value, String)
+                error("Argument $i of ", x.func, " must be a string")
+            end
+
+            value = new_args[i].value
+
+            if value == "N" || value == "n"
+                new_args[i] = StarpuExprVar(:CUBLAS_DIAG_NON_UNIT)
+            elseif value == "U" || value == "u"
+                new_args[i] = StarpuExprVar(:CUBLAS_DIAG_UNIT)
+            else
+                error("Unhandled value for rgument $i of ", x.func, ": ", value,
+                      "expecting (\"N\" or \"U\")")
+            end
+        end
+
+        new_args = [@parse(starpu_cublas_get_local_handle()), x.args...]
+
+        status_varname = "status"*rand_string()
+        status_var = StarpuExprVar(Symbol("cublasStatus_t "*status_varname))
+        call_expr = StarpuExprCall(blas_to_cublas[x.func][1], new_args)
+
+        return StarpuExprBlock([StarpuExprAffect(status_var, call_expr),
+                                starpu_parse(Meta.parse("""if $status_varname != CUBLAS_STATUS_SUCCESS
+                                                              STARPU_CUBLAS_REPORT_ERROR($status_varname)
+                                                          end""")),
+                                @parse cudaStreamSynchronize(starpu_cuda_get_local_stream())])
+    end
+
+    return apply(func_to_run, expr)
+end
 
 function transform_to_cuda_kernel(func :: StarpuExprFunction)
 
@@ -137,45 +264,48 @@ function transform_to_cuda_kernel(func :: StarpuExprFunction)
 
     init, indep, finish = extract_init_indep_finish(cpu_func.body)
 
-    if indep == nothing
-        error("No independant for loop has been found") # TODO can fail because extraction is not correct yet
-    end
+    cpu_instr = init
+    kernel = nothing
 
-    prekernel_instr, kernel_args, kernel_instr = analyse_sets(indep)
+    # Generate a CUDA kernel only if there is an independent loop (@parallel macro).
+    if (indep != nothing)
+        prekernel_instr, kernel_args, kernel_instr = analyse_sets(indep)
 
-    kernel_call = StarpuExprCudaCall(:cudaKernel, (@parse nblocks), (@parse THREADS_PER_BLOCK), StarpuExpr[])
-    prekernel_instr = vcat(init, prekernel_instr)
-    kernel_instr = vcat(kernel_instr, indep.body)
+        kernel_call = StarpuExprCudaCall(:cudaKernel, (@parse nblocks), (@parse THREADS_PER_BLOCK), StarpuExpr[])
+        cpu_instr = vcat(cpu_instr, prekernel_instr)
+        kernel_instr = vcat(kernel_instr, indep.body)
 
-    indep_for_def, indep_for_undef = analyse_variable_declarations(StarpuExprBlock(kernel_instr), kernel_args)
-    prekernel_def, prekernel_undef = analyse_variable_declarations(StarpuExprBlock(prekernel_instr), cpu_func.args)
+        indep_for_def, indep_for_undef = analyse_variable_declarations(StarpuExprBlock(kernel_instr), kernel_args)
+        prekernel_def, prekernel_undef = analyse_variable_declarations(StarpuExprBlock(cpu_instr), cpu_func.args)
 
-    for undef_var in indep_for_undef
+        for undef_var in indep_for_undef
 
-        found_var = find_variable(undef_var, prekernel_def)
+            found_var = find_variable(undef_var, prekernel_def)
 
-        if found_var == nothing # TODO : error then ?
-            continue
+            if found_var == nothing # TODO : error then ?
+                continue
+            end
+
+            push!(kernel_args, found_var)
         end
 
-        push!(kernel_args, found_var)
+        call_args = map((x -> StarpuExprVar(x.name)), kernel_args)
+        kernelname=Symbol("KERNEL_",func.func);
+        cuda_call = StarpuExprCudaCall(kernelname, (@parse nblocks), (@parse THREADS_PER_BLOCK), call_args)
+        push!(cpu_instr, cuda_call)
+        push!(cpu_instr, @parse cudaStreamSynchronize(starpu_cuda_get_local_stream()))
+        kernel = StarpuExprFunction(Nothing, kernelname, kernel_args, StarpuExprBlock(kernel_instr))
+        kernel = add_device_to_interval_call(kernel)
+        kernel = flatten_blocks(kernel)
     end
 
-    call_args = map((x -> StarpuExprVar(x.name)), kernel_args)
-    kernelname=Symbol("KERNEL_",func.func);
-    cuda_call = StarpuExprCudaCall(kernelname, (@parse nblocks), (@parse THREADS_PER_BLOCK), call_args)
-    push!(prekernel_instr, cuda_call)
-    push!(prekernel_instr, @parse cudaStreamSynchronize(starpu_cuda_get_local_stream()))
-    prekernel_instr = vcat(prekernel_instr, finish)
+    cpu_instr = vcat(cpu_instr, finish)
 
     prekernel_name = Symbol("CUDA_", func.func)
-    prekernel = StarpuExprFunction(Nothing, prekernel_name, cpu_func.args, StarpuExprBlock(prekernel_instr))
+    prekernel = StarpuExprFunction(Nothing, prekernel_name, cpu_func.args, StarpuExprBlock(cpu_instr))
+    prekernel = translate_cublas(prekernel)
     prekernel = flatten_blocks(prekernel)
 
-    kernel = StarpuExprFunction(Nothing, kernelname, kernel_args, StarpuExprBlock(kernel_instr))
-    kernel = add_device_to_interval_call(kernel)
-    kernel = flatten_blocks(kernel)
-    
     return prekernel, kernel
 end
 

+ 24 - 16
julia/src/compiler/expressions.jl

@@ -121,6 +121,9 @@ struct StarpuExprWhile <: StarpuExpr
     body :: StarpuExpr
 end
 
+struct StarpuExprAddress <: StarpuExpr
+    ref :: StarpuExpr
+end
 
 function starpu_parse_affect(x :: Expr)
 
@@ -247,7 +250,7 @@ function starpu_parse_call(x :: Expr)
 end
 
 
-starpu_infix_operators = (:(+), :(*), :(-), :(/), :(<), :(>), :(<=), :(>=), :(%))
+starpu_infix_operators = (:(+), :(*), :(-), :(/), :(<), :(>), :(<=), :(>=), :(!=), :(%))
 
 
 function print_prefix(io :: IO, x :: StarpuExprCall ; indent = 0, restrict=false)
@@ -293,7 +296,6 @@ function apply(func :: Function, expr :: StarpuExprCall)
     return func(StarpuExprCall(expr.func, map((x -> apply(func, x)), expr.args)))
 end
 
-
 #======================================================
                 CUDA KERNEL CALL
 ======================================================#
@@ -731,8 +733,6 @@ function print(io :: IO, x :: StarpuExprRef ; indent = 0,restrict=false)
 
 end
 
-
-
 function apply(func :: Function, expr :: StarpuExprRef)
 
     ref = apply(func, expr.ref)
@@ -741,6 +741,16 @@ function apply(func :: Function, expr :: StarpuExprRef)
     return func(StarpuExprRef(ref, indexes))
 end
 
+function print(io :: IO, x :: StarpuExprAddress ; indent = 0, restrict=false)
+    print(io, "&")
+    print(io, x.ref, indent = indent)
+end
+
+function apply(func :: Function, expr :: StarpuExprAddress)
+    ref = apply(func, expr.ref)
+    return func(StarpuExprAddress(ref))
+end
+
 #======================================================
                 BREAK EXPRESSION
 ======================================================#
@@ -796,7 +806,7 @@ function apply(func :: Function, expr :: StarpuExpr)
     return func(expr)
 end
 
-print(io :: IO, x :: StarpuExprVar ; indent = 0) = print(io, x.name)
+print(io :: IO, x :: StarpuExprVar ; indent = 0, restrict = false) = print(io, x.name)
 
 function print(io :: IO, x :: StarpuExprValue ; indent = 0,restrict=false)
 
@@ -866,26 +876,24 @@ end
 
 function starpu_type_traduction(x)
     if x <: Array
-        return starpu_type_traduction_array(x)
+        return starpu_type_traduction(eltype(x)) * "*"
     end
 
     if x <: Ptr
-        return starpu_type_traduction(eltype(x)) * "*"
+        depth = 1
+        type = eltype(x)
+        while type <: Ptr
+            depth +=1
+            type = eltype(type)
+        end
+
+        return starpu_type_traduction(type) * "*"^depth
     end
 
     return starpu_type_traduction_dict[x]
 
 end
 
-function starpu_type_traduction_array(x :: Type{Array{T,N}})  where {T,N}
-    output = starpu_type_traduction(T)
-    for i in (1 : N)
-        output *= "*"
-    end
-
-    return output
-end
-
 function print(io :: IO, x :: StarpuExprTyped ; indent = 0,restrict=false)
 
     if (isa(x, StarpuExprTypedVar))

+ 17 - 13
julia/src/compiler/file_generation.jl

@@ -12,6 +12,8 @@ const cpu_kernel_file_start = "#include <stdio.h>
 #include <starpu.h>
 #include <math.h>
 
+#include \"blas.h\"
+
 static inline long long jlstarpu_max(long long a, long long b)
 {
 	return (a > b) ? a : b;
@@ -32,15 +34,16 @@ const cuda_kernel_file_start = "#include <stdio.h>
 #include <stdint.h>
 #include <starpu.h>
 #include <math.h>
+#include <starpu_cublas_v2.h>
 
 #define THREADS_PER_BLOCK 64
 
-static inline long long jlstarpu_max(long long a, long long b)
+__attribute__((unused)) static inline long long jlstarpu_max(long long a, long long b)
 {
 	return (a > b) ? a : b;
 }
 
-static inline long long jlstarpu_interval_size(long long start, long long step, long long stop)
+__attribute__((unused)) static inline long long jlstarpu_interval_size(long long start, long long step, long long stop)
 {
     if (stop >= start){
             return jlstarpu_max(0, (stop - start + 1) / step);
@@ -50,12 +53,12 @@ static inline long long jlstarpu_interval_size(long long start, long long step,
 }
 
 
-__device__ static inline long long jlstarpu_max__device(long long a, long long b)
+__attribute__((unused)) __device__ static inline long long jlstarpu_max__device(long long a, long long b)
 {
 	return (a > b) ? a : b;
 }
 
-__device__ static inline long long jlstarpu_interval_size__device(long long start, long long step, long long stop)
+__attribute__((unused)) __device__ static inline long long jlstarpu_interval_size__device(long long start, long long step, long long stop)
 {
 	if (stop >= start){
 		return jlstarpu_max__device(0, (stop - start + 1) / step);
@@ -64,7 +67,6 @@ __device__ static inline long long jlstarpu_interval_size__device(long long star
 	}
 }
 
-
 "
 
 """
@@ -109,7 +111,7 @@ macro codelet(x)
     cpu_name = name
     cuda_name = "CUDA_"*name
     dump(name)
-    parse_scalar_parameters(parsed, cpu_name, cuda_name)
+    parse_scalar_parameters(parsed, name)
     c_struct_param_decl = generate_c_struct_param_declaration(name)
     cpu_expr = transform_to_cpu_kernel(parsed)
 
@@ -130,11 +132,15 @@ macro codelet(x)
         CPU_CODELETS[name]=cpu_name
     end
 
-    if starpu_target & STARPU_CUDA!=0
+    if (starpu_target & STARPU_CUDA!=0) && STARPU_USE_CUDA == 1
         kernel_file = open(generated_cuda_kernel_file_name, "w")
         debug_print("generating ", generated_cuda_kernel_file_name)
         print(kernel_file, cuda_kernel_file_start)
-        print(kernel_file, "__global__ ", kernel)
+
+        if kernel != nothing
+            print(kernel_file, "__global__ ", kernel)
+        end
+
         print(kernel_file, c_struct_param_decl)
         print(kernel_file, "\nextern \"C\" ", prekernel)
         close(kernel_file)
@@ -142,7 +148,7 @@ macro codelet(x)
     end
 end
 
-function parse_scalar_parameters(expr :: StarpuExprFunction, cpu_name::String, cuda_name::String)
+function parse_scalar_parameters(expr :: StarpuExprFunction, codelet_name)
     scalar_parameters = []
     for i in (1 : length(expr.args))
         type = expr.args[i].typ
@@ -151,8 +157,7 @@ function parse_scalar_parameters(expr :: StarpuExprFunction, cpu_name::String, c
         end
     end
 
-    CODELETS_SCALARS[cpu_name] = scalar_parameters
-    CODELETS_SCALARS[cuda_name] = scalar_parameters
+    CODELETS_SCALARS[codelet_name] = scalar_parameters
 
     # declare structure carrying scalar parameters
     struct_params_name = Symbol("params_", rand_string())
@@ -168,6 +173,5 @@ function parse_scalar_parameters(expr :: StarpuExprFunction, cpu_name::String, c
     eval(Meta.parse(add_to_dict_str))
 
     # save structure name
-    CODELETS_PARAMS_STRUCT[cpu_name] = struct_params_name
-    CODELETS_PARAMS_STRUCT[cuda_name] = struct_params_name
+    CODELETS_PARAMS_STRUCT[codelet_name] = struct_params_name
 end

+ 235 - 0
julia/src/data.jl

@@ -0,0 +1,235 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+const StarpuDataHandlePointer = Ptr{Cvoid}
+StarpuDataHandle = StarpuDestructible{StarpuDataHandlePointer}
+
+@enum(StarpuDataFilterFunc,
+      STARPU_MATRIX_FILTER_VERTICAL_BLOCK = 0,
+      STARPU_MATRIX_FILTER_BLOCK = 1,
+      STARPU_VECTOR_FILTER_BLOCK = 2,
+)
+
+export starpu_data_filter
+function starpu_data_filter(filter_func ::StarpuDataFilterFunc, nchildren ::Integer)
+    output = starpu_data_filter(zero)
+    output.nchildren = UInt32(nchildren)
+
+    if filter_func == STARPU_MATRIX_FILTER_VERTICAL_BLOCK
+        output.filter_func = Libdl.dlsym(starpu_wrapper_library_handle, "starpu_matrix_filter_vertical_block")
+    elseif filter_func == STARPU_MATRIX_FILTER_BLOCK
+        output.filter_func = Libdl.dlsym(starpu_wrapper_library_handle, "starpu_matrix_filter_block")
+    else filter_func == STARPU_VECTOR_FILTER_BLOCK
+        output.filter_func = Libdl.dlsym(starpu_wrapper_library_handle, "starpu_vector_filter_block")
+    end
+
+    return output
+end
+
+function starpu_memory_pin(data :: Union{Vector{T}, Matrix{T}}) where T
+    starpu_memory_pin(data, sizeof(data))::Cint
+end
+
+function starpu_memory_unpin(data :: Union{Vector{T}, Matrix{T}}) where T
+    starpu_memory_unpin(data, sizeof(data))::Cint
+end
+
+function StarpuNewDataHandle(ptr :: StarpuDataHandlePointer, destr :: Function...) :: StarpuDataHandle
+    return StarpuDestructible(ptr, destr...)
+end
+
+
+
+function starpu_data_unregister_pointer(ptr :: StarpuDataHandlePointer)
+    starpu_data_unregister(ptr)
+end
+
+function starpu_data_unregister(handles :: StarpuDataHandle...)
+    for h in handles
+        starpu_execute_destructor!(h, starpu_data_unregister_pointer)
+    end
+end
+
+function starpu_data_register(v :: Vector{T}) where T
+    output = Ref{Ptr{Cvoid}}(0)
+    data_pointer = pointer(v)
+
+    starpu_vector_data_register(output, STARPU_MAIN_RAM, data_pointer, length(v), sizeof(T))
+    return StarpuNewDataHandle(output[], starpu_data_unregister_pointer)#, [starpu_data_unregister_pointer])
+end
+
+function starpu_data_register(m :: Matrix{T}) where T
+
+    output = Ref{Ptr{Cvoid}}(0)
+    data_pointer = pointer(m)
+    (height, width) = size(m)
+
+    starpu_matrix_data_register(output, STARPU_MAIN_RAM, data_pointer, height, height, width, sizeof(T))
+    return StarpuNewDataHandle(output[], starpu_data_unregister_pointer)#, [starpu_data_unregister_pointer])
+end
+
+function starpu_data_register(block :: Array{T,3}) where T
+
+    output = Ref{Ptr{Cvoid}}(0)
+    data_pointer = pointer(block)
+    (height, width, depth) = size(block)
+
+    starpu_block_data_register(output, STARPU_MAIN_RAM, data_pointer, height, height * width, height, width, depth, sizeof(T))
+    return StarpuNewDataHandle(output[], starpu_data_unregister_pointer)
+end
+
+function starpu_data_register(ref :: Ref{T}) where T
+
+    output = Ref{Ptr{Cvoid}}(0)
+
+    starpu_variable_data_register(output, STARPU_MAIN_RAM, ref, sizeof(T))
+    return StarpuNewDataHandle(output[], starpu_data_unregister_pointer)
+end
+
+function starpu_data_register(x1, x2, next_args...)
+
+    handle_1 = starpu_data_register(x1)
+    handle_2 = starpu_data_register(x2)
+
+    next_handles = map(starpu_data_register, next_args)
+
+    return [handle_1, handle_2, next_handles...]
+end
+
+import Base.getindex
+function Base.getindex(handle :: StarpuDataHandle, indexes...)
+    output = starpu_data_get_sub_data(handle.object, length(indexes),
+                                      map(x->x-1, indexes)...)
+    return StarpuNewDataHandle(output)
+end
+
+function starpu_data_unpartition_pointer(ptr :: StarpuDataHandlePointer)
+    starpu_data_unpartition(ptr, STARPU_MAIN_RAM)
+end
+
+function starpu_data_partition(handle :: StarpuDataHandle, filter :: starpu_data_filter)
+
+    starpu_add_destructor!(handle, starpu_data_unpartition_pointer)
+    starpu_data_partition(handle.object, pointer_from_objref(filter))
+end
+
+function starpu_data_unpartition(handles :: StarpuDataHandle...)
+
+    for h in handles
+        starpu_execute_destructor!(h, starpu_data_unpartition_pointer)
+    end
+
+    return nothing
+end
+
+function starpu_data_map_filters(handle :: StarpuDataHandle, filter :: starpu_data_filter)
+    starpu_add_destructor!(handle, starpu_data_unpartition_pointer)
+    starpu_data_map_filters(handle.object, 1, pointer_from_objref(filter))
+end
+
+function starpu_data_map_filters(handle :: StarpuDataHandle, filter_1 :: starpu_data_filter, filter_2 :: starpu_data_filter)
+    starpu_add_destructor!(handle, starpu_data_unpartition_pointer)
+    starpu_data_map_filters(handle.object, 2, pointer_from_objref(filter_1), pointer_from_objref(filter_2))
+end
+
+function starpu_data_get_sequential_consistency_flag(handle :: StarpuDataHandle)
+    return starpu_data_get_sequential_consistency_flag(handle.object)
+end
+
+function starpu_data_set_sequential_consistency_flag(handle :: StarpuDataHandle, flag :: Int)
+    starpu_data_set_sequential_consistency_flag(handle.object, flag)
+end
+
+function starpu_data_acquire_on_node(handle :: StarpuDataHandle, node :: Int, mode)
+    starpu_data_acquire_on_node(handle.object, node, mode)
+end
+
+function starpu_data_release_on_node(handle :: StarpuDataHandle, node :: Int)
+    starpu_data_release_on_node(handle.object, node)
+end
+
+function starpu_data_wont_use(handle :: StarpuDataHandle)
+    starpu_data_wont_use(handle.object)
+end
+
+function repl(x::Symbol)
+    return x
+end
+function repl(x::Number)
+    return x
+end
+function repl(x :: Expr)
+    if (x.head == :call && x.args[1] == :+)
+        if (x.args[2] == :_)
+            return x.args[3]
+        elseif (x.args[3] == :_)
+            return x.args[2]
+        else return Expr(:call,:+,repl(x.args[2]),repl(x.args[3]))
+        end
+    elseif (x.head == :call && x.args[1] == :-)
+        if (x.args[2] == :_)
+            return Expr(:call,:-,x.args[3])
+        elseif (x.args[3] == :_)
+            return x.args[2]
+        else return Expr(:call,:-,repl(x.args[2]),repl(x.args[3]))
+        end
+    else return Expr(:call,x.args[1],repl(x.args[2]),repl(x.args[3]))
+    end
+end
+"""
+    Declares a subarray.
+    Ex : @starpu_filter ha = A[ _:_+1, : ] 
+ 
+"""
+macro starpu_filter(expr)
+    #dump(expr, maxdepth=20)
+    if (expr.head==Symbol("="))
+        region = expr.args[2]
+        if (region.head == Symbol("ref"))
+            farray = expr.args[1]
+            println("starpu filter")
+            index = 0
+            filter2=nothing
+            filter3=nothing
+            if (region.args[2]==Symbol(":"))
+                index = 3
+                filter2=:(STARPU_MATRIX_FILTER_BLOCK)
+            elseif (region.args[3] == Symbol(":"))
+                index = 2
+                filter3=:(STARPU_MATRIX_FILTER_VERTICAL_BLOCK)
+            else
+            end
+            ex = repl(region.args[index].args[3])
+            if (region.args[index].args[2] != Symbol("_"))
+                throw(AssertionError("LHS must be _"))
+            end
+            ret = quote
+                # escape and not global for farray!
+                $(esc(farray)) = starpu_data_register($(esc(region.args[1])))
+                starpu_data_partition( $(esc(farray)),starpu_data_filter($(esc(filter)),$(esc(ex))))
+            end
+            return ret
+        else
+            ret = quote
+                $(esc(farray))= starpu_data_register($(esc(region.args[1])))
+            end
+            
+            dump("coucou"); #dump(region.args[2])
+            #                dump(region.args[2])
+            #                dump(region.args[3])
+            return ret
+        end
+    end
+end

+ 48 - 0
julia/src/dynamic_compiler/Makefile.am

@@ -0,0 +1,48 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+LD=$(CC_OR_NVCC)
+AM_CPPFLAGS = -I$(abs_top_srcdir)/include/ -I$(abs_top_builddir)/src -I$(abs_top_builddir)/include \
+	 -I$(abs_top_srcdir)/julia/src/
+
+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS)
+AM_CFLAGS += -fPIC -O3 -g -DSTRIDE=${STRIDE} -Wall -mavx -fomit-frame-pointer -march=native -ffast-math
+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@
+CUDA_CFLAGS = $(STARPU_CUDA_CPPFLAGS) -Wno-deprecated-gpu-targets
+LDFLAGS = -L @STARPU_BUILD_DIR@/julia/src/.libs/ -lstarpujulia-1.3
+EXTERNLIB=extern_tasks.so
+GENERATEDLIB=generated_tasks.so
+
+C_OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
+
+if STARPU_USE_CUDA
+CUDA_OBJECTS=$(patsubst %.cu,%.o,$(wildcard gen*.cu))
+else
+CUDA_OBJECTS=
+endif
+
+%.o: %.c
+	$(CC) -c $(AM_CPPFLAGS) $(AM_CFLAGS) $^ -o $@
+
+%.o: %.cu
+	$(NVCC) -dc $(AM_CPPFLAGS) $(CUDA_CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
+
+${EXTERNLIB}: $(SOURCES_CPU)
+	$(CC) $(AM_CPPFLAGS) $(AM_CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
+
+${GENERATEDLIB}: $(C_OBJECTS) $(CUDA_OBJECTS)
+	$(LD) -shared $^ -o $@ $(LDFLAGS)
+

+ 50 - 0
julia/src/globals.jl

@@ -0,0 +1,50 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+global starpu_wrapper_library_handle = C_NULL
+
+global starpu_tasks_library_handle = C_NULL
+
+global starpu_target=STARPU_CPU
+
+global generated_cuda_kernel_file_name = "PRINT TO STDOUT"
+global generated_cpu_kernel_file_name = "PRINT TO STDOUT"
+
+global CPU_CODELETS=Dict{String,String}()
+global CUDA_CODELETS=Dict{String,String}()
+
+global CODELETS_SCALARS=Dict{String,Any}()
+global CODELETS_PARAMS_STRUCT=Dict{String,Any}()
+
+global starpu_type_traduction_dict = Dict(
+    Int32 => "int32_t",
+    UInt32 => "uint32_t",
+    Float32 => "float",
+    Int64 => "int64_t",
+    UInt64 => "uint64_t",
+    Float64 => "double",
+    Nothing => "void"
+)
+export starpu_type_traduction_dict
+
+global mutex = Threads.SpinLock()
+
+# detect CUDA support
+try
+    STARPU_USE_CUDA == 1
+catch
+   global  const STARPU_USE_CUDA = 0
+end

+ 73 - 0
julia/src/init.jl

@@ -0,0 +1,73 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+"""
+    Must be called before any other starpu function. Field extern_task_path is the
+    shared library path which will be used to find StarpuCodelet
+    cpu and gpu function names
+"""
+function starpu_init()
+    debug_print("starpu_init")
+
+    if (get(ENV,"JULIA_TASK_LIB",0)!=0)
+        global starpu_tasks_library_handle= Libdl.dlopen(ENV["JULIA_TASK_LIB"])
+        debug_print("Loading external codelet library")
+        ff = Libdl.dlsym(starpu_tasks_library_handle,:starpu_find_function)
+        dump(ff)
+        for k in keys(CPU_CODELETS)
+            CPU_CODELETS[k]=unsafe_string(ccall(ff,Cstring, (Cstring,Cstring),Cstring_from_String(string(k)),Cstring_from_String("cpu")))
+            if STARPU_USE_CUDA == 1
+                CUDA_CODELETS[k]=unsafe_string(ccall(ff,Cstring, (Cstring,Cstring),Cstring_from_String(string(k)),Cstring_from_String("gpu")))
+            end
+            print(k,">>>>",CPU_CODELETS[k],"\n")
+        end
+    else
+        srcdir=get(ENV,"STARPU_JULIA_BUILD",0)
+        if (srcdir == 0)
+            error("Must define environment variable STARPU_JULIA_BUILD")
+        end
+        makefile=string(srcdir, "/src/dynamic_compiler/Makefile")
+        debug_print("generating codelet library with ")
+        debug_print(makefile)
+        run(`make -f $makefile generated_tasks.so`)
+        global starpu_tasks_library_handle=Libdl.dlopen("generated_tasks.so")
+    end
+    global starpu_wrapper_library_handle= Libdl.dlopen(starpu_wrapper_library_name)
+    output = starpu_init(C_NULL)
+
+    global task_pool = ThreadPools.QueuePool(2)
+
+    starpu_enter_new_block()
+
+    return output
+end
+
+"""
+    Must be called at the end of the program
+"""
+function starpu_shutdown()
+    debug_print("starpu_shutdown")
+
+    starpu_exit_block()
+    @starpucall starpu_shutdown Cvoid ()
+
+    lock(mutex)
+    empty!(perfmodel_list)
+    empty!(codelet_list)
+    empty!(task_list)
+    unlock(mutex)
+
+    return nothing
+end

+ 17 - 16
julia/src/jlstarpu.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2018                                     Alexis Juven
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -13,22 +13,23 @@
  *
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
-/*
- * jlstarpu.h
- *
- *  Created on: 27 juin 2018
- *      Author: ajuven
- */
 
-#ifndef JLSTARPU_H_
-#define JLSTARPU_H_
+#ifndef __MPI_TESTS_GEMM_HELPER__
+#define __MPI_TESTS_GEMM_HELPER__
+
+#include <starpu_config.h>
+
+extern unsigned nslices;
+extern unsigned matrix_dim;
+extern unsigned check;
+extern int comm_thread_cpuid;
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <starpu.h>
-#include <pthread.h>
 
-#include "jlstarpu_utils.h"
-#include "jlstarpu_task.h"
+void gemm_alloc_data();
+int gemm_init_data();
+int gemm_submit_tasks();
+void gemm_release();
+void gemm_add_polling_dependencies();
+int gemm_submit_tasks_with_tags(int with_tags);
 
-#endif /* JLSTARPU_H_ */
+#endif /* __MPI_TESTS_GEMM_HELPER__ */

+ 9 - 0
julia/src/openblas_ldflags.jl

@@ -0,0 +1,9 @@
+import LinearAlgebra.BLAS
+import Libdl
+
+
+libdir = normpath(joinpath(splitpath(filter(x->occursin(Base.libblas_name,x), Libdl.dllist())[1])[1:end-1]...))
+libpath = normpath(filter(x->occursin(Base.libblas_name,x), Libdl.dllist())[1])
+libname = Base.libblas_name[4:end]
+println("-Wl,-rpath,$libpath -L$libdir -l$libname")
+

+ 400 - 0
julia/src/task.jl

@@ -0,0 +1,400 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+using ThreadPools
+
+mutable struct jl_starpu_codelet
+    c_codelet :: starpu_codelet
+    perfmodel :: starpu_perfmodel
+    cpu_func :: Union{String, STARPU_BLAS}
+    cuda_func :: Union{String, STARPU_BLAS}
+    opencl_func :: String
+    modes
+end
+
+global codelet_list = Vector{jl_starpu_codelet}()
+
+function starpu_codelet(;
+                        cpu_func :: Union{String, STARPU_BLAS, Cvoid} = "",
+                        cuda_func :: Union{String, STARPU_BLAS, Cvoid} = "",
+                        opencl_func :: String = "",
+                        modes = [],
+                        perfmodel :: starpu_perfmodel,
+                        where_to_execute :: Union{Cvoid, UInt32} = nothing,
+                        color :: UInt32 = 0x00000000
+                        )
+
+    if (length(modes) > STARPU_NMAXBUFS)
+        error("Codelet has too much buffers ($(length(modes)) but only $STARPU_NMAXBUFS are allowed)")
+    end
+
+
+    if (where_to_execute == nothing)
+        real_where = ((cpu_func != nothing) * STARPU_CPU) | ((cuda_func != nothing) * STARPU_CUDA)
+    else
+        real_where = where_to_execute
+    end
+
+    output = jl_starpu_codelet(starpu_codelet(zero), perfmodel, cpu_func, cuda_func, opencl_func, modes)
+    ## TODO: starpu_codelet_init
+
+    output.c_codelet.where = real_where
+
+    for i in 1:length(modes)
+        output.c_codelet.modes[i] = modes[i]
+    end
+    output.c_codelet.nbuffers = length(modes)
+    output.c_codelet.model = pointer_from_objref(perfmodel)
+    output.c_codelet.color = color
+
+    if typeof(cpu_func) == STARPU_BLAS
+        output.cpu_func = cpu_blas_codelets[cpu_func]
+        output.c_codelet.cpu_func = load_wrapper_function_pointer(output.cpu_func)
+    else
+        output.c_codelet.cpu_func = load_starpu_function_pointer(get(CPU_CODELETS, cpu_func, ""))
+    end
+
+    if typeof(cuda_func) == STARPU_BLAS
+        output.cuda_func = cuda_blas_codelets[cuda_func]
+        output.c_codelet.cuda_func = load_wrapper_function_pointer(output.cuda_func)
+        output.c_codelet.cuda_flags[1] = STARPU_CUDA_ASYNC
+    else
+        output.c_codelet.cuda_func = load_starpu_function_pointer(get(CUDA_CODELETS, cuda_func, ""))
+    end
+
+    output.c_codelet.opencl_func = load_starpu_function_pointer("")
+
+    # Codelets must not be garbage collected before starpu shutdown is called.
+    lock(mutex)
+    push!(codelet_list, output)
+    unlock(mutex)
+
+    return output
+end
+
+mutable struct jl_starpu_task
+
+    cl :: jl_starpu_codelet
+    handles :: Vector{StarpuDataHandle}
+    handle_pointers :: Vector{StarpuDataHandlePointer}
+    synchronous :: Bool
+    cl_arg # type depends on codelet
+    callback_signal :: Vector{Cint}
+    callback_function :: Union{Cvoid, Function}
+    callback_arg
+    c_task :: starpu_task
+end
+
+task_list = Vector{jl_starpu_task}()
+
+"""
+            starpu_task(; cl :: jl_starpu_codelet, handles :: Vector{StarpuDataHandle}, cl_arg :: Ref)
+
+            Creates a new task which will run the specified codelet on handle buffers and cl_args data
+        """
+function starpu_task(;
+                     cl :: Union{Cvoid, jl_starpu_codelet} = nothing,
+                     handles :: Vector{StarpuDataHandle} = StarpuDataHandle[],
+                     cl_arg = (),
+                     callback :: Union{Cvoid, Function} = nothing,
+                     callback_arg = nothing,
+                     tag :: Union{Cvoid, starpu_tag_t} = nothing,
+                     tag_only :: Union{Cvoid, starpu_tag_t} = nothing,
+                     sequential_consistency = true,
+                     detach = 1,
+                     color :: Union{Cvoid, UInt32} = nothing,
+                     where :: Union{Cvoid, Int32} = nothing)
+    if (cl == nothing)
+        error("\"cl\" field can't be empty when creating a StarpuTask")
+    end
+
+    output = jl_starpu_task(cl, handles, map((x -> x.object), handles), false, nothing, Vector{Cint}(undef, 1), callback, callback_arg, starpu_task(zero))
+
+    # handle scalar_parameters
+    codelet_name = ""
+    if isa(cl.cpu_func, String) && cl.cpu_func != ""
+        codelet = cl.cpu_func
+    elseif isa(cl.gpu_func, String) && cl.gpu_func != ""
+        codelet = cl.gpu_func
+    end
+    scalar_parameters = get(CODELETS_SCALARS, codelet_name, nothing)
+    if scalar_parameters != nothing
+        nb_scalar_required = length(scalar_parameters)
+        nb_scalar_provided = tuple_len(cl_arg)
+        if (nb_scalar_provided != nb_scalar_required)
+            error("$nb_scalar_provided scalar parameters provided but $nb_scalar_required are required by $codelet_name.")
+        end
+        output.cl_arg = create_param_struct_from_clarg(codelet_name, cl_arg)
+    else
+        output.cl_arg = cl_arg
+    end
+
+    starpu_task_init(Ref(output.c_task))
+    output.c_task.cl = pointer_from_objref(cl.c_codelet)
+    output.c_task.synchronous = false
+    output.c_task.sequential_consistency = sequential_consistency
+    output.c_task.detach = detach
+
+    ## TODO: check num handles equals num codelet buffers
+    for i in 1:length(handles)
+        output.c_task.handles[i] = output.handle_pointers[i]
+    end
+    if tuple_len(cl_arg) > 0
+        output.c_task.cl_arg = Base.unsafe_convert(Ptr{Cvoid}, Ref(output.cl_arg))
+        output.c_task.cl_arg_size = sizeof(output.cl_arg)
+    end
+
+    # callback
+    if output.callback_function != nothing
+        output.callback_signal[1] = 0
+        output.c_task.callback_arg = Base.unsafe_convert(Ptr{Cvoid}, output.callback_signal)
+        output.c_task.callback_func = load_wrapper_function_pointer("julia_callback_func")
+    end
+
+    if tag != nothing
+        output.c_task.tag_id = tag
+        output.c_task.use_tag = 1
+    end
+
+    if tag_only != nothing
+        output.c_task.tag_id = tag_only
+    end
+
+    if color != nothing
+        output.c_task.color = color
+    end
+
+    if where != nothing
+        output.c_task.where = where
+    end
+
+    # Tasks must not be garbage collected before starpu_task_wait_for_all is called.
+    # This is necessary in particular for tasks created inside callback functions.
+    lock(mutex)
+    push!(task_list, output)
+    unlock(mutex)
+
+    return output
+end
+
+
+function create_param_struct_from_clarg(codelet_name, cl_arg)
+    struct_params_name = CODELETS_PARAMS_STRUCT[codelet_name]
+
+    if struct_params_name == false
+        error("structure name not found in CODELET_PARAMS_STRUCT")
+    end
+
+    nb_scalar_provided = length(cl_arg)
+    create_struct_param_str = "output = $struct_params_name("
+    for i in 1:nb_scalar_provided-1
+        arg = cl_arg[i]
+        create_struct_param_str *= "$arg, "
+        end
+    if (nb_scalar_provided > 0)
+        arg = cl_arg[nb_scalar_provided]
+        create_struct_param_str *= "$arg"
+    end
+    create_struct_param_str *= ")"
+    eval(Meta.parse(create_struct_param_str))
+    return output
+end
+
+"""
+    Launches task execution, if "synchronous" task field is set to "false", call
+    returns immediately
+"""
+function starpu_task_submit(task :: jl_starpu_task)
+    if (length(task.handles) != length(task.cl.modes))
+        error("Invalid number of handles for task : $(length(task.handles)) where given while codelet has $(task.cl.modes) modes")
+    end
+
+    starpu_task_submit(Ref(task.c_task))
+
+    if task.callback_function != nothing
+        callback_arg = task.callback_arg
+        callback_signal = task.callback_signal
+        callback_function = task.callback_function
+
+        lock(mutex)
+        put!(task_pool) do
+
+            # Active waiting loop
+            @starpucall(julia_wait_signal, Cvoid, (Ptr{Cvoid},), Base.unsafe_convert(Ptr{Cvoid}, callback_signal))
+
+            # We've received the signal from the pthread, now execute the callback.
+            callback_function(callback_arg)
+
+            # Tell the pthread that the callback is done.
+            callback_signal[1] = 0
+        end
+        unlock(mutex)
+    end
+end
+
+function starpu_modes(x :: Symbol)
+    if (x == Symbol("STARPU_RW"))
+        return STARPU_RW
+    elseif (x == Symbol("STARPU_R"))
+        return STARPU_R
+    else return STARPU_W
+    end
+end
+
+default_codelet = Dict{String, jl_starpu_codelet}()
+default_perfmodel = Dict{String, starpu_perfmodel}()
+
+function get_default_perfmodel(name)
+    if name in keys(default_perfmodel)
+        return default_perfmodel[name]
+    end
+
+    perfmodel = starpu_perfmodel(
+        perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
+        symbol = name
+    )
+    default_perfmodel[name] = perfmodel
+    return perfmodel
+end
+
+function get_default_codelet(codelet_name, perfmodel, modes) :: jl_starpu_codelet
+    if codelet_name in keys(default_codelet)
+        return default_codelet[codelet_name]
+    end
+
+    cl = starpu_codelet(
+        cpu_func  = codelet_name in keys(CPU_CODELETS) ? codelet_name : "",
+        cuda_func = codelet_name in keys(CUDA_CODELETS) ? codelet_name : "",
+        modes = modes,
+        perfmodel = perfmodel,
+    )
+    default_codelet[codelet_name] = cl
+    return cl
+end
+
+function starpu_task_insert(;
+                            codelet_name :: Union{Cvoid, String} = nothing,
+                            cl :: Union{Cvoid, jl_starpu_codelet} = nothing,
+                            perfmodel :: Union{starpu_perfmodel, Cvoid} = nothing,
+                            handles :: Vector{StarpuDataHandle} = StarpuDataHandle[],
+                            cl_arg = (),
+                            callback :: Union{Cvoid, Function} = nothing,
+                            callback_arg = nothing,
+                            tag :: Union{Cvoid, starpu_tag_t} = nothing,
+                            tag_only :: Union{Cvoid, starpu_tag_t} = nothing,
+                            sequential_consistency = true,
+                            detach = 1,
+                            where :: Union{Cvoid, Int32} = nothing,
+                            color :: Union{Cvoid, UInt32} = nothing,
+                            modes = nothing)
+    if cl == nothing && codelet_name == nothing
+        error("At least one of the two parameters codelet_name or cl must be provided when calling starpu_task_insert.")
+
+    end
+    if cl == nothing && modes == nothing
+        error("Modes must be defined when calling starpu_task_insert without a codelet.")
+    end
+
+    if perfmodel == nothing
+        perfmodel = get_default_perfmodel(codelet_name == nothing ? "default" : codelet_name)
+    end
+
+    if cl == nothing
+        cl = get_default_codelet(codelet_name, perfmodel, modes)
+    end
+
+    task = starpu_task(cl = cl, handles = handles, cl_arg = cl_arg, callback = callback,
+                       callback_arg = callback_arg, tag = tag, tag_only = tag_only,
+                       sequential_consistency = sequential_consistency,
+                       detach = detach, color = color, where = where)
+
+    starpu_task_submit(task)
+end
+
+"""
+    Creates and submits an asynchronous task running cl Codelet function.
+    Ex : @starpu_async_cl cl(handle1, handle2)
+"""
+macro starpu_async_cl(expr, modes, cl_arg=(), color ::UInt32=0x00000000)
+
+    if (!isa(expr, Expr) || expr.head != :call)
+        error("Invalid task submit syntax")
+    end
+    if (!isa(expr, Expr)||modes.head != :vect)
+        error("Invalid task submit syntax")
+    end
+    perfmodel = starpu_perfmodel(
+        perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
+        symbol = "history_perf"
+    )
+    println(CPU_CODELETS[string(expr.args[1])])
+    cl = starpu_codelet(
+        cpu_func  = string(expr.args[1]),
+        cuda_func = string(expr.args[1]),
+        #opencl_func="ocl_matrix_mult",
+        ### TODO: CORRECT !
+        modes = map((x -> starpu_modes(x)),modes.args),
+        perfmodel = perfmodel,
+        color = color
+    )
+    handles = Expr(:vect, expr.args[2:end]...)
+    #dump(handles)
+    quote
+        task = starpu_task(cl = $(esc(cl)), handles = $(esc(handles)), cl_arg=$(esc(cl_arg)))
+        starpu_task_submit(task)
+    end
+end
+
+function starpu_task_wait(task :: jl_starpu_task)
+    @threadcall(@starpufunc(:starpu_task_wait),
+                Cint, (Ptr{Cvoid},), Ref(task.c_task))
+
+    # starpu_task_wait(Ref(task.c_task))
+end
+
+
+"""
+    Blocks until every submitted task has finished.
+"""
+function starpu_task_wait_for_all()
+    @threadcall(@starpufunc(:starpu_task_wait_for_all),
+                Cint, ())
+
+    lock(mutex)
+    empty!(task_list)
+    unlock(mutex)
+end
+
+"""
+    Blocks until every submitted task has finished.
+    Ex : @starpu_sync_tasks begin
+                [...]
+                starpu_task_submit(task)
+                [...]
+        end
+
+    TODO : Make the macro only wait for tasks declared inside the following expression.
+            (similar mechanism as @starpu_block)
+"""
+macro starpu_sync_tasks(expr)
+    quote
+        $(esc(expr))
+        starpu_task_wait_for_all()
+    end
+end
+
+function starpu_task_destroy(task :: jl_starpu_task)
+    starpu_task_destroy(Ref(task.c_task))
+end

+ 114 - 0
julia/src/translate_headers.jl

@@ -0,0 +1,114 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+using Clang
+using Clang.LibClang.LLVM_jll
+
+function starpu_translate_headers()
+    debug_print("Translating StarPU headers...")
+
+    if !isdir(joinpath(fstarpu_build_dir(), "julia/gen"))
+        mkdir(joinpath(fstarpu_build_dir(), "julia/gen"))
+    end
+
+    STARPU_BUILD_INCLUDE=joinpath(fstarpu_build_dir(), "include")
+    STARPU_SRC_INCLUDE=joinpath(fstarpu_src_dir(), "include")
+    STARPU_HEADERS = [joinpath(STARPU_BUILD_INCLUDE, header) for header in readdir(STARPU_BUILD_INCLUDE) if endswith(header, ".h")]
+    if STARPU_SRC_INCLUDE != STARPU_BUILD_INCLUDE
+        for header in readdir(STARPU_SRC_INCLUDE)
+            if endswith(header, ".h")
+                push!(STARPU_HEADERS, joinpath(STARPU_SRC_INCLUDE, header))
+            end
+        end
+    end
+
+    LIBCLANG_INCLUDE = joinpath(dirname(LLVM_jll.libclang_path), "..", "include", "clang-c") |> normpath
+
+    clang_args = ["-I", STARPU_BUILD_INCLUDE, "-I", STARPU_SRC_INCLUDE]
+
+    for header in find_std_headers()
+        push!(clang_args, "-I")
+        push!(clang_args, header)
+    end
+
+    only_select_symbols = Set(["starpu_task",
+                               "starpu_cublas_init",
+                               "starpu_codelet",
+                               "starpu_data_filter",
+                               "starpu_tag_t",
+                               "starpu_perfmodel",
+                               "starpu_perfmodel_type",
+                               "starpu_data_handle_t",
+                               "starpu_init",
+                               "starpu_data_acquire_on_node",
+                               "starpu_data_release_on_node",
+                               "starpu_data_unregister",
+                               "starpu_data_partition",
+                               "starpu_data_unpartition",
+                               "starpu_data_get_sub_data",
+                               "starpu_data_map_filters",
+                               "starpu_data_get_default_sequential_consistency_flag",
+                               "starpu_data_set_default_sequential_consistency_flag",
+                               "starpu_data_get_sequential_consistency_flag",
+                               "starpu_data_set_sequential_consistency_flag",
+                               "starpu_data_wont_use",
+                               "starpu_matrix_data_register",
+                               "starpu_block_data_register",
+                               "starpu_vector_data_register",
+                               "starpu_variable_data_register",
+                               "starpu_memory_pin",
+                               "starpu_memory_unpin",
+                               "starpu_task_end_dep_add",
+                               "starpu_task_end_dep_release",
+                               "starpu_task_init",
+                               "starpu_task_destroy",
+                               "starpu_task_submit",
+                               "starpu_task_wait",
+                               "starpu_task_wait_for_n_submitted",
+                               "starpu_tag_remove",
+                               "starpu_tag_wait",
+                               "starpu_tag_declare_deps_array",
+                               "starpu_tag_notify_from_apps",
+                               "starpu_task_declare_end_deps_array",
+                               "starpu_task_declare_deps_array",
+                               "starpu_iteration_push",
+                               "starpu_iteration_pop",
+                               "starpu_worker_get_count",
+                               "starpu_cpu_worker_get_count",
+                               "starpu_cuda_worker_get_count",
+                               "starpu_opencl_worker_get_count",
+                               "starpu_mic_worker_get_count",
+                               "STARPU_CPU",
+                               "STARPU_CUDA",
+                               "STARPU_CUDA_ASYNC",
+                               "STARPU_OPENCL",
+                               "STARPU_MAIN_RAM",
+                               "STARPU_NMAXBUFS",
+                               "STARPU_USE_CUDA"])
+
+    wc = init(; headers = STARPU_HEADERS,
+              output_file = joinpath(fstarpu_build_dir(), "julia/gen/libstarpu_api.jl"),
+              common_file = joinpath(fstarpu_build_dir(), "julia/gen/libstarpu_common.jl"),
+              clang_includes = vcat(LIBCLANG_INCLUDE, CLANG_INCLUDE),
+              clang_args = clang_args,
+              header_library = x->"starpu_wrapper_library_name",
+              clang_diagnostics = false,
+              rewriter = x->x,
+              only_select_symbols = only_select_symbols,
+              fields_align = Dict((:starpu_pthread_spinlock_t,:taken) => 16)
+              )
+
+    run(wc)
+end

+ 1 - 1
mpi/tests/abstract_sendrecv_bench.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 2
mpi/tests/abstract_sendrecv_bench.h

@@ -1,7 +1,6 @@
-
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
mpi/tests/bench_helper.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
mpi/tests/bench_helper.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 75 - 0
mpi/tests/burst.c

@@ -0,0 +1,75 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This test sends simultaneously many communications, with various configurations.
+ *
+ * Global purpose is to run with trace recording, to watch the behaviour of communications.
+ */
+
+#include <starpu_mpi.h>
+#include "helper.h"
+#include "burst_helper.h"
+
+void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++)
+	{
+		if (strcmp(argv[i], "-nreqs") == 0)
+		{
+			burst_nb_requests = atoi(argv[++i]);
+		}
+		else if (strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0)
+		{
+			fprintf(stderr,"Usage: %s [-nreqs nreqs]\n", argv[0]);
+			fprintf(stderr,"Currently selected: %d requests in each burst\n", burst_nb_requests);
+			exit(EXIT_SUCCESS);
+		}
+		else
+		{
+			fprintf(stderr,"Unrecognized option %s\n", argv[i]);
+			exit(EXIT_FAILURE);
+		}
+	}
+}
+
+int main(int argc, char **argv)
+{
+	int ret, rank, mpi_init, other_rank;
+
+	parse_args(argc, argv);
+
+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
+
+	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
+
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+
+	burst_init_data(rank);
+
+	burst_all(rank);
+
+	/* Clear up */
+	burst_free_data(rank);
+
+	starpu_mpi_shutdown();
+	if (!mpi_init)
+		MPI_Finalize();
+
+	return 0;
+}

+ 210 - 0
mpi/tests/burst_gemm.c

@@ -0,0 +1,210 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * Program to be executed with trace recording to watch the impact of
+ * computations (or task polling) on communications.
+ */
+#include <limits.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <starpu_mpi.h>
+#include <starpu_fxt.h>
+
+#include "helper.h"
+#include "gemm_helper.h"
+#include "burst_helper.h"
+
+static int gemm_warmup = 1;
+static int gemm_warmup_wait = 0;
+
+void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++)
+	{
+		if (strcmp(argv[i], "-nblocks") == 0)
+		{
+			char *argptr;
+			nslices = strtol(argv[++i], &argptr, 10);
+			matrix_dim = 320 * nslices;
+		}
+		else if (strcmp(argv[i], "-size") == 0)
+		{
+			char *argptr;
+			unsigned matrix_dim_tmp = strtol(argv[++i], &argptr, 10);
+			if (matrix_dim_tmp % 320 != 0)
+			{
+				fprintf(stderr, "Matrix size has to be a multiple of 320\n");
+			}
+			else
+			{
+				matrix_dim = matrix_dim_tmp;
+				nslices = matrix_dim / 320;
+			}
+		}
+		else if (strcmp(argv[i], "-check") == 0)
+		{
+			check = 1;
+		}
+		else if (strcmp(argv[i], "-nreqs") == 0)
+		{
+			burst_nb_requests = atoi(argv[++i]);
+		}
+		else if (strcmp(argv[i], "-no-gemm-warmup") == 0)
+		{
+			gemm_warmup = 0;
+		}
+		else if (strcmp(argv[i], "-gemm-warmup-wait") == 0)
+		{
+			/* All warmup GEMMs will start at the same moment */
+			gemm_warmup_wait = 1;
+		}
+		else if (strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0)
+		{
+			fprintf(stderr,"Usage: %s [-nblocks n] [-size size] [-check] [-nreqs nreqs] [-no-gemm-warmup] [-gemm-warmup-wait]\n", argv[0]);
+			fprintf(stderr,"Currently selected: matrix size: %u - %u blocks - %d requests in each burst - gemm warmup: %d -gemm-warmup-wait: %d\n", matrix_dim, nslices, burst_nb_requests, gemm_warmup, gemm_warmup_wait);
+			exit(EXIT_SUCCESS);
+		}
+		else
+		{
+			fprintf(stderr,"Unrecognized option %s\n", argv[i]);
+			exit(EXIT_FAILURE);
+		}
+	}
+}
+
+int main(int argc, char **argv)
+{
+	int ret, mpi_init, worldsize, mpi_rank;
+
+	parse_args(argc, argv);
+
+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
+	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
+
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &mpi_rank);
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &worldsize);
+
+	if (worldsize < 2)
+	{
+		if (mpi_rank == 0)
+			FPRINTF(stderr, "We need 2 processes.\n");
+
+		starpu_mpi_shutdown();
+		if (!mpi_init)
+			MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	gemm_alloc_data();
+	if (gemm_init_data() == -ENODEV)
+		goto enodev;
+
+	/* GEMM warmup, to really load the BLAS library */
+	if (gemm_warmup)
+	{
+		if (gemm_warmup_wait)
+		{
+			starpu_task_wait_for_all();
+			starpu_pause();
+		}
+
+		if(gemm_submit_tasks() == -ENODEV)
+			goto enodev;
+
+		if (gemm_warmup_wait)
+		{
+			starpu_resume();
+		}
+	}
+
+	burst_init_data(mpi_rank);
+
+	/* Wait for everything and everybody: */
+	starpu_task_wait_for_all();
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+
+	FPRINTF(stderr, "** Burst warmup **\n");
+	burst_all(mpi_rank);
+
+	starpu_sleep(0.3); // sleep to easily distinguish different bursts in traces
+
+	FPRINTF(stderr, "** Burst while there is no task available, but workers are polling **\n");
+	burst_all(mpi_rank);
+
+	starpu_sleep(0.3); // sleep to easily distinguish different bursts in traces
+
+	FPRINTF(stderr, "** Burst while there is no task available, workers are paused **\n");
+	starpu_pause();
+	burst_all(mpi_rank);
+
+	starpu_sleep(0.3); // sleep to easily distinguish different bursts in traces
+
+	FPRINTF(stderr, "** Burst while workers are really working **\n");
+	if(gemm_submit_tasks() == -ENODEV)
+		goto enodev;
+	starpu_resume();
+
+	burst_all(mpi_rank);
+
+	FPRINTF(stderr, "Burst done, now waiting for computing tasks to finish\n");
+
+	/* Wait for everything and everybody: */
+	starpu_task_wait_for_all();
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+
+	starpu_sleep(0.3); // sleep to easily distinguish different parts in traces
+
+	FPRINTF(stderr, "** Workers are computing, without communications **\n");
+	starpu_pause();
+	if(gemm_submit_tasks() == -ENODEV)
+		goto enodev;
+	starpu_resume();
+
+	/* Wait for everything and everybody: */
+	starpu_task_wait_for_all();
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+
+	starpu_sleep(0.3); // sleep to easily distinguish different parts in traces
+
+	FPRINTF(stderr, "** Burst while workers are computing, but polling a moment between each task **\n");
+	starpu_pause();
+	gemm_add_polling_dependencies();
+	if(gemm_submit_tasks_with_tags(/* enable task tags */ 1) == -ENODEV)
+		goto enodev;
+	starpu_resume();
+
+	burst_all(mpi_rank);
+
+	/* Wait for everything and everybody: */
+	starpu_task_wait_for_all();
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+
+enodev:
+	gemm_release();
+	burst_free_data(mpi_rank);
+
+	starpu_mpi_shutdown();
+	if (!mpi_init)
+		MPI_Finalize();
+
+	return ret;
+}

+ 223 - 0
mpi/tests/burst_helper.c

@@ -0,0 +1,223 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <starpu_mpi.h>
+
+#include "helper.h"
+#include "burst_helper.h"
+
+#if defined(STARPU_SIMGRID) || defined(STARPU_QUICK_CHECK)
+#define NB_REQUESTS 10
+#else
+#define NB_REQUESTS 50
+#endif
+#define NX_ARRAY (320 * 320)
+
+static starpu_data_handle_t* recv_handles;
+static starpu_data_handle_t* send_handles;
+static float** recv_buffers;
+static float** send_buffers;
+static starpu_mpi_req* recv_reqs;
+static starpu_mpi_req* send_reqs;
+
+int burst_nb_requests = NB_REQUESTS;
+
+void burst_init_data(int rank)
+{
+	if (rank == 0 || rank == 1)
+	{
+		recv_handles = malloc(burst_nb_requests * sizeof(starpu_data_handle_t));
+		send_handles = malloc(burst_nb_requests * sizeof(starpu_data_handle_t));
+		recv_buffers = malloc(burst_nb_requests * sizeof(float*));
+		send_buffers = malloc(burst_nb_requests * sizeof(float*));
+		recv_reqs = malloc(burst_nb_requests * sizeof(starpu_mpi_req));
+		send_reqs = malloc(burst_nb_requests * sizeof(starpu_mpi_req));
+
+		for (int i = 0; i < burst_nb_requests; i++)
+		{
+			send_buffers[i] = malloc(NX_ARRAY * sizeof(float));
+			memset(send_buffers[i], 0, NX_ARRAY * sizeof(float));
+			starpu_vector_data_register(&send_handles[i], STARPU_MAIN_RAM, (uintptr_t) send_buffers[i], NX_ARRAY, sizeof(float));
+
+			recv_buffers[i] = malloc(NX_ARRAY * sizeof(float));
+			memset(recv_buffers[i], 0, NX_ARRAY * sizeof(float));
+			starpu_vector_data_register(&recv_handles[i], STARPU_MAIN_RAM, (uintptr_t) recv_buffers[i], NX_ARRAY, sizeof(float));
+		}
+	}
+}
+
+void burst_free_data(int rank)
+{
+	if (rank == 0 || rank == 1)
+	{
+		for (int i = 0; i < burst_nb_requests; i++)
+		{
+			starpu_data_unregister(send_handles[i]);
+			free(send_buffers[i]);
+
+			starpu_data_unregister(recv_handles[i]);
+			free(recv_buffers[i]);
+		}
+
+		free(recv_handles);
+		free(send_handles);
+		free(recv_buffers);
+		free(send_buffers);
+		free(recv_reqs);
+		free(send_reqs);
+	}
+}
+
+/* Burst simultaneous from both nodes: 0 and 1 post all the recvs, synchronise, and then post all the sends */
+void burst_bidir(int rank)
+{
+	int other_rank = (rank == 0) ? 1 : 0;
+
+	FPRINTF(stderr, "Simultaneous....start (rank %d)\n", rank);
+
+	if (rank == 0 || rank == 1)
+	{
+		for (int i = 0; i < burst_nb_requests; i++)
+		{
+			recv_reqs[i] = NULL;
+			starpu_mpi_irecv(recv_handles[i], &recv_reqs[i], other_rank, i, MPI_COMM_WORLD);
+		}
+	}
+
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+
+	if (rank == 0 || rank == 1)
+	{
+		for (int i = 0; i < burst_nb_requests; i++)
+		{
+			send_reqs[i] = NULL;
+			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], other_rank, i, i, MPI_COMM_WORLD);
+		}
+
+		for (int i = 0; i < burst_nb_requests; i++)
+		{
+			if (recv_reqs[i]) starpu_mpi_wait(&recv_reqs[i], MPI_STATUS_IGNORE);
+			if (send_reqs[i]) starpu_mpi_wait(&send_reqs[i], MPI_STATUS_IGNORE);
+		}
+	}
+
+	FPRINTF(stderr, "Simultaneous....end (rank %d)\n", rank);
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+}
+
+void burst_unidir(int sender, int receiver, int rank)
+{
+	FPRINTF(stderr, "%d -> %d... start (rank %d)\n", sender, receiver, rank);
+
+	if (rank == receiver)
+	{
+		for (int i = 0; i < burst_nb_requests; i++)
+		{
+			recv_reqs[i] = NULL;
+			starpu_mpi_irecv(recv_handles[i], &recv_reqs[i], sender, i, MPI_COMM_WORLD);
+		}
+	}
+
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+
+	if (rank == sender)
+	{
+		for (int i = 0; i < burst_nb_requests; i++)
+		{
+			send_reqs[i] = NULL;
+			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], receiver, i, i, MPI_COMM_WORLD);
+		}
+	}
+
+	if (rank == sender || rank == receiver)
+	{
+		for (int i = 0; i < burst_nb_requests; i++)
+		{
+			if (rank != sender && recv_reqs[i]) starpu_mpi_wait(&recv_reqs[i], MPI_STATUS_IGNORE);
+			if (rank == sender && send_reqs[i]) starpu_mpi_wait(&send_reqs[i], MPI_STATUS_IGNORE);
+		}
+	}
+
+	FPRINTF(stderr, "%d -> %d... end (rank %d)\n", sender, receiver, rank);
+
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+}
+
+/* Half burst from both nodes, second half burst is triggered after some requests finished. */
+void burst_bidir_half_postponed(int rank)
+{
+	int other_rank = (rank == 0) ? 1 : 0;
+	int received = 0;
+
+	FPRINTF(stderr, "Half/half burst...start (rank %d)\n", rank);
+
+	if (rank == 0 || rank == 1)
+	{
+		for (int i = 0; i < burst_nb_requests; i++)
+		{
+			recv_reqs[i] = NULL;
+			starpu_mpi_irecv(recv_handles[i], &recv_reqs[i], other_rank, i, MPI_COMM_WORLD);
+		}
+	}
+
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+
+	if (rank == 0 || rank == 1)
+	{
+		for (int i = 0; i < (burst_nb_requests / 2); i++)
+		{
+			send_reqs[i] = NULL;
+			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], other_rank, i, i, MPI_COMM_WORLD);
+		}
+
+		if (recv_reqs[burst_nb_requests / 4]) starpu_mpi_wait(&recv_reqs[burst_nb_requests / 4], MPI_STATUS_IGNORE);
+
+		for (int i = (burst_nb_requests / 2); i < burst_nb_requests; i++)
+		{
+			send_reqs[i] = NULL;
+			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], other_rank, i, i, MPI_COMM_WORLD);
+		}
+
+		for (int i = 0; i < burst_nb_requests; i++)
+		{
+			if (recv_reqs[i]) starpu_mpi_wait(&recv_reqs[i], MPI_STATUS_IGNORE);
+			if (send_reqs[i]) starpu_mpi_wait(&send_reqs[i], MPI_STATUS_IGNORE);
+		}
+	}
+
+	FPRINTF(stderr, "Half/half burst...done (rank %d)\n", rank);
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+}
+
+void burst_all(int rank)
+{
+	double start, end;
+	start = starpu_timing_now();
+
+	/* Burst simultaneous from both nodes: 0 and 1 post all the recvs, synchronise, and then post all the sends */
+	burst_bidir(rank);
+
+	/* Burst from 0 to 1 : rank 1 posts all the recvs, barrier, then rank 0 posts all the sends */
+	burst_unidir(0, 1, rank);
+
+	/* Burst from 1 to 0 : rank 0 posts all the recvs, barrier, then rank 1 posts all the sends */
+	burst_unidir(1, 0, rank);
+
+	/* Half burst from both nodes, second half burst is triggered after some requests finished. */
+	burst_bidir_half_postponed(rank);
+
+	end = starpu_timing_now();
+	FPRINTF(stderr, "All bursts took %.0f ms\n", (end - start) / 1000.0);
+}

+ 29 - 0
mpi/tests/burst_helper.h

@@ -0,0 +1,29 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __MPI_TESTS_BURST_HELPER__
+#define __MPI_TESTS_BURST_HELPER__
+
+extern int burst_nb_requests;
+
+void burst_init_data(int rank);
+void burst_free_data(int rank);
+void burst_bidir(int rank);
+void burst_unidir(int sender, int receiver, int rank);
+void burst_bidir_half_postponed(int rank);
+void burst_all(int rank);
+
+#endif /* __MPI_TESTS_BURST_HELPER__ */

+ 330 - 0
mpi/tests/gemm_helper.c

@@ -0,0 +1,330 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <common/blas.h>
+#include "../../examples/mult/simple.h"
+#include "helper.h"
+#include "gemm_helper.h"
+
+
+#define CHECK_TASK_SUBMIT(ret) do {				\
+	if (ret == -ENODEV)					\
+	{							\
+		return -ENODEV;					\
+	}							\
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");	\
+} while(0)
+
+
+unsigned nslices = 4;
+#if defined(STARPU_QUICK_CHECK) && !defined(STARPU_SIMGRID)
+unsigned matrix_dim = 256;
+#else
+unsigned matrix_dim = 320 * 4;
+#endif
+unsigned check = 0;
+int comm_thread_cpuid = -1;
+
+static TYPE *A, *B, *C;
+static starpu_data_handle_t A_handle, B_handle, C_handle;
+
+static void check_output(void)
+{
+	/* compute C = C - AB */
+	CPU_GEMM("N", "N", matrix_dim, matrix_dim, matrix_dim, (TYPE)-1.0f, A, matrix_dim, B, matrix_dim, (TYPE)1.0f, C, matrix_dim);
+
+	/* make sure C = 0 */
+	TYPE err;
+	err = CPU_ASUM(matrix_dim*matrix_dim, C, 1);
+
+	if (err < matrix_dim*matrix_dim*0.001)
+	{
+		FPRINTF(stderr, "Results are OK\n");
+	}
+	else
+	{
+		int max;
+		max = CPU_IAMAX(matrix_dim*matrix_dim, C, 1);
+
+		FPRINTF(stderr, "There were errors ... err = %f\n", err);
+		FPRINTF(stderr, "Max error : %e\n", C[max]);
+	}
+}
+
+
+static void partition_mult_data(void)
+{
+	starpu_matrix_data_register(&A_handle, STARPU_MAIN_RAM, (uintptr_t)A,
+		matrix_dim, matrix_dim, matrix_dim, sizeof(TYPE));
+	starpu_matrix_data_register(&B_handle, STARPU_MAIN_RAM, (uintptr_t)B,
+		matrix_dim, matrix_dim, matrix_dim, sizeof(TYPE));
+	starpu_matrix_data_register(&C_handle, STARPU_MAIN_RAM, (uintptr_t)C,
+		matrix_dim, matrix_dim, matrix_dim, sizeof(TYPE));
+
+	struct starpu_data_filter vert;
+	memset(&vert, 0, sizeof(vert));
+	vert.filter_func = starpu_matrix_filter_vertical_block;
+	vert.nchildren = nslices;
+
+	struct starpu_data_filter horiz;
+	memset(&horiz, 0, sizeof(horiz));
+	horiz.filter_func = starpu_matrix_filter_block;
+	horiz.nchildren = nslices;
+
+	starpu_data_partition(B_handle, &vert);
+	starpu_data_partition(A_handle, &horiz);
+
+	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
+}
+
+
+static void cpu_init_matrix_random(void *descr[], void *arg)
+{
+	(void)arg;
+	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
+	TYPE *subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
+	unsigned nx = STARPU_MATRIX_GET_NX(descr[0]);
+	unsigned ny = STARPU_MATRIX_GET_NY(descr[0]);
+
+	for (unsigned i = 0; i < nx *ny; i++)
+	{
+		subA[i] = (TYPE) (starpu_drand48());
+		subB[i] = (TYPE) (starpu_drand48());
+	}
+}
+
+
+static void cpu_init_matrix_zero(void *descr[], void *arg)
+{
+	(void)arg;
+	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
+	unsigned nx = STARPU_MATRIX_GET_NX(descr[0]);
+	unsigned ny = STARPU_MATRIX_GET_NY(descr[0]);
+
+	for (unsigned i = 0; i < nx *ny; i++)
+	{
+		subA[i] = (TYPE) (0);
+	}
+}
+
+
+static void cpu_mult(void *descr[], void *arg)
+{
+	(void)arg;
+	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
+	TYPE *subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
+	TYPE *subC = (TYPE *)STARPU_MATRIX_GET_PTR(descr[2]);
+
+	unsigned nxC = STARPU_MATRIX_GET_NX(descr[2]);
+	unsigned nyC = STARPU_MATRIX_GET_NY(descr[2]);
+	unsigned nyA = STARPU_MATRIX_GET_NY(descr[0]);
+
+	unsigned ldA = STARPU_MATRIX_GET_LD(descr[0]);
+	unsigned ldB = STARPU_MATRIX_GET_LD(descr[1]);
+	unsigned ldC = STARPU_MATRIX_GET_LD(descr[2]);
+
+	int worker_size = starpu_combined_worker_get_size();
+
+	if (worker_size == 1)
+	{
+		/* Sequential CPU task */
+		CPU_GEMM("N", "N", nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB, (TYPE)0.0, subC, ldC);
+	}
+	else
+	{
+		/* Parallel CPU task */
+		unsigned rank = starpu_combined_worker_get_rank();
+
+		unsigned block_size = (nyC + worker_size - 1)/worker_size;
+		unsigned new_nyC = STARPU_MIN(nyC, block_size*(rank+1)) - block_size*rank;
+
+		STARPU_ASSERT(nyC == STARPU_MATRIX_GET_NY(descr[1]));
+
+		TYPE *new_subB = &subB[block_size*rank];
+		TYPE *new_subC = &subC[block_size*rank];
+
+		CPU_GEMM("N", "N", nxC, new_nyC, nyA, (TYPE)1.0, subA, ldA, new_subB, ldB, (TYPE)0.0, new_subC, ldC);
+	}
+}
+
+static struct starpu_perfmodel starpu_gemm_model =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = STARPU_GEMM_STR(gemm)
+};
+
+static struct starpu_codelet cl =
+{
+	.type = STARPU_SEQ, /* changed to STARPU_SPMD if -spmd is passed */
+	.max_parallelism = INT_MAX,
+	.cpu_funcs = {cpu_mult},
+	.cpu_funcs_name = {"cpu_mult"},
+	.nbuffers = 3,
+	.modes = {STARPU_R, STARPU_R, STARPU_RW},
+	.model = &starpu_gemm_model
+};
+
+static struct starpu_codelet cl_init_matrix_random =
+{
+	.max_parallelism = INT_MAX,
+	.cpu_funcs = {cpu_init_matrix_random},
+	.cpu_funcs_name = {"cpu_init_matrix_random"},
+	.nbuffers = 2,
+	.modes = {STARPU_W, STARPU_W},
+	.name = "init_matrix_random",
+	.color = 0xffa500 // orange
+};
+
+static struct starpu_codelet cl_init_matrix_zero =
+{
+	.max_parallelism = INT_MAX,
+	.cpu_funcs = {cpu_init_matrix_zero},
+	.cpu_funcs_name = {"cpu_init_matrix_zero"},
+	.nbuffers = 1,
+	.modes = {STARPU_W},
+	.name = "init_matrix_zero",
+	.color = 0x808000 // olive
+};
+
+/* Allocate and partition buffers */
+void gemm_alloc_data()
+{
+	starpu_malloc_flags((void **)&A, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+	starpu_malloc_flags((void **)&B, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+	starpu_malloc_flags((void **)&C, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+	partition_mult_data();
+}
+
+/* Submit tasks to initialize matrices: fill them with zeros or random numbers */
+int gemm_init_data()
+{
+#ifndef STARPU_SIMGRID
+	int ret;
+	unsigned x, y;
+
+	for (x = 0; x < nslices; x++)
+	{
+		struct starpu_task *task = starpu_task_create();
+		task->cl = &cl_init_matrix_random;
+		task->handles[0] = starpu_data_get_sub_data(A_handle, 1, x);
+		task->handles[1] = starpu_data_get_sub_data(B_handle, 1, x);
+		ret = starpu_task_submit(task);
+		CHECK_TASK_SUBMIT(ret);
+
+		for (y = 0; y < nslices; y++)
+		{
+			task = starpu_task_create();
+			task->cl = &cl_init_matrix_zero;
+			task->handles[0] = starpu_data_get_sub_data(C_handle, 2, x, y);
+			ret = starpu_task_submit(task);
+			CHECK_TASK_SUBMIT(ret);
+		}
+	}
+#endif
+	return 0;
+}
+
+/* Submit tasks to compute the GEMM */
+int gemm_submit_tasks()
+{
+	return gemm_submit_tasks_with_tags(/* by default, disable task tags */ 0);
+}
+
+int gemm_submit_tasks_with_tags(int with_tags)
+{
+	int ret;
+	unsigned x, y;
+	starpu_tag_t task_tag = 0;
+
+	for (x = 0; x < nslices; x++)
+	for (y = 0; y < nslices; y++)
+	{
+		struct starpu_task *task = starpu_task_create();
+		task->cl = &cl;
+		task->handles[0] = starpu_data_get_sub_data(A_handle, 1, y);
+		task->handles[1] = starpu_data_get_sub_data(B_handle, 1, x);
+		task->handles[2] = starpu_data_get_sub_data(C_handle, 2, x, y);
+		task->flops = 2ULL * (matrix_dim/nslices) * (matrix_dim/nslices) * matrix_dim;
+
+		if (with_tags)
+		{
+			task->use_tag = 1;
+			task->tag_id = ++task_tag;
+		}
+
+		ret = starpu_task_submit(task);
+		CHECK_TASK_SUBMIT(ret);
+		starpu_data_wont_use(starpu_data_get_sub_data(C_handle, 2, x, y));
+	}
+
+	return 0;
+}
+
+/* Add dependencies between GEMM tasks to see the impact of polling workers which will at the end get a task.
+ * The new dependency graph has the following shape:
+ * - the same number of GEMMs as the number of workers are executed in parallel on all workers ("a column of tasks")
+ * - then a GEMM waits all tasks of the previous column of tasks, and is executed on a worker
+ * - the next column of tasks waits for the previous GEMM
+ * - and so on...
+ *
+ * worker 0 |  1  |  4  |  5  |  8  |  9  |
+ * worker 1 |  2  |     |  6  |     | 10  |  ...
+ * worker 2 |  3  |     |  7  |     | 11  |
+ *
+ * This function has to be called before gemm_submit_tasks_with_tags(1).
+ */
+void gemm_add_polling_dependencies()
+{
+	int nb_tasks = nslices * nslices;
+	unsigned nb_workers = starpu_worker_get_count();
+
+	for (starpu_tag_t synchro_tag = nb_workers+1; synchro_tag <= nb_tasks; synchro_tag += (nb_workers+1))
+	{
+		// this synchro tag depends on tasks of previous column of tasks:
+		for (starpu_tag_t previous_tag = synchro_tag - nb_workers; previous_tag < synchro_tag; previous_tag++)
+		{
+			starpu_tag_declare_deps(synchro_tag, 1, previous_tag);
+		}
+
+		// tasks of the next column of tasks depend on this synchro tag:
+		// this actually allows workers to poll for new tasks, while no task is available
+		for (starpu_tag_t next_tag = synchro_tag+1; next_tag < (synchro_tag + nb_workers + 1) && next_tag <= nb_tasks; next_tag++)
+		{
+			starpu_tag_declare_deps(next_tag, 1, synchro_tag);
+		}
+	}
+
+}
+
+void gemm_release()
+{
+	starpu_data_unpartition(C_handle, STARPU_MAIN_RAM);
+	starpu_data_unpartition(B_handle, STARPU_MAIN_RAM);
+	starpu_data_unpartition(A_handle, STARPU_MAIN_RAM);
+
+	starpu_data_unregister(A_handle);
+	starpu_data_unregister(B_handle);
+	starpu_data_unregister(C_handle);
+
+	if (check)
+		check_output();
+
+	starpu_free_flags(A, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+	starpu_free_flags(B, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+	starpu_free_flags(C, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+}
+
+

+ 70 - 0
mpi/tests/nothing.c

@@ -0,0 +1,70 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This program does nothing. It waits until it is interrupted by the user.
+ * Useful to check binding while StarPU is running.
+ */
+
+#include <starpu_mpi.h>
+#include <unistd.h>
+#include "helper.h"
+
+
+int main(int argc, char **argv)
+{
+	int ret, rank, worldsize;
+	int mpi_init;
+
+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
+	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
+
+	starpu_pause(); // our program will only wait, no need to stress cores by polling workers
+
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &worldsize);
+
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+
+	char hostname[65];
+	gethostname(hostname, sizeof(hostname));
+
+	printf("[rank %d on %s] ready to wait !\n", rank, hostname);
+
+	if (rank == 0)
+	{
+		printf("You can now check if thread binding is correct, for instance.\n");
+	}
+
+	fflush(stdout);
+
+	while(1)
+	{
+		sleep(1);
+	}
+
+	// TODO: maybe better handle the user interruption ?
+
+
+	starpu_resume();
+
+	starpu_mpi_shutdown();
+	if (!mpi_init)
+		MPI_Finalize();
+
+	return 0;
+}

+ 1 - 1
mpi/tests/sendrecv_parallel_tasks_bench.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 0
src/common/utils.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Federal University of Rio Grande do Sul (UFRGS)
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 8 - 8
src/core/perfmodel/perfmodel_history.c

@@ -1885,20 +1885,20 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 				STARPU_HG_DISABLE_CHECKING(entry->nsample);
 				STARPU_HG_DISABLE_CHECKING(entry->mean);
 
-				/* Do not take the first measurement into account, it is very often quite bogus */
+				/* For history-based, do not take the first measurement into account, it is very often quite bogus */
 				/* TODO: it'd be good to use a better estimation heuristic, like the median, or latest n values, etc. */
-				//entry->mean = 0;
-				//entry->sum = 0;
-
-				//entry->deviation = 0.0;
-				//entry->sum2 = 0;
+				if (model->type != STARPU_HISTORY_BASED)
+				{
+					entry->sum = measured;
+					entry->sum2 = measured*measured;
+					entry->nsample = 1;
+					entry->mean = measured;
+				}
 
 				entry->size = _starpu_job_get_data_size(model, arch, impl, j);
 				entry->flops = j->task->flops;
 
 				entry->footprint = key;
-				//entry->nsample = 0;
-				//entry->nerror = 0;
 
 				insert_history_entry(entry, list, &per_arch_model->history);
 			}

+ 5 - 0
src/datawizard/coherency.h

@@ -281,6 +281,11 @@ struct _starpu_data_state
 
 	int partition_automatic_disabled;
 
+	/** Application-provided coordinates. The maximum dimension (5) is
+	  * relatively arbitrary. */
+	unsigned dimensions;
+	int coordinates[5];
+
 	/** A generic pointer to data in the user land (could be anything and this
 	 * is not manage by StarPU) */
 	void *user_data;

+ 24 - 1
src/datawizard/interfaces/data_interface.c

@@ -1117,8 +1117,18 @@ int starpu_data_get_home_node(starpu_data_handle_t handle)
 	return handle->home_node;
 }
 
-void starpu_data_set_coordinates_array(starpu_data_handle_t handle STARPU_ATTRIBUTE_UNUSED, int dimensions STARPU_ATTRIBUTE_UNUSED, int dims[] STARPU_ATTRIBUTE_UNUSED)
+void starpu_data_set_coordinates_array(starpu_data_handle_t handle, unsigned dimensions, int dims[])
 {
+	unsigned i;
+	unsigned max_dimensions = sizeof(handle->coordinates)/sizeof(handle->coordinates[0]);
+
+	if (dimensions > max_dimensions)
+		dimensions = max_dimensions;
+
+	handle->dimensions = dimensions;
+	for (i = 0; i < dimensions; i++)
+		handle->coordinates[i] = dims[i];
+
 	_STARPU_TRACE_DATA_COORDINATES(handle, dimensions, dims);
 }
 
@@ -1135,3 +1145,16 @@ void starpu_data_set_coordinates(starpu_data_handle_t handle, unsigned dimension
 
 	starpu_data_set_coordinates_array(handle, dimensions, dims);
 }
+
+unsigned starpu_data_get_coordinates_array(starpu_data_handle_t handle, unsigned dimensions, int dims[])
+{
+	unsigned i;
+
+	if (dimensions > handle->dimensions)
+		dimensions = handle->dimensions;
+
+	for (i = 0; i < dimensions; i++)
+		dims[i] = handle->coordinates[i];
+
+	return dimensions;
+}

+ 17 - 11
src/sched_policies/component_heft.c

@@ -77,10 +77,13 @@ static int heft_progress_one(struct starpu_sched_component *component)
 		/* Estimated transfer+task termination for each child */
 		double estimated_ends_with_task[component->nchildren * ntasks];
 
-		/* Minimum transfer+task termination on all children */
-		double min_exp_end_with_task[ntasks];
-		/* Maximum transfer+task termination on all children */
-		double max_exp_end_with_task[ntasks];
+		/* estimated energy */
+		double local_energy[component->nchildren * ntasks];
+
+		/* Minimum transfer+task termination of the NTASKS tasks over all workers */
+		double min_exp_end_of_task[ntasks];
+		/* Maximum termination of the already-scheduled tasks over all workers */
+		double max_exp_end_of_workers;
 
 		unsigned suitable_components[component->nchildren * ntasks];
 
@@ -100,20 +103,23 @@ static int heft_progress_one(struct starpu_sched_component *component)
 					estimated_lengths + offset,
 					estimated_transfer_length + offset,
 					estimated_ends_with_task + offset,
-					&min_exp_end_with_task[n], &max_exp_end_with_task[n],
+					&min_exp_end_of_task[n], &max_exp_end_of_workers,
 							  suitable_components + offset, nsuitable_components[n]);
+			
+			/* Compute the energy, if provided*/
+			starpu_mct_compute_energy(component, tasks[n], local_energy + offset, suitable_components + offset, nsuitable_components[n]);
 		}
 
+		/* best_task is the task that will finish first among the ntasks, while best_benefit is its expected execution time*/
 		int best_task = 0;
-		double max_benefit = 0;
+		double best_benefit = min_exp_end_of_task[0];
 
 		/* Find the task which provides the most computation time benefit */
-		for (n = 0; n < ntasks; n++)
+		for (n = 1; n < ntasks; n++)
 		{
-			double benefit = max_exp_end_with_task[n] - min_exp_end_with_task[n];
-			if (max_benefit < benefit)
+			if (best_benefit > min_exp_end_of_task[n])
 			{
-				max_benefit = benefit;
+				best_benefit =  min_exp_end_of_task[n];
 				best_task = n;
 			}
 		}
@@ -129,7 +135,7 @@ static int heft_progress_one(struct starpu_sched_component *component)
 
 		unsigned offset = component->nchildren * best_task;
 
-		int best_icomponent = starpu_mct_get_best_component(d, tasks[best_task], estimated_lengths + offset, estimated_transfer_length + offset, estimated_ends_with_task + offset, min_exp_end_with_task[best_task], max_exp_end_with_task[best_task], suitable_components + offset, nsuitable_components[best_task]);
+		int best_icomponent = starpu_mct_get_best_component(d, tasks[best_task], estimated_lengths + offset, estimated_transfer_length + offset, estimated_ends_with_task + offset, local_energy + offset, min_exp_end_of_task[best_task], max_exp_end_of_workers, suitable_components + offset, nsuitable_components[best_task]);
 
 		STARPU_ASSERT(best_icomponent != -1);
 		best_component = component->children[best_icomponent];

+ 28 - 12
src/sched_policies/component_heteroprio.c

@@ -106,10 +106,13 @@ static int heteroprio_progress_accel(struct starpu_sched_component *component, s
 	/* Estimated transfer+task termination for each child */
 	double estimated_ends_with_task[component->nchildren];
 
-	/* Minimum transfer+task termination on all children */
-	double min_exp_end_with_task;
-	/* Maximum transfer+task termination on all children */
-	double max_exp_end_with_task;
+	/* provided local energy */
+	double local_energy[component->nchildren];
+
+	/* Minimum transfer+task termination of the task over all workers */
+	double min_exp_end_of_task;
+	/* Maximum termination of the already-scheduled tasks over all workers */
+	double max_exp_end_of_workers;
 
 	unsigned suitable_components[component->nchildren];
 	unsigned nsuitable_components;
@@ -155,16 +158,21 @@ static int heteroprio_progress_accel(struct starpu_sched_component *component, s
 			estimated_lengths,
 			estimated_transfer_length,
 			estimated_ends_with_task,
-			&min_exp_end_with_task, &max_exp_end_with_task,
+			&min_exp_end_of_task, &max_exp_end_of_workers,
 			suitable_components, nsuitable_components);
 
+	/* Compute the energy, if provided*/
+	starpu_mct_compute_energy(component, task, local_energy, suitable_components, nsuitable_components);
+
 	/* And now find out which worker suits best for this task,
 	 * including data transfer */
+
 	int best_icomponent = starpu_mct_get_best_component(d, task,
 			estimated_lengths,
 			estimated_transfer_length,
 			estimated_ends_with_task,
-			min_exp_end_with_task, max_exp_end_with_task,
+                        local_energy,
+			min_exp_end_of_task, max_exp_end_of_workers,
 			suitable_components, nsuitable_components);
 
 	if (best_icomponent == -1)
@@ -236,10 +244,13 @@ static int heteroprio_progress_noaccel(struct starpu_sched_component *component,
 	/* Estimated transfer+task termination for each child */
 	double estimated_ends_with_task[component->nchildren];
 
-	/* Minimum transfer+task termination on all children */
-	double min_exp_end_with_task;
-	/* Maximum transfer+task termination on all children */
-	double max_exp_end_with_task;
+	/* estimated energy */
+	double local_energy[component->nchildren];
+
+	/* Minimum transfer+task termination of the task over all workers */
+	double min_exp_end_of_task;
+	/* Maximum termination of the already-scheduled tasks over all workers */
+	double max_exp_end_of_workers;
 
 	unsigned suitable_components[component->nchildren];
 	unsigned nsuitable_components;
@@ -264,16 +275,21 @@ static int heteroprio_progress_noaccel(struct starpu_sched_component *component,
 			estimated_lengths,
 			estimated_transfer_length,
 			estimated_ends_with_task,
-			&min_exp_end_with_task, &max_exp_end_with_task,
+			&min_exp_end_of_task, &max_exp_end_of_workers,
 			suitable_components, nsuitable_components);
 
+	/* Compute the energy, if provided*/
+	starpu_mct_compute_energy(component, task, local_energy, suitable_components, nsuitable_components);
+	
 	/* And now find out which worker suits best for this task,
 	 * including data transfer */
+
 	int best_icomponent = starpu_mct_get_best_component(d, task,
 			estimated_lengths,
 			estimated_transfer_length,
 			estimated_ends_with_task,
-			min_exp_end_with_task, max_exp_end_with_task,
+                        local_energy,
+			min_exp_end_of_task, max_exp_end_of_workers,
 			suitable_components, nsuitable_components);
 
 	/* If no best component is found, it means that the perfmodel of

+ 13 - 7
src/sched_policies/component_mct.c

@@ -35,10 +35,13 @@ static int mct_push_task(struct starpu_sched_component * component, struct starp
 	/* Estimated transfer+task termination for each child */
 	double estimated_ends_with_task[component->nchildren];
 
-	/* Minimum transfer+task termination on all children */
-	double min_exp_end_with_task;
-	/* Maximum transfer+task termination on all children */
-	double max_exp_end_with_task;
+	/* estimated energy */
+	double local_energy[component->nchildren];
+
+	/* Minimum transfer+task termination of the task over all workers */
+	double min_exp_end_of_task;
+	/* Maximum termination of the already-scheduled tasks over all workers */
+	double max_exp_end_of_workers;
 
 	unsigned suitable_components[component->nchildren];
 	unsigned nsuitable_components;
@@ -58,12 +61,14 @@ static int mct_push_task(struct starpu_sched_component * component, struct starp
 	   make scheduling decisions at the same time */
 	STARPU_COMPONENT_MUTEX_LOCK(&d->scheduling_mutex);
 
-
 	starpu_mct_compute_expected_times(component, task, estimated_lengths, estimated_transfer_length,
-					  estimated_ends_with_task, &min_exp_end_with_task, &max_exp_end_with_task, suitable_components, nsuitable_components);
+					  estimated_ends_with_task, &min_exp_end_of_task, &max_exp_end_of_workers, suitable_components, nsuitable_components);
+
+	/* Compute the energy, if provided*/
+	starpu_mct_compute_energy(component, task, local_energy, suitable_components, nsuitable_components);
 
 	int best_icomponent = starpu_mct_get_best_component(d, task, estimated_lengths, estimated_transfer_length,
-					  estimated_ends_with_task, min_exp_end_with_task, max_exp_end_with_task, suitable_components, nsuitable_components);
+							    estimated_ends_with_task, local_energy, min_exp_end_of_task, max_exp_end_of_workers, suitable_components, nsuitable_components);
 
 	/* If no best component is found, it means that the perfmodel of
 	 * the task had been purged since it has been pushed on the mct component.
@@ -105,6 +110,7 @@ static void mct_component_deinit_data(struct starpu_sched_component * component)
 
 int starpu_sched_component_is_mct(struct starpu_sched_component * component)
 {
+
 	return component->push_task == mct_push_task;
 }
 

+ 25 - 21
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -445,7 +445,7 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 	int best = -1;
 
-	double best_exp_end = 0.0;
+	double best_exp_end_of_task = 0.0;
 	double model_best = 0.0;
 	double transfer_model_best = 0.0;
 
@@ -552,10 +552,10 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 
 			exp_end = exp_start + fifo->exp_len + local_length;
 
-			if (best == -1 || exp_end < best_exp_end)
+			if (best == -1 || exp_end < best_exp_end_of_task)
 			{
 				/* a better solution was found */
-				best_exp_end = exp_end;
+				best_exp_end_of_task = exp_end;
 				best = worker;
 				model_best = local_length;
 				transfer_model_best = local_penalty;
@@ -589,15 +589,15 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 						unsigned nworkers,
 						double local_task_length[nworkers][STARPU_MAXIMPLEMENTATIONS],
 						double exp_end[nworkers][STARPU_MAXIMPLEMENTATIONS],
-						double *max_exp_endp,
-						double *best_exp_endp,
+						double *max_exp_endp_of_workers,
+						double *min_exp_endp_of_task,
 						double local_data_penalty[nworkers][STARPU_MAXIMPLEMENTATIONS],
 						double local_energy[nworkers][STARPU_MAXIMPLEMENTATIONS],
 						int *forced_worker, int *forced_impl, unsigned sched_ctx_id, unsigned sorted_decision)
 {
 	int calibrating = 0;
-	double max_exp_end = DBL_MIN;
-	double best_exp_end = DBL_MAX;
+	double max_exp_end_of_workers = DBL_MIN;
+	double best_exp_end_of_task = DBL_MAX;
 	int ntasks_best = -1;
 	int nimpl_best = 0;
 	double ntasks_best_end = 0.0;
@@ -664,8 +664,8 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 			}
 
 			exp_end[worker_ctx][nimpl] = exp_start + prev_exp_len;
-			if (exp_end[worker_ctx][nimpl] > max_exp_end)
-				max_exp_end = exp_end[worker_ctx][nimpl];
+			if (exp_end[worker_ctx][nimpl] > max_exp_end_of_workers)
+				max_exp_end_of_workers = exp_end[worker_ctx][nimpl];
 
 			//_STARPU_DEBUG("Scheduler dmda: task length (%lf) workerid (%u) kernel (%u) \n", local_task_length[workerid][nimpl],workerid,nimpl);
 
@@ -742,10 +742,10 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
 			exp_end[worker_ctx][nimpl] = task_starting_time + local_task_length[worker_ctx][nimpl];
 
-			if (exp_end[worker_ctx][nimpl] < best_exp_end)
+			if (exp_end[worker_ctx][nimpl] < best_exp_end_of_task)
 			{
 				/* a better solution was found */
-				best_exp_end = exp_end[worker_ctx][nimpl];
+				best_exp_end_of_task = exp_end[worker_ctx][nimpl];
 				nimpl_best = nimpl;
 			}
 
@@ -766,8 +766,8 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 	}
 #endif
 
-	*best_exp_endp = best_exp_end;
-	*max_exp_endp = max_exp_end;
+	*min_exp_endp_of_task = best_exp_end_of_task;
+	*max_exp_endp_of_workers = max_exp_end_of_workers;
 }
 
 static double _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sched_ctx_id, unsigned simulate, unsigned sorted_decision)
@@ -794,10 +794,10 @@ static double _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned
 	double exp_end[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
 
 	/* This is the minimum among the exp_end[] matrix */
-	double best_exp_end;
+	double min_exp_end_of_task;
 
 	/* This is the maximum termination time of already-scheduled tasks over all workers */
-	double max_exp_end = 0.0;
+	double max_exp_end_of_workers = 0.0;
 
 	double fitness[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
 
@@ -806,8 +806,8 @@ static double _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned
 					    nworkers_ctx,
 					    local_task_length,
 					    exp_end,
-					    &max_exp_end,
-					    &best_exp_end,
+					    &max_exp_end_of_workers,
+					    &min_exp_end_of_task,
 					    local_data_penalty,
 					    local_energy,
 					    &forced_best,
@@ -836,16 +836,18 @@ static double _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned
 					/* no one on that queue may execute this task */
 					continue;
 				}
-				fitness[worker_ctx][nimpl] = dt->alpha * __s_alpha__value *(exp_end[worker_ctx][nimpl] - best_exp_end)
+				fitness[worker_ctx][nimpl] = dt->alpha * __s_alpha__value *(exp_end[worker_ctx][nimpl] - min_exp_end_of_task)
 					+ dt->beta * __s_beta__value *(local_data_penalty[worker_ctx][nimpl])
 					+ dt->_gamma * __s_gamma__value *(local_energy[worker_ctx][nimpl]);
 
-				if (exp_end[worker_ctx][nimpl] > max_exp_end)
+				if (exp_end[worker_ctx][nimpl] > max_exp_end_of_workers)
 				{
 					/* This placement will make the computation
 					 * longer, take into account the idle
 					 * consumption of other cpus */
-					fitness[worker_ctx][nimpl] += dt->_gamma * __s_gamma__value * dt->idle_power * __s_idle_power__value * (exp_end[worker_ctx][nimpl] - max_exp_end) / 1000000.0;
+					fitness[worker_ctx][nimpl] += dt->_gamma * __s_gamma__value * dt->idle_power * __s_idle_power__value * (exp_end[worker_ctx][nimpl] - max_exp_end_of_workers) / 1000000.0; /* Since gamma is the cost in us of one Joules, 
+																									  then  d->idle_power * (exp_end - max_exp_end_of_workers) 
+																									  must be in Joules, thus the / 1000000.0 */
 				}
 
 				if (best == -1 || fitness[worker_ctx][nimpl] < best_fitness)
@@ -856,7 +858,7 @@ static double _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned
 					best_in_ctx = worker_ctx;
 					selected_impl = nimpl;
 
-					//_STARPU_DEBUG("best fitness (worker %d) %e = alpha*(%e) + beta(%e) +gamma(%e)\n", worker, best_fitness, exp_end[worker][nimpl] - best_exp_end, local_data_penalty[worker][nimpl], local_energy[worker][nimpl]);
+					//_STARPU_DEBUG("best fitness (worker %d) %e = alpha*(%e) + beta(%e) +gamma(%e)\n", worker, best_fitness, exp_end[worker][nimpl] - min_exp_end_of_task, local_data_penalty[worker][nimpl], local_energy[worker][nimpl]);
 
 				}
 			}
@@ -1026,7 +1028,9 @@ static void initialize_dmda_policy(unsigned sched_ctx_id)
 
 	dt->alpha = starpu_get_env_float_default("STARPU_SCHED_ALPHA", _STARPU_SCHED_ALPHA_DEFAULT);
 	dt->beta = starpu_get_env_float_default("STARPU_SCHED_BETA", _STARPU_SCHED_BETA_DEFAULT);
+	/* data->_gamma: cost of one Joule in us. If gamma is set to 10^6, then one Joule cost 1s */
 	dt->_gamma = starpu_get_env_float_default("STARPU_SCHED_GAMMA", _STARPU_SCHED_GAMMA_DEFAULT);
+	/* data->idle_power: Idle power of the whole machine in Watt */
 	dt->idle_power = starpu_get_env_float_default("STARPU_IDLE_POWER", 0.0);
 
 	if(starpu_sched_ctx_min_priority_is_set(sched_ctx_id) != 0 && starpu_sched_ctx_max_priority_is_set(sched_ctx_id) != 0)

+ 52 - 21
src/sched_policies/helper_mct.c

@@ -36,8 +36,10 @@ struct _starpu_mct_data *starpu_mct_init_parameters(struct starpu_sched_componen
 	{
 		data->alpha = params->alpha;
 		data->beta = params->beta;
+		/* data->_gamma: cost of one Joule in us. If gamma is set to 10^6, then one Joule cost 1s */
 		data->_gamma = params->_gamma;
-		data->idle_power = params->idle_power;
+		/* data->idle_power: Idle power of the whole machine in Watt */
+		data->idle_power = params->idle_power; 
 	}
 	else
 	{
@@ -79,14 +81,21 @@ static double compute_expected_time(double now, double predicted_end, double pre
 	return predicted_end;
 }
 
-double starpu_mct_compute_fitness(struct _starpu_mct_data * d, double exp_end, double min_exp_end, double max_exp_end, double transfer_len, double local_energy)
+double starpu_mct_compute_fitness(struct _starpu_mct_data * d, double exp_end, double min_exp_end_of_task, double max_exp_end_of_workers, double transfer_len, double local_energy)
 {
 	/* Note: the expected end includes the data transfer duration, which we want to be able to tune separately */
-
-	return d->alpha * (exp_end - min_exp_end)
-		+ d->beta * transfer_len
-		+ d->_gamma * local_energy
-		+ d->_gamma * d->idle_power * (exp_end - max_exp_end);
+	
+	/* min_exp_end_of_task is the minimum end time of the task over all workers */
+	double fitness = d->alpha * (exp_end - min_exp_end_of_task) + d->beta * transfer_len + d->_gamma * local_energy;
+	
+	/* max_exp_end is the maximum end time of the workers. If the total execution time is increased, then an 
+          additional energy penalty must be considered*/
+	if(exp_end > max_exp_end_of_workers)
+		fitness += d->_gamma * d->idle_power * (exp_end - max_exp_end_of_workers) / 1000000.0; /* Since gamma is the cost in us of one Joules, 
+											       then  d->idle_power * (exp_end - max_exp_end) 
+											       must be in Joules, thus the / 1000000.0 */
+
+	return fitness;
 }
 
 unsigned starpu_mct_compute_execution_times(struct starpu_sched_component *component, struct starpu_task *task,
@@ -120,12 +129,12 @@ unsigned starpu_mct_compute_execution_times(struct starpu_sched_component *compo
 
 void starpu_mct_compute_expected_times(struct starpu_sched_component *component, struct starpu_task *task STARPU_ATTRIBUTE_UNUSED,
 		double *estimated_lengths, double *estimated_transfer_length, double *estimated_ends_with_task,
-				       double *min_exp_end_with_task, double *max_exp_end_with_task, unsigned *suitable_components, unsigned nsuitable_components)
+				       double *min_exp_end_of_task, double *max_exp_end_of_workers, unsigned *suitable_components, unsigned nsuitable_components)
 {
 	unsigned i;
 	double now = starpu_timing_now();
-	*min_exp_end_with_task = DBL_MAX;
-	*max_exp_end_with_task = 0.0;
+	*min_exp_end_of_task = DBL_MAX;
+	*max_exp_end_of_workers = 0.0;
 	for(i = 0; i < nsuitable_components; i++)
 	{
 		unsigned icomponent = suitable_components[i];
@@ -138,14 +147,39 @@ void starpu_mct_compute_expected_times(struct starpu_sched_component *component,
 								    estimated_end,
 								    estimated_lengths[icomponent],
 								    estimated_transfer_length[icomponent]);
-		if(estimated_ends_with_task[icomponent] < *min_exp_end_with_task)
-			*min_exp_end_with_task = estimated_ends_with_task[icomponent];
-		if(estimated_ends_with_task[icomponent] > *max_exp_end_with_task)
-			*max_exp_end_with_task = estimated_ends_with_task[icomponent];
+		
+		/* estimated_ends_with_task[icomponent]: estimated end of execution on the worker icomponent
+		   estimated_end: estimatated end of the worker
+		   min_exp_end_of_task: minimum estimated execution time of the task over all workers
+		   max_exp_end_of_workers: maximum estimated end of the already-scheduled tasks over all workers
+		*/
+		if(estimated_ends_with_task[icomponent] < *min_exp_end_of_task)
+			*min_exp_end_of_task = estimated_ends_with_task[icomponent];
+		if(estimated_end > *max_exp_end_of_workers)
+			*max_exp_end_of_workers = estimated_end;
+	}
+}
+
+/* This function retrieves the energy consumption of a task in Joules*/
+void starpu_mct_compute_energy(struct starpu_sched_component *component, struct starpu_task *task , double *local_energy, unsigned *suitable_components, unsigned nsuitable_components)
+{
+	unsigned i;
+	for(i = 0; i < nsuitable_components; i++)
+	{
+		unsigned icomponent = suitable_components[i];
+		int nimpl = 0;
+		local_energy[icomponent] = starpu_task_worker_expected_energy(task, icomponent,  component->tree->sched_ctx_id, nimpl);
+		for (nimpl  = 1; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
+		{
+			double e;
+			e = starpu_task_worker_expected_energy(task, icomponent,  component->tree->sched_ctx_id, nimpl);
+			if (e < local_energy[icomponent])
+				local_energy[icomponent] = e;
+		}
 	}
 }
 
-int starpu_mct_get_best_component(struct _starpu_mct_data *d, struct starpu_task *task, double *estimated_lengths, double *estimated_transfer_length, double *estimated_ends_with_task, double min_exp_end_with_task, double max_exp_end_with_task, unsigned *suitable_components, unsigned nsuitable_components)
+int starpu_mct_get_best_component(struct _starpu_mct_data *d, struct starpu_task *task, double *estimated_lengths, double *estimated_transfer_length, double *estimated_ends_with_task, double *local_energy, double min_exp_end_of_task, double max_exp_end_of_workers, unsigned *suitable_components, unsigned nsuitable_components)
 {
 	double best_fitness = DBL_MAX;
 	int best_icomponent = -1;
@@ -154,15 +188,12 @@ int starpu_mct_get_best_component(struct _starpu_mct_data *d, struct starpu_task
 	for(i = 0; i < nsuitable_components; i++)
 	{
 		int icomponent = suitable_components[i];
-#ifdef STARPU_DEVEL
-#warning FIXME: take energy consumption into account
-#endif
 		double tmp = starpu_mct_compute_fitness(d,
 					     estimated_ends_with_task[icomponent],
-					     min_exp_end_with_task,
-					     max_exp_end_with_task,
+					     min_exp_end_of_task,
+					     max_exp_end_of_workers,
 					     estimated_transfer_length[icomponent],
-					     0.0);
+					     local_energy[icomponent]);
 
 		if(tmp < best_fitness)
 		{

+ 12 - 4
src/sched_policies/helper_mct.h

@@ -39,8 +39,8 @@ void starpu_mct_compute_expected_times(struct starpu_sched_component *component,
 				       double *estimated_lengths,
 				       double *estimated_transfer_length,
 				       double *estimated_ends_with_task,
-				       double *min_exp_end_with_task,
-				       double *max_exp_end_with_task,
+				       double *min_exp_end_of_task,
+				       double *max_exp_end_of_workers,
 				       unsigned *suitable_components,
 				       unsigned nsuitable_components);
 
@@ -56,7 +56,15 @@ int starpu_mct_get_best_component(struct _starpu_mct_data *d,
 				  double *estimated_lengths,
 				  double *estimated_transfer_length,
 				  double *estimated_ends_with_task,
-				  double min_exp_end_with_task,
-				  double max_exp_end_with_task,
+				  double *local_energy,
+				  double min_exp_end_of_task,
+				  double max_exp_end_of_workers,
 				  unsigned *suitable_components,
 				  unsigned nsuitable_components);
+
+
+void starpu_mct_compute_energy(struct starpu_sched_component *component,
+			       struct starpu_task *task ,
+			       double *local_energy,
+			       unsigned *suitable_components,
+			       unsigned nsuitable_components);

+ 1 - 1
tools/starpu_env.in

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by

+ 1 - 0
tools/starpu_perfmodel_recdump.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2011-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  * Copyright (C) 2011       Télécom-SudParis
+ * Copyright (C) 2020       Federal University of Rio Grande do Sul (UFRGS)
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 0
tools/starpu_smpirun.in

@@ -2,6 +2,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2014-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+# Copyright (C) 2020       Federal University of Rio Grande do Sul (UFRGS)
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by