5 years ago · 3154eea92e
--- a/AUTHORS
+++ b/AUTHORS
@@ -12,6 +12,7 @@ Danjean Vincent, University Grenoble Alpes, <Vincent.Danjean@ens-lyon.org>
 
				 Denis Alexandre, Inria, <alexandre.denis@inria.fr>
			
 
				 Eyraud-Dubois Lionel, Inria, <lionel.eyraud-dubois@inria.fr>
			
 
				 Furmento Nathalie, CNRS, <nathalie.furmento@labri.fr>
			
 
				+Guermouche Amina, Télécom SudParis, <amina.guermouche@inria.fr>
			
 
				 Guilbaud Adrien, Inria, <adrien.guilbaud@inria.fr>
			
 
				 Henry Sylvain, Université de Bordeaux, <sylvain.henry@inria.fr>
			
 
				 Hugo Andra, Université de Bordeaux/Inria, <andra.hugo@inria.fr>
			
--- a/ChangeLog
+++ b/ChangeLog
@@ -54,6 +54,9 @@ Small features:
 
				   * Add STARPU_WORKERS_GETBIND environment variable.
			
 
				   * Add STARPU_SCHED_SIMPLE_DECIDE_ALWAYS modular scheduler flag.
			
 
				   * And STARPU_LIMIT_BANDWIDTH environment variable.
			
 
				+  * Add field starpu_conf::precedence_over_environment_variables to ignore
			
 
				+    environment variables when parameters are set directly in starpu_conf
			
 
				+  * Add starpu_data_get_coordinates_array
			
 
				 
			
 
				 StarPU 1.3.3 (git revision 11afc5b007fe1ab1c729b55b47a5a98ef7f3cfad)
			
 
				 ====================================================================
			
--- a/configure.ac
+++ b/configure.ac
--- a/doc/doxygen/chapters/470_simgrid.doxy
+++ b/doc/doxygen/chapters/470_simgrid.doxy
@@ -1,6 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ * Copyright (C) 2020       Federal University of Rio Grande do Sul (UFRGS)
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/doc/doxygen/chapters/501_environment_variables.doxy
+++ b/doc/doxygen/chapters/501_environment_variables.doxy
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				  * Copyright (C) 2016       Uppsala University
			
 
				+ * Copyright (C) 2020       Federal University of Rio Grande do Sul (UFRGS)
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/examples/mlr/mlr.c
+++ b/examples/mlr/mlr.c
@@ -110,7 +110,9 @@ static struct starpu_perfmodel cl_model_init =
 
				    template.
			
 
				  */
			
 
				 
			
 
				+/* M^2 * N^1 * K^0 */
			
 
				 static unsigned combi1 [3]		= {	2,	1,	0 };
			
 
				+/* M^0 * N^3 * K^1 */
			
 
				 static unsigned combi2 [3]		= {	0,	3,	1 };
			
 
				 
			
 
				 static unsigned *combinations[] = { combi1, combi2 };
			
--- a/include/starpu_data.h
+++ b/include/starpu_data.h
@@ -123,7 +123,7 @@ void starpu_data_set_name(starpu_data_handle_t handle, const char *name);
 
				    tools. \p dimensions is the size of the \p dims array. This can be
			
 
				    for instance the tile coordinates within a big matrix.
			
 
				 */
			
 
				-void starpu_data_set_coordinates_array(starpu_data_handle_t handle, int dimensions, int dims[]);
			
 
				+void starpu_data_set_coordinates_array(starpu_data_handle_t handle, unsigned dimensions, int dims[]);
			
 
				 
			
 
				 /**
			
 
				    Set the coordinates of the data, to be shown in various profiling
			
@@ -133,6 +133,14 @@ void starpu_data_set_coordinates_array(starpu_data_handle_t handle, int dimensio
 
				 void starpu_data_set_coordinates(starpu_data_handle_t handle, unsigned dimensions, ...);
			
 
				 
			
 
				 /**
			
 
				+   Get the coordinates of the data, as set by a previous call to
			
 
				+   starpu_data_set_coordinates_array() or starpu_data_set_coordinates()
			
 
				+   \p dimensions is the size of the \p dims array.
			
 
				+   This returns the actual number of returned coordinates.
			
 
				+*/
			
 
				+unsigned starpu_data_get_coordinates_array(starpu_data_handle_t handle, unsigned dimensions, int dims[]);
			
 
				+
			
 
				+/**
			
 
				    Unregister a data \p handle from StarPU. If the data was
			
 
				    automatically allocated by StarPU because the home node was -1, all
			
 
				    automatically allocated buffers are freed. Otherwise, a valid copy
			
--- a/julia/examples/Makefile.am
+++ b/julia/examples/Makefile.am
@@ -0,0 +1,146 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+include $(top_srcdir)/starpu.mk
			
 
				+
			
 
				+noinst_PROGRAMS		=
			
 
				+
			
 
				+if STARPU_HAVE_WINDOWS
			
 
				+LOADER_BIN		=
			
 
				+else
			
 
				+loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
			
 
				+if !STARPU_SIMGRID
			
 
				+LOADER			=	loader
			
 
				+LOADER_BIN		=	$(abs_top_builddir)/julia/examples/$(LOADER)
			
 
				+noinst_PROGRAMS		+=	loader
			
 
				+endif
			
 
				+loader_SOURCES		=	../../tests/loader.c
			
 
				+endif
			
 
				+
			
 
				+if STARPU_HAVE_AM111
			
 
				+TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
			
 
				+else
			
 
				+TESTS_ENVIRONMENT 	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(LOADER_BIN)
			
 
				+endif
			
 
				+
			
 
				+BUILT_SOURCES =
			
 
				+
			
 
				+CLEANFILES = *.gcno *.gcda *.linkinfo starpu_idle_microsec.log
			
 
				+
			
 
				+EXTRA_DIST =					\
			
 
				+	axpy/axpy.jl				\
			
 
				+	axpy/axpy.sh				\
			
 
				+	black_scholes/black_scholes.jl		\
			
 
				+	callback/callback.jl			\
			
 
				+	callback/callback.sh			\
			
 
				+	check_deps/check_deps.jl		\
			
 
				+	check_deps/check_deps.sh		\
			
 
				+	cholesky/cholesky_codelets.jl		\
			
 
				+	cholesky/cholesky_common.jl		\
			
 
				+	cholesky/cholesky_native.jl		\
			
 
				+	cholesky/cholesky_implicit.jl		\
			
 
				+	cholesky/cholesky_tag.jl		\
			
 
				+	cholesky/cholesky.sh			\
			
 
				+	dependency/end_dep.jl			\
			
 
				+	dependency/end_dep.sh			\
			
 
				+	dependency/tag_dep.jl			\
			
 
				+	dependency/tag_dep.sh			\
			
 
				+	dependency/task_dep.sh			\
			
 
				+	dependency/task_dep.jl			\
			
 
				+	gemm/gemm.jl				\
			
 
				+	gemm/gemm_native.jl			\
			
 
				+	gemm/gemm.sh				\
			
 
				+	mandelbrot/mandelbrot_native.jl		\
			
 
				+	mandelbrot/mandelbrot.jl		\
			
 
				+	mandelbrot/mandelbrot.sh		\
			
 
				+	mult/mult_native.jl			\
			
 
				+	mult/mult.jl				\
			
 
				+	mult/perf.sh				\
			
 
				+	mult/mult_starpu.sh			\
			
 
				+	task_insert_color/task_insert_color.jl	\
			
 
				+	task_insert_color/task_insert_color.sh	\
			
 
				+	variable/variable.jl			\
			
 
				+	variable/variable_native.jl		\
			
 
				+	variable/variable.sh			\
			
 
				+	vector_scal/vector_scal.jl		\
			
 
				+	vector_scal/vector_scal.sh
			
 
				+
			
 
				+examplebindir = $(libdir)/starpu/julia
			
 
				+
			
 
				+examplebin_PROGRAMS =
			
 
				+
			
 
				+if STARPU_USE_CUDA
			
 
				+if STARPU_COVERITY
			
 
				+include $(top_srcdir)/starpu-mynvcc.mk
			
 
				+else
			
 
				+NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -I$(top_builddir)/include/ $(HWLOC_CFLAGS)
			
 
				+
			
 
				+.cu.cubin:
			
 
				+	$(V_nvcc) $(NVCC) -cubin $< -o $@ $(NVCCFLAGS)
			
 
				+
			
 
				+.cu.o:
			
 
				+	$(V_nvcc) $(NVCC) $< -c -o $@ $(NVCCFLAGS)
			
 
				+endif
			
 
				+endif
			
 
				+
			
 
				+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
			
 
				+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ ../src/libstarpujulia-@STARPU_EFFECTIVE_VERSION@.la -lm @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
			
 
				+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
			
 
				+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
			
 
				+
			
 
				+check_PROGRAMS = $(LOADER) $(starpu_julia_EXAMPLES)
			
 
				+SHELL_TESTS	=
			
 
				+STARPU_JULIA_EXAMPLES	=
			
 
				+
			
 
				+if BUILD_EXAMPLES
			
 
				+examplebin_PROGRAMS 	+=	$(STARPU_JULIA_EXAMPLES)
			
 
				+
			
 
				+TESTS			=	$(SHELL_TESTS) $(STARPU_JULIA_EXAMPLES)
			
 
				+endif
			
 
				+
			
 
				+######################
			
 
				+#      Examples      #
			
 
				+######################
			
 
				+
			
 
				+SHELL_TESTS	+=	check_deps/check_deps.sh
			
 
				+
			
 
				+STARPU_JULIA_EXAMPLES	+=	mult/mult
			
 
				+mult_mult_SOURCES	=	mult/mult.c mult/cpu_mult.c
			
 
				+SHELL_TESTS		+=	mult/mult_starpu.sh
			
 
				+
			
 
				+STARPU_JULIA_EXAMPLES				+=	task_insert_color/task_insert_color
			
 
				+task_insert_color_task_insert_color_SOURCES	=	task_insert_color/task_insert_color.c
			
 
				+SHELL_TESTS					+=	task_insert_color/task_insert_color.sh
			
 
				+
			
 
				+SHELL_TESTS	+=	variable/variable.sh
			
 
				+SHELL_TESTS	+=	vector_scal/vector_scal.sh
			
 
				+
			
 
				+STARPU_JULIA_EXAMPLES		+=	mandelbrot/mandelbrot
			
 
				+mandelbrot_mandelbrot_SOURCES	=	mandelbrot/mandelbrot.c mandelbrot/cpu_mandelbrot.c mandelbrot/cpu_mandelbrot.h
			
 
				+SHELL_TESTS			+=	mandelbrot/mandelbrot.sh
			
 
				+
			
 
				+STARPU_JULIA_EXAMPLES		+= 	callback/callback
			
 
				+callback_callback_SOURCES	=	callback/callback.c
			
 
				+SHELL_TESTS			+=	callback/callback.sh
			
 
				+
			
 
				+SHELL_TESTS			+=	dependency/tag_dep.sh
			
 
				+SHELL_TESTS			+=	dependency/task_dep.sh
			
 
				+SHELL_TESTS			+=	dependency/end_dep.sh
			
 
				+
			
 
				+if !NO_BLAS_LIB
			
 
				+SHELL_TESTS			+=	axpy/axpy.sh
			
 
				+SHELL_TESTS			+=	cholesky/cholesky.sh
			
 
				+SHELL_TESTS			+=	gemm/gemm.sh
			
 
				+endif
			
--- a/julia/examples/axpy/axpy.jl
+++ b/julia/examples/axpy/axpy.jl
@@ -0,0 +1,99 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+using StarPU
			
 
				+using Printf
			
 
				+const EPSILON = 1e-6
			
 
				+
			
 
				+function check(alpha, X, Y)
			
 
				+    for i in 1:length(X)
			
 
				+        expected_value = alpha * X[i] + 4.0
			
 
				+        if abs(Y[i] - expected_value) > expected_value * EPSILON
			
 
				+            error("at ", i, ", ", alpha, "*", X[i], "+4.0=", Y[i], ", expected ", expected_value)
			
 
				+        end
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+@target STARPU_CPU+STARPU_CUDA
			
 
				+@codelet function axpy(X :: Vector{Float32}, Y :: Vector{Float32}, alpha ::Float32) :: Nothing
			
 
				+    STARPU_SAXPY(length(X), alpha, X, 1, Y, 1)
			
 
				+    return
			
 
				+end
			
 
				+
			
 
				+function axpy(N, NBLOCKS, alpha, display = true)
			
 
				+    X = Array(fill(1.0f0, N))
			
 
				+    Y = Array(fill(4.0f0, N))
			
 
				+
			
 
				+    starpu_memory_pin(X)
			
 
				+    starpu_memory_pin(Y)
			
 
				+
			
 
				+    block_filter = starpu_data_filter(STARPU_VECTOR_FILTER_BLOCK, NBLOCKS)
			
 
				+
			
 
				+    if display
			
 
				+        println("BEFORE x[0] = ", X[1])
			
 
				+        println("BEFORE y[0] = ", Y[1])
			
 
				+    end
			
 
				+
			
 
				+    t_start = time_ns()
			
 
				+
			
 
				+    @starpu_block let
			
 
				+        hX,hY = starpu_data_register(X, Y)
			
 
				+
			
 
				+        starpu_data_partition(hX, block_filter)
			
 
				+        starpu_data_partition(hY, block_filter)
			
 
				+
			
 
				+        for b in 1:NBLOCKS
			
 
				+            starpu_task_insert(codelet_name = "axpy",
			
 
				+                               handles = [hX[b], hY[b]],
			
 
				+                               cl_arg = (Float32(alpha),),
			
 
				+                               tag = starpu_tag_t(b),
			
 
				+                               modes = [STARPU_R, STARPU_RW])
			
 
				+        end
			
 
				+
			
 
				+        starpu_task_wait_for_all()
			
 
				+    end
			
 
				+
			
 
				+    t_end = time_ns()
			
 
				+
			
 
				+    timing = (t_end-t_start)/1000
			
 
				+
			
 
				+    if display
			
 
				+        @printf("timing -> %d us %.2f MB/s\n", timing, 3*N*4/timing)
			
 
				+        println("AFTER y[0] = ", Y[1], " (ALPHA=", alpha, ")")
			
 
				+    end
			
 
				+
			
 
				+    check(alpha, X, Y)
			
 
				+
			
 
				+    starpu_memory_unpin(X)
			
 
				+    starpu_memory_unpin(Y)
			
 
				+end
			
 
				+
			
 
				+function main()
			
 
				+    N = 16 * 1024 * 1024
			
 
				+    NBLOCKS = 8
			
 
				+    alpha = 3.41
			
 
				+
			
 
				+    starpu_init()
			
 
				+    starpu_cublas_init()
			
 
				+
			
 
				+    # warmup
			
 
				+    axpy(10, 1, alpha, false)
			
 
				+
			
 
				+    axpy(N, NBLOCKS, alpha)
			
 
				+
			
 
				+    starpu_shutdown()
			
 
				+end
			
 
				+
			
 
				+main()
			
--- a/julia/examples/callback/callback.jl
+++ b/julia/examples/callback/callback.jl
@@ -0,0 +1,77 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+using StarPU
			
 
				+
			
 
				+@target STARPU_CPU
			
 
				+@codelet function variable(val ::Ref{Int32}) :: Nothing
			
 
				+    val[] = val[] + 1
			
 
				+
			
 
				+    return
			
 
				+end
			
 
				+
			
 
				+function callback(args)
			
 
				+    cl = args[1]
			
 
				+    handles = args[2]
			
 
				+
			
 
				+    task = starpu_task(cl = cl, handles=handles)
			
 
				+    starpu_task_submit(task)
			
 
				+end
			
 
				+
			
 
				+function variable_with_starpu(val ::Ref{Int32})
			
 
				+    perfmodel = starpu_perfmodel(
			
 
				+        perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
			
 
				+        symbol = "history_perf"
			
 
				+    )
			
 
				+
			
 
				+    cl = starpu_codelet(
			
 
				+        cpu_func = "variable",
			
 
				+        modes = [STARPU_RW],
			
 
				+        perfmodel = perfmodel
			
 
				+    )
			
 
				+
			
 
				+    @starpu_block let
			
 
				+	hVal = starpu_data_register(val)
			
 
				+
			
 
				+        starpu_task_insert(codelet_name = "variable",
			
 
				+                           cl = cl,
			
 
				+                           handles = [hVal],
			
 
				+                           callback = callback,
			
 
				+                           callback_arg = (cl, [hVal]))
			
 
				+
			
 
				+        starpu_task_wait_for_all()
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+function display()
			
 
				+    v = Ref(Int32(40))
			
 
				+
			
 
				+    variable_with_starpu(v)
			
 
				+
			
 
				+    println("variable -> ", v[])
			
 
				+    if v[] == 42
			
 
				+        println("result is correct")
			
 
				+    else
			
 
				+        error("result is incorret")
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+# Disable garbage collector because of random segfault/hang when using mutex.
			
 
				+# This issue should be solved with Julia release 1.5.
			
 
				+GC.enable(false)
			
 
				+starpu_init()
			
 
				+display()
			
 
				+starpu_shutdown()
			
 
				+GC.enable(true)
			
--- a/julia/examples/check_deps/check_deps.jl
+++ b/julia/examples/check_deps/check_deps.jl
@@ -0,0 +1,32 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+import Pkg
			
 
				+
			
 
				+try
			
 
				+    using CBinding
			
 
				+    using Clang
			
 
				+    using ThreadPools
			
 
				+catch
			
 
				+    Pkg.activate((@__DIR__)*"/../..")
			
 
				+    Pkg.instantiate()
			
 
				+    using Clang
			
 
				+    using CBinding
			
 
				+    using ThreadPools
			
 
				+end
			
 
				+
			
 
				+using StarPU
			
 
				+
			
 
				+starpu_translate_headers()
			
--- a/julia/examples/cholesky/cholesky.sh
+++ b/julia/examples/cholesky/cholesky.sh
@@ -0,0 +1,20 @@
 
				+#!/bin/bash
			
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+
			
 
				+$(dirname $0)/../execute.sh cholesky/cholesky_native.jl
			
 
				+$(dirname $0)/../execute.sh cholesky/cholesky_implicit.jl
			
 
				+$(dirname $0)/../execute.sh cholesky/cholesky_tag.jl
			
--- a/julia/examples/cholesky/cholesky_codelets.jl
+++ b/julia/examples/cholesky/cholesky_codelets.jl
@@ -0,0 +1,53 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+
			
 
				+chol_model11 = starpu_perfmodel(
			
 
				+    perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
			
 
				+    symbol = "chol_model11"
			
 
				+)
			
 
				+
			
 
				+chol_model21 = starpu_perfmodel(
			
 
				+    perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
			
 
				+    symbol = "chol_model21"
			
 
				+)
			
 
				+
			
 
				+chol_model22 = starpu_perfmodel(
			
 
				+    perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
			
 
				+    symbol = "chol_model22"
			
 
				+)
			
 
				+
			
 
				+cl_11 = starpu_codelet(
			
 
				+    cpu_func = "u11",
			
 
				+    # This kernel cannot be translated to CUDA yet.
			
 
				+    # cuda_func = "u11",
			
 
				+    modes = [STARPU_RW],
			
 
				+    color = 0xffff00,
			
 
				+    perfmodel = chol_model11
			
 
				+)
			
 
				+cl_21 = starpu_codelet(
			
 
				+    cpu_func = "u21",
			
 
				+    cuda_func = "u21",
			
 
				+    modes = [STARPU_R, STARPU_RW],
			
 
				+    color = 0x8080ff,
			
 
				+    perfmodel = chol_model21
			
 
				+)
			
 
				+cl_22 = starpu_codelet(
			
 
				+    cpu_func = "u22",
			
 
				+    cuda_func = "u22",
			
 
				+    modes = [STARPU_R, STARPU_R, STARPU_RW],
			
 
				+    color = 0x00ff00,
			
 
				+    perfmodel = chol_model22
			
 
				+)
			
--- a/julia/examples/cholesky/cholesky_common.jl
+++ b/julia/examples/cholesky/cholesky_common.jl
@@ -0,0 +1,155 @@
 
				+# Standard kernels for the Cholesky factorization
			
 
				+# U22 is the gemm update
			
 
				+# U21 is the trsm update
			
 
				+# U11 is the cholesky factorization
			
 
				+
			
 
				+@target STARPU_CPU+STARPU_CUDA
			
 
				+@codelet function u11(sub11 :: Matrix{Float32}) :: Nothing
			
 
				+    nx :: Int32 = width(sub11)
			
 
				+    ld :: Int32 = ld(sub11)
			
 
				+
			
 
				+    for z in 0:nx-1
			
 
				+        lambda11 :: Float32 = sqrt(sub11[z+1,z+1])
			
 
				+        sub11[z+1,z+1] = lambda11
			
 
				+
			
 
				+        alpha ::Float32 = 1.0f0 / lambda11
			
 
				+        X :: Vector{Float32} = view(sub11, z+2:z+2+(nx-z-2), z+1)
			
 
				+        STARPU_SSCAL(nx-z-1, alpha, X, 1)
			
 
				+
			
 
				+        alpha = -1.0f0
			
 
				+        A :: Matrix{Float32} = view(sub11, z+2:z+2+(nx-z-2), z+2:z+2+(nx-z-2))
			
 
				+	STARPU_SSYR("L", nx-z-1, alpha, X, 1, A, ld)
			
 
				+    end
			
 
				+    return
			
 
				+end
			
 
				+
			
 
				+@target STARPU_CPU+STARPU_CUDA
			
 
				+@codelet function u21(sub11 :: Matrix{Float32},
			
 
				+                      sub21 :: Matrix{Float32}) :: Nothing
			
 
				+    ld11 :: Int32 = ld(sub11)
			
 
				+    ld21 :: Int32 = ld(sub21)
			
 
				+    nx21 :: Int32 = width(sub21)
			
 
				+    ny21 :: Int32 = height(sub21)
			
 
				+    alpha :: Float32 = 1.0f0
			
 
				+    STARPU_STRSM("R", "L", "T", "N", nx21, ny21, alpha, sub11, ld11, sub21, ld21)
			
 
				+    return
			
 
				+end
			
 
				+
			
 
				+@target STARPU_CPU+STARPU_CUDA
			
 
				+@codelet function u22(left   :: Matrix{Float32},
			
 
				+                      right  :: Matrix{Float32},
			
 
				+                      center :: Matrix{Float32}) :: Nothing
			
 
				+    dx :: Int32 = width(center)
			
 
				+    dy :: Int32 = height(center)
			
 
				+    dz :: Int32 = width(left)
			
 
				+    ld21 :: Int32 = ld(left)
			
 
				+    ld12 :: Int32 = ld(center)
			
 
				+    ld22 :: Int32 = ld(right)
			
 
				+    alpha :: Float32 = -1.0f0
			
 
				+    beta :: Float32 = 1.0f0
			
 
				+    STARPU_SGEMM("N", "T", dy, dx, dz, alpha, left, ld21, right, ld12, beta, center, ld22)
			
 
				+    return
			
 
				+end
			
 
				+
			
 
				+@inline function tag11(k)
			
 
				+    return starpu_tag_t((UInt64(1)<<60) | UInt64(k))
			
 
				+end
			
 
				+
			
 
				+@inline function tag21(k, j)
			
 
				+    return starpu_tag_t((UInt64(3)<<60) | (UInt64(k)<<32) |  UInt64(j))
			
 
				+end
			
 
				+
			
 
				+@inline function tag22(k, i, j)
			
 
				+    return starpu_tag_t((UInt64(4)<<60) | (UInt64(k)<<32) | (UInt64(i)<<16) |  UInt64(j))
			
 
				+end
			
 
				+
			
 
				+function check(mat::Matrix{Float32})
			
 
				+    size_p = size(mat, 1)
			
 
				+
			
 
				+    for i in 1:size_p
			
 
				+        for j in 1:size_p
			
 
				+            if j > i
			
 
				+                mat[i, j] = 0.0f0
			
 
				+            end
			
 
				+        end
			
 
				+    end
			
 
				+
			
 
				+    test_mat ::Matrix{Float32} = zeros(Float32, size_p, size_p)
			
 
				+
			
 
				+    syrk!('L', 'N', 1.0f0, mat, 0.0f0, test_mat)
			
 
				+
			
 
				+    for i in 1:size_p
			
 
				+        for j in 1:size_p
			
 
				+            if j <= i
			
 
				+                orig = (1.0f0/(1.0f0+(i-1)+(j-1))) + ((i == j) ? 1.0f0*size_p : 0.0f0)
			
 
				+                err = abs(test_mat[i,j] - orig) / orig
			
 
				+                if err > 0.0001
			
 
				+                    got = test_mat[i,j]
			
 
				+                    expected = orig
			
 
				+                    error("[$i, $j] -> $got != $expected (err $err)")
			
 
				+                end
			
 
				+            end
			
 
				+        end
			
 
				+    end
			
 
				+
			
 
				+    println("Verification successful !")
			
 
				+end
			
 
				+
			
 
				+function clean_tags(nblocks)
			
 
				+    for k in 1:nblocks
			
 
				+        starpu_tag_remove(tag11(k))
			
 
				+
			
 
				+        for m in k+1:nblocks
			
 
				+            starpu_tag_remove(tag21(k, m))
			
 
				+
			
 
				+            for n in k+1:nblocks
			
 
				+                if n <= m
			
 
				+                    starpu_tag_remove(tag22(k, m, n))
			
 
				+                end
			
 
				+            end
			
 
				+        end
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+function main(size_p :: Int, nblocks :: Int; verify = false, verbose = false)
			
 
				+    mat :: Matrix{Float32} = zeros(Float32, size_p, size_p)
			
 
				+
			
 
				+    # create a simple definite positive symetric matrix
			
 
				+    # Hilbert matrix h(i,j) = 1/(i+j+1)
			
 
				+
			
 
				+    for i in 1:size_p
			
 
				+        for j in 1:size_p
			
 
				+            mat[i, j] = 1.0f0 / (1.0f0+(i-1)+(j-1)) + ((i == j) ? 1.0f0*size_p : 0.0f0)
			
 
				+        end
			
 
				+    end
			
 
				+
			
 
				+    if verbose
			
 
				+        display(mat)
			
 
				+    end
			
 
				+
			
 
				+    starpu_memory_pin(mat)
			
 
				+
			
 
				+    t_start = time_ns()
			
 
				+
			
 
				+    cholesky(mat, size_p, nblocks)
			
 
				+
			
 
				+    t_end = time_ns()
			
 
				+
			
 
				+    starpu_memory_unpin(mat)
			
 
				+
			
 
				+    flop = (1.0*size_p*size_p*size_p)/3.0
			
 
				+    println("# size\tms\tGFlops")
			
 
				+    time_ms = (t_end-t_start) / 1e6
			
 
				+    gflops = flop/(time_ms*1000)/1000
			
 
				+    println("# $size_p\t$time_ms\t$gflops")
			
 
				+
			
 
				+    clean_tags(nblocks)
			
 
				+
			
 
				+    if verbose
			
 
				+        display(mat)
			
 
				+    end
			
 
				+
			
 
				+    if verify
			
 
				+        check(mat)
			
 
				+    end
			
 
				+end
			
--- a/julia/examples/cholesky/cholesky_implicit.jl
+++ b/julia/examples/cholesky/cholesky_implicit.jl
@@ -0,0 +1,64 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+using StarPU
			
 
				+using LinearAlgebra.BLAS
			
 
				+
			
 
				+include("cholesky_common.jl")
			
 
				+
			
 
				+function cholesky(mat :: Matrix{Float32}, size, nblocks)
			
 
				+    include("cholesky_codelets.jl")
			
 
				+
			
 
				+    horiz = starpu_data_filter(STARPU_MATRIX_FILTER_BLOCK, nblocks)
			
 
				+    vert = starpu_data_filter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nblocks)
			
 
				+
			
 
				+    @starpu_block let
			
 
				+        h_mat = starpu_data_register(mat)
			
 
				+        starpu_data_map_filters(h_mat, horiz, vert)
			
 
				+
			
 
				+        for k in 1:nblocks
			
 
				+
			
 
				+            starpu_iteration_push(k)
			
 
				+
			
 
				+            starpu_task_insert(cl = cl_11, handles = [h_mat[k, k]], tag_only = tag11(k))
			
 
				+
			
 
				+            for m in k+1:nblocks
			
 
				+                starpu_task_insert(cl = cl_21, handles = [h_mat[k, k], h_mat[m, k]], tag_only = tag21(m, k))
			
 
				+            end
			
 
				+            starpu_data_wont_use(h_mat[k, k])
			
 
				+
			
 
				+            for m in k+1:nblocks
			
 
				+                for n in k+1:nblocks
			
 
				+                    if n <= m
			
 
				+                        starpu_task_insert(cl = cl_22, handles = [h_mat[m, k], h_mat[n, k], h_mat[m, n]], tag_only= tag22(k, m, n))
			
 
				+                    end
			
 
				+                end
			
 
				+                starpu_data_wont_use(h_mat[m, k])
			
 
				+            end
			
 
				+
			
 
				+            starpu_iteration_pop()
			
 
				+        end
			
 
				+
			
 
				+        starpu_task_wait_for_all()
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+starpu_init()
			
 
				+starpu_cublas_init()
			
 
				+
			
 
				+main(1024, 8, verify = true)
			
 
				+main(15360, 16)
			
 
				+
			
 
				+starpu_shutdown()
			
--- a/julia/examples/cholesky/cholesky_native.jl
+++ b/julia/examples/cholesky/cholesky_native.jl
@@ -0,0 +1,139 @@
 
				+using LinearAlgebra.BLAS
			
 
				+
			
 
				+function u11(sub11)
			
 
				+    nx = size(sub11, 1)
			
 
				+    ld = size(sub11, 1)
			
 
				+
			
 
				+    for z in 0:nx-1
			
 
				+        lambda11::Float32 = sqrt(sub11[z+1,z+1])
			
 
				+        sub11[z+1,z+1] = lambda11
			
 
				+        if lambda11 == 0.0f0
			
 
				+            error("lamda11")
			
 
				+        end
			
 
				+
			
 
				+        X = view(sub11, z+2:z+2+(nx-z-2), z+1)
			
 
				+        scal!(nx-z-1, 1.0f0/lambda11, X, 1)
			
 
				+
			
 
				+        A = view(sub11, z+2:z+2+(nx-z-2), z+2:z+2+(nx-z-2))
			
 
				+        syr!('L', -1.0f0, X, A)
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+function u21(sub11, sub21)
			
 
				+    trsm!('R', 'L', 'T', 'N', 1.0f0, sub11, sub21)
			
 
				+end
			
 
				+
			
 
				+function u22(left, right, center)
			
 
				+    gemm!('N', 'T', -1.0f0, left, right, 1.0f0, center)
			
 
				+end
			
 
				+
			
 
				+function get_block(mat :: Matrix{Float32}, m, n, nblocks)
			
 
				+    dim = size(mat, 1)
			
 
				+    if dim != size(mat,2)
			
 
				+        error("mat must be a square matrix")
			
 
				+    end
			
 
				+    if dim % nblocks != 0
			
 
				+        error("dim must be a multiple of nblocks")
			
 
				+    end
			
 
				+
			
 
				+    stride = Int(dim/nblocks)
			
 
				+
			
 
				+    return view(mat,
			
 
				+                m*stride+1:(m+1)*stride,
			
 
				+                n*stride+1:(n+1)*stride)
			
 
				+end
			
 
				+
			
 
				+function cholesky(mat :: Matrix{Float32}, size, nblocks)
			
 
				+    for k in 0:nblocks-1
			
 
				+        sdatakk = get_block(mat, k, k, nblocks)
			
 
				+        u11(sdatakk)
			
 
				+
			
 
				+        for m in k+1:nblocks-1
			
 
				+            sdatamk = get_block(mat, m, k, nblocks)
			
 
				+            u21(sdatakk, sdatamk)
			
 
				+        end
			
 
				+
			
 
				+        for m in k+1:nblocks-1
			
 
				+            sdatamk = get_block(mat, m, k, nblocks)
			
 
				+
			
 
				+            for n in k+1:nblocks-1
			
 
				+                if n <= m
			
 
				+                    sdatank = get_block(mat, n, k, nblocks)
			
 
				+                    sdatamn = get_block(mat, m, n, nblocks)
			
 
				+                    u22(sdatamk, sdatank, sdatamn)
			
 
				+                end
			
 
				+            end
			
 
				+        end
			
 
				+
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+function check(mat::Matrix{Float32})
			
 
				+    size_p = size(mat, 1)
			
 
				+
			
 
				+    for i in 1:size_p
			
 
				+        for j in 1:size_p
			
 
				+            if j > i
			
 
				+                mat[i, j] = 0.0f0
			
 
				+            end
			
 
				+        end
			
 
				+    end
			
 
				+
			
 
				+    test_mat ::Matrix{Float32} = zeros(Float32, size_p, size_p)
			
 
				+
			
 
				+    syrk!('L', 'N', 1.0f0, mat, 0.0f0, test_mat)
			
 
				+
			
 
				+    for i in 1:size_p
			
 
				+        for j in 1:size_p
			
 
				+            if j <= i
			
 
				+                orig = (1.0f0/(1.0f0+(i-1)+(j-1))) + ((i == j) ? 1.0f0*size_p : 0.0f0)
			
 
				+                err = abs(test_mat[i,j] - orig) / orig
			
 
				+                if err > 0.0001
			
 
				+                    got = test_mat[i,j]
			
 
				+                    expected = orig
			
 
				+                    error("[$i, $j] -> $got != $expected (err $err)")
			
 
				+                end
			
 
				+            end
			
 
				+        end
			
 
				+    end
			
 
				+
			
 
				+    println("Verification successful !")
			
 
				+end
			
 
				+
			
 
				+function main(size_p :: Int, nblocks :: Int, display = false)
			
 
				+    mat :: Matrix{Float32} = zeros(Float32, size_p, size_p)
			
 
				+
			
 
				+    # create a simple definite positive symetric matrix
			
 
				+    # Hilbert matrix h(i,j) = 1/(i+j+1)
			
 
				+
			
 
				+    for i in 1:size_p
			
 
				+        for j in 1:size_p
			
 
				+            mat[i, j] = 1.0f0 / (1.0f0+(i-1)+(j-1)) + ((i == j) ? 1.0f0*size_p : 0.0f0)
			
 
				+        end
			
 
				+    end
			
 
				+
			
 
				+    if display
			
 
				+        display(mat)
			
 
				+    end
			
 
				+
			
 
				+    t_start = time_ns()
			
 
				+
			
 
				+    cholesky(mat, size_p, nblocks)
			
 
				+
			
 
				+    t_end = time_ns()
			
 
				+
			
 
				+    flop = (1.0*size_p*size_p*size_p)/3.0
			
 
				+    println("# size\tms\tGFlops")
			
 
				+    time_ms = (t_end-t_start) / 1e6
			
 
				+    gflops = flop/(time_ms*1000)/1000
			
 
				+    println("# $size_p\t$time_ms\t$gflops")
			
 
				+
			
 
				+    if display
			
 
				+        display(mat)
			
 
				+    end
			
 
				+
			
 
				+    check(mat)
			
 
				+end
			
 
				+
			
 
				+main(1024*20, 8)
			
 
				+
			
--- a/julia/examples/cholesky/cholesky_tag.jl
+++ b/julia/examples/cholesky/cholesky_tag.jl
@@ -0,0 +1,86 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+using StarPU
			
 
				+using LinearAlgebra.BLAS
			
 
				+
			
 
				+include("cholesky_common.jl")
			
 
				+
			
 
				+function cholesky(mat :: Matrix{Float32}, size, nblocks)
			
 
				+    include("cholesky_codelets.jl")
			
 
				+
			
 
				+    horiz = starpu_data_filter(STARPU_MATRIX_FILTER_BLOCK, nblocks)
			
 
				+    vert = starpu_data_filter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nblocks)
			
 
				+
			
 
				+    @starpu_block let
			
 
				+        h_mat = starpu_data_register(mat)
			
 
				+        starpu_data_set_sequential_consistency_flag(h_mat, 0)
			
 
				+        starpu_data_map_filters(h_mat, horiz, vert)
			
 
				+
			
 
				+        entry_task = starpu_task(cl = cl_11,
			
 
				+                                 handles = [h_mat[1, 1]],
			
 
				+                                 tag = tag11(1))
			
 
				+
			
 
				+        for k in 1:nblocks
			
 
				+
			
 
				+            starpu_iteration_push(k)
			
 
				+
			
 
				+            if k > 1
			
 
				+                # enforce dependencies...
			
 
				+                starpu_tag_declare_deps(tag11(k), tag22(k-1, k, k))
			
 
				+                starpu_task_insert(cl = cl_11,
			
 
				+                                   handles = [h_mat[k, k]],
			
 
				+                                   tag = tag11(k))
			
 
				+            end
			
 
				+
			
 
				+            for m in k+1:nblocks
			
 
				+                # enforce dependencies...
			
 
				+                if k > 1
			
 
				+                    starpu_tag_declare_deps(tag21(k, m), tag11(k), tag22(k-1, m, k))
			
 
				+                else
			
 
				+                    starpu_tag_declare_deps(tag21(k, m), tag11(k))
			
 
				+                end
			
 
				+
			
 
				+                starpu_task_insert(cl = cl_21, handles = [h_mat[k, k], h_mat[m, k]], tag = tag21(k, m))
			
 
				+
			
 
				+                for n in k+1:nblocks
			
 
				+                    if n <= m
			
 
				+                        # enforce dependencies...
			
 
				+                        if k > 1
			
 
				+                            starpu_tag_declare_deps(tag22(k, m, n), tag22(k-1, m, n), tag21(k, n), tag21(k, m))
			
 
				+                        else
			
 
				+                            starpu_tag_declare_deps(tag22(k, m, n), tag21(k, n), tag21(k, m))
			
 
				+                        end
			
 
				+
			
 
				+                        starpu_task_insert(cl = cl_22, handles = [h_mat[m, k], h_mat[n, k], h_mat[m, n]], tag = tag22(k, m, n))
			
 
				+                    end
			
 
				+                end
			
 
				+            end
			
 
				+
			
 
				+            starpu_iteration_pop()
			
 
				+        end
			
 
				+
			
 
				+        starpu_task_submit(entry_task)
			
 
				+        starpu_tag_wait(tag11(nblocks))
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+starpu_init()
			
 
				+starpu_cublas_init()
			
 
				+
			
 
				+main(1024, 8, verify = true)
			
 
				+main(15360, 16)
			
 
				+
			
 
				+starpu_shutdown()
			
--- a/julia/examples/dependency/end_dep.jl
+++ b/julia/examples/dependency/end_dep.jl
@@ -0,0 +1,104 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+using StarPU
			
 
				+
			
 
				+@target STARPU_CPU
			
 
				+@codelet function codeletA() :: Nothing
			
 
				+    # print("[Task A] Value = ", val[]);
			
 
				+    # do nothing
			
 
				+end
			
 
				+
			
 
				+@target STARPU_CPU
			
 
				+@codelet function codeletB(val ::Ref{Int32}) :: Nothing
			
 
				+    # println("[Task B] Value = ", val[]);
			
 
				+    val[] = val[] *2
			
 
				+end
			
 
				+
			
 
				+function callbackB(task)
			
 
				+    sleep(1)
			
 
				+    starpu_task_end_dep_release(task)
			
 
				+end
			
 
				+
			
 
				+@target STARPU_CPU
			
 
				+@codelet function codeletC(val ::Ref{Int32}) :: Nothing
			
 
				+    # println("[Task C] Value = ", val[]);
			
 
				+    val[] = val[] *2
			
 
				+end
			
 
				+
			
 
				+function callbackC(task)
			
 
				+    starpu_task_end_dep_release(task)
			
 
				+end
			
 
				+
			
 
				+
			
 
				+function main()
			
 
				+    value = Ref(Int32(12))
			
 
				+
			
 
				+    @starpu_block let
			
 
				+        perfmodel = starpu_perfmodel(
			
 
				+            perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
			
 
				+            symbol = "history_perf"
			
 
				+        )
			
 
				+
			
 
				+        clA = starpu_codelet(
			
 
				+            cpu_func = "codeletA",
			
 
				+            perfmodel = perfmodel
			
 
				+        )
			
 
				+        clB = starpu_codelet(
			
 
				+            cpu_func = "codeletB",
			
 
				+            modes = [STARPU_RW],
			
 
				+            perfmodel = perfmodel
			
 
				+        )
			
 
				+        clC = starpu_codelet(
			
 
				+            cpu_func = "codeletC",
			
 
				+            modes = [STARPU_RW],
			
 
				+            perfmodel = perfmodel
			
 
				+        )
			
 
				+
			
 
				+        handle = starpu_data_register(value)
			
 
				+
			
 
				+	starpu_data_set_sequential_consistency_flag(handle, 0)
			
 
				+
			
 
				+        taskA = starpu_task(cl = clA, detach=0)
			
 
				+        taskB = starpu_task(cl = clB, handles = [handle], callback=callbackB, callback_arg=taskA)
			
 
				+	taskC = starpu_task(cl = clC, handles = [handle], callback=callbackC, callback_arg=taskA)
			
 
				+
			
 
				+	starpu_task_end_dep_add(taskA, 2)
			
 
				+        starpu_task_declare_deps(taskC, taskB)
			
 
				+
			
 
				+        starpu_task_submit(taskA)
			
 
				+        starpu_task_submit(taskB)
			
 
				+        starpu_task_submit(taskC)
			
 
				+        starpu_task_wait(taskA)
			
 
				+
			
 
				+        starpu_data_acquire_on_node(handle, STARPU_MAIN_RAM, STARPU_R);
			
 
				+	# Waiting for taskA should have also waited for taskB and taskC
			
 
				+        if value[] != 48
			
 
				+            error("Incorrect value $(value[]) (expected 48)")
			
 
				+        end
			
 
				+	starpu_data_release_on_node(handle, STARPU_MAIN_RAM);
			
 
				+    end
			
 
				+
			
 
				+
			
 
				+    println("Value = ", value[])
			
 
				+end
			
 
				+
			
 
				+# Disable garbage collector because of random segfault/hang when using mutex.
			
 
				+# This issue should be solved with Julia release 1.5.
			
 
				+GC.enable(false)
			
 
				+starpu_init()
			
 
				+main()
			
 
				+starpu_shutdown()
			
 
				+GC.enable(true)
			
--- a/julia/examples/dependency/tag_dep.jl
+++ b/julia/examples/dependency/tag_dep.jl
@@ -0,0 +1,122 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+using StarPU
			
 
				+
			
 
				+@target STARPU_CPU
			
 
				+@codelet function codeletA(val ::Ref{Int32}) :: Nothing
			
 
				+    # print("[Task A] Value = ", val[]);
			
 
				+    val[] = val[] * 2
			
 
				+end
			
 
				+
			
 
				+function callbackA(arg)
			
 
				+    clB = arg[1]
			
 
				+    handle = arg[2]
			
 
				+    tagHoldC = arg[3]
			
 
				+
			
 
				+    taskB = starpu_task(cl = clB, handles = [handle],
			
 
				+                        callback = starpu_tag_notify_from_apps,
			
 
				+                        callback_arg = tagHoldC,
			
 
				+                        sequential_consistency=false)
			
 
				+
			
 
				+    starpu_task_submit(taskB)
			
 
				+end
			
 
				+
			
 
				+@target STARPU_CPU
			
 
				+@codelet function codeletB(val ::Ref{Int32}) :: Nothing
			
 
				+    # println("[Task B] Value = ", val[]);
			
 
				+    val[] = val[] +1
			
 
				+end
			
 
				+
			
 
				+@target STARPU_CPU
			
 
				+@codelet function codeletC(val ::Ref{Int32}) :: Nothing
			
 
				+    # println("[Task C] Value = ", val[]);
			
 
				+    val[] = val[] *2
			
 
				+end
			
 
				+
			
 
				+
			
 
				+# Submit taskA and hold it
			
 
				+# Submit taskC and hold it
			
 
				+# Release taskA
			
 
				+# Execute taskA       --> callback: submit taskB
			
 
				+# Execute taskB       --> callback: release taskC
			
 
				+#
			
 
				+# All three tasks use the same data in RW, taskB is submitted after
			
 
				+# taskC, so taskB should normally only execute after taskC but as the
			
 
				+# sequential consistency for (taskB, data) is unset, taskB can
			
 
				+# execute straightaway
			
 
				+function main()
			
 
				+    value = Ref(Int32(12))
			
 
				+
			
 
				+    @starpu_block let
			
 
				+    tagHoldA :: starpu_tag_t = 32
			
 
				+    tagHoldC :: starpu_tag_t = 84
			
 
				+    tagA :: starpu_tag_t = 421
			
 
				+    tagC :: starpu_tag_t = 842
			
 
				+
			
 
				+    starpu_tag_declare_deps(tagA, tagHoldA)
			
 
				+    starpu_tag_declare_deps(tagC, tagHoldC)
			
 
				+
			
 
				+    perfmodel = starpu_perfmodel(
			
 
				+        perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
			
 
				+        symbol = "history_perf"
			
 
				+    )
			
 
				+
			
 
				+        clA = starpu_codelet(
			
 
				+            cpu_func = "codeletA",
			
 
				+            modes = [STARPU_RW],
			
 
				+            perfmodel = perfmodel
			
 
				+        )
			
 
				+        clB = starpu_codelet(
			
 
				+            cpu_func = "codeletB",
			
 
				+            modes = [STARPU_RW],
			
 
				+            perfmodel = perfmodel
			
 
				+        )
			
 
				+        clC = starpu_codelet(
			
 
				+            cpu_func = "codeletC",
			
 
				+            modes = [STARPU_RW],
			
 
				+            perfmodel = perfmodel
			
 
				+        )
			
 
				+
			
 
				+        handle = starpu_data_register(value)
			
 
				+
			
 
				+        taskA = starpu_task(cl = clA, handles = [handle], tag = tagA,
			
 
				+                            callback = callbackA,
			
 
				+                            callback_arg=(clB, handle, tagHoldC))
			
 
				+        starpu_task_submit(taskA)
			
 
				+
			
 
				+        taskC = starpu_task(cl = clC, handles = [handle], tag = tagC)
			
 
				+        starpu_task_submit(taskC)
			
 
				+
			
 
				+        # Release taskA (we want to make sure it will execute after taskC has been submitted)
			
 
				+        starpu_tag_notify_from_apps(tagHoldA)
			
 
				+
			
 
				+        starpu_task_wait_for_all()
			
 
				+    end
			
 
				+
			
 
				+    if value[] != 50
			
 
				+        error("Incorrect value $(value[]) (expected 50)")
			
 
				+    end
			
 
				+
			
 
				+    println("Value = ", value[])
			
 
				+end
			
 
				+
			
 
				+# Disable garbage collector because of random segfault/hang when using mutex.
			
 
				+# This issue should be solved with Julia release 1.5.
			
 
				+GC.enable(false)
			
 
				+starpu_init()
			
 
				+main()
			
 
				+starpu_shutdown()
			
 
				+GC.enable(true)
			
--- a/julia/examples/dependency/task_dep.jl
+++ b/julia/examples/dependency/task_dep.jl
@@ -0,0 +1,88 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+using StarPU
			
 
				+
			
 
				+@target STARPU_CPU
			
 
				+@codelet function codeletA(val ::Ref{Int32}) :: Nothing
			
 
				+    # print("[Task A] Value = ", val[]);
			
 
				+    val[] = val[] * 2
			
 
				+end
			
 
				+
			
 
				+@target STARPU_CPU
			
 
				+@codelet function codeletB(val ::Ref{Int32}) :: Nothing
			
 
				+    # println("[Task B] Value = ", val[]);
			
 
				+    val[] = val[] +1
			
 
				+end
			
 
				+
			
 
				+@target STARPU_CPU
			
 
				+@codelet function codeletC(val ::Ref{Int32}) :: Nothing
			
 
				+    # println("[Task C] Value = ", val[]);
			
 
				+    val[] = val[] *2
			
 
				+end
			
 
				+
			
 
				+function main()
			
 
				+    value = Ref(Int32(12))
			
 
				+
			
 
				+    @starpu_block let
			
 
				+        perfmodel = starpu_perfmodel(
			
 
				+            perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
			
 
				+            symbol = "history_perf"
			
 
				+        )
			
 
				+
			
 
				+        clA = starpu_codelet(
			
 
				+            cpu_func = "codeletA",
			
 
				+            modes = [STARPU_RW],
			
 
				+            perfmodel = perfmodel
			
 
				+        )
			
 
				+        clB = starpu_codelet(
			
 
				+            cpu_func = "codeletB",
			
 
				+            modes = [STARPU_RW],
			
 
				+            perfmodel = perfmodel
			
 
				+        )
			
 
				+        clC = starpu_codelet(
			
 
				+            cpu_func = "codeletC",
			
 
				+            modes = [STARPU_RW],
			
 
				+            perfmodel = perfmodel
			
 
				+        )
			
 
				+
			
 
				+        starpu_data_set_default_sequential_consistency_flag(0)
			
 
				+
			
 
				+        handle = starpu_data_register(value)
			
 
				+
			
 
				+        taskA = starpu_task(cl = clA, handles = [handle])
			
 
				+        taskB = starpu_task(cl = clB, handles = [handle])
			
 
				+        taskC = starpu_task(cl = clC, handles = [handle])
			
 
				+
			
 
				+        starpu_task_declare_deps(taskA, taskB)
			
 
				+        starpu_task_declare_deps(taskC, taskA, taskB)
			
 
				+
			
 
				+        starpu_task_submit(taskA)
			
 
				+        starpu_task_submit(taskB)
			
 
				+        starpu_task_submit(taskC)
			
 
				+
			
 
				+        starpu_task_wait_for_all()
			
 
				+    end
			
 
				+
			
 
				+    if value[] != 52
			
 
				+        error("Incorrect value $(value[]) (expected 52)")
			
 
				+    end
			
 
				+
			
 
				+    println("Value = ", value[])
			
 
				+end
			
 
				+
			
 
				+starpu_init()
			
 
				+main()
			
 
				+starpu_shutdown()
			
--- a/julia/examples/execute.sh.in
+++ b/julia/examples/execute.sh.in
@@ -0,0 +1,53 @@
 
				+#!@REALBASH@
			
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+
			
 
				+set -x
			
 
				+export JULIA_LOAD_PATH=@STARPU_SRC_DIR@/julia:$JULIA_LOAD_PATH
			
 
				+export STARPU_BUILD_DIR=@STARPU_BUILD_DIR@
			
 
				+export STARPU_SRC_DIR=@STARPU_SRC_DIR@
			
 
				+export STARPU_JULIA_LIB=@STARPU_BUILD_DIR@/julia/src/.libs/libstarpujulia-1.3
			
 
				+export STARPU_JULIA_BUILD=@STARPU_BUILD_DIR@/julia
			
 
				+export LD_LIBRARY_PATH=@STARPU_BUILD_DIR@/julia/src/.libs/:$LD_LIBRARY_PATH
			
 
				+export JULIA_NUM_THREADS=8
			
 
				+export STARPU_NOPENCL=0
			
 
				+export STARPU_SCHED=dmda
			
 
				+
			
 
				+srcdir=@STARPU_SRC_DIR@/julia/examples
			
 
				+
			
 
				+rm -f genc*.c gencuda*.cu genc*.o
			
 
				+
			
 
				+if test "$1" == "-calllib"
			
 
				+then
			
 
				+    shift
			
 
				+    pwd
			
 
				+    rm -f extern_tasks.so
			
 
				+    make -f @STARPU_BUILD_DIR@/julia/src/dynamic_compiler/Makefile extern_tasks.so SOURCES_CPU=$srcdir/$1
			
 
				+    shift
			
 
				+    export JULIA_TASK_LIB=$PWD/extern_tasks.so
			
 
				+fi
			
 
				+
			
 
				+srcfile=$1
			
 
				+if test ! -f $srcdir/$srcfile
			
 
				+then
			
 
				+    echo "Error. File $srcdir/$srcfile not found"
			
 
				+    exit 1
			
 
				+fi
			
 
				+shift
			
 
				+#cd $srcdir/$(dirname $srcfile)
			
 
				+#@JULIA@ $(basename $srcfile) $*
			
 
				+@JULIA@ $srcdir/$srcfile $*
			
 
				+
			
--- a/julia/examples/gemm/gemm.jl
+++ b/julia/examples/gemm/gemm.jl
@@ -0,0 +1,145 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+using StarPU
			
 
				+using LinearAlgebra.BLAS
			
 
				+using BenchmarkTools
			
 
				+
			
 
				+@target STARPU_CPU+STARPU_CUDA
			
 
				+@codelet function gemm(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, alpha :: Float32, beta :: Float32) :: Nothing
			
 
				+
			
 
				+    M :: Int32 = height(A)
			
 
				+    N :: Int32 = width(B)
			
 
				+    K :: Int32 = width(A)
			
 
				+    lda :: Int32 = ld(A)
			
 
				+    ldb :: Int32 = ld(B)
			
 
				+    ldc :: Int32 = ld(C)
			
 
				+    STARPU_SGEMM("N", "N", M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
			
 
				+
			
 
				+    return
			
 
				+end
			
 
				+
			
 
				+function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, alpha :: Float32, beta :: Float32, nslicesx, nslicesy)
			
 
				+    scale= 3
			
 
				+    tmin=0
			
 
				+    vert = starpu_data_filter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslicesx)
			
 
				+    horiz = starpu_data_filter(STARPU_MATRIX_FILTER_BLOCK, nslicesy)
			
 
				+    @starpu_block let
			
 
				+        hA,hB,hC = starpu_data_register(A, B, C)
			
 
				+        starpu_data_partition(hB, vert)
			
 
				+        starpu_data_partition(hA, horiz)
			
 
				+        starpu_data_map_filters(hC, vert, horiz)
			
 
				+        tmin=0
			
 
				+
			
 
				+        for i in (1 : 10 )
			
 
				+            t=time_ns()
			
 
				+            @starpu_sync_tasks begin
			
 
				+                for taskx in (1 : nslicesx)
			
 
				+                    for tasky in (1 : nslicesy)
			
 
				+                        starpu_task_insert(codelet_name = "gemm",
			
 
				+                                           handles = [hA[tasky], hB[taskx], hC[taskx, tasky]],
			
 
				+                                           cl_arg = (alpha, beta),
			
 
				+                                           modes = [STARPU_R, STARPU_R, STARPU_RW])
			
 
				+                    end
			
 
				+                end
			
 
				+            end
			
 
				+            t=time_ns()-t
			
 
				+            if (tmin==0 || tmin>t)
			
 
				+                tmin=t
			
 
				+            end
			
 
				+        end
			
 
				+    end
			
 
				+    return tmin
			
 
				+end
			
 
				+
			
 
				+
			
 
				+function approximately_equals(
			
 
				+    A :: Matrix{Cfloat},
			
 
				+    B :: Matrix{Cfloat},
			
 
				+    eps = 1e-2
			
 
				+)
			
 
				+    (height, width) = size(A)
			
 
				+
			
 
				+    for j in (1 : width)
			
 
				+        for i in (1 : height)
			
 
				+            if (abs(A[i,j] - B[i,j]) > eps * max(abs(B[i,j]), abs(A[i,j])))
			
 
				+                println("A[$i,$j] : $(A[i,j]), B[$i,$j] : $(B[i,j])")
			
 
				+                return false
			
 
				+            end
			
 
				+        end
			
 
				+    end
			
 
				+
			
 
				+    return true
			
 
				+end
			
 
				+
			
 
				+function check(expected, A, B, C, alpha, beta)
			
 
				+    for i in 1 : 10
			
 
				+        gemm!('N', 'N', alpha, A, B, beta, expected)
			
 
				+    end
			
 
				+
			
 
				+    height,width = size(C)
			
 
				+    for i in 1:height
			
 
				+        for j in 1:width
			
 
				+            got = C[i, j]
			
 
				+            exp = expected[i, j]
			
 
				+
			
 
				+            err = abs(exp - got) / exp
			
 
				+            if err > 0.0001
			
 
				+                error("[$i] -> $got != $exp (err $err)")
			
 
				+            end
			
 
				+        end
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy)
			
 
				+    for dim in (start_dim : step_dim : stop_dim)
			
 
				+        A = Array(rand(Cfloat, dim, dim))
			
 
				+        B = Array(rand(Cfloat, dim, dim))
			
 
				+        C = zeros(Float32, dim, dim)
			
 
				+        C_ref = copy(C)
			
 
				+        starpu_memory_pin(A)
			
 
				+        starpu_memory_pin(B)
			
 
				+        starpu_memory_pin(C)
			
 
				+        alpha = 4.0f0
			
 
				+        beta = 2.0f0
			
 
				+        mt =  multiply_with_starpu(A, B, C, alpha, beta, nslicesx, nslicesy)
			
 
				+        gflop = 2 * dim * dim * dim * 1.e-9
			
 
				+        gflops = gflop / (mt * 1.e-9)
			
 
				+        size=dim*dim*dim*4*3/1024/1024
			
 
				+        println(io,"$dim $gflops")
			
 
				+        println("$dim $gflops")
			
 
				+        starpu_memory_unpin(A)
			
 
				+        starpu_memory_unpin(B)
			
 
				+        starpu_memory_unpin(C)
			
 
				+        check(C_ref, A, B, C, alpha, beta)
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+if size(ARGS, 1) < 1
			
 
				+    filename="x.dat"
			
 
				+else
			
 
				+    filename=ARGS[1]
			
 
				+end
			
 
				+
			
 
				+starpu_init()
			
 
				+starpu_cublas_init()
			
 
				+nblock_x = Int32(ceil(sqrt(starpu_worker_get_count())))
			
 
				+nblock_y = nblock_x
			
 
				+io=open(filename,"w")
			
 
				+compute_times(io,64,512,4096,nblock_x,nblock_y)
			
 
				+close(io)
			
 
				+
			
 
				+starpu_shutdown()
			
 
				+
			
--- a/julia/examples/gemm/gemm.sh
+++ b/julia/examples/gemm/gemm.sh
@@ -0,0 +1,22 @@
 
				+#!/bin/bash
			
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+
			
 
				+$(dirname $0)/../execute.sh gemm/gemm_native.jl
			
 
				+
			
 
				+export OMP_NUM_THREADS=1
			
 
				+$(dirname $0)/../execute.sh gemm/gemm.jl
			
 
				+
			
--- a/julia/examples/gemm/gemm_bare.jl
+++ b/julia/examples/gemm/gemm_bare.jl
@@ -0,0 +1,146 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+using StarPU
			
 
				+using LinearAlgebra.BLAS
			
 
				+
			
 
				+@target STARPU_CPU+STARPU_CUDA
			
 
				+@codelet function gemm(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, alpha :: Float32, beta :: Float32) :: Nothing
			
 
				+
			
 
				+    M :: Int32 = height(A)
			
 
				+    N :: Int32 = width(B)
			
 
				+    K :: Int32 = width(A)
			
 
				+    lda :: Int32 = ld(A)
			
 
				+    ldb :: Int32 = ld(B)
			
 
				+    ldc :: Int32 = ld(C)
			
 
				+    STARPU_SGEMM("N", "N", M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
			
 
				+
			
 
				+    return
			
 
				+end
			
 
				+
			
 
				+function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, alpha :: Float32, beta :: Float32, nslicesx, nslicesy)
			
 
				+    scale= 3
			
 
				+    tmin=0
			
 
				+    hA,hB,hC = starpu_data_register(A, B, C)
			
 
				+    tmin=0
			
 
				+    perfmodel = starpu_perfmodel(
			
 
				+        perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
			
 
				+        symbol = "gemm"
			
 
				+    )
			
 
				+    cl = starpu_codelet(
			
 
				+        cpu_func  = "gemm",
			
 
				+        cuda_func = "",
			
 
				+        modes =[STARPU_R,STARPU_R,STARPU_RW], 
			
 
				+        perfmodel = perfmodel,
			
 
				+    )
			
 
				+    task = starpu_task(cl = cl, handles =[hA,hB,hC], cl_arg = (alpha,beta), callback = nothing,
			
 
				+		callback_arg = nothing, tag = nothing, tag_only = nothing,
			
 
				+                       sequential_consistency = true,
			
 
				+                       detach = 1, color = nothing, where = nothing)
			
 
				+
			
 
				+
			
 
				+    for i in (1 : 10 )
			
 
				+        t=time_ns()
			
 
				+starpu_task_submit(Ref(task.c_task))
			
 
				+        #starpu_task_submit(task)
			
 
				+        starpu_task_wait_for_all()
			
 
				+        t=time_ns()-t
			
 
				+	if (tmin==0 || tmin>t)
			
 
				+           tmin=t
			
 
				+        end
			
 
				+    end
			
 
				+    starpu_data_unregister(hA)
			
 
				+    starpu_data_unregister(hB)
			
 
				+    starpu_data_unregister(hC)
			
 
				+    return tmin
			
 
				+end
			
 
				+
			
 
				+
			
 
				+function approximately_equals(
			
 
				+    A :: Matrix{Cfloat},
			
 
				+    B :: Matrix{Cfloat},
			
 
				+    eps = 1e-2
			
 
				+)
			
 
				+    (height, width) = size(A)
			
 
				+
			
 
				+    for j in (1 : width)
			
 
				+        for i in (1 : height)
			
 
				+            if (abs(A[i,j] - B[i,j]) > eps * max(abs(B[i,j]), abs(A[i,j])))
			
 
				+                println("A[$i,$j] : $(A[i,j]), B[$i,$j] : $(B[i,j])")
			
 
				+                return false
			
 
				+            end
			
 
				+        end
			
 
				+    end
			
 
				+
			
 
				+    return true
			
 
				+end
			
 
				+
			
 
				+function check(expected, A, B, C, alpha, beta)
			
 
				+    for i in 1 : 10
			
 
				+        gemm!('N', 'N', alpha, A, B, beta, expected)
			
 
				+    end
			
 
				+
			
 
				+    height,width = size(C)
			
 
				+    for i in 1:height
			
 
				+        for j in 1:width
			
 
				+            got = C[i, j]
			
 
				+            exp = expected[i, j]
			
 
				+
			
 
				+            err = abs(exp - got) / exp
			
 
				+            if err > 0.0001
			
 
				+                error("[$i] -> $got != $exp (err $err)")
			
 
				+            end
			
 
				+        end
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy)
			
 
				+    for dim in (start_dim : step_dim : stop_dim)
			
 
				+        A = Array(rand(Cfloat, dim, dim))
			
 
				+        B = Array(rand(Cfloat, dim, dim))
			
 
				+        C = zeros(Float32, dim, dim)
			
 
				+        C_ref = copy(C)
			
 
				+        starpu_memory_pin(A)
			
 
				+        starpu_memory_pin(B)
			
 
				+        starpu_memory_pin(C)
			
 
				+        alpha = 4.0f0
			
 
				+        beta = 2.0f0
			
 
				+        mt =  multiply_with_starpu(A, B, C, alpha, beta, nslicesx, nslicesy)
			
 
				+        gflop = 2 * dim * dim * dim * 1.e-9
			
 
				+        gflops = gflop / (mt * 1.e-9)
			
 
				+        size=dim*dim*dim*4*3/1024/1024
			
 
				+        println(io,"$dim $gflops")
			
 
				+        println("$dim $gflops")
			
 
				+        starpu_memory_unpin(A)
			
 
				+        starpu_memory_unpin(B)
			
 
				+        starpu_memory_unpin(C)
			
 
				+        #check(C_ref, A, B, C, alpha, beta)
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+if size(ARGS, 1) < 1
			
 
				+    filename="x.dat"
			
 
				+else
			
 
				+    filename=ARGS[1]
			
 
				+end
			
 
				+
			
 
				+starpu_init()
			
 
				+starpu_cublas_init()
			
 
				+io=open(filename,"w")
			
 
				+compute_times(io,64,512,4096,1,1)
			
 
				+close(io)
			
 
				+
			
 
				+starpu_shutdown()
			
 
				+
			
--- a/julia/examples/gemm/gemm_native.jl
+++ b/julia/examples/gemm/gemm_native.jl
@@ -0,0 +1,56 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+using LinearAlgebra.BLAS
			
 
				+
			
 
				+function gemm_without_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, alpha :: Float32, beta :: Float32)
			
 
				+    tmin = 0
			
 
				+    for i in (1 : 10 )
			
 
				+        t=time_ns()
			
 
				+        gemm!('N', 'N', alpha, A, B, beta, C)
			
 
				+        t=time_ns() - t
			
 
				+        if (tmin==0 || tmin>t)
			
 
				+            tmin=t
			
 
				+        end
			
 
				+    end
			
 
				+    return tmin
			
 
				+end
			
 
				+
			
 
				+
			
 
				+function compute_times(io,start_dim, step_dim, stop_dim)
			
 
				+    for dim in (start_dim : step_dim : stop_dim)
			
 
				+        A = Array(rand(Cfloat, dim, dim))
			
 
				+        B = Array(rand(Cfloat, dim, dim))
			
 
				+        C = zeros(Float32, dim, dim)
			
 
				+        alpha = 4.0f0
			
 
				+        beta = 2.0f0
			
 
				+        mt =  gemm_without_starpu(A, B, C, alpha, beta)
			
 
				+        gflop = 2 * dim * dim * dim * 1.e-9
			
 
				+        gflops = gflop / (mt * 1.e-9)
			
 
				+        size=dim*dim*dim*4*3/1024/1024
			
 
				+        println(io,"$dim $gflops")
			
 
				+        println("$dim $gflops")
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+if size(ARGS, 1) < 1
			
 
				+    filename="x.dat"
			
 
				+else
			
 
				+    filename=ARGS[1]
			
 
				+end
			
 
				+io=open(filename,"w")
			
 
				+compute_times(io,64,512,4096)
			
 
				+close(io)
			
 
				+
			
--- a/julia/examples/mandelbrot/cpu_mandelbrot.c
+++ b/julia/examples/mandelbrot/cpu_mandelbrot.c
@@ -1,44 +1,62 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				 #include <stdio.h>
			
 
				 #include <starpu.h>
			
 
				 #include <math.h>
			
 
				+#include "cpu_mandelbrot.h"
			
 
				 
			
 
				 void cpu_mandelbrot(void *descr[], void *cl_arg)
			
 
				 {
			
 
				         long long *pixels;
			
 
				-        float *params;
			
 
				 
			
 
				         pixels = (long long int *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				-        params = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+        struct params *params = (struct params *) cl_arg;
			
 
				 
			
 
				-        long long width = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				-        long long height = STARPU_MATRIX_GET_NX(descr[0]);
			
 
				+        long width = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+        long height = STARPU_MATRIX_GET_NX(descr[0]);
			
 
				         double zoom = width * 0.25296875;
			
 
				         double iz = 1. / zoom;
			
 
				         float diverge = 4.0;
			
 
				         float max_iterations = (width/2) * 0.049715909 * log10(zoom);
			
 
				         float imi = 1. / max_iterations;
			
 
				-        float centerr = params[0];
			
 
				-        float centeri = params[1];
			
 
				-        float offset = params[2];
			
 
				-        float dim = params[3];
			
 
				+        double centerr = params->centerr;
			
 
				+        double centeri = params->centeri;
			
 
				+        long offset = params->offset;
			
 
				+        long dim = params->dim;
			
 
				         double cr = 0;
			
 
				         double zr = 0;
			
 
				         double ci = 0;
			
 
				         double zi = 0;
			
 
				-        long long n = 0;
			
 
				+        long n = 0;
			
 
				         double tmp = 0;
			
 
				         int ldP = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				 
			
 
				         long long x,y;
			
 
				 
			
 
				-        for (y = 0; y < height; y++){
			
 
				-                for (x = 0; x < width; x++){
			
 
				+        for (y = 0; y < height; y++)
			
 
				+	{
			
 
				+                for (x = 0; x < width; x++)
			
 
				+		{
			
 
				                         cr = centerr + (x - (dim/2)) * iz;
			
 
				 			zr = cr;
			
 
				                         ci = centeri + (y+offset - (dim/2)) * iz;
			
 
				                         zi = ci;
			
 
				 
			
 
				-                        for (n = 0; n <= max_iterations; n++) {
			
 
				+                        for (n = 0; n <= max_iterations; n++)
			
 
				+			{
			
 
				 				if (zr*zr + zi*zi>diverge) break;
			
 
				                                 tmp = zr*zr - zi*zi + cr;
			
 
				                                 zi = 2*zr*zi + ci;
			
@@ -53,8 +71,9 @@ void cpu_mandelbrot(void *descr[], void *cl_arg)
 
				 }
			
 
				 
			
 
				 char* CPU = "cpu_mandelbrot";
			
 
				-char* GPU = "gpu_mandelbrot";
			
 
				-extern char *starpu_find_function(char *name, char *device) {
			
 
				+char* GPU = "";
			
 
				+extern char *starpu_find_function(char *name, char *device)
			
 
				+{
			
 
				 	if (!strcmp(device,"gpu")) return GPU;
			
 
				 	return CPU;
			
 
				 }
			
--- a/julia/examples/mandelbrot/mandelbrot.jl
+++ b/julia/examples/mandelbrot/mandelbrot.jl
@@ -1,3 +1,18 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				 import Libdl
			
 
				 using StarPU
			
 
				 using LinearAlgebra
			
@@ -34,7 +49,7 @@ using LinearAlgebra
 
				                 zi = 2*zr*zi + ci
			
 
				                 zr = tmp
			
 
				             end
			
 
				-            
			
 
				+
			
 
				             if (n < max_iterations)
			
 
				                 pixels[y,x] = round(15 * n * imi)
			
 
				             else
			
@@ -49,13 +64,16 @@ end
 
				 starpu_init()
			
 
				 
			
 
				 function mandelbrot_with_starpu(A ::Matrix{Int64}, cr ::Float64, ci ::Float64, dim ::Int64, nslicesx ::Int64)
			
 
				-    horiz = StarpuDataFilter(STARPU_MATRIX_FILTER_BLOCK, nslicesx)
			
 
				+    horiz = starpu_data_filter(STARPU_MATRIX_FILTER_BLOCK, nslicesx)
			
 
				     @starpu_block let
			
 
				 	hA = starpu_data_register(A)
			
 
				 	starpu_data_partition(hA,horiz)
			
 
				 
			
 
				 	@starpu_sync_tasks for taskx in (1 : nslicesx)
			
 
				-                @starpu_async_cl mandelbrot(hA[taskx]) [STARPU_W] [cr, ci, (taskx-1)*dim/nslicesx, dim]
			
 
				+            starpu_task_insert(codelet_name = "mandelbrot",
			
 
				+                               handles = [hA[taskx]],
			
 
				+                               modes = [STARPU_W],
			
 
				+                               cl_arg = (cr, ci, Int64((taskx-1)*dim/nslicesx), dim))
			
 
				 	end
			
 
				     end
			
 
				 end
			
@@ -73,9 +91,9 @@ function pixels2img(pixels ::Matrix{Int64}, width ::Int64, height ::Int64, filen
 
				     end
			
 
				 end
			
 
				 
			
 
				-function min_times(cr ::Float64, ci ::Float64, dim ::Int64, nslices ::Int64)
			
 
				+function min_times(cr ::Float64, ci ::Float64, dim ::Int64, nslices ::Int64, gen_images)
			
 
				     tmin=0;
			
 
				-    
			
 
				+
			
 
				     pixels ::Matrix{Int64} = zeros(dim, dim)
			
 
				     for i = 1:10
			
 
				         t = time_ns();
			
@@ -85,20 +103,21 @@ function min_times(cr ::Float64, ci ::Float64, dim ::Int64, nslices ::Int64)
 
				             tmin=t
			
 
				         end
			
 
				     end
			
 
				-    pixels2img(pixels,dim,dim,"out$(dim).ppm")
			
 
				+    if (gen_images == 1)
			
 
				+        pixels2img(pixels,dim,dim,"out$(dim).ppm")
			
 
				+    end
			
 
				     return tmin
			
 
				 end
			
 
				 
			
 
				-function display_time(cr ::Float64, ci ::Float64, start_dim ::Int64, step_dim ::Int64, stop_dim ::Int64, nslices ::Int64)
			
 
				+function display_time(cr ::Float64, ci ::Float64, start_dim ::Int64, step_dim ::Int64, stop_dim ::Int64, nslices ::Int64, gen_images)
			
 
				     for dim in (start_dim : step_dim : stop_dim)
			
 
				-        res = min_times(cr, ci, dim, nslices)
			
 
				+        res = min_times(cr, ci, dim, nslices, gen_images)
			
 
				         res=res/dim/dim; # time per pixel
			
 
				         println("$(dim) $(res)")
			
 
				     end
			
 
				 end
			
 
				 
			
 
				 
			
 
				-display_time(-0.800671,-0.158392,32,32,4096,4)
			
 
				+display_time(-0.800671,-0.158392,32,32,512,4, 0)
			
 
				 
			
 
				 starpu_shutdown()
			
 
				-
			
--- a/julia/examples/mult/cpu_mult.c
+++ b/julia/examples/mult/cpu_mult.c
@@ -1,6 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2018                                     Alexis Juven
			
 
				+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ * Copyright (C) 2018       Alexis Juven
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -13,26 +14,30 @@
 
				  *
			
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				+
			
 
				 #include <stdint.h>
			
 
				 #include <stdio.h>
			
 
				 #include <string.h>
			
 
				 #include <starpu.h>
			
 
				+
			
 
				 /*
			
 
				  * The codelet is passed 3 matrices, the "descr" union-type field gives a
			
 
				  * description of the layout of those 3 matrices in the local memory (ie. RAM
			
 
				  * in the case of CPU, GPU frame buffer in the case of GPU etc.). Since we have
			
 
				  * registered data with the "matrix" data interface, we use the matrix macros.
			
 
				  */
			
 
				-void cpu_mult(void *descr[], void *arg)
			
 
				+void cpu_mult(void *descr[], void *cl_arg)
			
 
				 {
			
 
				-	(void)arg;
			
 
				+	int stride;
			
 
				 	float *subA, *subB, *subC;
			
 
				+
			
 
				+	stride = *((int *)cl_arg);
			
 
				+
			
 
				 	/* .blas.ptr gives a pointer to the first element of the local copy */
			
 
				 	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				 	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				 	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
			
 
				 
			
 
				-
			
 
				 	/* .blas.nx is the number of rows (consecutive elements) and .blas.ny
			
 
				 	 * is the number of lines that are separated by .blas.ld elements (ld
			
 
				 	 * stands for leading dimension).
			
@@ -50,14 +55,18 @@ void cpu_mult(void *descr[], void *arg)
 
				 	int i,j,k,ii,jj,kk;
			
 
				 	for (i = 0; i < nyC*nxC; i++) subC[i] = 0;
			
 
				 	//fprintf(stderr,"inside cpu_mult %dx%dx%d %d/%d on %d\n",nyC,nyA,nxC,starpu_worker_get_id(),STARPU_NMAXWORKERS,starpu_worker_get_devid(starpu_worker_get_id()));
			
 
				-	for (i=0;i<nyC;i+=STRIDE) {
			
 
				-		for (k=0;k<nyA;k+=STRIDE) {
			
 
				-			for (j=0;j<nxC;j+=STRIDE) {
			
 
				-				
			
 
				-				for (ii = i; ii < i+STRIDE; ii+=2) {
			
 
				+	for (i=0;i<nyC;i+=stride)
			
 
				+	{
			
 
				+		for (k=0;k<nyA;k+=stride)
			
 
				+		{
			
 
				+			for (j=0;j<nxC;j+=stride)
			
 
				+			{
			
 
				+				for (ii = i; ii < i+stride; ii+=2)
			
 
				+				{
			
 
				 					float *sC0=subC+ii*ldC+j;
			
 
				 					float *sC1=subC+ii*ldC+ldC+j;
			
 
				-					for (kk = k; kk < k+STRIDE; kk+=4) {
			
 
				+					for (kk = k; kk < k+stride; kk+=4)
			
 
				+					{
			
 
				 						float alpha00=subB[kk +  ii*ldB];
			
 
				 						float alpha01=subB[kk+1+ii*ldB];
			
 
				 						float alpha10=subB[kk+  ii*ldB+ldB];
			
@@ -70,7 +79,8 @@ void cpu_mult(void *descr[], void *arg)
 
				 						float *sA1=subA+kk*ldA+ldA+j;
			
 
				 						float *sA2=subA+kk*ldA+2*ldA+j;
			
 
				 						float *sA3=subA+kk*ldA+3*ldA+j;
			
 
				-						for (jj = 0; jj < STRIDE; jj+=1) {
			
 
				+						for (jj = 0; jj < stride; jj+=1)
			
 
				+						{
			
 
				 							sC0[jj] += alpha00*sA0[jj]+alpha01*sA1[jj]+alpha02*sA2[jj]+alpha03*sA3[jj];
			
 
				 							sC1[jj] += alpha10*sA0[jj]+alpha11*sA1[jj]+alpha12*sA2[jj]+alpha13*sA3[jj];
			
 
				 						}
			
@@ -80,11 +90,12 @@ void cpu_mult(void *descr[], void *arg)
 
				 		}
			
 
				 	}
			
 
				 	//fprintf(stderr,"inside cpu_mult %dx%dx%d\n",nyC,nyA,nxC);
			
 
				-
			
 
				 }
			
 
				+
			
 
				 char* CPU = "cpu_mult";
			
 
				-char* GPU = "gpu_mult";
			
 
				-extern char *starpu_find_function(char *name, char *device) {
			
 
				+char* GPU = "";
			
 
				+extern char *starpu_find_function(char *name, char *device)
			
 
				+{
			
 
				 	if (!strcmp(device,"gpu")) return GPU;
			
 
				 	return CPU;
			
 
				 }
			
--- a/julia/examples/mult/mult.jl
+++ b/julia/examples/mult/mult.jl
@@ -1,12 +1,24 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				 import Libdl
			
 
				 using StarPU
			
 
				 using LinearAlgebra
			
 
				 
			
 
				-#shoud be the same as in the makefile
			
 
				-const STRIDE = 72
			
 
				-
			
 
				 @target STARPU_CPU+STARPU_CUDA
			
 
				-@codelet function matrix_mult(m1 :: Matrix{Float32}, m2 :: Matrix{Float32}, m3 :: Matrix{Float32}) :: Nothing
			
 
				+@codelet function matrix_mult(m1 :: Matrix{Float32}, m2 :: Matrix{Float32}, m3 :: Matrix{Float32}, stride ::Int32) :: Nothing
			
 
				 
			
 
				     width_m2 :: Int32 = width(m2)
			
 
				     height_m1 :: Int32 = height(m1)
			
@@ -59,38 +71,27 @@ end
 
				 
			
 
				 starpu_init()
			
 
				 
			
 
				-function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, nslicesx, nslicesy)
			
 
				+function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, nslicesx, nslicesy, stride)
			
 
				     scale= 3
			
 
				     tmin=0
			
 
				-    vert = StarpuDataFilter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslicesx)
			
 
				-    horiz = StarpuDataFilter(STARPU_MATRIX_FILTER_BLOCK, nslicesy)
			
 
				+    vert = starpu_data_filter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslicesx)
			
 
				+    horiz = starpu_data_filter(STARPU_MATRIX_FILTER_BLOCK, nslicesy)
			
 
				     @starpu_block let
			
 
				         hA,hB,hC = starpu_data_register(A, B, C)
			
 
				         starpu_data_partition(hB, vert)
			
 
				         starpu_data_partition(hA, horiz)
			
 
				         starpu_data_map_filters(hC, vert, horiz)
			
 
				         tmin=0
			
 
				-        perfmodel = StarpuPerfmodel(
			
 
				-            perf_type = STARPU_HISTORY_BASED,
			
 
				-            symbol = "history_perf"
			
 
				-        )
			
 
				-        cl = StarpuCodelet(
			
 
				-            cpu_func = CPU_CODELETS["matrix_mult"],
			
 
				-            # cuda_func = CUDA_CODELETS["matrix_mult"],
			
 
				-            #opencl_func="ocl_matrix_mult",
			
 
				-            modes = [STARPU_R, STARPU_R, STARPU_W],
			
 
				-            perfmodel = perfmodel
			
 
				-        )
			
 
				 
			
 
				         for i in (1 : 10 )
			
 
				             t=time_ns()
			
 
				             @starpu_sync_tasks begin
			
 
				                 for taskx in (1 : nslicesx)
			
 
				                     for tasky in (1 : nslicesy)
			
 
				-                        handles = [hA[tasky], hB[taskx], hC[taskx, tasky]]
			
 
				-                        task = StarpuTask(cl = cl, handles = handles)
			
 
				-                        starpu_task_submit(task)
			
 
				-                        #@starpu_async_cl matrix_mult(hA[tasky], hB[taskx], hC[taskx, tasky])
			
 
				+                        starpu_task_insert(codelet_name = "matrix_mult",
			
 
				+                                           modes = [STARPU_R, STARPU_R, STARPU_W],
			
 
				+                                           handles = [hA[tasky], hB[taskx], hC[taskx, tasky]],
			
 
				+                                           cl_arg = (Int32(stride),))
			
 
				                     end
			
 
				                 end
			
 
				             end
			
@@ -104,41 +105,45 @@ function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: M
 
				 end
			
 
				 
			
 
				 
			
 
				-function approximately_equals(
			
 
				-    A :: Matrix{Cfloat},
			
 
				-    B :: Matrix{Cfloat},
			
 
				-    eps = 1e-2
			
 
				-)
			
 
				-    (height, width) = size(A)
			
 
				+function check(A, B, C)
			
 
				+    expected = A * B
			
 
				+    height,width = size(C)
			
 
				+    for i in 1:height
			
 
				+        for j in 1:width
			
 
				+            got = C[i, j]
			
 
				+            exp = expected[i, j]
			
 
				 
			
 
				-    for j in (1 : width)
			
 
				-        for i in (1 : height)
			
 
				-            if (abs(A[i,j] - B[i,j]) > eps * max(abs(B[i,j]), abs(A[i,j])))
			
 
				-                println("A[$i,$j] : $(A[i,j]), B[$i,$j] : $(B[i,j])")
			
 
				-                return false
			
 
				+            err = abs(exp - got) / exp
			
 
				+            if err > 0.0001
			
 
				+                error("[$i] -> $got != $exp (err $err)")
			
 
				             end
			
 
				         end
			
 
				     end
			
 
				-
			
 
				-    return true
			
 
				 end
			
 
				 
			
 
				-function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy)
			
 
				+function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy, stride)
			
 
				     for dim in (start_dim : step_dim : stop_dim)
			
 
				         A = Array(rand(Cfloat, dim, dim))
			
 
				         B = Array(rand(Cfloat, dim, dim))
			
 
				         C = zeros(Float32, dim, dim)
			
 
				-        mt =  multiply_with_starpu(A, B, C, nslicesx, nslicesy)
			
 
				+        mt =  multiply_with_starpu(A, B, C, nslicesx, nslicesy, stride)
			
 
				         flops = (2*dim-1)*dim*dim/mt
			
 
				         size=dim*dim*4*3/1024/1024
			
 
				         println(io,"$size $flops")
			
 
				         println("$size $flops")
			
 
				+        check(A, B, C)
			
 
				     end
			
 
				 end
			
 
				 
			
 
				-
			
 
				-io=open(ARGS[1],"w")
			
 
				-compute_times(io,16*STRIDE,4*STRIDE,4096,2,2)
			
 
				+if size(ARGS, 1) < 2
			
 
				+    stride=4
			
 
				+    filename="x.dat"
			
 
				+else
			
 
				+    stride=parse(Int, ARGS[1])
			
 
				+    filename=ARGS[2]
			
 
				+end
			
 
				+io=open(filename,"w")
			
 
				+compute_times(io,16*stride,4*stride,128*stride,2,2,stride)
			
 
				 close(io)
			
 
				 
			
 
				 starpu_shutdown()
			
--- a/julia/examples/task_insert_color/task_insert_color.jl
+++ b/julia/examples/task_insert_color/task_insert_color.jl
@@ -1,3 +1,18 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				 import Libdl
			
 
				 using StarPU
			
 
				 
			
@@ -14,27 +29,34 @@ function task_insert_color_with_starpu(val ::Ref{Int32})
 
				     @starpu_block let
			
 
				 	hVal = starpu_data_register(val)
			
 
				 
			
 
				-        cl1 = StarpuCodelet(
			
 
				-            cpu_func = CPU_CODELETS["task_insert_color"],
			
 
				-            modes = [STARPU_RW]
			
 
				+        perfmodel = starpu_perfmodel(
			
 
				+            perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
			
 
				+            symbol = "history_perf"
			
 
				         )
			
 
				 
			
 
				-        cl2 = StarpuCodelet(
			
 
				-            cpu_func = CPU_CODELETS["task_insert_color"],
			
 
				+        cl1 = starpu_codelet(
			
 
				+            cpu_func = "task_insert_color",
			
 
				             modes = [STARPU_RW],
			
 
				+            perfmodel = perfmodel
			
 
				+        )
			
 
				+
			
 
				+        cl2 = starpu_codelet(
			
 
				+            cpu_func = "task_insert_color",
			
 
				+            modes = [STARPU_RW],
			
 
				+            perfmodel = perfmodel,
			
 
				             color = 0x0000FF
			
 
				         )
			
 
				 
			
 
				 	@starpu_sync_tasks begin
			
 
				 
			
 
				             # In the trace file, the following task should be green (executed on CPU)
			
 
				-            starpu_task_submit(StarpuTask(cl = cl1, handles = [hVal]))
			
 
				+            starpu_task_submit(starpu_task(cl = cl1, handles = [hVal]))
			
 
				 
			
 
				             # In the trace file, the following task will be blue as specified by the field color of cl2
			
 
				-            starpu_task_submit(StarpuTask(cl = cl2, handles = [hVal]))
			
 
				+            starpu_task_submit(starpu_task(cl = cl2, handles = [hVal]))
			
 
				 
			
 
				             # In the trace file, the following tasks will be red as specified in @starpu_async_cl
			
 
				-            @starpu_async_cl task_insert_color(hVal) [STARPU_RW] [] 0xFF0000
			
 
				+            @starpu_async_cl task_insert_color(hVal) [STARPU_RW] () 0xFF0000
			
 
				 
			
 
				 	end
			
 
				     end
			
--- a/julia/examples/variable/variable.jl
+++ b/julia/examples/variable/variable.jl
@@ -29,7 +29,7 @@ function display(niter)
 
				     if foo[] == niter
			
 
				         println("result is correct")
			
 
				     else
			
 
				-        println("result is incorret")
			
 
				+        error("result is incorret")
			
 
				     end
			
 
				 end
			
 
				 
			
--- a/julia/examples/vector_scal/vector_scal.jl
+++ b/julia/examples/vector_scal/vector_scal.jl
@@ -1,3 +1,18 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				 import Libdl
			
 
				 using StarPU
			
 
				 using LinearAlgebra
			
@@ -21,28 +36,15 @@ function vector_scal_with_starpu(v :: Vector{Float32}, m :: Int32, k :: Float32,
 
				     @starpu_block let
			
 
				         hV = starpu_data_register(v)
			
 
				         tmin=0
			
 
				-        perfmodel = StarpuPerfmodel(
			
 
				-            perf_type = STARPU_HISTORY_BASED,
			
 
				-            symbol = "history_perf"
			
 
				-        )
			
 
				-        cl = StarpuCodelet(
			
 
				-            cpu_func = CPU_CODELETS["vector_scal"],
			
 
				-            # cuda_func = CUDA_CODELETS["vector_scal"],
			
 
				-            #opencl_func="ocl_matrix_mult",
			
 
				-            modes = [STARPU_RW],
			
 
				-            perfmodel = perfmodel
			
 
				-        )
			
 
				 
			
 
				         for i in (1 : 1)
			
 
				             t=time_ns()
			
 
				             @starpu_sync_tasks begin
			
 
				-                handles = [hV]
			
 
				-                task = StarpuTask(cl = cl, handles = handles, cl_arg=[m, k, l])
			
 
				-                starpu_task_submit(task)
			
 
				+                starpu_task_insert(codelet_name = "vector_scal",
			
 
				+                                   modes = [STARPU_RW],
			
 
				+                                   handles = [hV],
			
 
				+                                   cl_arg=(m, k, l))
			
 
				             end
			
 
				-            # @starpu_sync_tasks for task in (1:1)
			
 
				-            #     @starpu_async_cl vector_scal(hV, STARPU_RW, [m, k, l])
			
 
				-            # end
			
 
				             t=time_ns()-t
			
 
				             if (tmin==0 || tmin>t)
			
 
				                 tmin=t
			
@@ -52,9 +54,24 @@ function vector_scal_with_starpu(v :: Vector{Float32}, m :: Int32, k :: Float32,
 
				     return tmin
			
 
				 end
			
 
				 
			
 
				+function check(ref, res, m, k, l)
			
 
				+    expected = ref .* m .+ (k+l)
			
 
				+
			
 
				+    for i in 1:length(expected)
			
 
				+        got = res[i]
			
 
				+        exp = expected[i]
			
 
				+
			
 
				+        err = abs(exp - got) / exp
			
 
				+        if err > 0.0001
			
 
				+            error("[$i] -> $got != $exp (err $err)")
			
 
				+        end
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				 function compute_times(io,start_dim, step_dim, stop_dim)
			
 
				     for size in (start_dim : step_dim : stop_dim)
			
 
				         V = Array(rand(Cfloat, size))
			
 
				+        V_ref = copy(V)
			
 
				         starpu_memory_pin(V)
			
 
				 
			
 
				         m :: Int32 = 10
			
@@ -70,11 +87,18 @@ function compute_times(io,start_dim, step_dim, stop_dim)
 
				         println("OUTPUT ", V[1:10])
			
 
				         println(io,"$size $mt")
			
 
				         println("$size $mt")
			
 
				+
			
 
				+        check(V_ref, V, m, k, l)
			
 
				     end
			
 
				 end
			
 
				 
			
 
				+if size(ARGS, 1) < 1
			
 
				+    filename="x.dat"
			
 
				+else
			
 
				+    filename=ARGS[1]
			
 
				+end
			
 
				 
			
 
				-io=open(ARGS[1],"w")
			
 
				+io=open(filename,"w")
			
 
				 compute_times(io,1024,1024,4096)
			
 
				 close(io)
			
 
				 
			
--- a/julia/src/Makefile.am
+++ b/julia/src/Makefile.am
@@ -0,0 +1,60 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+
			
 
				+include $(top_srcdir)/starpu-notests.mk
			
 
				+
			
 
				+CLEANFILES = *.gcno *.gcda
			
 
				+
			
 
				+AM_CFLAGS = $(GLOBAL_AM_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(HWLOC_CFLAGS) $(FXT_CFLAGS) -fPIC
			
 
				+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ `@JULIA@ $(top_srcdir)/julia/src/openblas_ldflags.jl`
			
 
				+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_builddir)/src -I$(top_srcdir)/src -I$(top_srcdir)/julia/src
			
 
				+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) -no-undefined
			
 
				+
			
 
				+SUBDIRS = dynamic_compiler
			
 
				+
			
 
				+lib_LTLIBRARIES = libstarpujulia-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+
			
 
				+noinst_HEADERS =
			
 
				+
			
 
				+libstarpujulia_@STARPU_EFFECTIVE_VERSION@_la_LDFLAGS = $(ldflags) -no-undefined			\
			
 
				+  -version-info $(LIBSTARPUJULIA_INTERFACE_CURRENT):$(LIBSTARPUJULIA_INTERFACE_REVISION):$(LIBSTARPUJULIA_INTERFACE_AGE)
			
 
				+
			
 
				+libstarpujulia_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 						\
			
 
				+	callback_wrapper.c \
			
 
				+	blas_wrapper.c \
			
 
				+	blas.c
			
 
				+
			
 
				+EXTRA_DIST = blas.h				\
			
 
				+	blas.jl  				\
			
 
				+	data.jl					\
			
 
				+	destructible.jl				\
			
 
				+	globals.jl				\
			
 
				+	init.jl					\
			
 
				+	linked_list.jl				\
			
 
				+	perfmodel.jl				\
			
 
				+	StarPU.jl				\
			
 
				+	task_dep.jl				\
			
 
				+	task.jl					\
			
 
				+	translate_headers.jl			\
			
 
				+	utils.jl				\
			
 
				+	compiler/c.jl				\
			
 
				+	compiler/cuda.jl			\
			
 
				+	compiler/expression_manipulation.jl	\
			
 
				+	compiler/expressions.jl			\
			
 
				+	compiler/file_generation.jl		\
			
 
				+	compiler/include.jl			\
			
 
				+	compiler/parsing.jl			\
			
 
				+	compiler/utils.jl
			
--- a/julia/src/StarPU.jl
+++ b/julia/src/StarPU.jl
--- a/julia/src/blas.c
+++ b/julia/src/blas.c
@@ -0,0 +1,194 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <ctype.h>
			
 
				+#include <stdio.h>
			
 
				+
			
 
				+#include "blas.h"
			
 
				+
			
 
				+inline void STARPU_SGEMM(char *transa, char *transb, BLASINT M, BLASINT N, BLASINT K, 
			
 
				+			float alpha, const float *A, BLASINT lda, const float *B, BLASINT ldb, 
			
 
				+			float beta, float *C, BLASINT ldc)
			
 
				+{
			
 
				+	sgemm_64_(transa, transb, &M, &N, &K, &alpha,
			
 
				+			 A, &lda, B, &ldb,
			
 
				+			 &beta, C, &ldc);	
			
 
				+}
			
 
				+
			
 
				+inline void STARPU_DGEMM(char *transa, char *transb, BLASINT M, BLASINT N, BLASINT K, 
			
 
				+			double alpha, double *A, BLASINT lda, double *B, BLASINT ldb, 
			
 
				+			double beta, double *C, BLASINT ldc)
			
 
				+{
			
 
				+	dgemm_64_(transa, transb, &M, &N, &K, &alpha,
			
 
				+			 A, &lda, B, &ldb,
			
 
				+			 &beta, C, &ldc);	
			
 
				+}
			
 
				+
			
 
				+
			
 
				+inline void STARPU_SGEMV(char *transa, BLASINT M, BLASINT N, float alpha, float *A, BLASINT lda,
			
 
				+		float *X, BLASINT incX, float beta, float *Y, BLASINT incY)
			
 
				+{
			
 
				+	sgemv_64_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
			
 
				+}
			
 
				+
			
 
				+inline void STARPU_DGEMV(char *transa, BLASINT M, BLASINT N, double alpha, double *A, BLASINT lda,
			
 
				+		double *X, BLASINT incX, double beta, double *Y, BLASINT incY)
			
 
				+{
			
 
				+	dgemv_64_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
			
 
				+}
			
 
				+
			
 
				+inline float STARPU_SASUM(BLASINT N, float *X, BLASINT incX)
			
 
				+{
			
 
				+	return sasum_64_(&N, X, &incX);
			
 
				+}
			
 
				+
			
 
				+inline double STARPU_DASUM(BLASINT N, double *X, BLASINT incX)
			
 
				+{
			
 
				+	return dasum_64_(&N, X, &incX);
			
 
				+}
			
 
				+
			
 
				+void STARPU_SSCAL(BLASINT N, float alpha, float *X, BLASINT incX)
			
 
				+{
			
 
				+	sscal_64_(&N, &alpha, X, &incX);
			
 
				+}
			
 
				+
			
 
				+void STARPU_DSCAL(BLASINT N, double alpha, double *X, BLASINT incX)
			
 
				+{
			
 
				+	dscal_64_(&N, &alpha, X, &incX);
			
 
				+}
			
 
				+
			
 
				+void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
			
 
				+                   const char *diag, const BLASINT m, const BLASINT n,
			
 
				+                   const float alpha, const float *A, const BLASINT lda,
			
 
				+                   float *B, const BLASINT ldb)
			
 
				+{
			
 
				+	strsm_64_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
			
 
				+}
			
 
				+
			
 
				+void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
			
 
				+                   const char *diag, const BLASINT m, const BLASINT n,
			
 
				+                   const double alpha, const double *A, const BLASINT lda,
			
 
				+                   double *B, const BLASINT ldb)
			
 
				+{
			
 
				+	dtrsm_64_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
			
 
				+}
			
 
				+
			
 
				+void STARPU_SSYR (const char *uplo, const BLASINT n, const float alpha,
			
 
				+                  const float *x, const BLASINT incx, float *A, const BLASINT lda)
			
 
				+{
			
 
				+	ssyr_64_(uplo, &n, &alpha, x, &incx, A, &lda); 
			
 
				+}
			
 
				+
			
 
				+void STARPU_SSYRK (const char *uplo, const char *trans, const BLASINT n,
			
 
				+                   const BLASINT k, const float alpha, const float *A,
			
 
				+                   const BLASINT lda, const float beta, float *C,
			
 
				+                   const BLASINT ldc)
			
 
				+{
			
 
				+	ssyrk_64_(uplo, trans, &n, &k, &alpha, A, &lda, &beta, C, &ldc); 
			
 
				+}
			
 
				+
			
 
				+void STARPU_SGER(const BLASINT m, const BLASINT n, const float alpha,
			
 
				+                  const float *x, const BLASINT incx, const float *y,
			
 
				+                  const BLASINT incy, float *A, const BLASINT lda)
			
 
				+{
			
 
				+	sger_64_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
			
 
				+}
			
 
				+
			
 
				+void STARPU_DGER(const BLASINT m, const BLASINT n, const double alpha,
			
 
				+                  const double *x, const BLASINT incx, const double *y,
			
 
				+                  const BLASINT incy, double *A, const BLASINT lda)
			
 
				+{
			
 
				+	dger_64_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
			
 
				+}
			
 
				+
			
 
				+void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
			
 
				+                   const BLASINT n, const float *A, const BLASINT lda, float *x, 
			
 
				+                   const BLASINT incx)
			
 
				+{
			
 
				+	strsv_64_(uplo, trans, diag, &n, A, &lda, x, &incx);
			
 
				+}
			
 
				+
			
 
				+void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
			
 
				+                 const char *diag, const BLASINT m, const BLASINT n,
			
 
				+                 const float alpha, const float *A, const BLASINT lda,
			
 
				+                 float *B, const BLASINT ldb)
			
 
				+{
			
 
				+	strmm_64_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
			
 
				+}
			
 
				+
			
 
				+void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
			
 
				+                 const char *diag, const BLASINT m, const BLASINT n,
			
 
				+                 const double alpha, const double *A, const BLASINT lda,
			
 
				+                 double *B, const BLASINT ldb)
			
 
				+{
			
 
				+	dtrmm_64_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
			
 
				+}
			
 
				+
			
 
				+void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
			
 
				+                 const BLASINT n, const float *A, const BLASINT lda, float *X,
			
 
				+                 const BLASINT incX)
			
 
				+{
			
 
				+	strmv_64_(uplo, transA, diag, &n, A, &lda, X, &incX);
			
 
				+}
			
 
				+
			
 
				+void STARPU_SAXPY(const BLASINT n, const float alpha, float *X, const BLASINT incX, float *Y, const BLASINT incY)
			
 
				+{
			
 
				+	saxpy_64_(&n, &alpha, X, &incX, Y, &incY);
			
 
				+}
			
 
				+
			
 
				+void STARPU_DAXPY(const BLASINT n, const double alpha, double *X, const BLASINT incX, double *Y, const BLASINT incY)
			
 
				+{
			
 
				+	daxpy_64_(&n, &alpha, X, &incX, Y, &incY);
			
 
				+}
			
 
				+
			
 
				+BLASINT STARPU_ISAMAX (const BLASINT n, float *X, const BLASINT incX)
			
 
				+{
			
 
				+    BLASINT retVal;
			
 
				+    retVal = isamax_64_ (&n, X, &incX);
			
 
				+    return retVal;
			
 
				+}
			
 
				+
			
 
				+BLASINT STARPU_IDAMAX (const BLASINT n, double *X, const BLASINT incX)
			
 
				+{
			
 
				+    BLASINT retVal;
			
 
				+    retVal = idamax_64_ (&n, X, &incX);
			
 
				+    return retVal;
			
 
				+}
			
 
				+
			
 
				+float STARPU_SDOT(const BLASINT n, const float *x, const BLASINT incx, const float *y, const BLASINT incy)
			
 
				+{
			
 
				+	float retVal = 0;
			
 
				+
			
 
				+	/* GOTOBLAS will return a FLOATRET which is a double, not a float */
			
 
				+	retVal = (float)sdot_64_(&n, x, &incx, y, &incy);
			
 
				+
			
 
				+	return retVal;
			
 
				+}
			
 
				+
			
 
				+double STARPU_DDOT(const BLASINT n, const double *x, const BLASINT incx, const double *y, const BLASINT incy)
			
 
				+{
			
 
				+	return ddot_64_(&n, x, &incx, y, &incy);
			
 
				+}
			
 
				+
			
 
				+void STARPU_SSWAP(const BLASINT n, float *X, const BLASINT incX, float *Y, const BLASINT incY)
			
 
				+{
			
 
				+	sswap_64_(&n, X, &incX, Y, &incY);
			
 
				+}
			
 
				+
			
 
				+void STARPU_DSWAP(const BLASINT n, double *X, const BLASINT incX, double *Y, const BLASINT incY)
			
 
				+{
			
 
				+	dswap_64_(&n, X, &incX, Y, &incY);
			
 
				+}
			
--- a/julia/src/blas.h
+++ b/julia/src/blas.h
@@ -0,0 +1,148 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __BLAS_H__
			
 
				+#define __BLAS_H__
			
 
				+
			
 
				+#include <stdint.h>
			
 
				+
			
 
				+#define BLASINT int64_t
			
 
				+
			
 
				+void STARPU_SGEMM(char *transa, char *transb, BLASINT M, BLASINT N, BLASINT K, float alpha, const float *A, BLASINT lda, 
			
 
				+		const float *B, BLASINT ldb, float beta, float *C, BLASINT ldc);
			
 
				+void STARPU_DGEMM(char *transa, char *transb, BLASINT M, BLASINT N, BLASINT K, double alpha, double *A, BLASINT lda, 
			
 
				+		double *B, BLASINT ldb, double beta, double *C, BLASINT ldc);
			
 
				+void STARPU_SGEMV(char *transa, BLASINT M, BLASINT N, float alpha, float *A, BLASINT lda,
			
 
				+		float *X, BLASINT incX, float beta, float *Y, BLASINT incY);
			
 
				+void STARPU_DGEMV(char *transa, BLASINT M, BLASINT N, double alpha, double *A, BLASINT lda,
			
 
				+		double *X, BLASINT incX, double beta, double *Y, BLASINT incY);
			
 
				+float STARPU_SASUM(BLASINT N, float *X, BLASINT incX);
			
 
				+double STARPU_DASUM(BLASINT N, double *X, BLASINT incX);
			
 
				+void STARPU_SSCAL(BLASINT N, float alpha, float *X, BLASINT incX);
			
 
				+void STARPU_DSCAL(BLASINT N, double alpha, double *X, BLASINT incX);
			
 
				+void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
			
 
				+                   const char *diag, const BLASINT m, const BLASINT n,
			
 
				+                   const float alpha, const float *A, const BLASINT lda,
			
 
				+                   float *B, const BLASINT ldb);
			
 
				+void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
			
 
				+                   const char *diag, const BLASINT m, const BLASINT n,
			
 
				+                   const double alpha, const double *A, const BLASINT lda,
			
 
				+                   double *B, const BLASINT ldb);
			
 
				+void STARPU_SSYR (const char *uplo, const BLASINT n, const float alpha,
			
 
				+                  const float *x, const BLASINT incx, float *A, const BLASINT lda);
			
 
				+void STARPU_SSYRK (const char *uplo, const char *trans, const BLASINT n,
			
 
				+                   const BLASINT k, const float alpha, const float *A,
			
 
				+                   const BLASINT lda, const float beta, float *C,
			
 
				+                   const BLASINT ldc);
			
 
				+void STARPU_SGER (const BLASINT m, const BLASINT n, const float alpha,
			
 
				+                  const float *x, const BLASINT incx, const float *y,
			
 
				+                  const BLASINT incy, float *A, const BLASINT lda);
			
 
				+void STARPU_DGER(const BLASINT m, const BLASINT n, const double alpha,
			
 
				+                  const double *x, const BLASINT incx, const double *y,
			
 
				+                  const BLASINT incy, double *A, const BLASINT lda);
			
 
				+void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
			
 
				+                   const BLASINT n, const float *A, const BLASINT lda, float *x, 
			
 
				+                   const BLASINT incx);
			
 
				+void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
			
 
				+                 const char *diag, const BLASINT m, const BLASINT n,
			
 
				+                 const float alpha, const float *A, const BLASINT lda,
			
 
				+                 float *B, const BLASINT ldb);
			
 
				+void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
			
 
				+                 const char *diag, const BLASINT m, const BLASINT n,
			
 
				+                 const double alpha, const double *A, const BLASINT lda,
			
 
				+                 double *B, const BLASINT ldb);
			
 
				+void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
			
 
				+                 const BLASINT n, const float *A, const BLASINT lda, float *X,
			
 
				+                 const BLASINT incX);
			
 
				+void STARPU_SAXPY(const BLASINT n, const float alpha, float *X, const BLASINT incX, float *Y, const BLASINT incy);
			
 
				+void STARPU_DAXPY(const BLASINT n, const double alpha, double *X, const BLASINT incX, double *Y, const BLASINT incY);
			
 
				+BLASINT STARPU_ISAMAX (const BLASINT n, float *X, const BLASINT incX);
			
 
				+BLASINT STARPU_IDAMAX (const BLASINT n, double *X, const BLASINT incX);
			
 
				+float STARPU_SDOT(const BLASINT n, const float *x, const BLASINT incx, const float *y, const BLASINT incy);
			
 
				+double STARPU_DDOT(const BLASINT n, const double *x, const BLASINT incx, const double *y, const BLASINT incy);
			
 
				+void STARPU_SSWAP(const BLASINT n, float *x, const BLASINT incx, float *y, const BLASINT incy);
			
 
				+void STARPU_DSWAP(const BLASINT n, double *x, const BLASINT incx, double *y, const BLASINT incy);
			
 
				+
			
 
				+
			
 
				+extern void sgemm_64_ (const char *transa, const char *transb, const BLASINT *m,
			
 
				+                   const BLASINT *n, const BLASINT *k, const float *alpha, 
			
 
				+                   const float *A, const BLASINT *lda, const float *B, 
			
 
				+                   const BLASINT *ldb, const float *beta, float *C, 
			
 
				+                   const BLASINT *ldc);
			
 
				+extern void dgemm_64_ (const char *transa, const char *transb, const BLASINT *m,
			
 
				+                   const BLASINT *n, const BLASINT *k, const double *alpha, 
			
 
				+                   const double *A, const BLASINT *lda, const double *B, 
			
 
				+                   const BLASINT *ldb, const double *beta, double *C, 
			
 
				+                   const BLASINT *ldc);
			
 
				+extern void sgemv_64_(const char *trans, const BLASINT *m, const BLASINT *n, const float *alpha,
			
 
				+                   const float *a, const BLASINT *lda, const float *x, const BLASINT *incx, 
			
 
				+                   const float *beta, float *y, const BLASINT *incy);
			
 
				+extern void dgemv_64_(const char *trans, const BLASINT *m, const BLASINT *n, const double *alpha,
			
 
				+                   const double *a, const BLASINT *lda, const double *x, const BLASINT *incx,
			
 
				+                   const double *beta, double *y, const BLASINT *incy);
			
 
				+extern void ssyr_64_ (const char *uplo, const BLASINT *n, const float *alpha,
			
 
				+                  const float *x, const BLASINT *incx, float *A, const BLASINT *lda);
			
 
				+extern void ssyrk_64_ (const char *uplo, const char *trans, const BLASINT *n,
			
 
				+                   const BLASINT *k, const float *alpha, const float *A,
			
 
				+                   const BLASINT *lda, const float *beta, float *C,
			
 
				+                   const BLASINT *ldc);
			
 
				+extern void strsm_64_ (const char *side, const char *uplo, const char *transa, 
			
 
				+                   const char *diag, const BLASINT *m, const BLASINT *n,
			
 
				+                   const float *alpha, const float *A, const BLASINT *lda,
			
 
				+                   float *B, const BLASINT *ldb);
			
 
				+extern void dtrsm_64_ (const char *side, const char *uplo, const char *transa, 
			
 
				+                   const char *diag, const BLASINT *m, const BLASINT *n,
			
 
				+                   const double *alpha, const double *A, const BLASINT *lda,
			
 
				+                   double *B, const BLASINT *ldb);
			
 
				+extern double sasum_64_ (const BLASINT *n, const float *x, const BLASINT *incx);
			
 
				+extern double dasum_64_ (const BLASINT *n, const double *x, const BLASINT *incx);
			
 
				+extern void sscal_64_ (const BLASINT *n, const float *alpha, float *x,
			
 
				+                   const BLASINT *incx);
			
 
				+extern void dscal_64_ (const BLASINT *n, const double *alpha, double *x,
			
 
				+                   const BLASINT *incx);
			
 
				+extern void sger_64_(const BLASINT *m, const BLASINT *n, const float *alpha,
			
 
				+                  const float *x, const BLASINT *incx, const float *y,
			
 
				+                  const BLASINT *incy, float *A, const BLASINT *lda);
			
 
				+extern void dger_64_(const BLASINT *m, const BLASINT *n, const double *alpha,
			
 
				+                  const double *x, const BLASINT *incx, const double *y,
			
 
				+                  const BLASINT *incy, double *A, const BLASINT *lda);
			
 
				+extern void strsv_64_ (const char *uplo, const char *trans, const char *diag, 
			
 
				+                   const BLASINT *n, const float *A, const BLASINT *lda, float *x, 
			
 
				+                   const BLASINT *incx);
			
 
				+extern void strmm_64_(const char *side, const char *uplo, const char *transA,
			
 
				+                 const char *diag, const BLASINT *m, const BLASINT *n,
			
 
				+                 const float *alpha, const float *A, const BLASINT *lda,
			
 
				+                 float *B, const BLASINT *ldb);
			
 
				+extern void dtrmm_64_(const char *side, const char *uplo, const char *transA,
			
 
				+                 const char *diag, const BLASINT *m, const BLASINT *n,
			
 
				+                 const double *alpha, const double *A, const BLASINT *lda,
			
 
				+                 double *B, const BLASINT *ldb);
			
 
				+extern void strmv_64_(const char *uplo, const char *transA, const char *diag,
			
 
				+                 const BLASINT *n, const float *A, const BLASINT *lda, float *X,
			
 
				+                 const BLASINT *incX);
			
 
				+extern void saxpy_64_(const BLASINT *n, const float *alpha, const float *X, const BLASINT *incX,
			
 
				+		float *Y, const BLASINT *incy);
			
 
				+extern void daxpy_64_(const BLASINT *n, const double *alpha, const double *X, const BLASINT *incX,
			
 
				+		double *Y, const BLASINT *incy);
			
 
				+extern BLASINT isamax_64_(const BLASINT *n, const float *X, const BLASINT *incX);
			
 
				+extern BLASINT idamax_64_(const BLASINT *n, const double *X, const BLASINT *incX);
			
 
				+/* for some reason, FLOATRET is not a float but a double in GOTOBLAS */
			
 
				+extern double sdot_64_(const BLASINT *n, const float *x, const BLASINT *incx, const float *y, const BLASINT *incy);
			
 
				+extern double ddot_64_(const BLASINT *n, const double *x, const BLASINT *incx, const double *y, const BLASINT *incy);
			
 
				+extern void sswap_64_(const BLASINT *n, float *x, const BLASINT *incx, float *y, const BLASINT *incy);
			
 
				+extern void dswap_64_(const BLASINT *n, double *x, const BLASINT *incx, double *y, const BLASINT *incy);
			
 
				+
			
 
				+#endif /* __BLAS_H__ */
			
--- a/julia/src/blas.jl
+++ b/julia/src/blas.jl
@@ -0,0 +1,21 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+@enum STARPU_BLAS begin
			
 
				+    STARPU_SAXPY
			
 
				+end
			
 
				+
			
 
				+cuda_blas_codelets = Dict(STARPU_SAXPY => "julia_saxpy_cuda_codelet")
			
 
				+cpu_blas_codelets = Dict(STARPU_SAXPY => "julia_saxpy_cpu_codelet")
			
--- a/julia/src/blas_wrapper.c
+++ b/julia/src/blas_wrapper.c
@@ -0,0 +1,50 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+#include <starpu.h>
			
 
				+#include <blas.h>
			
 
				+
			
 
				+#if defined(STARPU_ATLAS) || defined(STARPU_OPENBLAS) || defined(STARPU_MKL)
			
 
				+void julia_saxpy_cpu_codelet(void *descr[], void *arg)
			
 
				+{
			
 
				+	float alpha = *((float *)arg);
			
 
				+
			
 
				+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				+
			
 
				+	float *block_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	float *block_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				+
			
 
				+	STARPU_SAXPY((int)n, alpha, block_x, 1, block_y, 1);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+
			
 
				+#include <starpu_cublas_v2.h>
			
 
				+
			
 
				+void julia_saxpy_cuda_codelet(void *descr[], void *arg)
			
 
				+{
			
 
				+	float alpha = *((float *)arg);
			
 
				+
			
 
				+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				+
			
 
				+	float *block_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	float *block_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				+
			
 
				+	cublasStatus_t status = cublasSaxpy(starpu_cublas_get_local_handle(), (int)n, &alpha, block_x, 1, block_y, 1);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				+}
			
 
				+#endif
			
--- a/julia/src/compiler/c.jl
+++ b/julia/src/compiler/c.jl
@@ -58,15 +58,16 @@ function transform_to_cpu_kernel(expr :: StarpuExprFunction)
 
				     output = add_for_loop_declarations(expr)
			
 
				     output = substitute_args(output)
			
 
				     output = substitute_func_calls(output)
			
 
				+    output = substitute_views(output)
			
 
				     output = substitute_indexing(output)
			
 
				     output = flatten_blocks(output)
			
 
				 
			
 
				     return output
			
 
				 end
			
 
				 
			
 
				-function generate_c_struct_param_declaration(funcname)
			
 
				-    scalar_parameters = CODELETS_SCALARS[funcname]
			
 
				-    struct_params_name = CODELETS_PARAMS_STRUCT[funcname]
			
 
				+function generate_c_struct_param_declaration(codelet_name)
			
 
				+    scalar_parameters = CODELETS_SCALARS[codelet_name]
			
 
				+    struct_params_name = CODELETS_PARAMS_STRUCT[codelet_name]
			
 
				 
			
 
				     output = "struct $struct_params_name {\n"
			
 
				     for p in scalar_parameters
			
@@ -197,18 +198,18 @@ function substitute_args(expr :: StarpuExprFunction)
 
				 
			
 
				 
			
 
				     new_args = [
			
 
				-                    starpu_parse(:($buffer_arg_name :: Matrix{Nothing})),
			
 
				-                    starpu_parse(:($cl_arg_name :: Vector{Nothing}))
			
 
				-                ]
			
 
				+        starpu_parse(:($buffer_arg_name :: Ptr{Ptr{Nothing}})),
			
 
				+        starpu_parse(:($cl_arg_name :: Vector{Nothing}))
			
 
				+    ]
			
 
				     new_body = StarpuExprBlock([function_start_affectations..., new_body.exprs...])
			
 
				 
			
 
				     return StarpuExprFunction(expr.ret_type, expr.func, new_args, new_body)
			
 
				 end
			
 
				 
			
 
				 func_substitution = Dict(
			
 
				-    :width => :STARPU_MATRIX_GET_NY,
			
 
				+    :width  => :STARPU_MATRIX_GET_NY,
			
 
				     :height => :STARPU_MATRIX_GET_NX,
			
 
				-
			
 
				+    :ld     => :STARPU_MATRIX_GET_LD,
			
 
				     :length => :STARPU_VECTOR_GET_NX
			
 
				 )
			
 
				 
			
@@ -228,6 +229,22 @@ function substitute_func_calls(expr :: StarpuExpr)
 
				     return apply(func_to_apply, expr)
			
 
				 end
			
 
				 
			
 
				+function substitute_views(expr :: StarpuExpr)
			
 
				+    function func_to_apply(x :: StarpuExpr)
			
 
				+
			
 
				+        if !isa(x, StarpuExprCall) || x.func != :view
			
 
				+            return x
			
 
				+        end
			
 
				+
			
 
				+        ref = x.args[1]
			
 
				+        indexes = map(i -> isa(i, StarpuExprInterval) ? i.start : i, x.args[2:end])
			
 
				+
			
 
				+        return StarpuExprAddress(StarpuExprRef(ref, indexes))
			
 
				+    end
			
 
				+
			
 
				+    return apply(func_to_apply, expr)
			
 
				+
			
 
				+end
			
 
				 
			
 
				 function substitute_indexing(expr :: StarpuExpr)
			
 
				 
			
--- a/julia/src/compiler/cuda.jl
+++ b/julia/src/compiler/cuda.jl
@@ -129,7 +129,134 @@ function add_device_to_interval_call(expr :: StarpuExpr)
 
				     return apply(func_to_apply, expr)
			
 
				 end
			
 
				 
			
 
				+function translate_cublas(expr :: StarpuExpr)
			
 
				+    function func_to_run(x :: StarpuExpr)
			
 
				+        # STARPU_BLAS => (CUBLAS, TRANS, FILLMODE, ALPHA, SIDE, DIAG)
			
 
				+        blas_to_cublas = Dict(:STARPU_SGEMM  => (:cublasSgemm, [1, 2], [], [6, 11], [], []),
			
 
				+                              :STARPU_DGEMM  => (:cublasDgemm, [1, 2], [], [6, 11], [], []),
			
 
				+                              :STARPU_SGEMV  => (:cublasSgemv, [1], [], [4,9], [], []),
			
 
				+                              :STARPU_DGEMV  => (:cublasDgemv, [1], [], [4,9], [], []),
			
 
				+                              :STARPU_SSCAL  => (:cublasSscal, [], [], [2], [], []),
			
 
				+                              :STARPU_DSCAL  => (:cublasDscal, [], [], [2], [], []),
			
 
				+                              :STARPU_STRSM  => (:cublasStrsm, [3], [2], [7], [1], [4]),
			
 
				+                              :STARPU_DTRSM  => (:cublasDtrsm, [3], [2], [7], [1], [4]),
			
 
				+                              :STARPU_SSYR   => (:cublasSsyr, [], [1], [3], [], []),
			
 
				+                              :STARPU_SSYRK  => (:cublasSsyrk, [2], [1], [5,8], [], []),
			
 
				+                              :STARPU_SGER   => (:cublasSger, [], [], [3], [], []),
			
 
				+                              :STARPU_DGER   => (:cublasDger, [], [], [3], [], []),
			
 
				+                              :STARPU_STRSV  => (:cublasStrsv, [2], [1], [], [], [3]),
			
 
				+                              :STARPU_STRMM  => (:cublasStrmm, [3], [2], [7], [1], [4]),
			
 
				+                              :STARPU_DTRMM  => (:cublasDtrmm, [3], [2], [7], [1], [4]),
			
 
				+                              :STARPU_STRMV  => (:cublasStrmv, [2], [1], [], [], [3]),
			
 
				+                              :STARPU_SAXPY  => (:cublasSaxpy, [], [], [2], [], []),
			
 
				+                              :STARPU_DAXPY  => (:cublasDaxpy, [], [], [2], [], []),
			
 
				+                              :STARPU_SSWAP  => (:cublasSswap, [], [], [], [], []),
			
 
				+                              :STARPU_DSWAP  => (:cublasDswap, [], [], [], [], []))
			
 
				+
			
 
				+        if !(isa(x, StarpuExprCall) && x.func in keys(blas_to_cublas))
			
 
				+            return x
			
 
				+        end
			
 
				+
			
 
				+        new_args = x.args
			
 
				+
			
 
				+        # cublasOperation_t parameters (e.g. StarpuExprValue("N")
			
 
				+        for i in blas_to_cublas[x.func][2]
			
 
				+            if !isa(new_args[i], StarpuExprValue) || !isa(new_args[i].value, String)
			
 
				+                error("Argument $i of ", x.func, " must be a string")
			
 
				+            end
			
 
				+
			
 
				+            value = new_args[i].value
			
 
				+
			
 
				+            if value == "N" || value == "n"
			
 
				+                new_args[i] = StarpuExprVar(:CUBLAS_OP_N)
			
 
				+            elseif value == "T" || value == "t"
			
 
				+                new_args[i] = StarpuExprVar(:CUBLAS_OP_T)
			
 
				+            elseif value == "C" || value == "c"
			
 
				+                new_args[i] = StarpuExprVar(:CUBLAS_OP_C)
			
 
				+            else
			
 
				+                error("Unhandled value for rgument $i of ", x.func, ": ", value,
			
 
				+                      "expecting (\"N\", \"T\", or \"C\")")
			
 
				+            end
			
 
				+        end
			
 
				+
			
 
				+        # cublasFillMode_t parameters (e.g. StarpuExprValue("L")
			
 
				+        for i in blas_to_cublas[x.func][3]
			
 
				+            if !isa(new_args[i], StarpuExprValue) || !isa(new_args[i].value, String)
			
 
				+                error("Argument $i of ", x.func, " must be a string")
			
 
				+            end
			
 
				+
			
 
				+            value = new_args[i].value
			
 
				+
			
 
				+            if value == "L" || value == "l"
			
 
				+                new_args[i] = StarpuExprVar(:CUBLAS_FILL_MODE_LOWER)
			
 
				+            elseif value == "U" || value == "u"
			
 
				+                new_args[i] = StarpuExprVar(:CUBLAS_FILL_MODE_UPPER)
			
 
				+            else
			
 
				+                error("Unhandled value for rgument $i of ", x.func, ": ", value,
			
 
				+                      "expecting (\"L\" or \"U\")")
			
 
				+            end
			
 
				+        end
			
 
				+
			
 
				+        # scalar parameters (alpha, beta, ...):  alpha -> &alpha
			
 
				+        for i in blas_to_cublas[x.func][4]
			
 
				+            if !isa(new_args[i], StarpuExprVar)
			
 
				+                error("Argument $i of ", x.func, " must be a variable")
			
 
				+            end
			
 
				+            var_name = new_args[i].name
			
 
				+            new_args[i] = StarpuExprVar(Symbol("&$var_name"))
			
 
				+        end
			
 
				+
			
 
				+        # cublasSideMode_t parameters (e.g. StarpuExprValue("L")
			
 
				+        for i in blas_to_cublas[x.func][5]
			
 
				+            if !isa(new_args[i], StarpuExprValue) || !isa(new_args[i].value, String)
			
 
				+                error("Argument $i of ", x.func, " must be a string, got: ", new_args[i])
			
 
				+            end
			
 
				 
			
 
				+            value = new_args[i].value
			
 
				+
			
 
				+            if value == "L" || value == "l"
			
 
				+                new_args[i] = StarpuExprVar(:CUBLAS_SIDE_LEFT)
			
 
				+            elseif value == "R" || value == "r"
			
 
				+                new_args[i] = StarpuExprVar(:CUBLAS_SIDE_RIGHT)
			
 
				+            else
			
 
				+                error("Unhandled value for rgument $i of ", x.func, ": ", value,
			
 
				+                      "expecting (\"L\" or \"R\")")
			
 
				+            end
			
 
				+        end
			
 
				+
			
 
				+        # cublasDiag_Typet parameters (e.g. StarpuExprValue("N")
			
 
				+        for i in blas_to_cublas[x.func][6]
			
 
				+            if !isa(new_args[i], StarpuExprValue) || !isa(new_args[i].value, String)
			
 
				+                error("Argument $i of ", x.func, " must be a string")
			
 
				+            end
			
 
				+
			
 
				+            value = new_args[i].value
			
 
				+
			
 
				+            if value == "N" || value == "n"
			
 
				+                new_args[i] = StarpuExprVar(:CUBLAS_DIAG_NON_UNIT)
			
 
				+            elseif value == "U" || value == "u"
			
 
				+                new_args[i] = StarpuExprVar(:CUBLAS_DIAG_UNIT)
			
 
				+            else
			
 
				+                error("Unhandled value for rgument $i of ", x.func, ": ", value,
			
 
				+                      "expecting (\"N\" or \"U\")")
			
 
				+            end
			
 
				+        end
			
 
				+
			
 
				+        new_args = [@parse(starpu_cublas_get_local_handle()), x.args...]
			
 
				+
			
 
				+        status_varname = "status"*rand_string()
			
 
				+        status_var = StarpuExprVar(Symbol("cublasStatus_t "*status_varname))
			
 
				+        call_expr = StarpuExprCall(blas_to_cublas[x.func][1], new_args)
			
 
				+
			
 
				+        return StarpuExprBlock([StarpuExprAffect(status_var, call_expr),
			
 
				+                                starpu_parse(Meta.parse("""if $status_varname != CUBLAS_STATUS_SUCCESS
			
 
				+                                                              STARPU_CUBLAS_REPORT_ERROR($status_varname)
			
 
				+                                                          end""")),
			
 
				+                                @parse cudaStreamSynchronize(starpu_cuda_get_local_stream())])
			
 
				+    end
			
 
				+
			
 
				+    return apply(func_to_run, expr)
			
 
				+end
			
 
				 
			
 
				 function transform_to_cuda_kernel(func :: StarpuExprFunction)
			
 
				 
			
@@ -137,45 +264,48 @@ function transform_to_cuda_kernel(func :: StarpuExprFunction)
 
				 
			
 
				     init, indep, finish = extract_init_indep_finish(cpu_func.body)
			
 
				 
			
 
				-    if indep == nothing
			
 
				-        error("No independant for loop has been found") # TODO can fail because extraction is not correct yet
			
 
				-    end
			
 
				+    cpu_instr = init
			
 
				+    kernel = nothing
			
 
				 
			
 
				-    prekernel_instr, kernel_args, kernel_instr = analyse_sets(indep)
			
 
				+    # Generate a CUDA kernel only if there is an independent loop (@parallel macro).
			
 
				+    if (indep != nothing)
			
 
				+        prekernel_instr, kernel_args, kernel_instr = analyse_sets(indep)
			
 
				 
			
 
				-    kernel_call = StarpuExprCudaCall(:cudaKernel, (@parse nblocks), (@parse THREADS_PER_BLOCK), StarpuExpr[])
			
 
				-    prekernel_instr = vcat(init, prekernel_instr)
			
 
				-    kernel_instr = vcat(kernel_instr, indep.body)
			
 
				+        kernel_call = StarpuExprCudaCall(:cudaKernel, (@parse nblocks), (@parse THREADS_PER_BLOCK), StarpuExpr[])
			
 
				+        cpu_instr = vcat(cpu_instr, prekernel_instr)
			
 
				+        kernel_instr = vcat(kernel_instr, indep.body)
			
 
				 
			
 
				-    indep_for_def, indep_for_undef = analyse_variable_declarations(StarpuExprBlock(kernel_instr), kernel_args)
			
 
				-    prekernel_def, prekernel_undef = analyse_variable_declarations(StarpuExprBlock(prekernel_instr), cpu_func.args)
			
 
				+        indep_for_def, indep_for_undef = analyse_variable_declarations(StarpuExprBlock(kernel_instr), kernel_args)
			
 
				+        prekernel_def, prekernel_undef = analyse_variable_declarations(StarpuExprBlock(cpu_instr), cpu_func.args)
			
 
				 
			
 
				-    for undef_var in indep_for_undef
			
 
				+        for undef_var in indep_for_undef
			
 
				 
			
 
				-        found_var = find_variable(undef_var, prekernel_def)
			
 
				+            found_var = find_variable(undef_var, prekernel_def)
			
 
				 
			
 
				-        if found_var == nothing # TODO : error then ?
			
 
				-            continue
			
 
				+            if found_var == nothing # TODO : error then ?
			
 
				+                continue
			
 
				+            end
			
 
				+
			
 
				+            push!(kernel_args, found_var)
			
 
				         end
			
 
				 
			
 
				-        push!(kernel_args, found_var)
			
 
				+        call_args = map((x -> StarpuExprVar(x.name)), kernel_args)
			
 
				+        kernelname=Symbol("KERNEL_",func.func);
			
 
				+        cuda_call = StarpuExprCudaCall(kernelname, (@parse nblocks), (@parse THREADS_PER_BLOCK), call_args)
			
 
				+        push!(cpu_instr, cuda_call)
			
 
				+        push!(cpu_instr, @parse cudaStreamSynchronize(starpu_cuda_get_local_stream()))
			
 
				+        kernel = StarpuExprFunction(Nothing, kernelname, kernel_args, StarpuExprBlock(kernel_instr))
			
 
				+        kernel = add_device_to_interval_call(kernel)
			
 
				+        kernel = flatten_blocks(kernel)
			
 
				     end
			
 
				 
			
 
				-    call_args = map((x -> StarpuExprVar(x.name)), kernel_args)
			
 
				-    kernelname=Symbol("KERNEL_",func.func);
			
 
				-    cuda_call = StarpuExprCudaCall(kernelname, (@parse nblocks), (@parse THREADS_PER_BLOCK), call_args)
			
 
				-    push!(prekernel_instr, cuda_call)
			
 
				-    push!(prekernel_instr, @parse cudaStreamSynchronize(starpu_cuda_get_local_stream()))
			
 
				-    prekernel_instr = vcat(prekernel_instr, finish)
			
 
				+    cpu_instr = vcat(cpu_instr, finish)
			
 
				 
			
 
				     prekernel_name = Symbol("CUDA_", func.func)
			
 
				-    prekernel = StarpuExprFunction(Nothing, prekernel_name, cpu_func.args, StarpuExprBlock(prekernel_instr))
			
 
				+    prekernel = StarpuExprFunction(Nothing, prekernel_name, cpu_func.args, StarpuExprBlock(cpu_instr))
			
 
				+    prekernel = translate_cublas(prekernel)
			
 
				     prekernel = flatten_blocks(prekernel)
			
 
				 
			
 
				-    kernel = StarpuExprFunction(Nothing, kernelname, kernel_args, StarpuExprBlock(kernel_instr))
			
 
				-    kernel = add_device_to_interval_call(kernel)
			
 
				-    kernel = flatten_blocks(kernel)
			
 
				-    
			
 
				     return prekernel, kernel
			
 
				 end
			
 
				 
			
--- a/julia/src/compiler/expressions.jl
+++ b/julia/src/compiler/expressions.jl
@@ -121,6 +121,9 @@ struct StarpuExprWhile <: StarpuExpr
 
				     body :: StarpuExpr
			
 
				 end
			
 
				 
			
 
				+struct StarpuExprAddress <: StarpuExpr
			
 
				+    ref :: StarpuExpr
			
 
				+end
			
 
				 
			
 
				 function starpu_parse_affect(x :: Expr)
			
 
				 
			
@@ -247,7 +250,7 @@ function starpu_parse_call(x :: Expr)
 
				 end
			
 
				 
			
 
				 
			
 
				-starpu_infix_operators = (:(+), :(*), :(-), :(/), :(<), :(>), :(<=), :(>=), :(%))
			
 
				+starpu_infix_operators = (:(+), :(*), :(-), :(/), :(<), :(>), :(<=), :(>=), :(!=), :(%))
			
 
				 
			
 
				 
			
 
				 function print_prefix(io :: IO, x :: StarpuExprCall ; indent = 0, restrict=false)
			
@@ -293,7 +296,6 @@ function apply(func :: Function, expr :: StarpuExprCall)
 
				     return func(StarpuExprCall(expr.func, map((x -> apply(func, x)), expr.args)))
			
 
				 end
			
 
				 
			
 
				-
			
 
				 #======================================================
			
 
				                 CUDA KERNEL CALL
			
 
				 ======================================================#
			
@@ -731,8 +733,6 @@ function print(io :: IO, x :: StarpuExprRef ; indent = 0,restrict=false)
 
				 
			
 
				 end
			
 
				 
			
 
				-
			
 
				-
			
 
				 function apply(func :: Function, expr :: StarpuExprRef)
			
 
				 
			
 
				     ref = apply(func, expr.ref)
			
@@ -741,6 +741,16 @@ function apply(func :: Function, expr :: StarpuExprRef)
 
				     return func(StarpuExprRef(ref, indexes))
			
 
				 end
			
 
				 
			
 
				+function print(io :: IO, x :: StarpuExprAddress ; indent = 0, restrict=false)
			
 
				+    print(io, "&")
			
 
				+    print(io, x.ref, indent = indent)
			
 
				+end
			
 
				+
			
 
				+function apply(func :: Function, expr :: StarpuExprAddress)
			
 
				+    ref = apply(func, expr.ref)
			
 
				+    return func(StarpuExprAddress(ref))
			
 
				+end
			
 
				+
			
 
				 #======================================================
			
 
				                 BREAK EXPRESSION
			
 
				 ======================================================#
			
@@ -796,7 +806,7 @@ function apply(func :: Function, expr :: StarpuExpr)
 
				     return func(expr)
			
 
				 end
			
 
				 
			
 
				-print(io :: IO, x :: StarpuExprVar ; indent = 0) = print(io, x.name)
			
 
				+print(io :: IO, x :: StarpuExprVar ; indent = 0, restrict = false) = print(io, x.name)
			
 
				 
			
 
				 function print(io :: IO, x :: StarpuExprValue ; indent = 0,restrict=false)
			
 
				 
			
@@ -866,26 +876,24 @@ end
 
				 
			
 
				 function starpu_type_traduction(x)
			
 
				     if x <: Array
			
 
				-        return starpu_type_traduction_array(x)
			
 
				+        return starpu_type_traduction(eltype(x)) * "*"
			
 
				     end
			
 
				 
			
 
				     if x <: Ptr
			
 
				-        return starpu_type_traduction(eltype(x)) * "*"
			
 
				+        depth = 1
			
 
				+        type = eltype(x)
			
 
				+        while type <: Ptr
			
 
				+            depth +=1
			
 
				+            type = eltype(type)
			
 
				+        end
			
 
				+
			
 
				+        return starpu_type_traduction(type) * "*"^depth
			
 
				     end
			
 
				 
			
 
				     return starpu_type_traduction_dict[x]
			
 
				 
			
 
				 end
			
 
				 
			
 
				-function starpu_type_traduction_array(x :: Type{Array{T,N}})  where {T,N}
			
 
				-    output = starpu_type_traduction(T)
			
 
				-    for i in (1 : N)
			
 
				-        output *= "*"
			
 
				-    end
			
 
				-
			
 
				-    return output
			
 
				-end
			
 
				-
			
 
				 function print(io :: IO, x :: StarpuExprTyped ; indent = 0,restrict=false)
			
 
				 
			
 
				     if (isa(x, StarpuExprTypedVar))
			
--- a/julia/src/compiler/file_generation.jl
+++ b/julia/src/compiler/file_generation.jl
@@ -12,6 +12,8 @@ const cpu_kernel_file_start = "#include <stdio.h>
 
				 #include <starpu.h>
			
 
				 #include <math.h>
			
 
				 
			
 
				+#include \"blas.h\"
			
 
				+
			
 
				 static inline long long jlstarpu_max(long long a, long long b)
			
 
				 {
			
 
				 	return (a > b) ? a : b;
			
@@ -32,15 +34,16 @@ const cuda_kernel_file_start = "#include <stdio.h>
 
				 #include <stdint.h>
			
 
				 #include <starpu.h>
			
 
				 #include <math.h>
			
 
				+#include <starpu_cublas_v2.h>
			
 
				 
			
 
				 #define THREADS_PER_BLOCK 64
			
 
				 
			
 
				-static inline long long jlstarpu_max(long long a, long long b)
			
 
				+__attribute__((unused)) static inline long long jlstarpu_max(long long a, long long b)
			
 
				 {
			
 
				 	return (a > b) ? a : b;
			
 
				 }
			
 
				 
			
 
				-static inline long long jlstarpu_interval_size(long long start, long long step, long long stop)
			
 
				+__attribute__((unused)) static inline long long jlstarpu_interval_size(long long start, long long step, long long stop)
			
 
				 {
			
 
				     if (stop >= start){
			
 
				             return jlstarpu_max(0, (stop - start + 1) / step);
			
@@ -50,12 +53,12 @@ static inline long long jlstarpu_interval_size(long long start, long long step,
 
				 }
			
 
				 
			
 
				 
			
 
				-__device__ static inline long long jlstarpu_max__device(long long a, long long b)
			
 
				+__attribute__((unused)) __device__ static inline long long jlstarpu_max__device(long long a, long long b)
			
 
				 {
			
 
				 	return (a > b) ? a : b;
			
 
				 }
			
 
				 
			
 
				-__device__ static inline long long jlstarpu_interval_size__device(long long start, long long step, long long stop)
			
 
				+__attribute__((unused)) __device__ static inline long long jlstarpu_interval_size__device(long long start, long long step, long long stop)
			
 
				 {
			
 
				 	if (stop >= start){
			
 
				 		return jlstarpu_max__device(0, (stop - start + 1) / step);
			
@@ -64,7 +67,6 @@ __device__ static inline long long jlstarpu_interval_size__device(long long star
 
				 	}
			
 
				 }
			
 
				 
			
 
				-
			
 
				 "
			
 
				 
			
 
				 """
			
@@ -109,7 +111,7 @@ macro codelet(x)
 
				     cpu_name = name
			
 
				     cuda_name = "CUDA_"*name
			
 
				     dump(name)
			
 
				-    parse_scalar_parameters(parsed, cpu_name, cuda_name)
			
 
				+    parse_scalar_parameters(parsed, name)
			
 
				     c_struct_param_decl = generate_c_struct_param_declaration(name)
			
 
				     cpu_expr = transform_to_cpu_kernel(parsed)
			
 
				 
			
@@ -130,11 +132,15 @@ macro codelet(x)
 
				         CPU_CODELETS[name]=cpu_name
			
 
				     end
			
 
				 
			
 
				-    if starpu_target & STARPU_CUDA!=0
			
 
				+    if (starpu_target & STARPU_CUDA!=0) && STARPU_USE_CUDA == 1
			
 
				         kernel_file = open(generated_cuda_kernel_file_name, "w")
			
 
				         debug_print("generating ", generated_cuda_kernel_file_name)
			
 
				         print(kernel_file, cuda_kernel_file_start)
			
 
				-        print(kernel_file, "__global__ ", kernel)
			
 
				+
			
 
				+        if kernel != nothing
			
 
				+            print(kernel_file, "__global__ ", kernel)
			
 
				+        end
			
 
				+
			
 
				         print(kernel_file, c_struct_param_decl)
			
 
				         print(kernel_file, "\nextern \"C\" ", prekernel)
			
 
				         close(kernel_file)
			
@@ -142,7 +148,7 @@ macro codelet(x)
 
				     end
			
 
				 end
			
 
				 
			
 
				-function parse_scalar_parameters(expr :: StarpuExprFunction, cpu_name::String, cuda_name::String)
			
 
				+function parse_scalar_parameters(expr :: StarpuExprFunction, codelet_name)
			
 
				     scalar_parameters = []
			
 
				     for i in (1 : length(expr.args))
			
 
				         type = expr.args[i].typ
			
@@ -151,8 +157,7 @@ function parse_scalar_parameters(expr :: StarpuExprFunction, cpu_name::String, c
 
				         end
			
 
				     end
			
 
				 
			
 
				-    CODELETS_SCALARS[cpu_name] = scalar_parameters
			
 
				-    CODELETS_SCALARS[cuda_name] = scalar_parameters
			
 
				+    CODELETS_SCALARS[codelet_name] = scalar_parameters
			
 
				 
			
 
				     # declare structure carrying scalar parameters
			
 
				     struct_params_name = Symbol("params_", rand_string())
			
@@ -168,6 +173,5 @@ function parse_scalar_parameters(expr :: StarpuExprFunction, cpu_name::String, c
 
				     eval(Meta.parse(add_to_dict_str))
			
 
				 
			
 
				     # save structure name
			
 
				-    CODELETS_PARAMS_STRUCT[cpu_name] = struct_params_name
			
 
				-    CODELETS_PARAMS_STRUCT[cuda_name] = struct_params_name
			
 
				+    CODELETS_PARAMS_STRUCT[codelet_name] = struct_params_name
			
 
				 end
			
--- a/julia/src/data.jl
+++ b/julia/src/data.jl
@@ -0,0 +1,235 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+const StarpuDataHandlePointer = Ptr{Cvoid}
			
 
				+StarpuDataHandle = StarpuDestructible{StarpuDataHandlePointer}
			
 
				+
			
 
				+@enum(StarpuDataFilterFunc,
			
 
				+      STARPU_MATRIX_FILTER_VERTICAL_BLOCK = 0,
			
 
				+      STARPU_MATRIX_FILTER_BLOCK = 1,
			
 
				+      STARPU_VECTOR_FILTER_BLOCK = 2,
			
 
				+)
			
 
				+
			
 
				+export starpu_data_filter
			
 
				+function starpu_data_filter(filter_func ::StarpuDataFilterFunc, nchildren ::Integer)
			
 
				+    output = starpu_data_filter(zero)
			
 
				+    output.nchildren = UInt32(nchildren)
			
 
				+
			
 
				+    if filter_func == STARPU_MATRIX_FILTER_VERTICAL_BLOCK
			
 
				+        output.filter_func = Libdl.dlsym(starpu_wrapper_library_handle, "starpu_matrix_filter_vertical_block")
			
 
				+    elseif filter_func == STARPU_MATRIX_FILTER_BLOCK
			
 
				+        output.filter_func = Libdl.dlsym(starpu_wrapper_library_handle, "starpu_matrix_filter_block")
			
 
				+    else filter_func == STARPU_VECTOR_FILTER_BLOCK
			
 
				+        output.filter_func = Libdl.dlsym(starpu_wrapper_library_handle, "starpu_vector_filter_block")
			
 
				+    end
			
 
				+
			
 
				+    return output
			
 
				+end
			
 
				+
			
 
				+function starpu_memory_pin(data :: Union{Vector{T}, Matrix{T}}) where T
			
 
				+    starpu_memory_pin(data, sizeof(data))::Cint
			
 
				+end
			
 
				+
			
 
				+function starpu_memory_unpin(data :: Union{Vector{T}, Matrix{T}}) where T
			
 
				+    starpu_memory_unpin(data, sizeof(data))::Cint
			
 
				+end
			
 
				+
			
 
				+function StarpuNewDataHandle(ptr :: StarpuDataHandlePointer, destr :: Function...) :: StarpuDataHandle
			
 
				+    return StarpuDestructible(ptr, destr...)
			
 
				+end
			
 
				+
			
 
				+
			
 
				+
			
 
				+function starpu_data_unregister_pointer(ptr :: StarpuDataHandlePointer)
			
 
				+    starpu_data_unregister(ptr)
			
 
				+end
			
 
				+
			
 
				+function starpu_data_unregister(handles :: StarpuDataHandle...)
			
 
				+    for h in handles
			
 
				+        starpu_execute_destructor!(h, starpu_data_unregister_pointer)
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+function starpu_data_register(v :: Vector{T}) where T
			
 
				+    output = Ref{Ptr{Cvoid}}(0)
			
 
				+    data_pointer = pointer(v)
			
 
				+
			
 
				+    starpu_vector_data_register(output, STARPU_MAIN_RAM, data_pointer, length(v), sizeof(T))
			
 
				+    return StarpuNewDataHandle(output[], starpu_data_unregister_pointer)#, [starpu_data_unregister_pointer])
			
 
				+end
			
 
				+
			
 
				+function starpu_data_register(m :: Matrix{T}) where T
			
 
				+
			
 
				+    output = Ref{Ptr{Cvoid}}(0)
			
 
				+    data_pointer = pointer(m)
			
 
				+    (height, width) = size(m)
			
 
				+
			
 
				+    starpu_matrix_data_register(output, STARPU_MAIN_RAM, data_pointer, height, height, width, sizeof(T))
			
 
				+    return StarpuNewDataHandle(output[], starpu_data_unregister_pointer)#, [starpu_data_unregister_pointer])
			
 
				+end
			
 
				+
			
 
				+function starpu_data_register(block :: Array{T,3}) where T
			
 
				+
			
 
				+    output = Ref{Ptr{Cvoid}}(0)
			
 
				+    data_pointer = pointer(block)
			
 
				+    (height, width, depth) = size(block)
			
 
				+
			
 
				+    starpu_block_data_register(output, STARPU_MAIN_RAM, data_pointer, height, height * width, height, width, depth, sizeof(T))
			
 
				+    return StarpuNewDataHandle(output[], starpu_data_unregister_pointer)
			
 
				+end
			
 
				+
			
 
				+function starpu_data_register(ref :: Ref{T}) where T
			
 
				+
			
 
				+    output = Ref{Ptr{Cvoid}}(0)
			
 
				+
			
 
				+    starpu_variable_data_register(output, STARPU_MAIN_RAM, ref, sizeof(T))
			
 
				+    return StarpuNewDataHandle(output[], starpu_data_unregister_pointer)
			
 
				+end
			
 
				+
			
 
				+function starpu_data_register(x1, x2, next_args...)
			
 
				+
			
 
				+    handle_1 = starpu_data_register(x1)
			
 
				+    handle_2 = starpu_data_register(x2)
			
 
				+
			
 
				+    next_handles = map(starpu_data_register, next_args)
			
 
				+
			
 
				+    return [handle_1, handle_2, next_handles...]
			
 
				+end
			
 
				+
			
 
				+import Base.getindex
			
 
				+function Base.getindex(handle :: StarpuDataHandle, indexes...)
			
 
				+    output = starpu_data_get_sub_data(handle.object, length(indexes),
			
 
				+                                      map(x->x-1, indexes)...)
			
 
				+    return StarpuNewDataHandle(output)
			
 
				+end
			
 
				+
			
 
				+function starpu_data_unpartition_pointer(ptr :: StarpuDataHandlePointer)
			
 
				+    starpu_data_unpartition(ptr, STARPU_MAIN_RAM)
			
 
				+end
			
 
				+
			
 
				+function starpu_data_partition(handle :: StarpuDataHandle, filter :: starpu_data_filter)
			
 
				+
			
 
				+    starpu_add_destructor!(handle, starpu_data_unpartition_pointer)
			
 
				+    starpu_data_partition(handle.object, pointer_from_objref(filter))
			
 
				+end
			
 
				+
			
 
				+function starpu_data_unpartition(handles :: StarpuDataHandle...)
			
 
				+
			
 
				+    for h in handles
			
 
				+        starpu_execute_destructor!(h, starpu_data_unpartition_pointer)
			
 
				+    end
			
 
				+
			
 
				+    return nothing
			
 
				+end
			
 
				+
			
 
				+function starpu_data_map_filters(handle :: StarpuDataHandle, filter :: starpu_data_filter)
			
 
				+    starpu_add_destructor!(handle, starpu_data_unpartition_pointer)
			
 
				+    starpu_data_map_filters(handle.object, 1, pointer_from_objref(filter))
			
 
				+end
			
 
				+
			
 
				+function starpu_data_map_filters(handle :: StarpuDataHandle, filter_1 :: starpu_data_filter, filter_2 :: starpu_data_filter)
			
 
				+    starpu_add_destructor!(handle, starpu_data_unpartition_pointer)
			
 
				+    starpu_data_map_filters(handle.object, 2, pointer_from_objref(filter_1), pointer_from_objref(filter_2))
			
 
				+end
			
 
				+
			
 
				+function starpu_data_get_sequential_consistency_flag(handle :: StarpuDataHandle)
			
 
				+    return starpu_data_get_sequential_consistency_flag(handle.object)
			
 
				+end
			
 
				+
			
 
				+function starpu_data_set_sequential_consistency_flag(handle :: StarpuDataHandle, flag :: Int)
			
 
				+    starpu_data_set_sequential_consistency_flag(handle.object, flag)
			
 
				+end
			
 
				+
			
 
				+function starpu_data_acquire_on_node(handle :: StarpuDataHandle, node :: Int, mode)
			
 
				+    starpu_data_acquire_on_node(handle.object, node, mode)
			
 
				+end
			
 
				+
			
 
				+function starpu_data_release_on_node(handle :: StarpuDataHandle, node :: Int)
			
 
				+    starpu_data_release_on_node(handle.object, node)
			
 
				+end
			
 
				+
			
 
				+function starpu_data_wont_use(handle :: StarpuDataHandle)
			
 
				+    starpu_data_wont_use(handle.object)
			
 
				+end
			
 
				+
			
 
				+function repl(x::Symbol)
			
 
				+    return x
			
 
				+end
			
 
				+function repl(x::Number)
			
 
				+    return x
			
 
				+end
			
 
				+function repl(x :: Expr)
			
 
				+    if (x.head == :call && x.args[1] == :+)
			
 
				+        if (x.args[2] == :_)
			
 
				+            return x.args[3]
			
 
				+        elseif (x.args[3] == :_)
			
 
				+            return x.args[2]
			
 
				+        else return Expr(:call,:+,repl(x.args[2]),repl(x.args[3]))
			
 
				+        end
			
 
				+    elseif (x.head == :call && x.args[1] == :-)
			
 
				+        if (x.args[2] == :_)
			
 
				+            return Expr(:call,:-,x.args[3])
			
 
				+        elseif (x.args[3] == :_)
			
 
				+            return x.args[2]
			
 
				+        else return Expr(:call,:-,repl(x.args[2]),repl(x.args[3]))
			
 
				+        end
			
 
				+    else return Expr(:call,x.args[1],repl(x.args[2]),repl(x.args[3]))
			
 
				+    end
			
 
				+end
			
 
				+"""
			
 
				+    Declares a subarray.
			
 
				+    Ex : @starpu_filter ha = A[ _:_+1, : ] 
			
 
				+ 
			
 
				+"""
			
 
				+macro starpu_filter(expr)
			
 
				+    #dump(expr, maxdepth=20)
			
 
				+    if (expr.head==Symbol("="))
			
 
				+        region = expr.args[2]
			
 
				+        if (region.head == Symbol("ref"))
			
 
				+            farray = expr.args[1]
			
 
				+            println("starpu filter")
			
 
				+            index = 0
			
 
				+            filter2=nothing
			
 
				+            filter3=nothing
			
 
				+            if (region.args[2]==Symbol(":"))
			
 
				+                index = 3
			
 
				+                filter2=:(STARPU_MATRIX_FILTER_BLOCK)
			
 
				+            elseif (region.args[3] == Symbol(":"))
			
 
				+                index = 2
			
 
				+                filter3=:(STARPU_MATRIX_FILTER_VERTICAL_BLOCK)
			
 
				+            else
			
 
				+            end
			
 
				+            ex = repl(region.args[index].args[3])
			
 
				+            if (region.args[index].args[2] != Symbol("_"))
			
 
				+                throw(AssertionError("LHS must be _"))
			
 
				+            end
			
 
				+            ret = quote
			
 
				+                # escape and not global for farray!
			
 
				+                $(esc(farray)) = starpu_data_register($(esc(region.args[1])))
			
 
				+                starpu_data_partition( $(esc(farray)),starpu_data_filter($(esc(filter)),$(esc(ex))))
			
 
				+            end
			
 
				+            return ret
			
 
				+        else
			
 
				+            ret = quote
			
 
				+                $(esc(farray))= starpu_data_register($(esc(region.args[1])))
			
 
				+            end
			
 
				+            
			
 
				+            dump("coucou"); #dump(region.args[2])
			
 
				+            #                dump(region.args[2])
			
 
				+            #                dump(region.args[3])
			
 
				+            return ret
			
 
				+        end
			
 
				+    end
			
 
				+end
			
--- a/julia/src/dynamic_compiler/Makefile.am
+++ b/julia/src/dynamic_compiler/Makefile.am
@@ -0,0 +1,48 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+
			
 
				+LD=$(CC_OR_NVCC)
			
 
				+AM_CPPFLAGS = -I$(abs_top_srcdir)/include/ -I$(abs_top_builddir)/src -I$(abs_top_builddir)/include \
			
 
				+	 -I$(abs_top_srcdir)/julia/src/
			
 
				+
			
 
				+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS)
			
 
				+AM_CFLAGS += -fPIC -O3 -g -DSTRIDE=${STRIDE} -Wall -mavx -fomit-frame-pointer -march=native -ffast-math
			
 
				+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@
			
 
				+CUDA_CFLAGS = $(STARPU_CUDA_CPPFLAGS) -Wno-deprecated-gpu-targets
			
 
				+LDFLAGS = -L @STARPU_BUILD_DIR@/julia/src/.libs/ -lstarpujulia-1.3
			
 
				+EXTERNLIB=extern_tasks.so
			
 
				+GENERATEDLIB=generated_tasks.so
			
 
				+
			
 
				+C_OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
			
 
				+
			
 
				+if STARPU_USE_CUDA
			
 
				+CUDA_OBJECTS=$(patsubst %.cu,%.o,$(wildcard gen*.cu))
			
 
				+else
			
 
				+CUDA_OBJECTS=
			
 
				+endif
			
 
				+
			
 
				+%.o: %.c
			
 
				+	$(CC) -c $(AM_CPPFLAGS) $(AM_CFLAGS) $^ -o $@
			
 
				+
			
 
				+%.o: %.cu
			
 
				+	$(NVCC) -dc $(AM_CPPFLAGS) $(CUDA_CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
			
 
				+
			
 
				+${EXTERNLIB}: $(SOURCES_CPU)
			
 
				+	$(CC) $(AM_CPPFLAGS) $(AM_CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
			
 
				+
			
 
				+${GENERATEDLIB}: $(C_OBJECTS) $(CUDA_OBJECTS)
			
 
				+	$(LD) -shared $^ -o $@ $(LDFLAGS)
			
 
				+
			
--- a/julia/src/globals.jl
+++ b/julia/src/globals.jl
@@ -0,0 +1,50 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+
			
 
				+global starpu_wrapper_library_handle = C_NULL
			
 
				+
			
 
				+global starpu_tasks_library_handle = C_NULL
			
 
				+
			
 
				+global starpu_target=STARPU_CPU
			
 
				+
			
 
				+global generated_cuda_kernel_file_name = "PRINT TO STDOUT"
			
 
				+global generated_cpu_kernel_file_name = "PRINT TO STDOUT"
			
 
				+
			
 
				+global CPU_CODELETS=Dict{String,String}()
			
 
				+global CUDA_CODELETS=Dict{String,String}()
			
 
				+
			
 
				+global CODELETS_SCALARS=Dict{String,Any}()
			
 
				+global CODELETS_PARAMS_STRUCT=Dict{String,Any}()
			
 
				+
			
 
				+global starpu_type_traduction_dict = Dict(
			
 
				+    Int32 => "int32_t",
			
 
				+    UInt32 => "uint32_t",
			
 
				+    Float32 => "float",
			
 
				+    Int64 => "int64_t",
			
 
				+    UInt64 => "uint64_t",
			
 
				+    Float64 => "double",
			
 
				+    Nothing => "void"
			
 
				+)
			
 
				+export starpu_type_traduction_dict
			
 
				+
			
 
				+global mutex = Threads.SpinLock()
			
 
				+
			
 
				+# detect CUDA support
			
 
				+try
			
 
				+    STARPU_USE_CUDA == 1
			
 
				+catch
			
 
				+   global  const STARPU_USE_CUDA = 0
			
 
				+end
			
--- a/julia/src/init.jl
+++ b/julia/src/init.jl
@@ -0,0 +1,73 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+"""
			
 
				+    Must be called before any other starpu function. Field extern_task_path is the
			
 
				+    shared library path which will be used to find StarpuCodelet
			
 
				+    cpu and gpu function names
			
 
				+"""
			
 
				+function starpu_init()
			
 
				+    debug_print("starpu_init")
			
 
				+
			
 
				+    if (get(ENV,"JULIA_TASK_LIB",0)!=0)
			
 
				+        global starpu_tasks_library_handle= Libdl.dlopen(ENV["JULIA_TASK_LIB"])
			
 
				+        debug_print("Loading external codelet library")
			
 
				+        ff = Libdl.dlsym(starpu_tasks_library_handle,:starpu_find_function)
			
 
				+        dump(ff)
			
 
				+        for k in keys(CPU_CODELETS)
			
 
				+            CPU_CODELETS[k]=unsafe_string(ccall(ff,Cstring, (Cstring,Cstring),Cstring_from_String(string(k)),Cstring_from_String("cpu")))
			
 
				+            if STARPU_USE_CUDA == 1
			
 
				+                CUDA_CODELETS[k]=unsafe_string(ccall(ff,Cstring, (Cstring,Cstring),Cstring_from_String(string(k)),Cstring_from_String("gpu")))
			
 
				+            end
			
 
				+            print(k,">>>>",CPU_CODELETS[k],"\n")
			
 
				+        end
			
 
				+    else
			
 
				+        srcdir=get(ENV,"STARPU_JULIA_BUILD",0)
			
 
				+        if (srcdir == 0)
			
 
				+            error("Must define environment variable STARPU_JULIA_BUILD")
			
 
				+        end
			
 
				+        makefile=string(srcdir, "/src/dynamic_compiler/Makefile")
			
 
				+        debug_print("generating codelet library with ")
			
 
				+        debug_print(makefile)
			
 
				+        run(`make -f $makefile generated_tasks.so`)
			
 
				+        global starpu_tasks_library_handle=Libdl.dlopen("generated_tasks.so")
			
 
				+    end
			
 
				+    global starpu_wrapper_library_handle= Libdl.dlopen(starpu_wrapper_library_name)
			
 
				+    output = starpu_init(C_NULL)
			
 
				+
			
 
				+    global task_pool = ThreadPools.QueuePool(2)
			
 
				+
			
 
				+    starpu_enter_new_block()
			
 
				+
			
 
				+    return output
			
 
				+end
			
 
				+
			
 
				+"""
			
 
				+    Must be called at the end of the program
			
 
				+"""
			
 
				+function starpu_shutdown()
			
 
				+    debug_print("starpu_shutdown")
			
 
				+
			
 
				+    starpu_exit_block()
			
 
				+    @starpucall starpu_shutdown Cvoid ()
			
 
				+
			
 
				+    lock(mutex)
			
 
				+    empty!(perfmodel_list)
			
 
				+    empty!(codelet_list)
			
 
				+    empty!(task_list)
			
 
				+    unlock(mutex)
			
 
				+
			
 
				+    return nothing
			
 
				+end
			
--- a/julia/src/jlstarpu.h
+++ b/julia/src/jlstarpu.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2018                                     Alexis Juven
			
 
				+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -13,22 +13,23 @@
 
				  *
			
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				-/*
			
 
				- * jlstarpu.h
			
 
				- *
			
 
				- *  Created on: 27 juin 2018
			
 
				- *      Author: ajuven
			
 
				- */
			
 
				 
			
 
				-#ifndef JLSTARPU_H_
			
 
				-#define JLSTARPU_H_
			
 
				+#ifndef __MPI_TESTS_GEMM_HELPER__
			
 
				+#define __MPI_TESTS_GEMM_HELPER__
			
 
				+
			
 
				+#include <starpu_config.h>
			
 
				+
			
 
				+extern unsigned nslices;
			
 
				+extern unsigned matrix_dim;
			
 
				+extern unsigned check;
			
 
				+extern int comm_thread_cpuid;
			
 
				 
			
 
				-#include <stdio.h>
			
 
				-#include <stdlib.h>
			
 
				-#include <starpu.h>
			
 
				-#include <pthread.h>
			
 
				 
			
 
				-#include "jlstarpu_utils.h"
			
 
				-#include "jlstarpu_task.h"
			
 
				+void gemm_alloc_data();
			
 
				+int gemm_init_data();
			
 
				+int gemm_submit_tasks();
			
 
				+void gemm_release();
			
 
				+void gemm_add_polling_dependencies();
			
 
				+int gemm_submit_tasks_with_tags(int with_tags);
			
 
				 
			
 
				-#endif /* JLSTARPU_H_ */
			
 
				+#endif /* __MPI_TESTS_GEMM_HELPER__ */
			
--- a/julia/src/openblas_ldflags.jl
+++ b/julia/src/openblas_ldflags.jl
@@ -0,0 +1,9 @@
 
				+import LinearAlgebra.BLAS
			
 
				+import Libdl
			
 
				+
			
 
				+
			
 
				+libdir = normpath(joinpath(splitpath(filter(x->occursin(Base.libblas_name,x), Libdl.dllist())[1])[1:end-1]...))
			
 
				+libpath = normpath(filter(x->occursin(Base.libblas_name,x), Libdl.dllist())[1])
			
 
				+libname = Base.libblas_name[4:end]
			
 
				+println("-Wl,-rpath,$libpath -L$libdir -l$libname")
			
 
				+
			
--- a/julia/src/task.jl
+++ b/julia/src/task.jl
@@ -0,0 +1,400 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+using ThreadPools
			
 
				+
			
 
				+mutable struct jl_starpu_codelet
			
 
				+    c_codelet :: starpu_codelet
			
 
				+    perfmodel :: starpu_perfmodel
			
 
				+    cpu_func :: Union{String, STARPU_BLAS}
			
 
				+    cuda_func :: Union{String, STARPU_BLAS}
			
 
				+    opencl_func :: String
			
 
				+    modes
			
 
				+end
			
 
				+
			
 
				+global codelet_list = Vector{jl_starpu_codelet}()
			
 
				+
			
 
				+function starpu_codelet(;
			
 
				+                        cpu_func :: Union{String, STARPU_BLAS, Cvoid} = "",
			
 
				+                        cuda_func :: Union{String, STARPU_BLAS, Cvoid} = "",
			
 
				+                        opencl_func :: String = "",
			
 
				+                        modes = [],
			
 
				+                        perfmodel :: starpu_perfmodel,
			
 
				+                        where_to_execute :: Union{Cvoid, UInt32} = nothing,
			
 
				+                        color :: UInt32 = 0x00000000
			
 
				+                        )
			
 
				+
			
 
				+    if (length(modes) > STARPU_NMAXBUFS)
			
 
				+        error("Codelet has too much buffers ($(length(modes)) but only $STARPU_NMAXBUFS are allowed)")
			
 
				+    end
			
 
				+
			
 
				+
			
 
				+    if (where_to_execute == nothing)
			
 
				+        real_where = ((cpu_func != nothing) * STARPU_CPU) | ((cuda_func != nothing) * STARPU_CUDA)
			
 
				+    else
			
 
				+        real_where = where_to_execute
			
 
				+    end
			
 
				+
			
 
				+    output = jl_starpu_codelet(starpu_codelet(zero), perfmodel, cpu_func, cuda_func, opencl_func, modes)
			
 
				+    ## TODO: starpu_codelet_init
			
 
				+
			
 
				+    output.c_codelet.where = real_where
			
 
				+
			
 
				+    for i in 1:length(modes)
			
 
				+        output.c_codelet.modes[i] = modes[i]
			
 
				+    end
			
 
				+    output.c_codelet.nbuffers = length(modes)
			
 
				+    output.c_codelet.model = pointer_from_objref(perfmodel)
			
 
				+    output.c_codelet.color = color
			
 
				+
			
 
				+    if typeof(cpu_func) == STARPU_BLAS
			
 
				+        output.cpu_func = cpu_blas_codelets[cpu_func]
			
 
				+        output.c_codelet.cpu_func = load_wrapper_function_pointer(output.cpu_func)
			
 
				+    else
			
 
				+        output.c_codelet.cpu_func = load_starpu_function_pointer(get(CPU_CODELETS, cpu_func, ""))
			
 
				+    end
			
 
				+
			
 
				+    if typeof(cuda_func) == STARPU_BLAS
			
 
				+        output.cuda_func = cuda_blas_codelets[cuda_func]
			
 
				+        output.c_codelet.cuda_func = load_wrapper_function_pointer(output.cuda_func)
			
 
				+        output.c_codelet.cuda_flags[1] = STARPU_CUDA_ASYNC
			
 
				+    else
			
 
				+        output.c_codelet.cuda_func = load_starpu_function_pointer(get(CUDA_CODELETS, cuda_func, ""))
			
 
				+    end
			
 
				+
			
 
				+    output.c_codelet.opencl_func = load_starpu_function_pointer("")
			
 
				+
			
 
				+    # Codelets must not be garbage collected before starpu shutdown is called.
			
 
				+    lock(mutex)
			
 
				+    push!(codelet_list, output)
			
 
				+    unlock(mutex)
			
 
				+
			
 
				+    return output
			
 
				+end
			
 
				+
			
 
				+mutable struct jl_starpu_task
			
 
				+
			
 
				+    cl :: jl_starpu_codelet
			
 
				+    handles :: Vector{StarpuDataHandle}
			
 
				+    handle_pointers :: Vector{StarpuDataHandlePointer}
			
 
				+    synchronous :: Bool
			
 
				+    cl_arg # type depends on codelet
			
 
				+    callback_signal :: Vector{Cint}
			
 
				+    callback_function :: Union{Cvoid, Function}
			
 
				+    callback_arg
			
 
				+    c_task :: starpu_task
			
 
				+end
			
 
				+
			
 
				+task_list = Vector{jl_starpu_task}()
			
 
				+
			
 
				+"""
			
 
				+            starpu_task(; cl :: jl_starpu_codelet, handles :: Vector{StarpuDataHandle}, cl_arg :: Ref)
			
 
				+
			
 
				+            Creates a new task which will run the specified codelet on handle buffers and cl_args data
			
 
				+        """
			
 
				+function starpu_task(;
			
 
				+                     cl :: Union{Cvoid, jl_starpu_codelet} = nothing,
			
 
				+                     handles :: Vector{StarpuDataHandle} = StarpuDataHandle[],
			
 
				+                     cl_arg = (),
			
 
				+                     callback :: Union{Cvoid, Function} = nothing,
			
 
				+                     callback_arg = nothing,
			
 
				+                     tag :: Union{Cvoid, starpu_tag_t} = nothing,
			
 
				+                     tag_only :: Union{Cvoid, starpu_tag_t} = nothing,
			
 
				+                     sequential_consistency = true,
			
 
				+                     detach = 1,
			
 
				+                     color :: Union{Cvoid, UInt32} = nothing,
			
 
				+                     where :: Union{Cvoid, Int32} = nothing)
			
 
				+    if (cl == nothing)
			
 
				+        error("\"cl\" field can't be empty when creating a StarpuTask")
			
 
				+    end
			
 
				+
			
 
				+    output = jl_starpu_task(cl, handles, map((x -> x.object), handles), false, nothing, Vector{Cint}(undef, 1), callback, callback_arg, starpu_task(zero))
			
 
				+
			
 
				+    # handle scalar_parameters
			
 
				+    codelet_name = ""
			
 
				+    if isa(cl.cpu_func, String) && cl.cpu_func != ""
			
 
				+        codelet = cl.cpu_func
			
 
				+    elseif isa(cl.gpu_func, String) && cl.gpu_func != ""
			
 
				+        codelet = cl.gpu_func
			
 
				+    end
			
 
				+    scalar_parameters = get(CODELETS_SCALARS, codelet_name, nothing)
			
 
				+    if scalar_parameters != nothing
			
 
				+        nb_scalar_required = length(scalar_parameters)
			
 
				+        nb_scalar_provided = tuple_len(cl_arg)
			
 
				+        if (nb_scalar_provided != nb_scalar_required)
			
 
				+            error("$nb_scalar_provided scalar parameters provided but $nb_scalar_required are required by $codelet_name.")
			
 
				+        end
			
 
				+        output.cl_arg = create_param_struct_from_clarg(codelet_name, cl_arg)
			
 
				+    else
			
 
				+        output.cl_arg = cl_arg
			
 
				+    end
			
 
				+
			
 
				+    starpu_task_init(Ref(output.c_task))
			
 
				+    output.c_task.cl = pointer_from_objref(cl.c_codelet)
			
 
				+    output.c_task.synchronous = false
			
 
				+    output.c_task.sequential_consistency = sequential_consistency
			
 
				+    output.c_task.detach = detach
			
 
				+
			
 
				+    ## TODO: check num handles equals num codelet buffers
			
 
				+    for i in 1:length(handles)
			
 
				+        output.c_task.handles[i] = output.handle_pointers[i]
			
 
				+    end
			
 
				+    if tuple_len(cl_arg) > 0
			
 
				+        output.c_task.cl_arg = Base.unsafe_convert(Ptr{Cvoid}, Ref(output.cl_arg))
			
 
				+        output.c_task.cl_arg_size = sizeof(output.cl_arg)
			
 
				+    end
			
 
				+
			
 
				+    # callback
			
 
				+    if output.callback_function != nothing
			
 
				+        output.callback_signal[1] = 0
			
 
				+        output.c_task.callback_arg = Base.unsafe_convert(Ptr{Cvoid}, output.callback_signal)
			
 
				+        output.c_task.callback_func = load_wrapper_function_pointer("julia_callback_func")
			
 
				+    end
			
 
				+
			
 
				+    if tag != nothing
			
 
				+        output.c_task.tag_id = tag
			
 
				+        output.c_task.use_tag = 1
			
 
				+    end
			
 
				+
			
 
				+    if tag_only != nothing
			
 
				+        output.c_task.tag_id = tag_only
			
 
				+    end
			
 
				+
			
 
				+    if color != nothing
			
 
				+        output.c_task.color = color
			
 
				+    end
			
 
				+
			
 
				+    if where != nothing
			
 
				+        output.c_task.where = where
			
 
				+    end
			
 
				+
			
 
				+    # Tasks must not be garbage collected before starpu_task_wait_for_all is called.
			
 
				+    # This is necessary in particular for tasks created inside callback functions.
			
 
				+    lock(mutex)
			
 
				+    push!(task_list, output)
			
 
				+    unlock(mutex)
			
 
				+
			
 
				+    return output
			
 
				+end
			
 
				+
			
 
				+
			
 
				+function create_param_struct_from_clarg(codelet_name, cl_arg)
			
 
				+    struct_params_name = CODELETS_PARAMS_STRUCT[codelet_name]
			
 
				+
			
 
				+    if struct_params_name == false
			
 
				+        error("structure name not found in CODELET_PARAMS_STRUCT")
			
 
				+    end
			
 
				+
			
 
				+    nb_scalar_provided = length(cl_arg)
			
 
				+    create_struct_param_str = "output = $struct_params_name("
			
 
				+    for i in 1:nb_scalar_provided-1
			
 
				+        arg = cl_arg[i]
			
 
				+        create_struct_param_str *= "$arg, "
			
 
				+        end
			
 
				+    if (nb_scalar_provided > 0)
			
 
				+        arg = cl_arg[nb_scalar_provided]
			
 
				+        create_struct_param_str *= "$arg"
			
 
				+    end
			
 
				+    create_struct_param_str *= ")"
			
 
				+    eval(Meta.parse(create_struct_param_str))
			
 
				+    return output
			
 
				+end
			
 
				+
			
 
				+"""
			
 
				+    Launches task execution, if "synchronous" task field is set to "false", call
			
 
				+    returns immediately
			
 
				+"""
			
 
				+function starpu_task_submit(task :: jl_starpu_task)
			
 
				+    if (length(task.handles) != length(task.cl.modes))
			
 
				+        error("Invalid number of handles for task : $(length(task.handles)) where given while codelet has $(task.cl.modes) modes")
			
 
				+    end
			
 
				+
			
 
				+    starpu_task_submit(Ref(task.c_task))
			
 
				+
			
 
				+    if task.callback_function != nothing
			
 
				+        callback_arg = task.callback_arg
			
 
				+        callback_signal = task.callback_signal
			
 
				+        callback_function = task.callback_function
			
 
				+
			
 
				+        lock(mutex)
			
 
				+        put!(task_pool) do
			
 
				+
			
 
				+            # Active waiting loop
			
 
				+            @starpucall(julia_wait_signal, Cvoid, (Ptr{Cvoid},), Base.unsafe_convert(Ptr{Cvoid}, callback_signal))
			
 
				+
			
 
				+            # We've received the signal from the pthread, now execute the callback.
			
 
				+            callback_function(callback_arg)
			
 
				+
			
 
				+            # Tell the pthread that the callback is done.
			
 
				+            callback_signal[1] = 0
			
 
				+        end
			
 
				+        unlock(mutex)
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+function starpu_modes(x :: Symbol)
			
 
				+    if (x == Symbol("STARPU_RW"))
			
 
				+        return STARPU_RW
			
 
				+    elseif (x == Symbol("STARPU_R"))
			
 
				+        return STARPU_R
			
 
				+    else return STARPU_W
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+default_codelet = Dict{String, jl_starpu_codelet}()
			
 
				+default_perfmodel = Dict{String, starpu_perfmodel}()
			
 
				+
			
 
				+function get_default_perfmodel(name)
			
 
				+    if name in keys(default_perfmodel)
			
 
				+        return default_perfmodel[name]
			
 
				+    end
			
 
				+
			
 
				+    perfmodel = starpu_perfmodel(
			
 
				+        perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
			
 
				+        symbol = name
			
 
				+    )
			
 
				+    default_perfmodel[name] = perfmodel
			
 
				+    return perfmodel
			
 
				+end
			
 
				+
			
 
				+function get_default_codelet(codelet_name, perfmodel, modes) :: jl_starpu_codelet
			
 
				+    if codelet_name in keys(default_codelet)
			
 
				+        return default_codelet[codelet_name]
			
 
				+    end
			
 
				+
			
 
				+    cl = starpu_codelet(
			
 
				+        cpu_func  = codelet_name in keys(CPU_CODELETS) ? codelet_name : "",
			
 
				+        cuda_func = codelet_name in keys(CUDA_CODELETS) ? codelet_name : "",
			
 
				+        modes = modes,
			
 
				+        perfmodel = perfmodel,
			
 
				+    )
			
 
				+    default_codelet[codelet_name] = cl
			
 
				+    return cl
			
 
				+end
			
 
				+
			
 
				+function starpu_task_insert(;
			
 
				+                            codelet_name :: Union{Cvoid, String} = nothing,
			
 
				+                            cl :: Union{Cvoid, jl_starpu_codelet} = nothing,
			
 
				+                            perfmodel :: Union{starpu_perfmodel, Cvoid} = nothing,
			
 
				+                            handles :: Vector{StarpuDataHandle} = StarpuDataHandle[],
			
 
				+                            cl_arg = (),
			
 
				+                            callback :: Union{Cvoid, Function} = nothing,
			
 
				+                            callback_arg = nothing,
			
 
				+                            tag :: Union{Cvoid, starpu_tag_t} = nothing,
			
 
				+                            tag_only :: Union{Cvoid, starpu_tag_t} = nothing,
			
 
				+                            sequential_consistency = true,
			
 
				+                            detach = 1,
			
 
				+                            where :: Union{Cvoid, Int32} = nothing,
			
 
				+                            color :: Union{Cvoid, UInt32} = nothing,
			
 
				+                            modes = nothing)
			
 
				+    if cl == nothing && codelet_name == nothing
			
 
				+        error("At least one of the two parameters codelet_name or cl must be provided when calling starpu_task_insert.")
			
 
				+
			
 
				+    end
			
 
				+    if cl == nothing && modes == nothing
			
 
				+        error("Modes must be defined when calling starpu_task_insert without a codelet.")
			
 
				+    end
			
 
				+
			
 
				+    if perfmodel == nothing
			
 
				+        perfmodel = get_default_perfmodel(codelet_name == nothing ? "default" : codelet_name)
			
 
				+    end
			
 
				+
			
 
				+    if cl == nothing
			
 
				+        cl = get_default_codelet(codelet_name, perfmodel, modes)
			
 
				+    end
			
 
				+
			
 
				+    task = starpu_task(cl = cl, handles = handles, cl_arg = cl_arg, callback = callback,
			
 
				+                       callback_arg = callback_arg, tag = tag, tag_only = tag_only,
			
 
				+                       sequential_consistency = sequential_consistency,
			
 
				+                       detach = detach, color = color, where = where)
			
 
				+
			
 
				+    starpu_task_submit(task)
			
 
				+end
			
 
				+
			
 
				+"""
			
 
				+    Creates and submits an asynchronous task running cl Codelet function.
			
 
				+    Ex : @starpu_async_cl cl(handle1, handle2)
			
 
				+"""
			
 
				+macro starpu_async_cl(expr, modes, cl_arg=(), color ::UInt32=0x00000000)
			
 
				+
			
 
				+    if (!isa(expr, Expr) || expr.head != :call)
			
 
				+        error("Invalid task submit syntax")
			
 
				+    end
			
 
				+    if (!isa(expr, Expr)||modes.head != :vect)
			
 
				+        error("Invalid task submit syntax")
			
 
				+    end
			
 
				+    perfmodel = starpu_perfmodel(
			
 
				+        perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
			
 
				+        symbol = "history_perf"
			
 
				+    )
			
 
				+    println(CPU_CODELETS[string(expr.args[1])])
			
 
				+    cl = starpu_codelet(
			
 
				+        cpu_func  = string(expr.args[1]),
			
 
				+        cuda_func = string(expr.args[1]),
			
 
				+        #opencl_func="ocl_matrix_mult",
			
 
				+        ### TODO: CORRECT !
			
 
				+        modes = map((x -> starpu_modes(x)),modes.args),
			
 
				+        perfmodel = perfmodel,
			
 
				+        color = color
			
 
				+    )
			
 
				+    handles = Expr(:vect, expr.args[2:end]...)
			
 
				+    #dump(handles)
			
 
				+    quote
			
 
				+        task = starpu_task(cl = $(esc(cl)), handles = $(esc(handles)), cl_arg=$(esc(cl_arg)))
			
 
				+        starpu_task_submit(task)
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+function starpu_task_wait(task :: jl_starpu_task)
			
 
				+    @threadcall(@starpufunc(:starpu_task_wait),
			
 
				+                Cint, (Ptr{Cvoid},), Ref(task.c_task))
			
 
				+
			
 
				+    # starpu_task_wait(Ref(task.c_task))
			
 
				+end
			
 
				+
			
 
				+
			
 
				+"""
			
 
				+    Blocks until every submitted task has finished.
			
 
				+"""
			
 
				+function starpu_task_wait_for_all()
			
 
				+    @threadcall(@starpufunc(:starpu_task_wait_for_all),
			
 
				+                Cint, ())
			
 
				+
			
 
				+    lock(mutex)
			
 
				+    empty!(task_list)
			
 
				+    unlock(mutex)
			
 
				+end
			
 
				+
			
 
				+"""
			
 
				+    Blocks until every submitted task has finished.
			
 
				+    Ex : @starpu_sync_tasks begin
			
 
				+                [...]
			
 
				+                starpu_task_submit(task)
			
 
				+                [...]
			
 
				+        end
			
 
				+
			
 
				+    TODO : Make the macro only wait for tasks declared inside the following expression.
			
 
				+            (similar mechanism as @starpu_block)
			
 
				+"""
			
 
				+macro starpu_sync_tasks(expr)
			
 
				+    quote
			
 
				+        $(esc(expr))
			
 
				+        starpu_task_wait_for_all()
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+function starpu_task_destroy(task :: jl_starpu_task)
			
 
				+    starpu_task_destroy(Ref(task.c_task))
			
 
				+end
			
--- a/julia/src/translate_headers.jl
+++ b/julia/src/translate_headers.jl
@@ -0,0 +1,114 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+using Clang
			
 
				+using Clang.LibClang.LLVM_jll
			
 
				+
			
 
				+function starpu_translate_headers()
			
 
				+    debug_print("Translating StarPU headers...")
			
 
				+
			
 
				+    if !isdir(joinpath(fstarpu_build_dir(), "julia/gen"))
			
 
				+        mkdir(joinpath(fstarpu_build_dir(), "julia/gen"))
			
 
				+    end
			
 
				+
			
 
				+    STARPU_BUILD_INCLUDE=joinpath(fstarpu_build_dir(), "include")
			
 
				+    STARPU_SRC_INCLUDE=joinpath(fstarpu_src_dir(), "include")
			
 
				+    STARPU_HEADERS = [joinpath(STARPU_BUILD_INCLUDE, header) for header in readdir(STARPU_BUILD_INCLUDE) if endswith(header, ".h")]
			
 
				+    if STARPU_SRC_INCLUDE != STARPU_BUILD_INCLUDE
			
 
				+        for header in readdir(STARPU_SRC_INCLUDE)
			
 
				+            if endswith(header, ".h")
			
 
				+                push!(STARPU_HEADERS, joinpath(STARPU_SRC_INCLUDE, header))
			
 
				+            end
			
 
				+        end
			
 
				+    end
			
 
				+
			
 
				+    LIBCLANG_INCLUDE = joinpath(dirname(LLVM_jll.libclang_path), "..", "include", "clang-c") |> normpath
			
 
				+
			
 
				+    clang_args = ["-I", STARPU_BUILD_INCLUDE, "-I", STARPU_SRC_INCLUDE]
			
 
				+
			
 
				+    for header in find_std_headers()
			
 
				+        push!(clang_args, "-I")
			
 
				+        push!(clang_args, header)
			
 
				+    end
			
 
				+
			
 
				+    only_select_symbols = Set(["starpu_task",
			
 
				+                               "starpu_cublas_init",
			
 
				+                               "starpu_codelet",
			
 
				+                               "starpu_data_filter",
			
 
				+                               "starpu_tag_t",
			
 
				+                               "starpu_perfmodel",
			
 
				+                               "starpu_perfmodel_type",
			
 
				+                               "starpu_data_handle_t",
			
 
				+                               "starpu_init",
			
 
				+                               "starpu_data_acquire_on_node",
			
 
				+                               "starpu_data_release_on_node",
			
 
				+                               "starpu_data_unregister",
			
 
				+                               "starpu_data_partition",
			
 
				+                               "starpu_data_unpartition",
			
 
				+                               "starpu_data_get_sub_data",
			
 
				+                               "starpu_data_map_filters",
			
 
				+                               "starpu_data_get_default_sequential_consistency_flag",
			
 
				+                               "starpu_data_set_default_sequential_consistency_flag",
			
 
				+                               "starpu_data_get_sequential_consistency_flag",
			
 
				+                               "starpu_data_set_sequential_consistency_flag",
			
 
				+                               "starpu_data_wont_use",
			
 
				+                               "starpu_matrix_data_register",
			
 
				+                               "starpu_block_data_register",
			
 
				+                               "starpu_vector_data_register",
			
 
				+                               "starpu_variable_data_register",
			
 
				+                               "starpu_memory_pin",
			
 
				+                               "starpu_memory_unpin",
			
 
				+                               "starpu_task_end_dep_add",
			
 
				+                               "starpu_task_end_dep_release",
			
 
				+                               "starpu_task_init",
			
 
				+                               "starpu_task_destroy",
			
 
				+                               "starpu_task_submit",
			
 
				+                               "starpu_task_wait",
			
 
				+                               "starpu_task_wait_for_n_submitted",
			
 
				+                               "starpu_tag_remove",
			
 
				+                               "starpu_tag_wait",
			
 
				+                               "starpu_tag_declare_deps_array",
			
 
				+                               "starpu_tag_notify_from_apps",
			
 
				+                               "starpu_task_declare_end_deps_array",
			
 
				+                               "starpu_task_declare_deps_array",
			
 
				+                               "starpu_iteration_push",
			
 
				+                               "starpu_iteration_pop",
			
 
				+                               "starpu_worker_get_count",
			
 
				+                               "starpu_cpu_worker_get_count",
			
 
				+                               "starpu_cuda_worker_get_count",
			
 
				+                               "starpu_opencl_worker_get_count",
			
 
				+                               "starpu_mic_worker_get_count",
			
 
				+                               "STARPU_CPU",
			
 
				+                               "STARPU_CUDA",
			
 
				+                               "STARPU_CUDA_ASYNC",
			
 
				+                               "STARPU_OPENCL",
			
 
				+                               "STARPU_MAIN_RAM",
			
 
				+                               "STARPU_NMAXBUFS",
			
 
				+                               "STARPU_USE_CUDA"])
			
 
				+
			
 
				+    wc = init(; headers = STARPU_HEADERS,
			
 
				+              output_file = joinpath(fstarpu_build_dir(), "julia/gen/libstarpu_api.jl"),
			
 
				+              common_file = joinpath(fstarpu_build_dir(), "julia/gen/libstarpu_common.jl"),
			
 
				+              clang_includes = vcat(LIBCLANG_INCLUDE, CLANG_INCLUDE),
			
 
				+              clang_args = clang_args,
			
 
				+              header_library = x->"starpu_wrapper_library_name",
			
 
				+              clang_diagnostics = false,
			
 
				+              rewriter = x->x,
			
 
				+              only_select_symbols = only_select_symbols,
			
 
				+              fields_align = Dict((:starpu_pthread_spinlock_t,:taken) => 16)
			
 
				+              )
			
 
				+
			
 
				+    run(wc)
			
 
				+end
			
--- a/mpi/tests/abstract_sendrecv_bench.c
+++ b/mpi/tests/abstract_sendrecv_bench.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/mpi/tests/abstract_sendrecv_bench.h
+++ b/mpi/tests/abstract_sendrecv_bench.h
@@ -1,7 +1,6 @@
 
				-
			
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/mpi/tests/bench_helper.c
+++ b/mpi/tests/bench_helper.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/mpi/tests/bench_helper.h
+++ b/mpi/tests/bench_helper.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/mpi/tests/burst.c
+++ b/mpi/tests/burst.c
@@ -0,0 +1,75 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * This test sends simultaneously many communications, with various configurations.
			
 
				+ *
			
 
				+ * Global purpose is to run with trace recording, to watch the behaviour of communications.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "helper.h"
			
 
				+#include "burst_helper.h"
			
 
				+
			
 
				+void parse_args(int argc, char **argv)
			
 
				+{
			
 
				+	int i;
			
 
				+	for (i = 1; i < argc; i++)
			
 
				+	{
			
 
				+		if (strcmp(argv[i], "-nreqs") == 0)
			
 
				+		{
			
 
				+			burst_nb_requests = atoi(argv[++i]);
			
 
				+		}
			
 
				+		else if (strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0)
			
 
				+		{
			
 
				+			fprintf(stderr,"Usage: %s [-nreqs nreqs]\n", argv[0]);
			
 
				+			fprintf(stderr,"Currently selected: %d requests in each burst\n", burst_nb_requests);
			
 
				+			exit(EXIT_SUCCESS);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			fprintf(stderr,"Unrecognized option %s\n", argv[i]);
			
 
				+			exit(EXIT_FAILURE);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret, rank, mpi_init, other_rank;
			
 
				+
			
 
				+	parse_args(argc, argv);
			
 
				+
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				+
			
 
				+	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
			
 
				+
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+
			
 
				+	burst_init_data(rank);
			
 
				+
			
 
				+	burst_all(rank);
			
 
				+
			
 
				+	/* Clear up */
			
 
				+	burst_free_data(rank);
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/mpi/tests/burst_gemm.c
+++ b/mpi/tests/burst_gemm.c
@@ -0,0 +1,210 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * Program to be executed with trace recording to watch the impact of
			
 
				+ * computations (or task polling) on communications.
			
 
				+ */
			
 
				+#include <limits.h>
			
 
				+#include <string.h>
			
 
				+#include <unistd.h>
			
 
				+#include <sys/types.h>
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <starpu_fxt.h>
			
 
				+
			
 
				+#include "helper.h"
			
 
				+#include "gemm_helper.h"
			
 
				+#include "burst_helper.h"
			
 
				+
			
 
				+static int gemm_warmup = 1;
			
 
				+static int gemm_warmup_wait = 0;
			
 
				+
			
 
				+void parse_args(int argc, char **argv)
			
 
				+{
			
 
				+	int i;
			
 
				+	for (i = 1; i < argc; i++)
			
 
				+	{
			
 
				+		if (strcmp(argv[i], "-nblocks") == 0)
			
 
				+		{
			
 
				+			char *argptr;
			
 
				+			nslices = strtol(argv[++i], &argptr, 10);
			
 
				+			matrix_dim = 320 * nslices;
			
 
				+		}
			
 
				+		else if (strcmp(argv[i], "-size") == 0)
			
 
				+		{
			
 
				+			char *argptr;
			
 
				+			unsigned matrix_dim_tmp = strtol(argv[++i], &argptr, 10);
			
 
				+			if (matrix_dim_tmp % 320 != 0)
			
 
				+			{
			
 
				+				fprintf(stderr, "Matrix size has to be a multiple of 320\n");
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				matrix_dim = matrix_dim_tmp;
			
 
				+				nslices = matrix_dim / 320;
			
 
				+			}
			
 
				+		}
			
 
				+		else if (strcmp(argv[i], "-check") == 0)
			
 
				+		{
			
 
				+			check = 1;
			
 
				+		}
			
 
				+		else if (strcmp(argv[i], "-nreqs") == 0)
			
 
				+		{
			
 
				+			burst_nb_requests = atoi(argv[++i]);
			
 
				+		}
			
 
				+		else if (strcmp(argv[i], "-no-gemm-warmup") == 0)
			
 
				+		{
			
 
				+			gemm_warmup = 0;
			
 
				+		}
			
 
				+		else if (strcmp(argv[i], "-gemm-warmup-wait") == 0)
			
 
				+		{
			
 
				+			/* All warmup GEMMs will start at the same moment */
			
 
				+			gemm_warmup_wait = 1;
			
 
				+		}
			
 
				+		else if (strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0)
			
 
				+		{
			
 
				+			fprintf(stderr,"Usage: %s [-nblocks n] [-size size] [-check] [-nreqs nreqs] [-no-gemm-warmup] [-gemm-warmup-wait]\n", argv[0]);
			
 
				+			fprintf(stderr,"Currently selected: matrix size: %u - %u blocks - %d requests in each burst - gemm warmup: %d -gemm-warmup-wait: %d\n", matrix_dim, nslices, burst_nb_requests, gemm_warmup, gemm_warmup_wait);
			
 
				+			exit(EXIT_SUCCESS);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			fprintf(stderr,"Unrecognized option %s\n", argv[i]);
			
 
				+			exit(EXIT_FAILURE);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret, mpi_init, worldsize, mpi_rank;
			
 
				+
			
 
				+	parse_args(argc, argv);
			
 
				+
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				+	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
			
 
				+	if (ret == -ENODEV)
			
 
				+		return 77;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
			
 
				+
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &mpi_rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &worldsize);
			
 
				+
			
 
				+	if (worldsize < 2)
			
 
				+	{
			
 
				+		if (mpi_rank == 0)
			
 
				+			FPRINTF(stderr, "We need 2 processes.\n");
			
 
				+
			
 
				+		starpu_mpi_shutdown();
			
 
				+		if (!mpi_init)
			
 
				+			MPI_Finalize();
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+
			
 
				+	gemm_alloc_data();
			
 
				+	if (gemm_init_data() == -ENODEV)
			
 
				+		goto enodev;
			
 
				+
			
 
				+	/* GEMM warmup, to really load the BLAS library */
			
 
				+	if (gemm_warmup)
			
 
				+	{
			
 
				+		if (gemm_warmup_wait)
			
 
				+		{
			
 
				+			starpu_task_wait_for_all();
			
 
				+			starpu_pause();
			
 
				+		}
			
 
				+
			
 
				+		if(gemm_submit_tasks() == -ENODEV)
			
 
				+			goto enodev;
			
 
				+
			
 
				+		if (gemm_warmup_wait)
			
 
				+		{
			
 
				+			starpu_resume();
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	burst_init_data(mpi_rank);
			
 
				+
			
 
				+	/* Wait for everything and everybody: */
			
 
				+	starpu_task_wait_for_all();
			
 
				+	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+
			
 
				+	FPRINTF(stderr, "** Burst warmup **\n");
			
 
				+	burst_all(mpi_rank);
			
 
				+
			
 
				+	starpu_sleep(0.3); // sleep to easily distinguish different bursts in traces
			
 
				+
			
 
				+	FPRINTF(stderr, "** Burst while there is no task available, but workers are polling **\n");
			
 
				+	burst_all(mpi_rank);
			
 
				+
			
 
				+	starpu_sleep(0.3); // sleep to easily distinguish different bursts in traces
			
 
				+
			
 
				+	FPRINTF(stderr, "** Burst while there is no task available, workers are paused **\n");
			
 
				+	starpu_pause();
			
 
				+	burst_all(mpi_rank);
			
 
				+
			
 
				+	starpu_sleep(0.3); // sleep to easily distinguish different bursts in traces
			
 
				+
			
 
				+	FPRINTF(stderr, "** Burst while workers are really working **\n");
			
 
				+	if(gemm_submit_tasks() == -ENODEV)
			
 
				+		goto enodev;
			
 
				+	starpu_resume();
			
 
				+
			
 
				+	burst_all(mpi_rank);
			
 
				+
			
 
				+	FPRINTF(stderr, "Burst done, now waiting for computing tasks to finish\n");
			
 
				+
			
 
				+	/* Wait for everything and everybody: */
			
 
				+	starpu_task_wait_for_all();
			
 
				+	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+
			
 
				+	starpu_sleep(0.3); // sleep to easily distinguish different parts in traces
			
 
				+
			
 
				+	FPRINTF(stderr, "** Workers are computing, without communications **\n");
			
 
				+	starpu_pause();
			
 
				+	if(gemm_submit_tasks() == -ENODEV)
			
 
				+		goto enodev;
			
 
				+	starpu_resume();
			
 
				+
			
 
				+	/* Wait for everything and everybody: */
			
 
				+	starpu_task_wait_for_all();
			
 
				+	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+
			
 
				+	starpu_sleep(0.3); // sleep to easily distinguish different parts in traces
			
 
				+
			
 
				+	FPRINTF(stderr, "** Burst while workers are computing, but polling a moment between each task **\n");
			
 
				+	starpu_pause();
			
 
				+	gemm_add_polling_dependencies();
			
 
				+	if(gemm_submit_tasks_with_tags(/* enable task tags */ 1) == -ENODEV)
			
 
				+		goto enodev;
			
 
				+	starpu_resume();
			
 
				+
			
 
				+	burst_all(mpi_rank);
			
 
				+
			
 
				+	/* Wait for everything and everybody: */
			
 
				+	starpu_task_wait_for_all();
			
 
				+	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+
			
 
				+enodev:
			
 
				+	gemm_release();
			
 
				+	burst_free_data(mpi_rank);
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
--- a/mpi/tests/burst_helper.c
+++ b/mpi/tests/burst_helper.c
@@ -0,0 +1,223 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+#include <starpu_mpi.h>
			
 
				+
			
 
				+#include "helper.h"
			
 
				+#include "burst_helper.h"
			
 
				+
			
 
				+#if defined(STARPU_SIMGRID) || defined(STARPU_QUICK_CHECK)
			
 
				+#define NB_REQUESTS 10
			
 
				+#else
			
 
				+#define NB_REQUESTS 50
			
 
				+#endif
			
 
				+#define NX_ARRAY (320 * 320)
			
 
				+
			
 
				+static starpu_data_handle_t* recv_handles;
			
 
				+static starpu_data_handle_t* send_handles;
			
 
				+static float** recv_buffers;
			
 
				+static float** send_buffers;
			
 
				+static starpu_mpi_req* recv_reqs;
			
 
				+static starpu_mpi_req* send_reqs;
			
 
				+
			
 
				+int burst_nb_requests = NB_REQUESTS;
			
 
				+
			
 
				+void burst_init_data(int rank)
			
 
				+{
			
 
				+	if (rank == 0 || rank == 1)
			
 
				+	{
			
 
				+		recv_handles = malloc(burst_nb_requests * sizeof(starpu_data_handle_t));
			
 
				+		send_handles = malloc(burst_nb_requests * sizeof(starpu_data_handle_t));
			
 
				+		recv_buffers = malloc(burst_nb_requests * sizeof(float*));
			
 
				+		send_buffers = malloc(burst_nb_requests * sizeof(float*));
			
 
				+		recv_reqs = malloc(burst_nb_requests * sizeof(starpu_mpi_req));
			
 
				+		send_reqs = malloc(burst_nb_requests * sizeof(starpu_mpi_req));
			
 
				+
			
 
				+		for (int i = 0; i < burst_nb_requests; i++)
			
 
				+		{
			
 
				+			send_buffers[i] = malloc(NX_ARRAY * sizeof(float));
			
 
				+			memset(send_buffers[i], 0, NX_ARRAY * sizeof(float));
			
 
				+			starpu_vector_data_register(&send_handles[i], STARPU_MAIN_RAM, (uintptr_t) send_buffers[i], NX_ARRAY, sizeof(float));
			
 
				+
			
 
				+			recv_buffers[i] = malloc(NX_ARRAY * sizeof(float));
			
 
				+			memset(recv_buffers[i], 0, NX_ARRAY * sizeof(float));
			
 
				+			starpu_vector_data_register(&recv_handles[i], STARPU_MAIN_RAM, (uintptr_t) recv_buffers[i], NX_ARRAY, sizeof(float));
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void burst_free_data(int rank)
			
 
				+{
			
 
				+	if (rank == 0 || rank == 1)
			
 
				+	{
			
 
				+		for (int i = 0; i < burst_nb_requests; i++)
			
 
				+		{
			
 
				+			starpu_data_unregister(send_handles[i]);
			
 
				+			free(send_buffers[i]);
			
 
				+
			
 
				+			starpu_data_unregister(recv_handles[i]);
			
 
				+			free(recv_buffers[i]);
			
 
				+		}
			
 
				+
			
 
				+		free(recv_handles);
			
 
				+		free(send_handles);
			
 
				+		free(recv_buffers);
			
 
				+		free(send_buffers);
			
 
				+		free(recv_reqs);
			
 
				+		free(send_reqs);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* Burst simultaneous from both nodes: 0 and 1 post all the recvs, synchronise, and then post all the sends */
			
 
				+void burst_bidir(int rank)
			
 
				+{
			
 
				+	int other_rank = (rank == 0) ? 1 : 0;
			
 
				+
			
 
				+	FPRINTF(stderr, "Simultaneous....start (rank %d)\n", rank);
			
 
				+
			
 
				+	if (rank == 0 || rank == 1)
			
 
				+	{
			
 
				+		for (int i = 0; i < burst_nb_requests; i++)
			
 
				+		{
			
 
				+			recv_reqs[i] = NULL;
			
 
				+			starpu_mpi_irecv(recv_handles[i], &recv_reqs[i], other_rank, i, MPI_COMM_WORLD);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+
			
 
				+	if (rank == 0 || rank == 1)
			
 
				+	{
			
 
				+		for (int i = 0; i < burst_nb_requests; i++)
			
 
				+		{
			
 
				+			send_reqs[i] = NULL;
			
 
				+			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], other_rank, i, i, MPI_COMM_WORLD);
			
 
				+		}
			
 
				+
			
 
				+		for (int i = 0; i < burst_nb_requests; i++)
			
 
				+		{
			
 
				+			if (recv_reqs[i]) starpu_mpi_wait(&recv_reqs[i], MPI_STATUS_IGNORE);
			
 
				+			if (send_reqs[i]) starpu_mpi_wait(&send_reqs[i], MPI_STATUS_IGNORE);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	FPRINTF(stderr, "Simultaneous....end (rank %d)\n", rank);
			
 
				+	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+}
			
 
				+
			
 
				+void burst_unidir(int sender, int receiver, int rank)
			
 
				+{
			
 
				+	FPRINTF(stderr, "%d -> %d... start (rank %d)\n", sender, receiver, rank);
			
 
				+
			
 
				+	if (rank == receiver)
			
 
				+	{
			
 
				+		for (int i = 0; i < burst_nb_requests; i++)
			
 
				+		{
			
 
				+			recv_reqs[i] = NULL;
			
 
				+			starpu_mpi_irecv(recv_handles[i], &recv_reqs[i], sender, i, MPI_COMM_WORLD);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+
			
 
				+	if (rank == sender)
			
 
				+	{
			
 
				+		for (int i = 0; i < burst_nb_requests; i++)
			
 
				+		{
			
 
				+			send_reqs[i] = NULL;
			
 
				+			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], receiver, i, i, MPI_COMM_WORLD);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (rank == sender || rank == receiver)
			
 
				+	{
			
 
				+		for (int i = 0; i < burst_nb_requests; i++)
			
 
				+		{
			
 
				+			if (rank != sender && recv_reqs[i]) starpu_mpi_wait(&recv_reqs[i], MPI_STATUS_IGNORE);
			
 
				+			if (rank == sender && send_reqs[i]) starpu_mpi_wait(&send_reqs[i], MPI_STATUS_IGNORE);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	FPRINTF(stderr, "%d -> %d... end (rank %d)\n", sender, receiver, rank);
			
 
				+
			
 
				+	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+}
			
 
				+
			
 
				+/* Half burst from both nodes, second half burst is triggered after some requests finished. */
			
 
				+void burst_bidir_half_postponed(int rank)
			
 
				+{
			
 
				+	int other_rank = (rank == 0) ? 1 : 0;
			
 
				+	int received = 0;
			
 
				+
			
 
				+	FPRINTF(stderr, "Half/half burst...start (rank %d)\n", rank);
			
 
				+
			
 
				+	if (rank == 0 || rank == 1)
			
 
				+	{
			
 
				+		for (int i = 0; i < burst_nb_requests; i++)
			
 
				+		{
			
 
				+			recv_reqs[i] = NULL;
			
 
				+			starpu_mpi_irecv(recv_handles[i], &recv_reqs[i], other_rank, i, MPI_COMM_WORLD);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+
			
 
				+	if (rank == 0 || rank == 1)
			
 
				+	{
			
 
				+		for (int i = 0; i < (burst_nb_requests / 2); i++)
			
 
				+		{
			
 
				+			send_reqs[i] = NULL;
			
 
				+			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], other_rank, i, i, MPI_COMM_WORLD);
			
 
				+		}
			
 
				+
			
 
				+		if (recv_reqs[burst_nb_requests / 4]) starpu_mpi_wait(&recv_reqs[burst_nb_requests / 4], MPI_STATUS_IGNORE);
			
 
				+
			
 
				+		for (int i = (burst_nb_requests / 2); i < burst_nb_requests; i++)
			
 
				+		{
			
 
				+			send_reqs[i] = NULL;
			
 
				+			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], other_rank, i, i, MPI_COMM_WORLD);
			
 
				+		}
			
 
				+
			
 
				+		for (int i = 0; i < burst_nb_requests; i++)
			
 
				+		{
			
 
				+			if (recv_reqs[i]) starpu_mpi_wait(&recv_reqs[i], MPI_STATUS_IGNORE);
			
 
				+			if (send_reqs[i]) starpu_mpi_wait(&send_reqs[i], MPI_STATUS_IGNORE);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	FPRINTF(stderr, "Half/half burst...done (rank %d)\n", rank);
			
 
				+	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+}
			
 
				+
			
 
				+void burst_all(int rank)
			
 
				+{
			
 
				+	double start, end;
			
 
				+	start = starpu_timing_now();
			
 
				+
			
 
				+	/* Burst simultaneous from both nodes: 0 and 1 post all the recvs, synchronise, and then post all the sends */
			
 
				+	burst_bidir(rank);
			
 
				+
			
 
				+	/* Burst from 0 to 1 : rank 1 posts all the recvs, barrier, then rank 0 posts all the sends */
			
 
				+	burst_unidir(0, 1, rank);
			
 
				+
			
 
				+	/* Burst from 1 to 0 : rank 0 posts all the recvs, barrier, then rank 1 posts all the sends */
			
 
				+	burst_unidir(1, 0, rank);
			
 
				+
			
 
				+	/* Half burst from both nodes, second half burst is triggered after some requests finished. */
			
 
				+	burst_bidir_half_postponed(rank);
			
 
				+
			
 
				+	end = starpu_timing_now();
			
 
				+	FPRINTF(stderr, "All bursts took %.0f ms\n", (end - start) / 1000.0);
			
 
				+}
			
--- a/mpi/tests/burst_helper.h
+++ b/mpi/tests/burst_helper.h
@@ -0,0 +1,29 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __MPI_TESTS_BURST_HELPER__
			
 
				+#define __MPI_TESTS_BURST_HELPER__
			
 
				+
			
 
				+extern int burst_nb_requests;
			
 
				+
			
 
				+void burst_init_data(int rank);
			
 
				+void burst_free_data(int rank);
			
 
				+void burst_bidir(int rank);
			
 
				+void burst_unidir(int sender, int receiver, int rank);
			
 
				+void burst_bidir_half_postponed(int rank);
			
 
				+void burst_all(int rank);
			
 
				+
			
 
				+#endif /* __MPI_TESTS_BURST_HELPER__ */
			
--- a/mpi/tests/gemm_helper.c
+++ b/mpi/tests/gemm_helper.c
@@ -0,0 +1,330 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <common/blas.h>
			
 
				+#include "../../examples/mult/simple.h"
			
 
				+#include "helper.h"
			
 
				+#include "gemm_helper.h"
			
 
				+
			
 
				+
			
 
				+#define CHECK_TASK_SUBMIT(ret) do {				\
			
 
				+	if (ret == -ENODEV)					\
			
 
				+	{							\
			
 
				+		return -ENODEV;					\
			
 
				+	}							\
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");	\
			
 
				+} while(0)
			
 
				+
			
 
				+
			
 
				+unsigned nslices = 4;
			
 
				+#if defined(STARPU_QUICK_CHECK) && !defined(STARPU_SIMGRID)
			
 
				+unsigned matrix_dim = 256;
			
 
				+#else
			
 
				+unsigned matrix_dim = 320 * 4;
			
 
				+#endif
			
 
				+unsigned check = 0;
			
 
				+int comm_thread_cpuid = -1;
			
 
				+
			
 
				+static TYPE *A, *B, *C;
			
 
				+static starpu_data_handle_t A_handle, B_handle, C_handle;
			
 
				+
			
 
				+static void check_output(void)
			
 
				+{
			
 
				+	/* compute C = C - AB */
			
 
				+	CPU_GEMM("N", "N", matrix_dim, matrix_dim, matrix_dim, (TYPE)-1.0f, A, matrix_dim, B, matrix_dim, (TYPE)1.0f, C, matrix_dim);
			
 
				+
			
 
				+	/* make sure C = 0 */
			
 
				+	TYPE err;
			
 
				+	err = CPU_ASUM(matrix_dim*matrix_dim, C, 1);
			
 
				+
			
 
				+	if (err < matrix_dim*matrix_dim*0.001)
			
 
				+	{
			
 
				+		FPRINTF(stderr, "Results are OK\n");
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		int max;
			
 
				+		max = CPU_IAMAX(matrix_dim*matrix_dim, C, 1);
			
 
				+
			
 
				+		FPRINTF(stderr, "There were errors ... err = %f\n", err);
			
 
				+		FPRINTF(stderr, "Max error : %e\n", C[max]);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void partition_mult_data(void)
			
 
				+{
			
 
				+	starpu_matrix_data_register(&A_handle, STARPU_MAIN_RAM, (uintptr_t)A,
			
 
				+		matrix_dim, matrix_dim, matrix_dim, sizeof(TYPE));
			
 
				+	starpu_matrix_data_register(&B_handle, STARPU_MAIN_RAM, (uintptr_t)B,
			
 
				+		matrix_dim, matrix_dim, matrix_dim, sizeof(TYPE));
			
 
				+	starpu_matrix_data_register(&C_handle, STARPU_MAIN_RAM, (uintptr_t)C,
			
 
				+		matrix_dim, matrix_dim, matrix_dim, sizeof(TYPE));
			
 
				+
			
 
				+	struct starpu_data_filter vert;
			
 
				+	memset(&vert, 0, sizeof(vert));
			
 
				+	vert.filter_func = starpu_matrix_filter_vertical_block;
			
 
				+	vert.nchildren = nslices;
			
 
				+
			
 
				+	struct starpu_data_filter horiz;
			
 
				+	memset(&horiz, 0, sizeof(horiz));
			
 
				+	horiz.filter_func = starpu_matrix_filter_block;
			
 
				+	horiz.nchildren = nslices;
			
 
				+
			
 
				+	starpu_data_partition(B_handle, &vert);
			
 
				+	starpu_data_partition(A_handle, &horiz);
			
 
				+
			
 
				+	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void cpu_init_matrix_random(void *descr[], void *arg)
			
 
				+{
			
 
				+	(void)arg;
			
 
				+	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	TYPE *subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+	unsigned nx = STARPU_MATRIX_GET_NX(descr[0]);
			
 
				+	unsigned ny = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+
			
 
				+	for (unsigned i = 0; i < nx *ny; i++)
			
 
				+	{
			
 
				+		subA[i] = (TYPE) (starpu_drand48());
			
 
				+		subB[i] = (TYPE) (starpu_drand48());
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void cpu_init_matrix_zero(void *descr[], void *arg)
			
 
				+{
			
 
				+	(void)arg;
			
 
				+	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	unsigned nx = STARPU_MATRIX_GET_NX(descr[0]);
			
 
				+	unsigned ny = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+
			
 
				+	for (unsigned i = 0; i < nx *ny; i++)
			
 
				+	{
			
 
				+		subA[i] = (TYPE) (0);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void cpu_mult(void *descr[], void *arg)
			
 
				+{
			
 
				+	(void)arg;
			
 
				+	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	TYPE *subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+	TYPE *subC = (TYPE *)STARPU_MATRIX_GET_PTR(descr[2]);
			
 
				+
			
 
				+	unsigned nxC = STARPU_MATRIX_GET_NX(descr[2]);
			
 
				+	unsigned nyC = STARPU_MATRIX_GET_NY(descr[2]);
			
 
				+	unsigned nyA = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+
			
 
				+	unsigned ldA = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+	unsigned ldB = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				+	unsigned ldC = STARPU_MATRIX_GET_LD(descr[2]);
			
 
				+
			
 
				+	int worker_size = starpu_combined_worker_get_size();
			
 
				+
			
 
				+	if (worker_size == 1)
			
 
				+	{
			
 
				+		/* Sequential CPU task */
			
 
				+		CPU_GEMM("N", "N", nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB, (TYPE)0.0, subC, ldC);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		/* Parallel CPU task */
			
 
				+		unsigned rank = starpu_combined_worker_get_rank();
			
 
				+
			
 
				+		unsigned block_size = (nyC + worker_size - 1)/worker_size;
			
 
				+		unsigned new_nyC = STARPU_MIN(nyC, block_size*(rank+1)) - block_size*rank;
			
 
				+
			
 
				+		STARPU_ASSERT(nyC == STARPU_MATRIX_GET_NY(descr[1]));
			
 
				+
			
 
				+		TYPE *new_subB = &subB[block_size*rank];
			
 
				+		TYPE *new_subC = &subC[block_size*rank];
			
 
				+
			
 
				+		CPU_GEMM("N", "N", nxC, new_nyC, nyA, (TYPE)1.0, subA, ldA, new_subB, ldB, (TYPE)0.0, new_subC, ldC);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static struct starpu_perfmodel starpu_gemm_model =
			
 
				+{
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = STARPU_GEMM_STR(gemm)
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet cl =
			
 
				+{
			
 
				+	.type = STARPU_SEQ, /* changed to STARPU_SPMD if -spmd is passed */
			
 
				+	.max_parallelism = INT_MAX,
			
 
				+	.cpu_funcs = {cpu_mult},
			
 
				+	.cpu_funcs_name = {"cpu_mult"},
			
 
				+	.nbuffers = 3,
			
 
				+	.modes = {STARPU_R, STARPU_R, STARPU_RW},
			
 
				+	.model = &starpu_gemm_model
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet cl_init_matrix_random =
			
 
				+{
			
 
				+	.max_parallelism = INT_MAX,
			
 
				+	.cpu_funcs = {cpu_init_matrix_random},
			
 
				+	.cpu_funcs_name = {"cpu_init_matrix_random"},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_W, STARPU_W},
			
 
				+	.name = "init_matrix_random",
			
 
				+	.color = 0xffa500 // orange
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet cl_init_matrix_zero =
			
 
				+{
			
 
				+	.max_parallelism = INT_MAX,
			
 
				+	.cpu_funcs = {cpu_init_matrix_zero},
			
 
				+	.cpu_funcs_name = {"cpu_init_matrix_zero"},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_W},
			
 
				+	.name = "init_matrix_zero",
			
 
				+	.color = 0x808000 // olive
			
 
				+};
			
 
				+
			
 
				+/* Allocate and partition buffers */
			
 
				+void gemm_alloc_data()
			
 
				+{
			
 
				+	starpu_malloc_flags((void **)&A, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				+	starpu_malloc_flags((void **)&B, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				+	starpu_malloc_flags((void **)&C, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				+	partition_mult_data();
			
 
				+}
			
 
				+
			
 
				+/* Submit tasks to initialize matrices: fill them with zeros or random numbers */
			
 
				+int gemm_init_data()
			
 
				+{
			
 
				+#ifndef STARPU_SIMGRID
			
 
				+	int ret;
			
 
				+	unsigned x, y;
			
 
				+
			
 
				+	for (x = 0; x < nslices; x++)
			
 
				+	{
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				+		task->cl = &cl_init_matrix_random;
			
 
				+		task->handles[0] = starpu_data_get_sub_data(A_handle, 1, x);
			
 
				+		task->handles[1] = starpu_data_get_sub_data(B_handle, 1, x);
			
 
				+		ret = starpu_task_submit(task);
			
 
				+		CHECK_TASK_SUBMIT(ret);
			
 
				+
			
 
				+		for (y = 0; y < nslices; y++)
			
 
				+		{
			
 
				+			task = starpu_task_create();
			
 
				+			task->cl = &cl_init_matrix_zero;
			
 
				+			task->handles[0] = starpu_data_get_sub_data(C_handle, 2, x, y);
			
 
				+			ret = starpu_task_submit(task);
			
 
				+			CHECK_TASK_SUBMIT(ret);
			
 
				+		}
			
 
				+	}
			
 
				+#endif
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* Submit tasks to compute the GEMM */
			
 
				+int gemm_submit_tasks()
			
 
				+{
			
 
				+	return gemm_submit_tasks_with_tags(/* by default, disable task tags */ 0);
			
 
				+}
			
 
				+
			
 
				+int gemm_submit_tasks_with_tags(int with_tags)
			
 
				+{
			
 
				+	int ret;
			
 
				+	unsigned x, y;
			
 
				+	starpu_tag_t task_tag = 0;
			
 
				+
			
 
				+	for (x = 0; x < nslices; x++)
			
 
				+	for (y = 0; y < nslices; y++)
			
 
				+	{
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				+		task->cl = &cl;
			
 
				+		task->handles[0] = starpu_data_get_sub_data(A_handle, 1, y);
			
 
				+		task->handles[1] = starpu_data_get_sub_data(B_handle, 1, x);
			
 
				+		task->handles[2] = starpu_data_get_sub_data(C_handle, 2, x, y);
			
 
				+		task->flops = 2ULL * (matrix_dim/nslices) * (matrix_dim/nslices) * matrix_dim;
			
 
				+
			
 
				+		if (with_tags)
			
 
				+		{
			
 
				+			task->use_tag = 1;
			
 
				+			task->tag_id = ++task_tag;
			
 
				+		}
			
 
				+
			
 
				+		ret = starpu_task_submit(task);
			
 
				+		CHECK_TASK_SUBMIT(ret);
			
 
				+		starpu_data_wont_use(starpu_data_get_sub_data(C_handle, 2, x, y));
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* Add dependencies between GEMM tasks to see the impact of polling workers which will at the end get a task.
			
 
				+ * The new dependency graph has the following shape:
			
 
				+ * - the same number of GEMMs as the number of workers are executed in parallel on all workers ("a column of tasks")
			
 
				+ * - then a GEMM waits all tasks of the previous column of tasks, and is executed on a worker
			
 
				+ * - the next column of tasks waits for the previous GEMM
			
 
				+ * - and so on...
			
 
				+ *
			
 
				+ * worker 0 |  1  |  4  |  5  |  8  |  9  |
			
 
				+ * worker 1 |  2  |     |  6  |     | 10  |  ...
			
 
				+ * worker 2 |  3  |     |  7  |     | 11  |
			
 
				+ *
			
 
				+ * This function has to be called before gemm_submit_tasks_with_tags(1).
			
 
				+ */
			
 
				+void gemm_add_polling_dependencies()
			
 
				+{
			
 
				+	int nb_tasks = nslices * nslices;
			
 
				+	unsigned nb_workers = starpu_worker_get_count();
			
 
				+
			
 
				+	for (starpu_tag_t synchro_tag = nb_workers+1; synchro_tag <= nb_tasks; synchro_tag += (nb_workers+1))
			
 
				+	{
			
 
				+		// this synchro tag depends on tasks of previous column of tasks:
			
 
				+		for (starpu_tag_t previous_tag = synchro_tag - nb_workers; previous_tag < synchro_tag; previous_tag++)
			
 
				+		{
			
 
				+			starpu_tag_declare_deps(synchro_tag, 1, previous_tag);
			
 
				+		}
			
 
				+
			
 
				+		// tasks of the next column of tasks depend on this synchro tag:
			
 
				+		// this actually allows workers to poll for new tasks, while no task is available
			
 
				+		for (starpu_tag_t next_tag = synchro_tag+1; next_tag < (synchro_tag + nb_workers + 1) && next_tag <= nb_tasks; next_tag++)
			
 
				+		{
			
 
				+			starpu_tag_declare_deps(next_tag, 1, synchro_tag);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+}
			
 
				+
			
 
				+void gemm_release()
			
 
				+{
			
 
				+	starpu_data_unpartition(C_handle, STARPU_MAIN_RAM);
			
 
				+	starpu_data_unpartition(B_handle, STARPU_MAIN_RAM);
			
 
				+	starpu_data_unpartition(A_handle, STARPU_MAIN_RAM);
			
 
				+
			
 
				+	starpu_data_unregister(A_handle);
			
 
				+	starpu_data_unregister(B_handle);
			
 
				+	starpu_data_unregister(C_handle);
			
 
				+
			
 
				+	if (check)
			
 
				+		check_output();
			
 
				+
			
 
				+	starpu_free_flags(A, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				+	starpu_free_flags(B, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				+	starpu_free_flags(C, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				+}
			
 
				+
			
 
				+
			
--- a/mpi/tests/nothing.c
+++ b/mpi/tests/nothing.c
@@ -0,0 +1,70 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * This program does nothing. It waits until it is interrupted by the user.
			
 
				+ * Useful to check binding while StarPU is running.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <unistd.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret, rank, worldsize;
			
 
				+	int mpi_init;
			
 
				+
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				+	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
			
 
				+
			
 
				+	starpu_pause(); // our program will only wait, no need to stress cores by polling workers
			
 
				+
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &worldsize);
			
 
				+
			
 
				+	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+
			
 
				+	char hostname[65];
			
 
				+	gethostname(hostname, sizeof(hostname));
			
 
				+
			
 
				+	printf("[rank %d on %s] ready to wait !\n", rank, hostname);
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		printf("You can now check if thread binding is correct, for instance.\n");
			
 
				+	}
			
 
				+
			
 
				+	fflush(stdout);
			
 
				+
			
 
				+	while(1)
			
 
				+	{
			
 
				+		sleep(1);
			
 
				+	}
			
 
				+
			
 
				+	// TODO: maybe better handle the user interruption ?
			
 
				+
			
 
				+
			
 
				+	starpu_resume();
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/mpi/tests/sendrecv_parallel_tasks_bench.c
+++ b/mpi/tests/sendrecv_parallel_tasks_bench.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/src/common/utils.c
+++ b/src/common/utils.c
@@ -1,6 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ * Copyright (C) 2020       Federal University of Rio Grande do Sul (UFRGS)
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -1885,20 +1885,20 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
				 				STARPU_HG_DISABLE_CHECKING(entry->nsample);
			
 
				 				STARPU_HG_DISABLE_CHECKING(entry->mean);
			
 
				 
			
 
				-				/* Do not take the first measurement into account, it is very often quite bogus */
			
 
				+				/* For history-based, do not take the first measurement into account, it is very often quite bogus */
			
 
				 				/* TODO: it'd be good to use a better estimation heuristic, like the median, or latest n values, etc. */
			
 
				-				//entry->mean = 0;
			
 
				-				//entry->sum = 0;
			
 
				-
			
 
				-				//entry->deviation = 0.0;
			
 
				-				//entry->sum2 = 0;
			
 
				+				if (model->type != STARPU_HISTORY_BASED)
			
 
				+				{
			
 
				+					entry->sum = measured;
			
 
				+					entry->sum2 = measured*measured;
			
 
				+					entry->nsample = 1;
			
 
				+					entry->mean = measured;
			
 
				+				}
			
 
				 
			
 
				 				entry->size = _starpu_job_get_data_size(model, arch, impl, j);
			
 
				 				entry->flops = j->task->flops;
			
 
				 
			
 
				 				entry->footprint = key;
			
 
				-				//entry->nsample = 0;
			
 
				-				//entry->nerror = 0;
			
 
				 
			
 
				 				insert_history_entry(entry, list, &per_arch_model->history);
			
 
				 			}
			
--- a/src/datawizard/coherency.h
+++ b/src/datawizard/coherency.h
@@ -281,6 +281,11 @@ struct _starpu_data_state
 
				 
			
 
				 	int partition_automatic_disabled;
			
 
				 
			
 
				+	/** Application-provided coordinates. The maximum dimension (5) is
			
 
				+	  * relatively arbitrary. */
			
 
				+	unsigned dimensions;
			
 
				+	int coordinates[5];
			
 
				+
			
 
				 	/** A generic pointer to data in the user land (could be anything and this
			
 
				 	 * is not manage by StarPU) */
			
 
				 	void *user_data;
			
--- a/src/datawizard/interfaces/data_interface.c
+++ b/src/datawizard/interfaces/data_interface.c
@@ -1117,8 +1117,18 @@ int starpu_data_get_home_node(starpu_data_handle_t handle)
 
				 	return handle->home_node;
			
 
				 }
			
 
				 
			
 
				-void starpu_data_set_coordinates_array(starpu_data_handle_t handle STARPU_ATTRIBUTE_UNUSED, int dimensions STARPU_ATTRIBUTE_UNUSED, int dims[] STARPU_ATTRIBUTE_UNUSED)
			
 
				+void starpu_data_set_coordinates_array(starpu_data_handle_t handle, unsigned dimensions, int dims[])
			
 
				 {
			
 
				+	unsigned i;
			
 
				+	unsigned max_dimensions = sizeof(handle->coordinates)/sizeof(handle->coordinates[0]);
			
 
				+
			
 
				+	if (dimensions > max_dimensions)
			
 
				+		dimensions = max_dimensions;
			
 
				+
			
 
				+	handle->dimensions = dimensions;
			
 
				+	for (i = 0; i < dimensions; i++)
			
 
				+		handle->coordinates[i] = dims[i];
			
 
				+
			
 
				 	_STARPU_TRACE_DATA_COORDINATES(handle, dimensions, dims);
			
 
				 }
			
 
				 
			
@@ -1135,3 +1145,16 @@ void starpu_data_set_coordinates(starpu_data_handle_t handle, unsigned dimension
 
				 
			
 
				 	starpu_data_set_coordinates_array(handle, dimensions, dims);
			
 
				 }
			
 
				+
			
 
				+unsigned starpu_data_get_coordinates_array(starpu_data_handle_t handle, unsigned dimensions, int dims[])
			
 
				+{
			
 
				+	unsigned i;
			
 
				+
			
 
				+	if (dimensions > handle->dimensions)
			
 
				+		dimensions = handle->dimensions;
			
 
				+
			
 
				+	for (i = 0; i < dimensions; i++)
			
 
				+		dims[i] = handle->coordinates[i];
			
 
				+
			
 
				+	return dimensions;
			
 
				+}
			
--- a/src/sched_policies/component_heft.c
+++ b/src/sched_policies/component_heft.c
@@ -77,10 +77,13 @@ static int heft_progress_one(struct starpu_sched_component *component)
 
				 		/* Estimated transfer+task termination for each child */
			
 
				 		double estimated_ends_with_task[component->nchildren * ntasks];
			
 
				 
			
 
				-		/* Minimum transfer+task termination on all children */
			
 
				-		double min_exp_end_with_task[ntasks];
			
 
				-		/* Maximum transfer+task termination on all children */
			
 
				-		double max_exp_end_with_task[ntasks];
			
 
				+		/* estimated energy */
			
 
				+		double local_energy[component->nchildren * ntasks];
			
 
				+
			
 
				+		/* Minimum transfer+task termination of the NTASKS tasks over all workers */
			
 
				+		double min_exp_end_of_task[ntasks];
			
 
				+		/* Maximum termination of the already-scheduled tasks over all workers */
			
 
				+		double max_exp_end_of_workers;
			
 
				 
			
 
				 		unsigned suitable_components[component->nchildren * ntasks];
			
 
				 
			
@@ -100,20 +103,23 @@ static int heft_progress_one(struct starpu_sched_component *component)
 
				 					estimated_lengths + offset,
			
 
				 					estimated_transfer_length + offset,
			
 
				 					estimated_ends_with_task + offset,
			
 
				-					&min_exp_end_with_task[n], &max_exp_end_with_task[n],
			
 
				+					&min_exp_end_of_task[n], &max_exp_end_of_workers,
			
 
				 							  suitable_components + offset, nsuitable_components[n]);
			
 
				+			
			
 
				+			/* Compute the energy, if provided*/
			
 
				+			starpu_mct_compute_energy(component, tasks[n], local_energy + offset, suitable_components + offset, nsuitable_components[n]);
			
 
				 		}
			
 
				 
			
 
				+		/* best_task is the task that will finish first among the ntasks, while best_benefit is its expected execution time*/
			
 
				 		int best_task = 0;
			
 
				-		double max_benefit = 0;
			
 
				+		double best_benefit = min_exp_end_of_task[0];
			
 
				 
			
 
				 		/* Find the task which provides the most computation time benefit */
			
 
				-		for (n = 0; n < ntasks; n++)
			
 
				+		for (n = 1; n < ntasks; n++)
			
 
				 		{
			
 
				-			double benefit = max_exp_end_with_task[n] - min_exp_end_with_task[n];
			
 
				-			if (max_benefit < benefit)
			
 
				+			if (best_benefit > min_exp_end_of_task[n])
			
 
				 			{
			
 
				-				max_benefit = benefit;
			
 
				+				best_benefit =  min_exp_end_of_task[n];
			
 
				 				best_task = n;
			
 
				 			}
			
 
				 		}
			
@@ -129,7 +135,7 @@ static int heft_progress_one(struct starpu_sched_component *component)
 
				 
			
 
				 		unsigned offset = component->nchildren * best_task;
			
 
				 
			
 
				-		int best_icomponent = starpu_mct_get_best_component(d, tasks[best_task], estimated_lengths + offset, estimated_transfer_length + offset, estimated_ends_with_task + offset, min_exp_end_with_task[best_task], max_exp_end_with_task[best_task], suitable_components + offset, nsuitable_components[best_task]);
			
 
				+		int best_icomponent = starpu_mct_get_best_component(d, tasks[best_task], estimated_lengths + offset, estimated_transfer_length + offset, estimated_ends_with_task + offset, local_energy + offset, min_exp_end_of_task[best_task], max_exp_end_of_workers, suitable_components + offset, nsuitable_components[best_task]);
			
 
				 
			
 
				 		STARPU_ASSERT(best_icomponent != -1);
			
 
				 		best_component = component->children[best_icomponent];
			
--- a/src/sched_policies/component_heteroprio.c
+++ b/src/sched_policies/component_heteroprio.c
@@ -106,10 +106,13 @@ static int heteroprio_progress_accel(struct starpu_sched_component *component, s
 
				 	/* Estimated transfer+task termination for each child */
			
 
				 	double estimated_ends_with_task[component->nchildren];
			
 
				 
			
 
				-	/* Minimum transfer+task termination on all children */
			
 
				-	double min_exp_end_with_task;
			
 
				-	/* Maximum transfer+task termination on all children */
			
 
				-	double max_exp_end_with_task;
			
 
				+	/* provided local energy */
			
 
				+	double local_energy[component->nchildren];
			
 
				+
			
 
				+	/* Minimum transfer+task termination of the task over all workers */
			
 
				+	double min_exp_end_of_task;
			
 
				+	/* Maximum termination of the already-scheduled tasks over all workers */
			
 
				+	double max_exp_end_of_workers;
			
 
				 
			
 
				 	unsigned suitable_components[component->nchildren];
			
 
				 	unsigned nsuitable_components;
			
@@ -155,16 +158,21 @@ static int heteroprio_progress_accel(struct starpu_sched_component *component, s
 
				 			estimated_lengths,
			
 
				 			estimated_transfer_length,
			
 
				 			estimated_ends_with_task,
			
 
				-			&min_exp_end_with_task, &max_exp_end_with_task,
			
 
				+			&min_exp_end_of_task, &max_exp_end_of_workers,
			
 
				 			suitable_components, nsuitable_components);
			
 
				 
			
 
				+	/* Compute the energy, if provided*/
			
 
				+	starpu_mct_compute_energy(component, task, local_energy, suitable_components, nsuitable_components);
			
 
				+
			
 
				 	/* And now find out which worker suits best for this task,
			
 
				 	 * including data transfer */
			
 
				+
			
 
				 	int best_icomponent = starpu_mct_get_best_component(d, task,
			
 
				 			estimated_lengths,
			
 
				 			estimated_transfer_length,
			
 
				 			estimated_ends_with_task,
			
 
				-			min_exp_end_with_task, max_exp_end_with_task,
			
 
				+                        local_energy,
			
 
				+			min_exp_end_of_task, max_exp_end_of_workers,
			
 
				 			suitable_components, nsuitable_components);
			
 
				 
			
 
				 	if (best_icomponent == -1)
			
@@ -236,10 +244,13 @@ static int heteroprio_progress_noaccel(struct starpu_sched_component *component,
 
				 	/* Estimated transfer+task termination for each child */
			
 
				 	double estimated_ends_with_task[component->nchildren];
			
 
				 
			
 
				-	/* Minimum transfer+task termination on all children */
			
 
				-	double min_exp_end_with_task;
			
 
				-	/* Maximum transfer+task termination on all children */
			
 
				-	double max_exp_end_with_task;
			
 
				+	/* estimated energy */
			
 
				+	double local_energy[component->nchildren];
			
 
				+
			
 
				+	/* Minimum transfer+task termination of the task over all workers */
			
 
				+	double min_exp_end_of_task;
			
 
				+	/* Maximum termination of the already-scheduled tasks over all workers */
			
 
				+	double max_exp_end_of_workers;
			
 
				 
			
 
				 	unsigned suitable_components[component->nchildren];
			
 
				 	unsigned nsuitable_components;
			
@@ -264,16 +275,21 @@ static int heteroprio_progress_noaccel(struct starpu_sched_component *component,
 
				 			estimated_lengths,
			
 
				 			estimated_transfer_length,
			
 
				 			estimated_ends_with_task,
			
 
				-			&min_exp_end_with_task, &max_exp_end_with_task,
			
 
				+			&min_exp_end_of_task, &max_exp_end_of_workers,
			
 
				 			suitable_components, nsuitable_components);
			
 
				 
			
 
				+	/* Compute the energy, if provided*/
			
 
				+	starpu_mct_compute_energy(component, task, local_energy, suitable_components, nsuitable_components);
			
 
				+	
			
 
				 	/* And now find out which worker suits best for this task,
			
 
				 	 * including data transfer */
			
 
				+
			
 
				 	int best_icomponent = starpu_mct_get_best_component(d, task,
			
 
				 			estimated_lengths,
			
 
				 			estimated_transfer_length,
			
 
				 			estimated_ends_with_task,
			
 
				-			min_exp_end_with_task, max_exp_end_with_task,
			
 
				+                        local_energy,
			
 
				+			min_exp_end_of_task, max_exp_end_of_workers,
			
 
				 			suitable_components, nsuitable_components);
			
 
				 
			
 
				 	/* If no best component is found, it means that the perfmodel of
			
--- a/src/sched_policies/component_mct.c
+++ b/src/sched_policies/component_mct.c
@@ -35,10 +35,13 @@ static int mct_push_task(struct starpu_sched_component * component, struct starp
 
				 	/* Estimated transfer+task termination for each child */
			
 
				 	double estimated_ends_with_task[component->nchildren];
			
 
				 
			
 
				-	/* Minimum transfer+task termination on all children */
			
 
				-	double min_exp_end_with_task;
			
 
				-	/* Maximum transfer+task termination on all children */
			
 
				-	double max_exp_end_with_task;
			
 
				+	/* estimated energy */
			
 
				+	double local_energy[component->nchildren];
			
 
				+
			
 
				+	/* Minimum transfer+task termination of the task over all workers */
			
 
				+	double min_exp_end_of_task;
			
 
				+	/* Maximum termination of the already-scheduled tasks over all workers */
			
 
				+	double max_exp_end_of_workers;
			
 
				 
			
 
				 	unsigned suitable_components[component->nchildren];
			
 
				 	unsigned nsuitable_components;
			
@@ -58,12 +61,14 @@ static int mct_push_task(struct starpu_sched_component * component, struct starp
 
				 	   make scheduling decisions at the same time */
			
 
				 	STARPU_COMPONENT_MUTEX_LOCK(&d->scheduling_mutex);
			
 
				 
			
 
				-
			
 
				 	starpu_mct_compute_expected_times(component, task, estimated_lengths, estimated_transfer_length,
			
 
				-					  estimated_ends_with_task, &min_exp_end_with_task, &max_exp_end_with_task, suitable_components, nsuitable_components);
			
 
				+					  estimated_ends_with_task, &min_exp_end_of_task, &max_exp_end_of_workers, suitable_components, nsuitable_components);
			
 
				+
			
 
				+	/* Compute the energy, if provided*/
			
 
				+	starpu_mct_compute_energy(component, task, local_energy, suitable_components, nsuitable_components);
			
 
				 
			
 
				 	int best_icomponent = starpu_mct_get_best_component(d, task, estimated_lengths, estimated_transfer_length,
			
 
				-					  estimated_ends_with_task, min_exp_end_with_task, max_exp_end_with_task, suitable_components, nsuitable_components);
			
 
				+							    estimated_ends_with_task, local_energy, min_exp_end_of_task, max_exp_end_of_workers, suitable_components, nsuitable_components);
			
 
				 
			
 
				 	/* If no best component is found, it means that the perfmodel of
			
 
				 	 * the task had been purged since it has been pushed on the mct component.
			
@@ -105,6 +110,7 @@ static void mct_component_deinit_data(struct starpu_sched_component * component)
 
				 
			
 
				 int starpu_sched_component_is_mct(struct starpu_sched_component * component)
			
 
				 {
			
 
				+
			
 
				 	return component->push_task == mct_push_task;
			
 
				 }
			
 
				 
			
--- a/src/sched_policies/deque_modeling_policy_data_aware.c
+++ b/src/sched_policies/deque_modeling_policy_data_aware.c
@@ -445,7 +445,7 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 
				 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
			
 
				 	int best = -1;
			
 
				 
			
 
				-	double best_exp_end = 0.0;
			
 
				+	double best_exp_end_of_task = 0.0;
			
 
				 	double model_best = 0.0;
			
 
				 	double transfer_model_best = 0.0;
			
 
				 
			
@@ -552,10 +552,10 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 
				 
			
 
				 			exp_end = exp_start + fifo->exp_len + local_length;
			
 
				 
			
 
				-			if (best == -1 || exp_end < best_exp_end)
			
 
				+			if (best == -1 || exp_end < best_exp_end_of_task)
			
 
				 			{
			
 
				 				/* a better solution was found */
			
 
				-				best_exp_end = exp_end;
			
 
				+				best_exp_end_of_task = exp_end;
			
 
				 				best = worker;
			
 
				 				model_best = local_length;
			
 
				 				transfer_model_best = local_penalty;
			
@@ -589,15 +589,15 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				 						unsigned nworkers,
			
 
				 						double local_task_length[nworkers][STARPU_MAXIMPLEMENTATIONS],
			
 
				 						double exp_end[nworkers][STARPU_MAXIMPLEMENTATIONS],
			
 
				-						double *max_exp_endp,
			
 
				-						double *best_exp_endp,
			
 
				+						double *max_exp_endp_of_workers,
			
 
				+						double *min_exp_endp_of_task,
			
 
				 						double local_data_penalty[nworkers][STARPU_MAXIMPLEMENTATIONS],
			
 
				 						double local_energy[nworkers][STARPU_MAXIMPLEMENTATIONS],
			
 
				 						int *forced_worker, int *forced_impl, unsigned sched_ctx_id, unsigned sorted_decision)
			
 
				 {
			
 
				 	int calibrating = 0;
			
 
				-	double max_exp_end = DBL_MIN;
			
 
				-	double best_exp_end = DBL_MAX;
			
 
				+	double max_exp_end_of_workers = DBL_MIN;
			
 
				+	double best_exp_end_of_task = DBL_MAX;
			
 
				 	int ntasks_best = -1;
			
 
				 	int nimpl_best = 0;
			
 
				 	double ntasks_best_end = 0.0;
			
@@ -664,8 +664,8 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				 			}
			
 
				 
			
 
				 			exp_end[worker_ctx][nimpl] = exp_start + prev_exp_len;
			
 
				-			if (exp_end[worker_ctx][nimpl] > max_exp_end)
			
 
				-				max_exp_end = exp_end[worker_ctx][nimpl];
			
 
				+			if (exp_end[worker_ctx][nimpl] > max_exp_end_of_workers)
			
 
				+				max_exp_end_of_workers = exp_end[worker_ctx][nimpl];
			
 
				 
			
 
				 			//_STARPU_DEBUG("Scheduler dmda: task length (%lf) workerid (%u) kernel (%u) \n", local_task_length[workerid][nimpl],workerid,nimpl);
			
 
				 
			
@@ -742,10 +742,10 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				 
			
 
				 			exp_end[worker_ctx][nimpl] = task_starting_time + local_task_length[worker_ctx][nimpl];
			
 
				 
			
 
				-			if (exp_end[worker_ctx][nimpl] < best_exp_end)
			
 
				+			if (exp_end[worker_ctx][nimpl] < best_exp_end_of_task)
			
 
				 			{
			
 
				 				/* a better solution was found */
			
 
				-				best_exp_end = exp_end[worker_ctx][nimpl];
			
 
				+				best_exp_end_of_task = exp_end[worker_ctx][nimpl];
			
 
				 				nimpl_best = nimpl;
			
 
				 			}
			
 
				 
			
@@ -766,8 +766,8 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				-	*best_exp_endp = best_exp_end;
			
 
				-	*max_exp_endp = max_exp_end;
			
 
				+	*min_exp_endp_of_task = best_exp_end_of_task;
			
 
				+	*max_exp_endp_of_workers = max_exp_end_of_workers;
			
 
				 }
			
 
				 
			
 
				 static double _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sched_ctx_id, unsigned simulate, unsigned sorted_decision)
			
@@ -794,10 +794,10 @@ static double _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned
 
				 	double exp_end[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
			
 
				 
			
 
				 	/* This is the minimum among the exp_end[] matrix */
			
 
				-	double best_exp_end;
			
 
				+	double min_exp_end_of_task;
			
 
				 
			
 
				 	/* This is the maximum termination time of already-scheduled tasks over all workers */
			
 
				-	double max_exp_end = 0.0;
			
 
				+	double max_exp_end_of_workers = 0.0;
			
 
				 
			
 
				 	double fitness[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
			
 
				 
			
@@ -806,8 +806,8 @@ static double _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned
 
				 					    nworkers_ctx,
			
 
				 					    local_task_length,
			
 
				 					    exp_end,
			
 
				-					    &max_exp_end,
			
 
				-					    &best_exp_end,
			
 
				+					    &max_exp_end_of_workers,
			
 
				+					    &min_exp_end_of_task,
			
 
				 					    local_data_penalty,
			
 
				 					    local_energy,
			
 
				 					    &forced_best,
			
@@ -836,16 +836,18 @@ static double _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned
 
				 					/* no one on that queue may execute this task */
			
 
				 					continue;
			
 
				 				}
			
 
				-				fitness[worker_ctx][nimpl] = dt->alpha * __s_alpha__value *(exp_end[worker_ctx][nimpl] - best_exp_end)
			
 
				+				fitness[worker_ctx][nimpl] = dt->alpha * __s_alpha__value *(exp_end[worker_ctx][nimpl] - min_exp_end_of_task)
			
 
				 					+ dt->beta * __s_beta__value *(local_data_penalty[worker_ctx][nimpl])
			
 
				 					+ dt->_gamma * __s_gamma__value *(local_energy[worker_ctx][nimpl]);
			
 
				 
			
 
				-				if (exp_end[worker_ctx][nimpl] > max_exp_end)
			
 
				+				if (exp_end[worker_ctx][nimpl] > max_exp_end_of_workers)
			
 
				 				{
			
 
				 					/* This placement will make the computation
			
 
				 					 * longer, take into account the idle
			
 
				 					 * consumption of other cpus */
			
 
				-					fitness[worker_ctx][nimpl] += dt->_gamma * __s_gamma__value * dt->idle_power * __s_idle_power__value * (exp_end[worker_ctx][nimpl] - max_exp_end) / 1000000.0;
			
 
				+					fitness[worker_ctx][nimpl] += dt->_gamma * __s_gamma__value * dt->idle_power * __s_idle_power__value * (exp_end[worker_ctx][nimpl] - max_exp_end_of_workers) / 1000000.0; /* Since gamma is the cost in us of one Joules, 
			
 
				+																									  then  d->idle_power * (exp_end - max_exp_end_of_workers) 
			
 
				+																									  must be in Joules, thus the / 1000000.0 */
			
 
				 				}
			
 
				 
			
 
				 				if (best == -1 || fitness[worker_ctx][nimpl] < best_fitness)
			
@@ -856,7 +858,7 @@ static double _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned
 
				 					best_in_ctx = worker_ctx;
			
 
				 					selected_impl = nimpl;
			
 
				 
			
 
				-					//_STARPU_DEBUG("best fitness (worker %d) %e = alpha*(%e) + beta(%e) +gamma(%e)\n", worker, best_fitness, exp_end[worker][nimpl] - best_exp_end, local_data_penalty[worker][nimpl], local_energy[worker][nimpl]);
			
 
				+					//_STARPU_DEBUG("best fitness (worker %d) %e = alpha*(%e) + beta(%e) +gamma(%e)\n", worker, best_fitness, exp_end[worker][nimpl] - min_exp_end_of_task, local_data_penalty[worker][nimpl], local_energy[worker][nimpl]);
			
 
				 
			
 
				 				}
			
 
				 			}
			
@@ -1026,7 +1028,9 @@ static void initialize_dmda_policy(unsigned sched_ctx_id)
 
				 
			
 
				 	dt->alpha = starpu_get_env_float_default("STARPU_SCHED_ALPHA", _STARPU_SCHED_ALPHA_DEFAULT);
			
 
				 	dt->beta = starpu_get_env_float_default("STARPU_SCHED_BETA", _STARPU_SCHED_BETA_DEFAULT);
			
 
				+	/* data->_gamma: cost of one Joule in us. If gamma is set to 10^6, then one Joule cost 1s */
			
 
				 	dt->_gamma = starpu_get_env_float_default("STARPU_SCHED_GAMMA", _STARPU_SCHED_GAMMA_DEFAULT);
			
 
				+	/* data->idle_power: Idle power of the whole machine in Watt */
			
 
				 	dt->idle_power = starpu_get_env_float_default("STARPU_IDLE_POWER", 0.0);
			
 
				 
			
 
				 	if(starpu_sched_ctx_min_priority_is_set(sched_ctx_id) != 0 && starpu_sched_ctx_max_priority_is_set(sched_ctx_id) != 0)
			
--- a/src/sched_policies/helper_mct.c
+++ b/src/sched_policies/helper_mct.c
@@ -36,8 +36,10 @@ struct _starpu_mct_data *starpu_mct_init_parameters(struct starpu_sched_componen
 
				 	{
			
 
				 		data->alpha = params->alpha;
			
 
				 		data->beta = params->beta;
			
 
				+		/* data->_gamma: cost of one Joule in us. If gamma is set to 10^6, then one Joule cost 1s */
			
 
				 		data->_gamma = params->_gamma;
			
 
				-		data->idle_power = params->idle_power;
			
 
				+		/* data->idle_power: Idle power of the whole machine in Watt */
			
 
				+		data->idle_power = params->idle_power; 
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
@@ -79,14 +81,21 @@ static double compute_expected_time(double now, double predicted_end, double pre
 
				 	return predicted_end;
			
 
				 }
			
 
				 
			
 
				-double starpu_mct_compute_fitness(struct _starpu_mct_data * d, double exp_end, double min_exp_end, double max_exp_end, double transfer_len, double local_energy)
			
 
				+double starpu_mct_compute_fitness(struct _starpu_mct_data * d, double exp_end, double min_exp_end_of_task, double max_exp_end_of_workers, double transfer_len, double local_energy)
			
 
				 {
			
 
				 	/* Note: the expected end includes the data transfer duration, which we want to be able to tune separately */
			
 
				-
			
 
				-	return d->alpha * (exp_end - min_exp_end)
			
 
				-		+ d->beta * transfer_len
			
 
				-		+ d->_gamma * local_energy
			
 
				-		+ d->_gamma * d->idle_power * (exp_end - max_exp_end);
			
 
				+	
			
 
				+	/* min_exp_end_of_task is the minimum end time of the task over all workers */
			
 
				+	double fitness = d->alpha * (exp_end - min_exp_end_of_task) + d->beta * transfer_len + d->_gamma * local_energy;
			
 
				+	
			
 
				+	/* max_exp_end is the maximum end time of the workers. If the total execution time is increased, then an 
			
 
				+          additional energy penalty must be considered*/
			
 
				+	if(exp_end > max_exp_end_of_workers)
			
 
				+		fitness += d->_gamma * d->idle_power * (exp_end - max_exp_end_of_workers) / 1000000.0; /* Since gamma is the cost in us of one Joules, 
			
 
				+											       then  d->idle_power * (exp_end - max_exp_end) 
			
 
				+											       must be in Joules, thus the / 1000000.0 */
			
 
				+
			
 
				+	return fitness;
			
 
				 }
			
 
				 
			
 
				 unsigned starpu_mct_compute_execution_times(struct starpu_sched_component *component, struct starpu_task *task,
			
@@ -120,12 +129,12 @@ unsigned starpu_mct_compute_execution_times(struct starpu_sched_component *compo
 
				 
			
 
				 void starpu_mct_compute_expected_times(struct starpu_sched_component *component, struct starpu_task *task STARPU_ATTRIBUTE_UNUSED,
			
 
				 		double *estimated_lengths, double *estimated_transfer_length, double *estimated_ends_with_task,
			
 
				-				       double *min_exp_end_with_task, double *max_exp_end_with_task, unsigned *suitable_components, unsigned nsuitable_components)
			
 
				+				       double *min_exp_end_of_task, double *max_exp_end_of_workers, unsigned *suitable_components, unsigned nsuitable_components)
			
 
				 {
			
 
				 	unsigned i;
			
 
				 	double now = starpu_timing_now();
			
 
				-	*min_exp_end_with_task = DBL_MAX;
			
 
				-	*max_exp_end_with_task = 0.0;
			
 
				+	*min_exp_end_of_task = DBL_MAX;
			
 
				+	*max_exp_end_of_workers = 0.0;
			
 
				 	for(i = 0; i < nsuitable_components; i++)
			
 
				 	{
			
 
				 		unsigned icomponent = suitable_components[i];
			
@@ -138,14 +147,39 @@ void starpu_mct_compute_expected_times(struct starpu_sched_component *component,
 
				 								    estimated_end,
			
 
				 								    estimated_lengths[icomponent],
			
 
				 								    estimated_transfer_length[icomponent]);
			
 
				-		if(estimated_ends_with_task[icomponent] < *min_exp_end_with_task)
			
 
				-			*min_exp_end_with_task = estimated_ends_with_task[icomponent];
			
 
				-		if(estimated_ends_with_task[icomponent] > *max_exp_end_with_task)
			
 
				-			*max_exp_end_with_task = estimated_ends_with_task[icomponent];
			
 
				+		
			
 
				+		/* estimated_ends_with_task[icomponent]: estimated end of execution on the worker icomponent
			
 
				+		   estimated_end: estimatated end of the worker
			
 
				+		   min_exp_end_of_task: minimum estimated execution time of the task over all workers
			
 
				+		   max_exp_end_of_workers: maximum estimated end of the already-scheduled tasks over all workers
			
 
				+		*/
			
 
				+		if(estimated_ends_with_task[icomponent] < *min_exp_end_of_task)
			
 
				+			*min_exp_end_of_task = estimated_ends_with_task[icomponent];
			
 
				+		if(estimated_end > *max_exp_end_of_workers)
			
 
				+			*max_exp_end_of_workers = estimated_end;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* This function retrieves the energy consumption of a task in Joules*/
			
 
				+void starpu_mct_compute_energy(struct starpu_sched_component *component, struct starpu_task *task , double *local_energy, unsigned *suitable_components, unsigned nsuitable_components)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	for(i = 0; i < nsuitable_components; i++)
			
 
				+	{
			
 
				+		unsigned icomponent = suitable_components[i];
			
 
				+		int nimpl = 0;
			
 
				+		local_energy[icomponent] = starpu_task_worker_expected_energy(task, icomponent,  component->tree->sched_ctx_id, nimpl);
			
 
				+		for (nimpl  = 1; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				+		{
			
 
				+			double e;
			
 
				+			e = starpu_task_worker_expected_energy(task, icomponent,  component->tree->sched_ctx_id, nimpl);
			
 
				+			if (e < local_energy[icomponent])
			
 
				+				local_energy[icomponent] = e;
			
 
				+		}
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				-int starpu_mct_get_best_component(struct _starpu_mct_data *d, struct starpu_task *task, double *estimated_lengths, double *estimated_transfer_length, double *estimated_ends_with_task, double min_exp_end_with_task, double max_exp_end_with_task, unsigned *suitable_components, unsigned nsuitable_components)
			
 
				+int starpu_mct_get_best_component(struct _starpu_mct_data *d, struct starpu_task *task, double *estimated_lengths, double *estimated_transfer_length, double *estimated_ends_with_task, double *local_energy, double min_exp_end_of_task, double max_exp_end_of_workers, unsigned *suitable_components, unsigned nsuitable_components)
			
 
				 {
			
 
				 	double best_fitness = DBL_MAX;
			
 
				 	int best_icomponent = -1;
			
@@ -154,15 +188,12 @@ int starpu_mct_get_best_component(struct _starpu_mct_data *d, struct starpu_task
 
				 	for(i = 0; i < nsuitable_components; i++)
			
 
				 	{
			
 
				 		int icomponent = suitable_components[i];
			
 
				-#ifdef STARPU_DEVEL
			
 
				-#warning FIXME: take energy consumption into account
			
 
				-#endif
			
 
				 		double tmp = starpu_mct_compute_fitness(d,
			
 
				 					     estimated_ends_with_task[icomponent],
			
 
				-					     min_exp_end_with_task,
			
 
				-					     max_exp_end_with_task,
			
 
				+					     min_exp_end_of_task,
			
 
				+					     max_exp_end_of_workers,
			
 
				 					     estimated_transfer_length[icomponent],
			
 
				-					     0.0);
			
 
				+					     local_energy[icomponent]);
			
 
				 
			
 
				 		if(tmp < best_fitness)
			
 
				 		{
			
--- a/src/sched_policies/helper_mct.h
+++ b/src/sched_policies/helper_mct.h
@@ -39,8 +39,8 @@ void starpu_mct_compute_expected_times(struct starpu_sched_component *component,
 
				 				       double *estimated_lengths,
			
 
				 				       double *estimated_transfer_length,
			
 
				 				       double *estimated_ends_with_task,
			
 
				-				       double *min_exp_end_with_task,
			
 
				-				       double *max_exp_end_with_task,
			
 
				+				       double *min_exp_end_of_task,
			
 
				+				       double *max_exp_end_of_workers,
			
 
				 				       unsigned *suitable_components,
			
 
				 				       unsigned nsuitable_components);
			
 
				 
			
@@ -56,7 +56,15 @@ int starpu_mct_get_best_component(struct _starpu_mct_data *d,
 
				 				  double *estimated_lengths,
			
 
				 				  double *estimated_transfer_length,
			
 
				 				  double *estimated_ends_with_task,
			
 
				-				  double min_exp_end_with_task,
			
 
				-				  double max_exp_end_with_task,
			
 
				+				  double *local_energy,
			
 
				+				  double min_exp_end_of_task,
			
 
				+				  double max_exp_end_of_workers,
			
 
				 				  unsigned *suitable_components,
			
 
				 				  unsigned nsuitable_components);
			
 
				+
			
 
				+
			
 
				+void starpu_mct_compute_energy(struct starpu_sched_component *component,
			
 
				+			       struct starpu_task *task ,
			
 
				+			       double *local_energy,
			
 
				+			       unsigned *suitable_components,
			
 
				+			       unsigned nsuitable_components);
			
--- a/tools/starpu_env.in
+++ b/tools/starpu_env.in
@@ -1,6 +1,6 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				-# Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
 
				 # it under the terms of the GNU Lesser General Public License as published by
			
--- a/tools/starpu_perfmodel_recdump.c
+++ b/tools/starpu_perfmodel_recdump.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2011-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				  * Copyright (C) 2011       Télécom-SudParis
			
 
				+ * Copyright (C) 2020       Federal University of Rio Grande do Sul (UFRGS)
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/tools/starpu_smpirun.in
+++ b/tools/starpu_smpirun.in
@@ -2,6 +2,7 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				 # Copyright (C) 2014-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+# Copyright (C) 2020       Federal University of Rio Grande do Sul (UFRGS)
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
 
				 # it under the terms of the GNU Lesser General Public License as published by