5 years ago · dc72337184
--- a/doc/doxygen/Makefile.am
+++ b/doc/doxygen/Makefile.am
@@ -270,13 +270,11 @@ $(DOX_PDF): $(DOX_TAG) refman.tex
 
				 	! < refman.log grep -v group__ | grep -v _amgrp | grep -v deprecated__ | grep "multiply defined" || exit 1 ;\
			
 
				 	$(MAKEINDEX) refman.idx ;\
			
 
				 	max_print_line=1000000 $(PDFLATEX) -interaction batchmode refman.tex ;\
			
 
				-	done=0; repeat=5 ;\
			
 
				-	while test $$done = 0 -a $$repeat -gt 0; do \
			
 
				+	for i in $(shell seq 1 5); do \
			
 
				            if $(EGREP) 'Rerun (LaTeX|to get cross-references right)' refman.log > /dev/null 2>&1; then \
			
 
				 	       max_print_line=1000000 $(PDFLATEX) -interaction batchmode refman.tex; \
			
 
				-	       repeat=`expr $$repeat - 1`; \
			
 
				 	   else \
			
 
				-	       done=1; \
			
 
				+		break ; \
			
 
				 	   fi; \
			
 
				 	done
			
 
				 	mv $(DOX_LATEX_DIR)/refman.pdf $(DOX_PDF)
			
--- a/doc/doxygen_dev/Makefile.am
+++ b/doc/doxygen_dev/Makefile.am
@@ -217,13 +217,11 @@ $(DOX_PDF): $(DOX_TAG) refman.tex
 
				 	! < refman.log grep -v group__ | grep -v _amgrp | grep -v deprecated__ | grep "multiply defined" || exit 1 ;\
			
 
				 	$(MAKEINDEX) refman.idx ;\
			
 
				 	max_print_line=1000000 $(PDFLATEX) -interaction batchmode refman.tex ;\
			
 
				-	done=0; repeat=5 ;\
			
 
				-	while test $$done = 0 -a $$repeat -gt 0; do \
			
 
				+	for i in $(shell seq 1 5); do \
			
 
				            if $(EGREP) 'Rerun (LaTeX|to get cross-references right)' refman.log > /dev/null 2>&1; then \
			
 
				 	       max_print_line=1000000 $(PDFLATEX) -interaction batchmode refman.tex; \
			
 
				-	       repeat=`expr $$repeat - 1`; \
			
 
				 	   else \
			
 
				-	       done=1; \
			
 
				+		break ; \
			
 
				 	   fi; \
			
 
				 	done
			
 
				 	mv $(DOX_LATEX_DIR)/refman.pdf $(DOX_PDF)
			
--- a/examples/perf_steering/perf_knobs_03.c
+++ b/examples/perf_steering/perf_knobs_03.c
@@ -126,7 +126,8 @@ int main(int argc, char **argv)
 
				 		const int id = starpu_perf_knob_name_to_id(scope_id, knob_name);
			
 
				 		STARPU_ASSERT(starpu_perf_knob_get_type_id(id) == starpu_perf_knob_type_name_to_id(knob_type_name));
			
 
				 
			
 
				-		struct starpu_codelet cl = {
			
 
				+		struct starpu_codelet cl =
			
 
				+		{
			
 
				 			.cpu_funcs = {cpu_func}
			
 
				 		};
			
 
				 
			
--- a/examples/pipeline/pipeline.c
+++ b/examples/pipeline/pipeline.c
@@ -176,7 +176,8 @@ static struct starpu_codelet pipeline_codelet_sum =
 
				 	.model = &pipeline_model_sum
			
 
				 };
			
 
				 
			
 
				-static void release_sem(void *arg) {
			
 
				+static void release_sem(void *arg)
			
 
				+{
			
 
				 	sem_post(arg);
			
 
				 };
			
 
				 
			
--- a/examples/ppm_downscaler/yuv_downscaler.c
+++ b/examples/ppm_downscaler/yuv_downscaler.c
@@ -141,14 +141,16 @@ int main(int argc, char **argv)
 
				 
			
 
				 	/* fetch input data */
			
 
				 	FILE *f_in = fopen(filename_in, "r");
			
 
				-	if (!f_in) {
			
 
				+	if (!f_in)
			
 
				+	{
			
 
				 		fprintf(stderr, "couldn't open input file %s\n", filename_in);
			
 
				 		exit(EXIT_FAILURE);
			
 
				 	}
			
 
				 
			
 
				 	/* allocate room for an output buffer */
			
 
				 	FILE *f_out = fopen(filename_out, "w+");
			
 
				-	if (!f_out) {
			
 
				+	if (!f_out)
			
 
				+	{
			
 
				 		fprintf(stderr, "couldn't open output file %s\n", filename_out);
			
 
				 		exit(EXIT_FAILURE);
			
 
				 	}
			
--- a/examples/scheduler/dummy_modular_sched.c
+++ b/examples/scheduler/dummy_modular_sched.c
@@ -170,7 +170,8 @@ static void init_dummy_sched(unsigned sched_ctx_id)
 
				 {
			
 
				 	FPRINTF(stderr, "Initialising Dummy scheduler\n");
			
 
				 
			
 
				-	struct dummy_sched_params params = {
			
 
				+	struct dummy_sched_params params =
			
 
				+	{
			
 
				 		.verbose = 0,
			
 
				 	};
			
 
				 
			
--- a/examples/spmv/spmv.c
+++ b/examples/spmv/spmv.c
@@ -245,10 +245,13 @@ int main(int argc, char **argv)
 
				 			vector_exp_out_ptr[row] += UPPER_BAND * vector_in_ptr[row+1];
			
 
				 	}
			
 
				 	for (row = 0; row < size; row++)
			
 
				-		if (vector_out_ptr[row] != vector_exp_out_ptr[row]) {
			
 
				+	{
			
 
				+		if (vector_out_ptr[row] != vector_exp_out_ptr[row])
			
 
				+		{
			
 
				 			FPRINTF(stderr, "check failed at %u: %f vs expected %f\n", row, vector_out_ptr[row], vector_exp_out_ptr[row]);
			
 
				 			exit(EXIT_FAILURE);
			
 
				 		}
			
 
				+	}
			
 
				 
			
 
				 	starpu_free(nzval);
			
 
				 	starpu_free(colind);
			
--- a/examples/tag_example/tag_example2.c
+++ b/examples/tag_example/tag_example2.c
@@ -70,7 +70,7 @@ static void tag_cleanup_grid(unsigned iter)
 
				 
			
 
				 	for (i = 0; i < ni; i++)
			
 
				 		starpu_tag_remove(TAG(i,iter));
			
 
				-} 
			
 
				+}
			
 
				 
			
 
				 static int create_task_grid(unsigned iter)
			
 
				 {
			
--- a/examples/tag_example/tag_example3.c
+++ b/examples/tag_example/tag_example3.c
@@ -72,7 +72,7 @@ static void tag_cleanup_grid(unsigned iter)
 
				 
			
 
				 	for (i = 0; i < ni; i++)
			
 
				 		starpu_tag_remove(TAG(i,iter));
			
 
				-} 
			
 
				+}
			
 
				 
			
 
				 static int create_task_grid(unsigned iter)
			
 
				 {
			
--- a/julia/StarPU.jl/Makefile
+++ b/julia/StarPU.jl/Makefile
@@ -0,0 +1,8 @@
 
				+SRCS=src/jlstarpu_task_submit.c src/jlstarpu_simple_functions.c src/jlstarpu_data_handles.c
			
 
				+CC = gcc
			
 
				+CFLAGS += $(shell pkg-config --cflags starpu-1.3)
			
 
				+LDFLAGS += $(shell pkg-config --libs starpu-1.3)
			
 
				+
			
 
				+lib/libjlstarpu_c_wrapper.so: ${SRCS}
			
 
				+	test -d lib || mkdir lib
			
 
				+	$(CC) -O3 -shared -fPIC $(CFLAGS) $^ -o $@ $(LDFLAGS)
			
--- a/julia/StarPU.jl/src/StarPU.jl
+++ b/julia/StarPU.jl/src/StarPU.jl
@@ -12,14 +12,14 @@ export STARPU_CUDA
 
				 const  STARPU_CPU = 1 << 1
			
 
				 const  STARPU_CUDA = 1 << 3
			
 
				 
			
 
				-global starpu_task_library_name="libjlstarpu_c_wrapper"
			
 
				+const starpu_task_library_name="libjlstarpu_c_wrapper.so"
			
 
				 global starpu_tasks_library_handle = C_NULL
			
 
				 global starpu_target=STARPU_CPU
			
 
				 
			
 
				 include("compiler/include.jl")
			
 
				 
			
 
				 macro starpufunc(symbol)
			
 
				-    :($symbol, "libjlstarpu_c_wrapper")
			
 
				+    :($symbol, starpu_task_library_name)
			
 
				 end
			
 
				 
			
 
				 """
			
@@ -27,7 +27,7 @@ end
 
				     Works as ccall function
			
 
				 """
			
 
				 macro starpucall(func, ret_type, arg_types, args...)
			
 
				-    return Expr(:call, :ccall, (func, "libjlstarpu_c_wrapper"), esc(ret_type), esc(arg_types), map(esc, args)...)
			
 
				+    return Expr(:call, :ccall, (func, starpu_task_library_name), esc(ret_type), esc(arg_types), map(esc, args)...)
			
 
				 end
			
 
				 
			
 
				 export @debugprint
			
@@ -103,13 +103,14 @@ export StarpuCodelet
 
				 
			
 
				 const jlstarpu_allocated_structures = Vector{Ptr{Cvoid}}([])
			
 
				 @enum(StarpuPerfmodelType,
			
 
				-    STARPU_PERFMODEL_INVALID = 0,
			
 
				-	STARPU_PER_ARCH = 1,
			
 
				-	STARPU_COMMON = 2,
			
 
				-	STARPU_HISTORY_BASED = 3,
			
 
				-	STARPU_REGRESSION_BASED = 4,
			
 
				-	STARPU_NL_REGRESSION_BASED = 5,
			
 
				-	STARPU_MULTIPLE_REGRESSION_BASED = 6
			
 
				+      STARPU_PERFMODEL_INVALID = 0,
			
 
				+      STARPU_PER_WORKER = 1,
			
 
				+      STARPU_PER_ARCH = 2,
			
 
				+      STARPU_COMMON = 3,
			
 
				+      STARPU_HISTORY_BASED = 4,
			
 
				+      STARPU_REGRESSION_BASED = 5,
			
 
				+      STARPU_NL_REGRESSION_BASED = 6,
			
 
				+      STARPU_MULTIPLE_REGRESSION_BASED = 7
			
 
				 )
			
 
				 mutable struct StarpuPerfmodel_c
			
 
				 
			
@@ -117,6 +118,7 @@ mutable struct StarpuPerfmodel_c
 
				 
			
 
				     cost_function :: Ptr{Cvoid}
			
 
				     arch_cost_function :: Ptr{Cvoid}
			
 
				+    worker_cost_function :: Ptr{Cvoid}
			
 
				 
			
 
				     size_base :: Ptr{Cvoid}
			
 
				     footprint :: Ptr{Cvoid}
			
@@ -583,8 +585,9 @@ function starpu_init()
 
				             print(k,">>>>",CPU_CODELETS[k],"\n")
			
 
				         end
			
 
				     else
			
 
				-        system("make generated_tasks.dylib")
			
 
				-        global starpu_tasks_library_handle=Libdl.dlopen("generated_tasks")
			
 
				+        @debugprint "generating codelet library"
			
 
				+        run(`make generated_tasks.so`);
			
 
				+        global starpu_tasks_library_handle=Libdl.dlopen("generated_tasks.so")
			
 
				     end
			
 
				     output = @starpucall jlstarpu_init Cint ()
			
 
				 
			
@@ -798,15 +801,28 @@ function starpu_task_submit(task :: StarpuTask)
 
				     @starpucall starpu_task_submit Cint (Ptr{Cvoid},) task.c_task
			
 
				 end
			
 
				 
			
 
				+
			
 
				+function starpu_modes(x :: Symbol)
			
 
				+    if (x == Symbol("STARPU_RW"))
			
 
				+        return STARPU_RW
			
 
				+    elseif (x == Symbol("STARPU_R"))
			
 
				+        return STARPU_R
			
 
				+    else return STARPU_W
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				 """
			
 
				     Creates and submits an asynchronous task running cl Codelet function.
			
 
				     Ex : @starpu_async_cl cl(handle1, handle2)
			
 
				 """
			
 
				-macro starpu_async_cl(expr)
			
 
				+macro starpu_async_cl(expr,modes)
			
 
				 
			
 
				     if (!isa(expr, Expr) || expr.head != :call)
			
 
				         error("Invalid task submit syntax")
			
 
				     end
			
 
				+    if (!isa(expr, Expr)||modes.head != :vect)
			
 
				+        error("Invalid task submit syntax")
			
 
				+    end
			
 
				     perfmodel = StarpuPerfmodel(
			
 
				         perf_type = STARPU_HISTORY_BASED,
			
 
				         symbol = "history_perf"
			
@@ -817,7 +833,7 @@ macro starpu_async_cl(expr)
 
				         #cuda_func = "matrix_mult",
			
 
				         #opencl_func="ocl_matrix_mult",
			
 
				         ### TODO: CORRECT !
			
 
				-        modes = [STARPU_R, STARPU_R, STARPU_W],
			
 
				+        modes = map((x -> starpu_modes(x)),modes.args),
			
 
				         perfmodel = perfmodel
			
 
				     )
			
 
				     handles = Expr(:vect, expr.args[2:end]...)
			
@@ -1216,7 +1232,7 @@ macro starpu_noparam_function(func_name, ret_type)
 
				 
			
 
				     quote
			
 
				         export $func
			
 
				-        global $func() = ccall(($func_name, "libjlstarpu_c_wrapper"),
			
 
				+        global $func() = ccall(($func_name, starpu_task_library_name),
			
 
				                                 $ret_type, ()) :: $ret_type
			
 
				     end
			
 
				 end
			
--- a/julia/StarPU.jl/src/compiler/expressions.jl
+++ b/julia/StarPU.jl/src/compiler/expressions.jl
@@ -93,6 +93,8 @@ end
 
				 struct StarpuExprReturn <: StarpuExpr
			
 
				     value :: StarpuExpr
			
 
				 end
			
 
				+struct StarpuExprBreak <: StarpuExpr
			
 
				+end
			
 
				 struct StarpuExprVar <: StarpuExpr
			
 
				     name :: Symbol
			
 
				 end
			
@@ -717,6 +719,26 @@ function apply(func :: Function, expr :: StarpuExprRef)
 
				 end
			
 
				 
			
 
				 #======================================================
			
 
				+                BREAK EXPRESSION
			
 
				+======================================================#
			
 
				+
			
 
				+function starpu_parse_break(x :: Expr)
			
 
				+    if (x.head != :break)
			
 
				+        error("Invalid \"break\" expression")
			
 
				+    end
			
 
				+
			
 
				+    return StarpuExprBreak()
			
 
				+end
			
 
				+
			
 
				+function print(io :: IO, x :: StarpuExprBreak ; indent = 0)
			
 
				+    print(io, "break")
			
 
				+end
			
 
				+
			
 
				+function apply(func :: Function, expr :: StarpuExprBreak)
			
 
				+
			
 
				+    return func(StarpuExprBreak())
			
 
				+end
			
 
				+#======================================================
			
 
				                 RETURN EXPRESSION
			
 
				 ======================================================#
			
 
				 
			
--- a/julia/StarPU.jl/src/compiler/file_generation.jl
+++ b/julia/StarPU.jl/src/compiler/file_generation.jl
@@ -10,6 +10,7 @@ global generated_cpu_kernel_file_name = "PRINT TO STDOUT"
 
				 const cpu_kernel_file_start = "#include <stdio.h>
			
 
				 #include <stdint.h>
			
 
				 #include <starpu.h>
			
 
				+#include <math.h>
			
 
				 
			
 
				 static inline long long jlstarpu_max(long long a, long long b)
			
 
				 {
			
@@ -30,6 +31,7 @@ static inline long long jlstarpu_interval_size(long long start, long long step,
 
				 const cuda_kernel_file_start = "#include <stdio.h>
			
 
				 #include <stdint.h>
			
 
				 #include <starpu.h>
			
 
				+#include <math.h>
			
 
				 
			
 
				 #define THREADS_PER_BLOCK 64
			
 
				 
			
--- a/julia/StarPU.jl/src/compiler/parsing.jl
+++ b/julia/StarPU.jl/src/compiler/parsing.jl
@@ -32,7 +32,7 @@ function starpu_parse(x :: Expr)
 
				 
			
 
				 end
			
 
				 
			
 
				-for kw in (:if, :call, :for, :block, :return, :function, :while, :ref)
			
 
				+for kw in (:if, :call, :for, :block, :return, :function, :while, :ref, :break)
			
 
				     starpu_parse_key_word_parsing_function[kw] = eval(Symbol(:starpu_parse_, kw))
			
 
				 end
			
 
				 
			
--- a/julia/mandelbrot/cpu_mandelbrot.c
+++ b/julia/mandelbrot/cpu_mandelbrot.c
@@ -0,0 +1,56 @@
 
				+#include <stdio.h>
			
 
				+#include <starpu.h>
			
 
				+#include <math.h>
			
 
				+
			
 
				+void cpu_mandelbrot(void *descr[], void *cl_arg)
			
 
				+{
			
 
				+        long long int *pixels;
			
 
				+	float *params;
			
 
				+
			
 
				+        pixels = (long long int *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	params = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+
			
 
				+        int width = STARPU_MATRIX_GET_NX(descr[0]);
			
 
				+        int height = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+        
			
 
				+        int ldP = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+
			
 
				+        float centerr = params[0];
			
 
				+        float centeri = params[1];
			
 
				+        float offset = params[2];
			
 
				+        float dim = params[3];
			
 
				+        float zoom = width * 0.25296875;
			
 
				+        float diverge = 4.0;
			
 
				+        int max_iter = (width/2) * 0.049715909 * log10(zoom);
			
 
				+
			
 
				+        int x,y,n;
			
 
				+
			
 
				+        for (y = 0; y < height; y++){
			
 
				+                for (x = 0; x < width; x++){
			
 
				+                        float cr = centerr + (x - (dim/2))/zoom;
			
 
				+                        float ci = centeri + (y+offset - (dim/2))/zoom;
			
 
				+                        float zr = cr;
			
 
				+                        float zi = ci;
			
 
				+                        
			
 
				+                        for (n = 0; n <= max_iter; n++) {
			
 
				+				if (zr*zr + zi*zi>diverge) break;
			
 
				+                                float tmp = zr*zr - zi*zi + cr;
			
 
				+                                zi = 2*zr*zi + ci;
			
 
				+                                zr = tmp;
			
 
				+                        }
			
 
				+			int color;
			
 
				+			if (n<max_iter)
			
 
				+				color = round(15.*n/max_iter);
			
 
				+			else
			
 
				+				color = 0;
			
 
				+			pixels[x*ldP + y] = color;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+char* CPU = "cpu_mandelbrot";
			
 
				+char* GPU = "gpu_mandelbrot";
			
 
				+extern char *starpu_find_function(char *name, char *device) {
			
 
				+	if (!strcmp(device,"gpu")) return GPU;
			
 
				+	return CPU;
			
 
				+}
			
--- a/julia/mandelbrot/makefile
+++ b/julia/mandelbrot/makefile
@@ -0,0 +1,35 @@
 
				+# GCC compiler
			
 
				+CC=gcc-9
			
 
				+CFLAGS += -O3 -mavx -mfma -fomit-frame-pointer -march=native -ffast-math $(shell pkg-config --cflags starpu-1.3)
			
 
				+
			
 
				+LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
			
 
				+EXTERNLIB=extern_tasks.dylib
			
 
				+GENERATEDLIB=generated_tasks.dylib
			
 
				+OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
			
 
				+LIBPATH=${PWD}/../StarPU.jl/lib
			
 
				+
			
 
				+all: ${EXTERNLIB} 
			
 
				+
			
 
				+mult: mult.c cpu_mult.o #gpu_mult.o 
			
 
				+	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)	
			
 
				+
			
 
				+gpu_mult.o: gpu_mult.cu
			
 
				+	nvcc -c $(CFLAGS) $^ -o $@
			
 
				+
			
 
				+%.o: %.c
			
 
				+	$(CC) -c $(CFLAGS) $^ -o $@
			
 
				+
			
 
				+${EXTERNLIB}: cpu_mandelbrot.o
			
 
				+	$(CC) -shared -fPIC $(LDFLAGS) $^ -o $@  
			
 
				+
			
 
				+gpu_mult.so: gpu_mult.o
			
 
				+	nvcc $(CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
			
 
				+
			
 
				+${GENERATEDLIB}: ${OBJECTS}
			
 
				+	$(CC) -shared -fPIC $(LDFLAGS) $^ -o $@
			
 
				+
			
 
				+clean:
			
 
				+	rm *.so *.o *.dylib c_*.genc gencuda_*.cu *.dat
			
 
				+
			
 
				+
			
 
				+
			
--- a/julia/mandelbrot/mandelbrot.jl
+++ b/julia/mandelbrot/mandelbrot.jl
@@ -1,30 +1,114 @@
 
				-function mandelbrotjl(pixels ::Matrix{Int64}, centerr ::Float64, centeri ::Float64)
			
 
				-    height,width = size(pixels)
			
 
				-    zoom = width * 0.25296875
			
 
				-    val_diverge = 2.0
			
 
				-    max_iterations = (width/2) * 0.049715909 * log10(zoom);
			
 
				+import Libdl
			
 
				+using StarPU
			
 
				+using LinearAlgebra
			
 
				 
			
 
				-
			
 
				-    for y = 1:height
			
 
				+@target STARPU_CPU+STARPU_CUDA
			
 
				+@codelet function mandelbrot(pixels ::Matrix{Int64}, params ::Matrix{Float32} ) :: Float32
			
 
				+    height :: Int64 = height(pixels)
			
 
				+    width :: Int64 = width(pixels)
			
 
				+    zoom :: Float64 = width * 0.25296875
			
 
				+    iz :: Float64 = 1. / zoom
			
 
				+    diverge :: Float32 = 4.0
			
 
				+    max_iterations :: Float32 = ((width/2) * 0.049715909 * log10(zoom));
			
 
				+    imi :: Float32 = 1. / max_iterations
			
 
				+    centerr :: Float32 = params[1,1]
			
 
				+    centeri :: Float32 = params[2,1]
			
 
				+    offset :: Float32 = params[3,1]
			
 
				+    dim :: Float32 = params[4,1]
			
 
				+    cr :: Float64 = 0.
			
 
				+    zr :: Float64 = 0.
			
 
				+    ci :: Float64 = 0.
			
 
				+    zi :: Float64 = 0.
			
 
				+    n :: Int64 = 0
			
 
				+    tmp :: Float64 = 0.
			
 
				+    @parallel for y = 1:height
			
 
				         for x = 1:width
			
 
				-            cr = centerr + (x - (width / 2))/zoom
			
 
				+            cr = centerr + (x-1 - (dim / 2)) * iz
			
 
				             zr = cr
			
 
				-            ci = centeri + (y - (height / 2))/zoom
			
 
				+            ci = centeri + (y-1+offset - (dim / 2)) * iz
			
 
				             zi = ci
			
 
				-
			
 
				-            n = 0
			
 
				-            while ((n < max_iterations) && (zr*zr + zi*zi < val_diverge*val_diverge))
			
 
				+            for n = 0:max_iterations
			
 
				+                if (zr*zr + zi*zi > diverge)
			
 
				+                    break
			
 
				+                end
			
 
				                 tmp = zr*zr - zi*zi + cr
			
 
				                 zi = 2*zr*zi + ci
			
 
				                 zr = tmp
			
 
				-                n = n+1
			
 
				             end
			
 
				             
			
 
				             if (n < max_iterations)
			
 
				-                pixels[y,x] = round(255 * n / max_iterations)
			
 
				+                pixels[y,x] = round(15 * n * imi)
			
 
				             else
			
 
				                 pixels[y,x] = 0
			
 
				             end
			
 
				         end
			
 
				     end
			
 
				-end
			
 
				+    return 0. :: Float32
			
 
				+end
			
 
				+
			
 
				+@debugprint "starpu_init"
			
 
				+starpu_init()
			
 
				+
			
 
				+function mandelbrot_with_starpu(A ::Matrix{Int64}, params ::Matrix{Float32}, nslicesx ::Int64)
			
 
				+    horiz = StarpuDataFilter(STARPU_MATRIX_FILTER_BLOCK, nslicesx)
			
 
				+    @starpu_block let
			
 
				+	hA, hP = starpu_data_register(A,params)
			
 
				+	starpu_data_partition(hA,horiz)
			
 
				+        starpu_data_partition(hP,horiz)
			
 
				+        
			
 
				+	@starpu_sync_tasks for taskx in (1 : nslicesx)
			
 
				+                @starpu_async_cl mandelbrot(hA[taskx], hP[taskx]) [STARPU_W, STARPU_R]
			
 
				+	end
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+function pixels2img(pixels ::Matrix{Int64}, width ::Int64, height ::Int64, filename ::String)
			
 
				+    MAPPING = [[66,30,15],[25,7,26],[9,1,47],[4,4,73],[0,7,100],[12,44,138],[24,82,177],[57,125,209],[134,181,229],[211,236,248],[241,233,191],[248,201,95],[255,170,0],[204,128,0],[153,87,0],[106,52,3]]
			
 
				+    open(filename, "w") do f
			
 
				+        write(f, "P3\n$width $height\n255\n")
			
 
				+        for i = 1:height
			
 
				+            for j = 1:width
			
 
				+                write(f,"$(MAPPING[1+pixels[i,j]][1]) $(MAPPING[1+pixels[i,j]][2]) $(MAPPING[1+pixels[i,j]][3]) ")
			
 
				+            end
			
 
				+            write(f, "\n")
			
 
				+        end
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+function min_times(cr ::Float64, ci ::Float64, dim ::Int64, nslices ::Int64)
			
 
				+    tmin=0;
			
 
				+    
			
 
				+    pixels ::Matrix{Int64} = zeros(dim, dim)
			
 
				+    params :: Matrix{Float32} = zeros(4*nslices,1)
			
 
				+    for i=0:(nslices-1)
			
 
				+        params[4*i+1,1] = cr
			
 
				+        params[4*i+2,1] = ci
			
 
				+        params[4*i+3,1] = i*dim/nslices
			
 
				+        params[4*i+4,1] = dim
			
 
				+    end
			
 
				+    for i = 1:10
			
 
				+        t = time_ns();
			
 
				+        mandelbrot_with_starpu(pixels, params, nslices)
			
 
				+        t = time_ns()-t
			
 
				+        if (tmin==0 || tmin>t)
			
 
				+            tmin=t
			
 
				+        end
			
 
				+    end
			
 
				+    pixels2img(pixels,dim,dim,"out$(dim).ppm")
			
 
				+    return tmin
			
 
				+end
			
 
				+
			
 
				+function display_time(cr ::Float64, ci ::Float64, start_dim ::Int64, step_dim ::Int64, stop_dim ::Int64, nslices ::Int64)
			
 
				+    for dim in (start_dim : step_dim : stop_dim)
			
 
				+        res = min_times(cr, ci, dim, nslices)
			
 
				+        res=res/dim/dim; # time per pixel
			
 
				+        println("$(dim) $(res)")
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+
			
 
				+display_time(-0.800671,-0.158392,32,32,4096,4)
			
 
				+
			
 
				+@debugprint "starpu_shutdown"
			
 
				+starpu_shutdown()
			
 
				+
			
--- a/julia/mult/makefile
+++ b/julia/mult/makefile
@@ -5,19 +5,20 @@ STRIDE=72
 
				 #CC =icc
			
 
				 #CFLAGS=-restrict -unroll4 -ipo -falign-loops=256 -O3 -DSTRIDE=${STRIDE} -march=native $(shell pkg-config --cflags starpu-1.3)
			
 
				 # GCC compiler
			
 
				-CC=gcc-9
			
 
				-CFLAGS += -O3 -DSTRIDE=${STRIDE} -mavx -mfma -fomit-frame-pointer -march=native -ffast-math $(shell pkg-config --cflags starpu-1.3)
			
 
				+CC=gcc
			
 
				+CFLAGS += -O3 -DSTRIDE=${STRIDE} -mavx -fomit-frame-pointer -march=native -ffast-math $(shell pkg-config --cflags starpu-1.3)
			
 
				 
			
 
				 LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
			
 
				-EXTERNLIB=extern_tasks.dylib
			
 
				-GENERATEDLIB=generated_tasks.dylib
			
 
				-OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
			
 
				+EXTERNLIB=extern_tasks.so
			
 
				+GENERATEDLIB=generated_tasks.so
			
 
				+#OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
			
 
				+OBJECTS=$(wildcard gen*.c)
			
 
				 LIBPATH=${PWD}/../StarPU.jl/lib
			
 
				 
			
 
				-all: ${EXTERNLIB} 
			
 
				+all: ${EXTERNLIB}
			
 
				 
			
 
				-mult: mult.c cpu_mult.o #gpu_mult.o 
			
 
				-	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)	
			
 
				+mult: mult.c cpu_mult.o #gpu_mult.o
			
 
				+	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
			
 
				 
			
 
				 gpu_mult.o: gpu_mult.cu
			
 
				 	nvcc -c $(CFLAGS) $^ -o $@
			
@@ -25,8 +26,8 @@ gpu_mult.o: gpu_mult.cu
 
				 %.o: %.c
			
 
				 	$(CC) -c $(CFLAGS) $^ -o $@
			
 
				 
			
 
				-${EXTERNLIB}: cpu_mult.o
			
 
				-	$(CC) -shared -fPIC $(LDFLAGS) $^ -o $@  
			
 
				+${EXTERNLIB}: cpu_mult.c
			
 
				+	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
			
 
				 
			
 
				 gpu_mult.so: gpu_mult.o
			
 
				 	nvcc $(CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
			
@@ -35,10 +36,10 @@ cpu_mult_sa: cpu_mult_sa.o
 
				 	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
			
 
				 
			
 
				 ${GENERATEDLIB}: ${OBJECTS}
			
 
				-	$(CC) -shared -fPIC $(LDFLAGS) $^ -o $@
			
 
				+	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
			
 
				 
			
 
				 clean:
			
 
				-	rm *.so *.o *.dylib c_*.genc gencuda_*.cu *.dat
			
 
				+	rm -f mult *.so *.o c_*.genc gencuda_*.cu *.dat
			
 
				 
			
 
				 # Performance Tests
			
 
				 cstarpu.dat: mult
			
@@ -51,5 +52,3 @@ julia_calllib.dat: ${EXTERNLIB}
 
				 	LD_LIBRARY_PATH+=${LIBPATH} JULIA_TASK_LIB="${EXTERNLIB}" STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mult.jl julia_calllib.dat
			
 
				 
			
 
				 test: cstarpu.dat julia_generatedc.dat julia_native.dat julia_calllib.dat
			
 
				-
			
 
				-
			
--- a/julia/mult/cpu_mult.c
+++ b/julia/mult/cpu_mult.c
@@ -0,0 +1,90 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2018                                     Alexis Juven
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+#include <stdint.h>
			
 
				+#include <stdio.h>
			
 
				+#include <string.h>
			
 
				+#include <starpu.h>
			
 
				+/*
			
 
				+ * The codelet is passed 3 matrices, the "descr" union-type field gives a
			
 
				+ * description of the layout of those 3 matrices in the local memory (ie. RAM
			
 
				+ * in the case of CPU, GPU frame buffer in the case of GPU etc.). Since we have
			
 
				+ * registered data with the "matrix" data interface, we use the matrix macros.
			
 
				+ */
			
 
				+void cpu_mult(void *descr[], void *arg)
			
 
				+{
			
 
				+	(void)arg;
			
 
				+	float *subA, *subB, *subC;
			
 
				+	/* .blas.ptr gives a pointer to the first element of the local copy */
			
 
				+	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
			
 
				+
			
 
				+
			
 
				+	/* .blas.nx is the number of rows (consecutive elements) and .blas.ny
			
 
				+	 * is the number of lines that are separated by .blas.ld elements (ld
			
 
				+	 * stands for leading dimension).
			
 
				+	 * NB: in case some filters were used, the leading dimension is not
			
 
				+	 * guaranteed to be the same in main memory (on the original matrix)
			
 
				+	 * and on the accelerator! */
			
 
				+	const uint32_t nxC = STARPU_MATRIX_GET_NX(descr[2]);
			
 
				+	const uint32_t nyC = STARPU_MATRIX_GET_NY(descr[2]);
			
 
				+	const uint32_t nyA = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+
			
 
				+	const uint32_t ldA = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+	const uint32_t ldB = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				+	const uint32_t ldC = STARPU_MATRIX_GET_LD(descr[2]);
			
 
				+	/* we assume a FORTRAN-ordering! */
			
 
				+	int i,j,k,ii,jj,kk;
			
 
				+	for (i = 0; i < nyC*nxC; i++) subC[i] = 0;
			
 
				+	//fprintf(stderr,"inside cpu_mult %dx%dx%d %d/%d on %d\n",nyC,nyA,nxC,starpu_worker_get_id(),STARPU_NMAXWORKERS,starpu_worker_get_devid(starpu_worker_get_id()));
			
 
				+	for (i=0;i<nyC;i+=STRIDE) {
			
 
				+		for (k=0;k<nyA;k+=STRIDE) {
			
 
				+			for (j=0;j<nxC;j+=STRIDE) {
			
 
				+				
			
 
				+				for (ii = i; ii < i+STRIDE; ii+=2) {
			
 
				+					float *sC0=subC+ii*ldC+j;
			
 
				+					float *sC1=subC+ii*ldC+ldC+j;
			
 
				+					for (kk = k; kk < k+STRIDE; kk+=4) {
			
 
				+						float alpha00=subB[kk +  ii*ldB];
			
 
				+						float alpha01=subB[kk+1+ii*ldB];
			
 
				+						float alpha10=subB[kk+  ii*ldB+ldB];
			
 
				+						float alpha11=subB[kk+1+ii*ldB+ldB];
			
 
				+						float alpha02=subB[kk+2+ii*ldB];
			
 
				+						float alpha03=subB[kk+3+ii*ldB];
			
 
				+						float alpha12=subB[kk+2+ ii*ldB+ldB];
			
 
				+						float alpha13=subB[kk+3+ii*ldB+ldB];
			
 
				+						float *sA0=subA+kk*ldA+j;
			
 
				+						float *sA1=subA+kk*ldA+ldA+j;
			
 
				+						float *sA2=subA+kk*ldA+2*ldA+j;
			
 
				+						float *sA3=subA+kk*ldA+3*ldA+j;
			
 
				+						for (jj = 0; jj < STRIDE; jj+=1) {
			
 
				+							sC0[jj] += alpha00*sA0[jj]+alpha01*sA1[jj]+alpha02*sA2[jj]+alpha03*sA3[jj];
			
 
				+							sC1[jj] += alpha10*sA0[jj]+alpha11*sA1[jj]+alpha12*sA2[jj]+alpha13*sA3[jj];
			
 
				+						}
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	//fprintf(stderr,"inside cpu_mult %dx%dx%d\n",nyC,nyA,nxC);
			
 
				+
			
 
				+}
			
 
				+char* CPU = "cpu_mult";
			
 
				+char* GPU = "gpu_mult";
			
 
				+extern char *starpu_find_function(char *name, char *device) {
			
 
				+	if (!strcmp(device,"gpu")) return GPU;
			
 
				+	return CPU;
			
 
				+}
			
--- a/julia/mult/gpu_mult.cu
+++ b/julia/mult/gpu_mult.cu
@@ -0,0 +1,84 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2018                                     Alexis Juven
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+#include <starpu.h>
			
 
				+extern "C" {
			
 
				+#include <starpu_cuda.h>
			
 
				+}
			
 
				+
			
 
				+#include <stdint.h>
			
 
				+#include <stdio.h>
			
 
				+
			
 
				+
			
 
				+__global__ void gpuMultKernel
			
 
				+(
			
 
				+		uint32_t nxC, uint32_t nyC, uint32_t nyA,
			
 
				+		uint32_t ldA, uint32_t ldB, uint32_t ldC,
			
 
				+		float * subA, float * subB, float * subC
			
 
				+)
			
 
				+{
			
 
				+	uint32_t id, i, j, k;
			
 
				+	float sum;
			
 
				+
			
 
				+	id = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+	i = id % nxC;
			
 
				+	j = id / nxC;
			
 
				+
			
 
				+	if (j >= nyC){
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	sum = 0.;
			
 
				+
			
 
				+	for (k = 0 ; k < nyA ; k++){
			
 
				+		sum += subA[i + k*ldA] * subB[k + j*ldB];
			
 
				+	}
			
 
				+
			
 
				+	subC[i + j*ldC] = sum;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+#define THREADS_PER_BLOCK 64
			
 
				+extern "C" void gpu_mult(void * descr[], void * args)
			
 
				+{
			
 
				+
			
 
				+	float * d_subA, * d_subB, * d_subC;
			
 
				+	uint32_t nxC, nyC, nyA;
			
 
				+	uint32_t ldA, ldB, ldC;
			
 
				+	uint32_t nblocks;
			
 
				+
			
 
				+	d_subA = (float *) STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	d_subB = (float *) STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+	d_subC = (float *) STARPU_MATRIX_GET_PTR(descr[2]);
			
 
				+
			
 
				+	nxC = STARPU_MATRIX_GET_NX(descr[2]);
			
 
				+	nyC = STARPU_MATRIX_GET_NY(descr[2]);
			
 
				+	nyA = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+
			
 
				+	ldA = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+	ldB = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				+	ldC = STARPU_MATRIX_GET_LD(descr[2]);
			
 
				+
			
 
				+	nblocks = (nxC * nyC + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
			
 
				+
			
 
				+	gpuMultKernel
			
 
				+		<<< nblocks, THREADS_PER_BLOCK, 0, NULL /*starpu_cuda_get_local_stream()*/
			
 
				+		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
			
 
				+
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+
			
 
				+}
			
--- a/mpi/src/nmad/starpu_mpi_nmad.c
+++ b/mpi/src/nmad/starpu_mpi_nmad.c
@@ -67,8 +67,8 @@ static volatile int pending_request = 0;
 
				 
			
 
				 #define REQ_FINALIZED 0x1
			
 
				 
			
 
				-PUK_LFSTACK_TYPE(callback,	struct _starpu_mpi_req *req;);
			
 
				-static callback_lfstack_t callback_stack = NULL;
			
 
				+PUK_LFSTACK_TYPE(callback, struct _starpu_mpi_req *req;);
			
 
				+static callback_lfstack_t callback_stack;
			
 
				 
			
 
				 static starpu_sem_t callback_sem;
			
 
				 
			
@@ -594,6 +594,8 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 
				 		_starpu_mpi_thread_cpuid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
			
 
				 	}
			
 
				 
			
 
				+	callback_lfstack_init(&callback_stack);
			
 
				+
			
 
				 	/* Tell pioman to use a bound thread for communication progression:
			
 
				 	 * share the same core as StarPU's MPI thread, the MPI thread has very low activity with NMAD backend */
			
 
				 	int indexes[1] = { _starpu_mpi_thread_cpuid };
			
@@ -663,6 +665,8 @@ void _starpu_mpi_progress_shutdown(void **value)
 
				 
			
 
				 	STARPU_PTHREAD_JOIN(progress_thread, value);
			
 
				 
			
 
				+	callback_lfstack_destroy(&callback_stack);
			
 
				+
			
 
				         STARPU_PTHREAD_MUTEX_DESTROY(&progress_mutex);
			
 
				         STARPU_PTHREAD_COND_DESTROY(&progress_cond);
			
 
				 }
			
--- a/mpi/tests/Makefile.am
+++ b/mpi/tests/Makefile.am
@@ -58,9 +58,11 @@ BUILT_SOURCES =
 
				 
			
 
				 CLEANFILES = *.gcno *.gcda *.linkinfo starpu_idle_microsec.log
			
 
				 
			
 
				-EXTRA_DIST = 					\
			
 
				-	user_defined_datatype_value.h		\
			
 
				-	helper.h
			
 
				+EXTRA_DIST = 				\
			
 
				+	abstract_sendrecv_bench.h	\
			
 
				+	bench_helper.h			\
			
 
				+	helper.h			\
			
 
				+	user_defined_datatype_value.h
			
 
				 
			
 
				 examplebindir = $(libdir)/starpu/examples/mpi
			
 
				 
			
@@ -138,9 +140,13 @@ starpu_mpi_TESTS +=				\
 
				 	user_defined_datatype			\
			
 
				 	early_stuff				\
			
 
				 	sendrecv_bench				\
			
 
				-	sendrecv_gemm_bench			\
			
 
				 	sendrecv_parallel_tasks_bench
			
 
				 
			
 
				+if !NO_BLAS_LIB
			
 
				+starpu_mpi_TESTS +=				\
			
 
				+	sendrecv_gemm_bench
			
 
				+endif
			
 
				+
			
 
				 if !STARPU_SIMGRID
			
 
				 # missing support in simgrid
			
 
				 starpu_mpi_TESTS +=				\
			
--- a/mpi/tests/sendrecv_gemm_bench.c
+++ b/mpi/tests/sendrecv_gemm_bench.c
@@ -320,7 +320,13 @@ static void* comm_thread_func(void* arg)
 
				 	return NULL;
			
 
				 }
			
 
				 
			
 
				-
			
 
				+#ifdef STARPU_USE_MPI_MPI
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	FPRINTF(stderr, "This test does not work with the MPI backend.\n");
			
 
				+	return STARPU_TEST_SKIPPED;
			
 
				+}
			
 
				+#else
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	double start, end;
			
@@ -461,3 +467,4 @@ enodev:
 
				 
			
 
				 	return ret;
			
 
				 }
			
 
				+#endif
			
--- a/mpi/tests/sendrecv_parallel_tasks_bench.c
+++ b/mpi/tests/sendrecv_parallel_tasks_bench.c
@@ -32,7 +32,11 @@
 
				 
			
 
				 /* We reduce NX_MAX, since some NICs don't support exchanging simultaneously such amount of memory */
			
 
				 #undef NX_MAX
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				+#define NX_MAX (1024)
			
 
				+#else
			
 
				 #define NX_MAX (64 * 1024 * 1024)
			
 
				+#endif
			
 
				 
			
 
				 
			
 
				 void cpu_task(void* descr[], void* args)
			
--- a/src/common/barrier.c
+++ b/src/common/barrier.c
@@ -50,7 +50,8 @@ int _starpu_barrier_test(struct _starpu_barrier *barrier)
 
				 int _starpu_barrier_destroy(struct _starpu_barrier *barrier)
			
 
				 {
			
 
				 	int ret;
			
 
				-	do {
			
 
				+	do
			
 
				+	{
			
 
				 		ret = _starpu_barrier_test(barrier);
			
 
				 	}
			
 
				 	while (ret == EBUSY);
			
--- a/src/core/dependencies/data_concurrency.c
+++ b/src/core/dependencies/data_concurrency.c
@@ -402,7 +402,8 @@ void _starpu_enforce_data_deps_notify_job_ready_soon(struct _starpu_job *j, _sta
 
				 {
			
 
				 	unsigned buf;
			
 
				 
			
 
				-	if (j->task->cl) {
			
 
				+	if (j->task->cl)
			
 
				+	{
			
 
				 		unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(j->task);
			
 
				 
			
 
				 		for (buf = 0; buf < nbuffers; buf++)
			
@@ -415,7 +416,8 @@ void _starpu_enforce_data_deps_notify_job_ready_soon(struct _starpu_job *j, _sta
 
				 
			
 
				 		/* We need to check data availability only if sequential consistency
			
 
				 		 * dependencies have not been used */
			
 
				-		if (!j->sequential_consistency) {
			
 
				+		if (!j->sequential_consistency)
			
 
				+		{
			
 
				 			for (buf = 0; buf < nbuffers; buf++)
			
 
				 			{
			
 
				 				starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(j->task, buf);
			
--- a/src/core/dependencies/dependencies.c
+++ b/src/core/dependencies/dependencies.c
@@ -41,7 +41,8 @@ void _starpu_notify_dependencies(struct _starpu_job *j)
 
				 static starpu_notify_ready_soon_func notify_ready_soon_func;
			
 
				 static void *notify_ready_soon_func_data;
			
 
				 
			
 
				-struct _starpu_notify_job_start_data {
			
 
				+struct _starpu_notify_job_start_data
			
 
				+{
			
 
				 	double delay;
			
 
				 };
			
 
				 
			
--- a/src/core/dependencies/implicit_data_deps.c
+++ b/src/core/dependencies/implicit_data_deps.c
@@ -234,11 +234,13 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 
				 
			
 
				 		/* Skip tasks that are associated to a reduction phase so that
			
 
				 		 * they do not interfere with the application. */
			
 
				-		if (pre_sync_job->reduction_task) {
			
 
				+		if (pre_sync_job->reduction_task)
			
 
				+		{
			
 
				 			*submit_pre_sync = 1;
			
 
				 			return NULL;
			
 
				 		}
			
 
				-		if (post_sync_job->reduction_task) {
			
 
				+		if (post_sync_job->reduction_task)
			
 
				+		{
			
 
				 			*submit_pre_sync = 0;
			
 
				 			return NULL;
			
 
				 		}
			
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -1026,7 +1026,8 @@ void starpu_perfmodel_dump_xml(FILE *f, struct starpu_perfmodel *model)
 
				 		for(dev = 0; dev < ndevices; dev++)
			
 
				 		{
			
 
				 			const char *type;
			
 
				-			switch (arch_combs[comb]->devices[dev].type) {
			
 
				+			switch (arch_combs[comb]->devices[dev].type)
			
 
				+			{
			
 
				 				case STARPU_CPU_WORKER: type = "CPU"; break;
			
 
				 				case STARPU_CUDA_WORKER: type = "CUDA"; break;
			
 
				 				case STARPU_OPENCL_WORKER: type = "OpenCL"; break;
			
@@ -1421,7 +1422,8 @@ int starpu_perfmodel_list(FILE *output)
 
				 	else
			
 
				 	{
			
 
				 		int i;
			
 
				-		for (i = 0; i < n; i++) {
			
 
				+		for (i = 0; i < n; i++)
			
 
				+		{
			
 
				 			if (strcmp(list[i]->d_name, ".") && strcmp(list[i]->d_name, ".."))
			
 
				 				fprintf(output, "file: <%s>\n", list[i]->d_name);
			
 
				 			free(list[i]);
			
@@ -1772,7 +1774,8 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, s
 
				 
			
 
				 docal:
			
 
				 #ifdef STARPU_SIMGRID
			
 
				-	if (isnan(exp)) {
			
 
				+	if (isnan(exp))
			
 
				+	{
			
 
				 		char archname[STR_SHORT_LENGTH];
			
 
				 		starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), nimpl);
			
 
				 
			
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -300,13 +300,15 @@ int _starpu_task_data_get_node_on_node(struct starpu_task *task, unsigned index,
 
				 	int node = STARPU_SPECIFIC_NODE_LOCAL;
			
 
				 	if (task->cl->specific_nodes)
			
 
				 		node = STARPU_CODELET_GET_NODE(task->cl, index);
			
 
				-	switch (node) {
			
 
				+	switch (node)
			
 
				+	{
			
 
				 	case STARPU_SPECIFIC_NODE_LOCAL:
			
 
				 		// TODO: rather find MCDRAM
			
 
				 		node = local_node;
			
 
				 		break;
			
 
				 	case STARPU_SPECIFIC_NODE_CPU:
			
 
				-		switch (starpu_node_get_kind(local_node)) {
			
 
				+		switch (starpu_node_get_kind(local_node))
			
 
				+		{
			
 
				 		case STARPU_CPU_RAM:
			
 
				 			node = local_node;
			
 
				 			break;
			
@@ -321,10 +323,13 @@ int _starpu_task_data_get_node_on_node(struct starpu_task *task, unsigned index,
 
				 		node = local_node;
			
 
				 		break;
			
 
				 	case STARPU_SPECIFIC_NODE_LOCAL_OR_CPU:
			
 
				-		if (task->handles[index]->per_node[local_node].state != STARPU_INVALID) {
			
 
				+		if (task->handles[index]->per_node[local_node].state != STARPU_INVALID)
			
 
				+		{
			
 
				 			/* It is here already, rather access it from here */
			
 
				 			node = local_node;
			
 
				-		} else {
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				 			/* It is not here already, do not bother moving it */
			
 
				 			node = STARPU_MAIN_RAM;
			
 
				 		}
			
@@ -339,7 +344,8 @@ int _starpu_task_data_get_node_on_worker(struct starpu_task *task, unsigned inde
 
				 	int node = STARPU_SPECIFIC_NODE_LOCAL;
			
 
				 	if (task->cl->specific_nodes)
			
 
				 		node = STARPU_CODELET_GET_NODE(task->cl, index);
			
 
				-	switch (node) {
			
 
				+	switch (node)
			
 
				+	{
			
 
				 	case STARPU_SPECIFIC_NODE_LOCAL:
			
 
				 		// TODO: rather find MCDRAM
			
 
				 		node = local_node;
			
@@ -354,10 +360,13 @@ int _starpu_task_data_get_node_on_worker(struct starpu_task *task, unsigned inde
 
				 		node = local_node;
			
 
				 		break;
			
 
				 	case STARPU_SPECIFIC_NODE_LOCAL_OR_CPU:
			
 
				-		if (task->handles[index]->per_node[local_node].state != STARPU_INVALID) {
			
 
				+		if (task->handles[index]->per_node[local_node].state != STARPU_INVALID)
			
 
				+		{
			
 
				 			/* It is here already, rather access it from here */
			
 
				 			node = local_node;
			
 
				-		} else {
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				 			/* It is not here already, do not bother moving it */
			
 
				 			node = STARPU_MAIN_RAM;
			
 
				 		}
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -434,7 +434,8 @@ static inline int _starpu_can_use_nth_implementation(enum starpu_worker_archtype
 
				 
			
 
				 /* Test if this task can be processed on this worker, regardless of the implementation */
			
 
				 /* must be called with sched_mutex locked to protect state_blocked */
			
 
				-static inline int _starpu_can_execute_task_any_impl(unsigned workerid, struct starpu_task *task) {
			
 
				+static inline int _starpu_can_execute_task_any_impl(unsigned workerid, struct starpu_task *task)
			
 
				+{
			
 
				 
			
 
				 	if (!_starpu_config.workers[workerid].enable_knob)
			
 
				 		return 0;
			
@@ -446,7 +447,6 @@ static inline int _starpu_can_execute_task_any_impl(unsigned workerid, struct st
 
				 			return 0;
			
 
				 	}
			
 
				 
			
 
				-	
			
 
				 	/* if the worker is blocked in a parallel ctx don't submit tasks on it */
			
 
				 #ifdef STARPU_DEVEL
			
 
				 #warning FIXME: this is very expensive, while can_execute is supposed to be not very costly so schedulers can call it a lot
			
@@ -457,7 +457,7 @@ static inline int _starpu_can_execute_task_any_impl(unsigned workerid, struct st
 
				 	if (!(task->where & _starpu_config.workers[workerid].worker_mask))
			
 
				 		return 0;
			
 
				 
			
 
				-	return 1; 
			
 
				+	return 1;
			
 
				 }
			
 
				 
			
 
				 /* must be called with sched_mutex locked to protect state_blocked_in_parallel */
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -215,7 +215,8 @@ void _starpu_update_data_state(starpu_data_handle_t handle,
 
				 			for (node = 0; node < nnodes; node++)
			
 
				 			{
			
 
				 				struct _starpu_data_replicate *replicate = &handle->per_node[node];
			
 
				-                               if (replicate->state != STARPU_INVALID){
			
 
				+                               if (replicate->state != STARPU_INVALID)
			
 
				+			       {
			
 
				                                        _STARPU_TRACE_DATA_STATE_SHARED(handle, node);
			
 
				 					replicate->state = STARPU_SHARED;
			
 
				                                }
			
--- a/src/datawizard/interfaces/matrix_interface.c
+++ b/src/datawizard/interfaces/matrix_interface.c
@@ -213,7 +213,8 @@ static void display_matrix_interface(starpu_data_handle_t handle, FILE *f)
 
				 
			
 
				 //#define DYNAMIC_MATRICES
			
 
				 
			
 
				-struct pack_matrix_header {
			
 
				+struct pack_matrix_header
			
 
				+{
			
 
				 #ifdef DYNAMIC_MATRICES
			
 
				 	/* Receiving matrices with different sizes from MPI */
			
 
				 	/* FIXME: that would break alignment for O_DIRECT disk access...
			
--- a/src/debug/traces/starpu_fxt.c
+++ b/src/debug/traces/starpu_fxt.c
@@ -343,7 +343,8 @@ static void handle_papi_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *o
 
				 	long long int value = ev->param[2];
			
 
				 	//char *prefix = options->file_prefix;
			
 
				 
			
 
				-	if (papi_file){
			
 
				+	if (papi_file)
			
 
				+	{
			
 
				 		char event_str[PAPI_MAX_STR_LEN];
			
 
				 		PAPI_event_code_to_name(event_code, event_str);
			
 
				 		fprintf(papi_file, "JobId: %lu\n", task);
			
@@ -2470,47 +2471,55 @@ static void handle_memnode_event(struct fxt_ev_64 *ev, struct starpu_fxt_options
 
				 		memnode_set_state(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr);
			
 
				 }
			
 
				 
			
 
				-static void handle_memnode_event_start_3(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr){
			
 
				+static void handle_memnode_event_start_3(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
			
 
				+{
			
 
				 	unsigned memnode = ev->param[0];
			
 
				-       unsigned size = ev->param[2];
			
 
				-       unsigned long handle = ev->param[3];
			
 
				+	unsigned size = ev->param[2];
			
 
				+	unsigned long handle = ev->param[3];
			
 
				 
			
 
				 	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, size, memnode, options);
			
 
				 }
			
 
				 
			
 
				-static void handle_memnode_event_start_4(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr){
			
 
				-       unsigned memnode = ev->param[0];
			
 
				-       unsigned dest = ev->param[1];
			
 
				-       if(strcmp(eventstr, "rc")==0){
			
 
				-               //If it is a Request Create, use dest normally
			
 
				-       }else{
			
 
				-               dest = memnode;
			
 
				-       }
			
 
				-       unsigned size = ev->param[2];
			
 
				-       unsigned long handle = ev->param[3];
			
 
				-       unsigned prefe = ev->param[4];
			
 
				+static void handle_memnode_event_start_4(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
			
 
				+{
			
 
				+	unsigned memnode = ev->param[0];
			
 
				+	unsigned dest = ev->param[1];
			
 
				+	if(strcmp(eventstr, "rc")==0)
			
 
				+	{
			
 
				+		//If it is a Request Create, use dest normally
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		dest = memnode;
			
 
				+	}
			
 
				+	unsigned size = ev->param[2];
			
 
				+	unsigned long handle = ev->param[3];
			
 
				+	unsigned prefe = ev->param[4];
			
 
				 
			
 
				-       memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, prefe, size, dest, options);
			
 
				+	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, prefe, size, dest, options);
			
 
				 }
			
 
				 
			
 
				-static void handle_memnode_event_end_3(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr){
			
 
				+static void handle_memnode_event_end_3(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
			
 
				+{
			
 
				 	unsigned memnode = ev->param[0];
			
 
				-       unsigned long handle = ev->param[2];
			
 
				-       unsigned info = ev->param[3];
			
 
				+	unsigned long handle = ev->param[2];
			
 
				+	unsigned info = ev->param[3];
			
 
				 
			
 
				 	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, info, 0, memnode, options);
			
 
				 }
			
 
				 
			
 
				-static void handle_memnode_event_start_2(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr){
			
 
				+static void handle_memnode_event_start_2(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
			
 
				+{
			
 
				 	unsigned memnode = ev->param[0];
			
 
				-       unsigned long handle = ev->param[2];
			
 
				+	unsigned long handle = ev->param[2];
			
 
				 
			
 
				 	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, 0, memnode, options);
			
 
				 }
			
 
				 
			
 
				-static void handle_memnode_event_end_2(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr){
			
 
				+static void handle_memnode_event_end_2(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
			
 
				+{
			
 
				 	unsigned memnode = ev->param[0];
			
 
				-       unsigned long handle = ev->param[2];
			
 
				+	unsigned long handle = ev->param[2];
			
 
				 
			
 
				 	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, 0, memnode, options);
			
 
				 }
			
--- a/src/sched_policies/component_best_implementation.c
+++ b/src/sched_policies/component_best_implementation.c
@@ -38,26 +38,26 @@ static int find_best_impl(unsigned sched_ctx_id, struct starpu_task * task, int
 
				 		len = 0.0;
			
 
				 	}
			
 
				 	else
			
 
				-	{	
			
 
				-	    struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid, sched_ctx_id);
			
 
				-	    for(impl = 0; impl < STARPU_MAXIMPLEMENTATIONS; impl++)
			
 
				-	    {
			
 
				-		if(starpu_worker_can_execute_task(workerid, task, impl))
			
 
				+	{
			
 
				+		struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid, sched_ctx_id);
			
 
				+		for(impl = 0; impl < STARPU_MAXIMPLEMENTATIONS; impl++)
			
 
				 		{
			
 
				-			double d = starpu_task_expected_length(task, archtype, impl);
			
 
				-			if(isnan(d))
			
 
				-			{
			
 
				-				best_impl = impl;
			
 
				-				len = 0.0;
			
 
				-				break;
			
 
				-			}
			
 
				-			if(d < len)
			
 
				+			if(starpu_worker_can_execute_task(workerid, task, impl))
			
 
				 			{
			
 
				-				len = d;
			
 
				-				best_impl = impl;
			
 
				+				double d = starpu_task_expected_length(task, archtype, impl);
			
 
				+				if(isnan(d))
			
 
				+				{
			
 
				+					best_impl = impl;
			
 
				+					len = 0.0;
			
 
				+					break;
			
 
				+				}
			
 
				+				if(d < len)
			
 
				+				{
			
 
				+					len = d;
			
 
				+					best_impl = impl;
			
 
				+				}
			
 
				 			}
			
 
				 		}
			
 
				-	    }
			
 
				 	}
			
 
				 	if(best_impl == -1)
			
 
				 		return 0;
			
--- a/src/sched_policies/component_heteroprio.c
+++ b/src/sched_policies/component_heteroprio.c
@@ -206,7 +206,8 @@ out:
 
				 	//fprintf(stderr, "could not push %p to %d actually\n", task, best_icomponent);
			
 
				 	/* Could not push to child actually, push that one back */
			
 
				 	STARPU_COMPONENT_MUTEX_LOCK(mutex);
			
 
				-	for (j = 0; j < (int) data->naccel; j++) {
			
 
				+	for (j = 0; j < (int) data->naccel; j++)
			
 
				+	{
			
 
				 		if (acceleration == data->accel[j])
			
 
				 		{
			
 
				 			_starpu_prio_deque_push_front_task(data->bucket[j], task);
			
@@ -305,7 +306,8 @@ static int heteroprio_progress_one(struct starpu_sched_component *component)
 
				 	task = _starpu_prio_deque_pop_task(no_accel);
			
 
				 	STARPU_COMPONENT_MUTEX_UNLOCK(mutex);
			
 
				 
			
 
				-	if (task) {
			
 
				+	if (task)
			
 
				+	{
			
 
				 		if (heteroprio_progress_noaccel(component, data, task))
			
 
				 		{
			
 
				 			/* Could not push to child actually, push that one back */
			
@@ -388,7 +390,8 @@ static int heteroprio_push_task(struct starpu_sched_component * component, struc
 
				 			max_expected = min_arch;
			
 
				 	}
			
 
				 
			
 
				-	if (workerid == -1) {
			
 
				+	if (workerid == -1)
			
 
				+	{
			
 
				 		/* All archs can run it */
			
 
				 		STARPU_ASSERT(!isnan(min_expected));
			
 
				 		STARPU_ASSERT(!isnan(max_expected));
			
@@ -402,13 +405,15 @@ static int heteroprio_push_task(struct starpu_sched_component * component, struc
 
				 		STARPU_COMPONENT_MUTEX_LOCK(mutex);
			
 
				 		unsigned i, j;
			
 
				 		/* Try to find a bucket with similar acceleration */
			
 
				-		for (i = 0; i < data->naccel; i++) {
			
 
				+		for (i = 0; i < data->naccel; i++)
			
 
				+		{
			
 
				 			if (acceleration >= data->accel[i] * (1 - APPROX) &&
			
 
				 			    acceleration <= data->accel[i] * (1 + APPROX))
			
 
				 				break;
			
 
				 		}
			
 
				 
			
 
				-		if (i == data->naccel) {
			
 
				+		if (i == data->naccel)
			
 
				+		{
			
 
				 			/* Didn't find it, add one */
			
 
				 			data->naccel++;
			
 
				 
			
@@ -418,8 +423,10 @@ static int heteroprio_push_task(struct starpu_sched_component * component, struc
 
				 			_starpu_prio_deque_init(newbucket);
			
 
				 			int inserted = 0;
			
 
				 
			
 
				-			for (j = 0; j < data->naccel-1; j++) {
			
 
				-				if (!inserted && acceleration > data->accel[j]) {
			
 
				+			for (j = 0; j < data->naccel-1; j++)
			
 
				+			{
			
 
				+				if (!inserted && acceleration > data->accel[j])
			
 
				+				{
			
 
				 					/* Insert the new bucket here */
			
 
				 					i = j;
			
 
				 					newbuckets[j] = newbucket;
			
@@ -429,7 +436,8 @@ static int heteroprio_push_task(struct starpu_sched_component * component, struc
 
				 				newbuckets[j+inserted] = data->bucket[j];
			
 
				 				newaccel[j+inserted] = data->accel[j];
			
 
				 			}
			
 
				-			if (!inserted) {
			
 
				+			if (!inserted)
			
 
				+			{
			
 
				 				/* Insert it last */
			
 
				 				newbuckets[data->naccel-1] = newbucket;
			
 
				 				newaccel[data->naccel-1] = acceleration;
			
@@ -441,14 +449,17 @@ static int heteroprio_push_task(struct starpu_sched_component * component, struc
 
				 		}
			
 
				 #if 0
			
 
				 		fprintf(stderr,"buckets:");
			
 
				-		for (j = 0; j < data->naccel; j++) {
			
 
				+		for (j = 0; j < data->naccel; j++)
			
 
				+		{
			
 
				 			fprintf(stderr, " %f", data->accel[j]);
			
 
				 		}
			
 
				 		fprintf(stderr,"\ninserting %p %f to %d\n", task, acceleration, i);
			
 
				 #endif
			
 
				 		_starpu_prio_deque_push_back_task(data->bucket[i],task);
			
 
				 		STARPU_COMPONENT_MUTEX_UNLOCK(mutex);
			
 
				-	} else {
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				 		/* Not all archs can run it, will resort to HEFT strategy */
			
 
				 		acceleration = INFINITY;
			
 
				 		//fprintf(stderr,"%s: some archs can't do it\n", starpu_task_get_name(task));
			
--- a/src/sched_policies/component_sched.c
+++ b/src/sched_policies/component_sched.c
@@ -284,8 +284,10 @@ void _starpu_sched_component_update_workers_in_ctx(struct starpu_sched_component
 
				 	struct starpu_bitmap * workers_in_ctx = _starpu_get_worker_mask(sched_ctx_id);
			
 
				 	starpu_bitmap_unset_and(component->workers_in_ctx,component->workers, workers_in_ctx);
			
 
				 	unsigned i,j;
			
 
				-	for(i = starpu_worker_get_count(); i < starpu_worker_get_count() + starpu_combined_worker_get_count(); i++) {
			
 
				-		if (starpu_bitmap_get(component->workers, i)) {
			
 
				+	for(i = starpu_worker_get_count(); i < starpu_worker_get_count() + starpu_combined_worker_get_count(); i++)
			
 
				+	{
			
 
				+		if (starpu_bitmap_get(component->workers, i))
			
 
				+		{
			
 
				 			/* Component has this combined worker, check whether the
			
 
				 			 * context has all the corresponding workers */
			
 
				 			int worker_size;
			
--- a/src/sched_policies/component_work_stealing.c
+++ b/src/sched_policies/component_work_stealing.c
@@ -240,7 +240,8 @@ static int push_task(struct starpu_sched_component * component, struct starpu_ta
 
				 
			
 
				 	/* Find a child component that can execute this task */
			
 
				 	i = (i+1)%component->nchildren;
			
 
				-	while(1) {
			
 
				+	while(1)
			
 
				+	{
			
 
				 		int workerid;
			
 
				 		for(workerid = starpu_bitmap_first(component->children[i]->workers_in_ctx);
			
 
				 		    -1 != workerid;
			
--- a/src/sched_policies/component_worker.c
+++ b/src/sched_policies/component_worker.c
@@ -149,7 +149,8 @@ struct _starpu_worker_component_data
 
				 	union
			
 
				 	{
			
 
				 		struct _starpu_worker * worker;
			
 
				-		struct {
			
 
				+		struct
			
 
				+		{
			
 
				 			unsigned worker_size;
			
 
				 			unsigned workerids[STARPU_NMAXWORKERS];
			
 
				 		} parallel_worker;
			
--- a/src/sched_policies/heteroprio.c
+++ b/src/sched_policies/heteroprio.c
@@ -524,7 +524,7 @@ static struct starpu_task *pop_task_heteroprio_policy(unsigned sched_ctx_id)
 
				 				nb_added_tasks       += 1;
			
 
				 				// TODO starpu_prefetch_task_input_for(task, workerid);
			
 
				 			}
			
 
				-		}		
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	struct starpu_task* task = NULL;
			
--- a/src/sched_policies/modular_ez.c
+++ b/src/sched_policies/modular_ez.c
@@ -262,10 +262,13 @@ void starpu_sched_component_initialize_simple_schedulers(unsigned sched_ctx_id,
 
				 		unsigned ntasks_threshold;
			
 
				 		if (starpu_sched_component_is_heft(decision_component) ||
			
 
				 		    starpu_sched_component_is_mct(decision_component) ||
			
 
				-		    starpu_sched_component_is_heteroprio(decision_component)) {
			
 
				+		    starpu_sched_component_is_heteroprio(decision_component))
			
 
				+		{
			
 
				 			/* These need more queueing to allow CPUs to take some share of the work */
			
 
				 			ntasks_threshold = _STARPU_SCHED_NTASKS_THRESHOLD_HEFT;
			
 
				-		} else {
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				 			ntasks_threshold = _STARPU_SCHED_NTASKS_THRESHOLD_DEFAULT;
			
 
				 		}
			
 
				 		/* But let user tune it */
			
@@ -279,20 +282,20 @@ void starpu_sched_component_initialize_simple_schedulers(unsigned sched_ctx_id,
 
				 		int exp = flags & STARPU_SCHED_SIMPLE_FIFOS_BELOW_EXP ? 1 : 0;
			
 
				 
			
 
				 		struct starpu_sched_component_prio_data prio_data =
			
 
				-			{
			
 
				-				.ntasks_threshold = ntasks_threshold,
			
 
				-				.exp_len_threshold = exp_len_threshold,
			
 
				-				.ready = ready,
			
 
				-				.exp = exp,
			
 
				-			};
			
 
				+		{
			
 
				+			.ntasks_threshold = ntasks_threshold,
			
 
				+			.exp_len_threshold = exp_len_threshold,
			
 
				+			.ready = ready,
			
 
				+			.exp = exp,
			
 
				+		};
			
 
				 
			
 
				 		struct starpu_sched_component_fifo_data fifo_data =
			
 
				-			{
			
 
				-				.ntasks_threshold = ntasks_threshold,
			
 
				-				.exp_len_threshold = exp_len_threshold,
			
 
				-				.ready = ready,
			
 
				-				.exp = exp,
			
 
				-			};
			
 
				+		{
			
 
				+			.ntasks_threshold = ntasks_threshold,
			
 
				+			.exp_len_threshold = exp_len_threshold,
			
 
				+			.ready = ready,
			
 
				+			.exp = exp,
			
 
				+		};
			
 
				 
			
 
				 		/* Create one fifo+eager component pair per choice, below scheduling decision */
			
 
				 		for(i = 0; i < nbelow; i++)
			
@@ -334,7 +337,8 @@ void starpu_sched_component_initialize_simple_schedulers(unsigned sched_ctx_id,
 
				 					STARPU_ABORT();
			
 
				 			}
			
 
				 			STARPU_ASSERT(n >= 1);
			
 
				-			if (n > 1) {
			
 
				+			if (n > 1)
			
 
				+			{
			
 
				 				/* Several workers for this choice, need to introduce
			
 
				 				 * a component to distribute the work */
			
 
				 				struct starpu_sched_component *distribute;
			
--- a/src/sched_policies/modular_gemm.c
+++ b/src/sched_policies/modular_gemm.c
@@ -26,7 +26,8 @@
 
				 
			
 
				 #define MEMORY_AFFINITY
			
 
				 
			
 
				-struct child_data {
			
 
				+struct child_data
			
 
				+{
			
 
				 	double expected_start;
			
 
				 	double predicted;
			
 
				 	double predicted_transfer;
			
--- a/src/sched_policies/modular_heteroprio_heft.c
+++ b/src/sched_policies/modular_heteroprio_heft.c
@@ -22,7 +22,8 @@
 
				 
			
 
				 static void initialize_heteroprio_heft_center_policy(unsigned sched_ctx_id)
			
 
				 {
			
 
				-	struct starpu_sched_component_heteroprio_data heteroprio_data = {
			
 
				+	struct starpu_sched_component_heteroprio_data heteroprio_data =
			
 
				+	{
			
 
				 		.mct = NULL,
			
 
				 		.batch = 1,
			
 
				 	};
			
--- a/src/util/openmp_runtime_support.c
+++ b/src/util/openmp_runtime_support.c
@@ -135,7 +135,9 @@ static void wake_up_and_unlock_task(struct starpu_omp_task *task)
 
				 		weak_task_unlock(task);
			
 
				 		int ret = starpu_task_submit(task->starpu_task);
			
 
				 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				-	} else {
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				 		weak_task_unlock(task);
			
 
				 	}
			
 
				 }
			
@@ -379,36 +381,37 @@ static void starpu_omp_explicit_task_entry(struct starpu_omp_task *task)
 
				 {
			
 
				 	STARPU_ASSERT(!(task->flags & STARPU_OMP_TASK_FLAGS_IMPLICIT));
			
 
				 	struct _starpu_worker *starpu_worker = _starpu_get_local_worker_key();
			
 
				-   /* XXX on work */
			
 
				-   if (task->is_loop) {
			
 
				-      starpu_omp_for_inline_first_alt(task->nb_iterations, task->chunk, starpu_omp_sched_static, 1, &task->begin_i, &task->end_i);
			
 
				-   }
			
 
				-   if (starpu_worker->arch == STARPU_CPU_WORKER)
			
 
				-   {
			
 
				-      task->cpu_f(task->starpu_buffers, task->starpu_cl_arg);
			
 
				-   }
			
 
				+	/* XXX on work */
			
 
				+	if (task->is_loop)
			
 
				+	{
			
 
				+		starpu_omp_for_inline_first_alt(task->nb_iterations, task->chunk, starpu_omp_sched_static, 1, &task->begin_i, &task->end_i);
			
 
				+	}
			
 
				+	if (starpu_worker->arch == STARPU_CPU_WORKER)
			
 
				+	{
			
 
				+		task->cpu_f(task->starpu_buffers, task->starpu_cl_arg);
			
 
				+	}
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-   else if (starpu_worker->arch == STARPU_CUDA_WORKER)
			
 
				-   {
			
 
				-      task->cuda_f(task->starpu_buffers, task->starpu_cl_arg);
			
 
				-   }
			
 
				+	else if (starpu_worker->arch == STARPU_CUDA_WORKER)
			
 
				+	{
			
 
				+		task->cuda_f(task->starpu_buffers, task->starpu_cl_arg);
			
 
				+	}
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-   else if (starpu_worker->arch == STARPU_OPENCL_WORKER)
			
 
				-   {
			
 
				-      task->opencl_f(task->starpu_buffers, task->starpu_cl_arg);
			
 
				-   }
			
 
				+	else if (starpu_worker->arch == STARPU_OPENCL_WORKER)
			
 
				+	{
			
 
				+		task->opencl_f(task->starpu_buffers, task->starpu_cl_arg);
			
 
				+	}
			
 
				 #endif
			
 
				-   else
			
 
				-      _STARPU_ERROR("invalid worker architecture");
			
 
				-   /**/
			
 
				+	else
			
 
				+		_STARPU_ERROR("invalid worker architecture");
			
 
				+	/**/
			
 
				 	_starpu_omp_unregister_task_handles(task);
			
 
				 	_starpu_spin_lock(&task->lock);
			
 
				 	task->state = starpu_omp_task_state_terminated;
			
 
				 	task->transaction_pending=1;
			
 
				 	_starpu_spin_unlock(&task->lock);
			
 
				 	struct starpu_omp_thread *thread = _starpu_omp_get_thread();
			
 
				-	/* 
			
 
				+	/*
			
 
				 	 * the task reached the terminated state, definitively give hand back to the worker code.
			
 
				 	 *
			
 
				 	 * about to run on the worker stack...
			
@@ -428,7 +431,7 @@ static void starpu_omp_implicit_task_entry(struct starpu_omp_task *task)
 
				 		_starpu_omp_unregister_region_handles(task->owner_region);
			
 
				 	}
			
 
				 	task->state = starpu_omp_task_state_terminated;
			
 
				-	/* 
			
 
				+	/*
			
 
				 	 * the task reached the terminated state, definitively give hand back to the worker code.
			
 
				 	 *
			
 
				 	 * about to run on the worker stack...
			
@@ -447,7 +450,7 @@ static void starpu_omp_task_preempt(void)
 
				 	struct starpu_omp_thread *thread = _starpu_omp_get_thread();
			
 
				 	task->state = starpu_omp_task_state_preempted;
			
 
				 
			
 
				-	/* 
			
 
				+	/*
			
 
				 	 * the task reached a blocked state, give hand back to the worker code.
			
 
				 	 *
			
 
				 	 * about to run on the worker stack...
			
@@ -486,7 +489,7 @@ static void starpu_omp_implicit_task_exec(void *buffers[], void *cl_arg)
 
				 
			
 
				 	task->state = starpu_omp_task_state_clear;
			
 
				 
			
 
				-	/* 
			
 
				+	/*
			
 
				 	 * start the task execution, or restore a previously preempted task.
			
 
				 	 * about to run on the task stack...
			
 
				 	 * */
			
@@ -655,7 +658,7 @@ static void starpu_omp_explicit_task_exec(void *buffers[], void *cl_arg)
 
				 	}
			
 
				 	task->state = starpu_omp_task_state_clear;
			
 
				 
			
 
				-	/* 
			
 
				+	/*
			
 
				 	 * start the task execution, or restore a previously preempted task.
			
 
				 	 * about to run on the task stack...
			
 
				 	 * */
			
@@ -694,11 +697,11 @@ static struct starpu_omp_task *create_omp_task_struct(struct starpu_omp_task *pa
 
				 		task->flags |= STARPU_OMP_TASK_FLAGS_IMPLICIT;
			
 
				 	}
			
 
				 	_starpu_spin_init(&task->lock);
			
 
				-	/* TODO: initialize task->data_env_icvs with proper values */ 
			
 
				+	/* TODO: initialize task->data_env_icvs with proper values */
			
 
				 	memset(&task->data_env_icvs, 0, sizeof(task->data_env_icvs));
			
 
				 	if (is_implicit)
			
 
				 	{
			
 
				-	  /* TODO: initialize task->implicit_task_icvs with proper values */ 
			
 
				+	  /* TODO: initialize task->implicit_task_icvs with proper values */
			
 
				 		memset(&task->implicit_task_icvs, 0, sizeof(task->implicit_task_icvs));
			
 
				 	}
			
 
				 
			
@@ -1037,7 +1040,7 @@ void starpu_omp_parallel_region(const struct starpu_omp_parallel_region_attr *at
 
				 	struct starpu_omp_task *task = _starpu_omp_get_task();
			
 
				 	struct starpu_omp_region *generating_region = task->owner_region;
			
 
				 	const int max_active_levels = generating_region->owner_device->icvs.max_active_levels_var;
			
 
				-	struct starpu_omp_region *new_region = 
			
 
				+	struct starpu_omp_region *new_region =
			
 
				 		create_omp_region_struct(generating_region, _global_state.initial_device);
			
 
				 	int ret;
			
 
				 	int nb_threads = 1;
			
@@ -1166,7 +1169,7 @@ void starpu_omp_parallel_region(const struct starpu_omp_parallel_region_attr *at
 
				 	}
			
 
				 	STARPU_ASSERT(new_region->nb_threads == nb_threads);
			
 
				 
			
 
				-	/* 
			
 
				+	/*
			
 
				 	 * if task == initial_task, create a starpu task as a continuation to all the implicit
			
 
				 	 * tasks of the new region, else prepare the task for preemption,
			
 
				 	 * to become itself a continuation to the implicit tasks of the new region
			
@@ -1194,7 +1197,7 @@ void starpu_omp_parallel_region(const struct starpu_omp_parallel_region_attr *at
 
				 	 * create the starpu tasks for the implicit omp tasks,
			
 
				 	 * create explicit dependencies between these starpu tasks and the continuation starpu task
			
 
				 	 */
			
 
				-	for (i = 0; i < nb_threads; i++) 
			
 
				+	for (i = 0; i < nb_threads; i++)
			
 
				 	{
			
 
				 		struct starpu_omp_task * implicit_task = new_region->implicit_task_array[i];
			
 
				 		implicit_task->cl = attr->cl;
			
@@ -1234,7 +1237,7 @@ void starpu_omp_parallel_region(const struct starpu_omp_parallel_region_attr *at
 
				 	/*
			
 
				 	 * submit all the region implicit starpu tasks
			
 
				 	 */
			
 
				-	for (i = 0; i < nb_threads; i++) 
			
 
				+	for (i = 0; i < nb_threads; i++)
			
 
				 	{
			
 
				 		struct starpu_omp_task * implicit_task = new_region->implicit_task_array[i];
			
 
				 		ret = starpu_task_submit(implicit_task->starpu_task);
			
@@ -1292,7 +1295,7 @@ static void wake_up_barrier(struct starpu_omp_region *parallel_region)
 
				 {
			
 
				 	struct starpu_omp_task *task = _starpu_omp_get_task();
			
 
				 	int i;
			
 
				-	for (i = 0; i < parallel_region->nb_threads; i++) 
			
 
				+	for (i = 0; i < parallel_region->nb_threads; i++)
			
 
				 	{
			
 
				 		struct starpu_omp_task * implicit_task = parallel_region->implicit_task_array[i];
			
 
				 		if (implicit_task == task)
			
@@ -1343,7 +1346,7 @@ void starpu_omp_barrier(void)
 
				 	{
			
 
				 		ANNOTATE_HAPPENS_BEFORE(&parallel_region->barrier_count);
			
 
				 		/* not the last task reaching the barrier
			
 
				-		 * . prepare for conditional continuation 
			
 
				+		 * . prepare for conditional continuation
			
 
				 		 * . sleep
			
 
				 		 */
			
 
				 
			
@@ -1826,40 +1829,46 @@ void starpu_omp_taskgroup_inline_end(void)
 
				 // XXX on work
			
 
				 void starpu_omp_taskloop_inline_begin(struct starpu_omp_task_region_attr *attr)
			
 
				 {
			
 
				-   if (!attr->nogroup_clause)
			
 
				-   {
			
 
				-      starpu_omp_taskgroup_inline_begin();
			
 
				-   }
			
 
				-
			
 
				-   int nb_subloop;
			
 
				-   if (attr->num_tasks) {
			
 
				-      nb_subloop = attr->num_tasks;
			
 
				-   } else if (attr->grainsize) {
			
 
				-      nb_subloop = attr->nb_iterations / attr->grainsize;
			
 
				-   } else {
			
 
				-      nb_subloop = 4;
			
 
				-   }
			
 
				-
			
 
				-   attr->is_loop = 1;
			
 
				-
			
 
				-   int i;
			
 
				-   int nb_iter_i = attr->nb_iterations / nb_subloop;
			
 
				-   for (i = 0; i < nb_subloop; i++)
			
 
				-   {
			
 
				-      attr->begin_i = nb_iter_i * i;
			
 
				-      attr->end_i = attr->begin_i + nb_iter_i;
			
 
				-      attr->end_i += (i+1 != nb_subloop) ? 0 : (attr->nb_iterations % nb_subloop);
			
 
				-      attr->chunk = attr->end_i - attr->begin_i;
			
 
				-      starpu_omp_task_region(attr);
			
 
				-   }
			
 
				+	if (!attr->nogroup_clause)
			
 
				+	{
			
 
				+		starpu_omp_taskgroup_inline_begin();
			
 
				+	}
			
 
				+
			
 
				+	int nb_subloop;
			
 
				+	if (attr->num_tasks)
			
 
				+	{
			
 
				+		nb_subloop = attr->num_tasks;
			
 
				+	}
			
 
				+	else if (attr->grainsize)
			
 
				+	{
			
 
				+		nb_subloop = attr->nb_iterations / attr->grainsize;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		nb_subloop = 4;
			
 
				+	}
			
 
				+
			
 
				+	attr->is_loop = 1;
			
 
				+
			
 
				+	int i;
			
 
				+	int nb_iter_i = attr->nb_iterations / nb_subloop;
			
 
				+	for (i = 0; i < nb_subloop; i++)
			
 
				+	{
			
 
				+		attr->begin_i = nb_iter_i * i;
			
 
				+		attr->end_i = attr->begin_i + nb_iter_i;
			
 
				+		attr->end_i += (i+1 != nb_subloop) ? 0 : (attr->nb_iterations % nb_subloop);
			
 
				+		attr->chunk = attr->end_i - attr->begin_i;
			
 
				+		starpu_omp_task_region(attr);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 // XXX on work
			
 
				 void starpu_omp_taskloop_inline_end(const struct starpu_omp_task_region_attr *attr)
			
 
				 {
			
 
				-   if (!attr->nogroup_clause) {
			
 
				-      starpu_omp_taskgroup_inline_end();
			
 
				-   }
			
 
				+	if (!attr->nogroup_clause)
			
 
				+	{
			
 
				+		starpu_omp_taskgroup_inline_end();
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 static inline void _starpu_omp_for_loop(struct starpu_omp_region *parallel_region, struct starpu_omp_task *task,
			
@@ -2170,7 +2179,7 @@ void starpu_omp_ordered_inline_end(void)
 
				 	struct starpu_omp_region *parallel_region = task->owner_region;
			
 
				 	struct starpu_omp_loop *loop = _starpu_omp_for_get_loop(parallel_region, task);
			
 
				 
			
 
				-	loop->ordered_iteration++;	
			
 
				+	loop->ordered_iteration++;
			
 
				 	condition_broadcast(&loop->ordered_cond, starpu_omp_task_wait_on_ordered);
			
 
				 	_starpu_spin_unlock(&loop->ordered_lock);
			
 
				 }
			
--- a/tests/datawizard/bcsr.c
+++ b/tests/datawizard/bcsr.c
@@ -41,7 +41,8 @@ void cpu_show_bcsr(void *descr[], void *arg)
 
				 
			
 
				 	printf("nnz %d elemsize %d\n", nnz, elemsize);
			
 
				 
			
 
				-	for (i = 0; i < nrow; i++) {
			
 
				+	for (i = 0; i < nrow; i++)
			
 
				+	{
			
 
				 		uint32_t row_start = rowptr[i] - firstentry;
			
 
				 		uint32_t row_end = rowptr[i+1] - firstentry;
			
 
				 
			
@@ -73,7 +74,7 @@ struct starpu_codelet show_cl =
 
				 };
			
 
				 
			
 
				 /*
			
 
				- * In this test, we use the following matrix: 
			
 
				+ * In this test, we use the following matrix:
			
 
				  *
			
 
				  *   +----------------+
			
 
				  *   |  0   1   0   0 |
			
@@ -129,7 +130,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_task_insert(&show_cl, STARPU_R, bcsr_handle, 0);
			
 
				 
			
 
				-	struct starpu_data_filter filter = {
			
 
				+	struct starpu_data_filter filter =
			
 
				+	{
			
 
				 		.filter_func = starpu_bcsr_filter_vertical_block,
			
 
				 		.nchildren = 2,
			
 
				 	};
			
--- a/tests/microbenchs/tasks_size_overhead.c
+++ b/tests/microbenchs/tasks_size_overhead.c
@@ -185,7 +185,8 @@ int main(int argc, char **argv)
 
				 	unsetenv("STARPU_NCPU");
			
 
				 #endif
			
 
				 
			
 
				-	if (STARPU_RUNNING_ON_VALGRIND) {
			
 
				+	if (STARPU_RUNNING_ON_VALGRIND)
			
 
				+	{
			
 
				 		factortime *= 4;
			
 
				 		cpustep *= 4;
			
 
				 	}
			
--- a/tests/parallel_tasks/parallel_kernels.c
+++ b/tests/parallel_tasks/parallel_kernels.c
@@ -89,7 +89,8 @@ int main(void)
 
				 
			
 
				 	unsigned iter, worker, n;
			
 
				 	n = N;
			
 
				-	if (STARPU_RUNNING_ON_VALGRIND) {
			
 
				+	if (STARPU_RUNNING_ON_VALGRIND)
			
 
				+	{
			
 
				 		n /= 300;
			
 
				 	}
			
 
				 	for (iter = 0; iter < n; iter++)
			
--- a/tests/parallel_tasks/parallel_kernels_spmd.c
+++ b/tests/parallel_tasks/parallel_kernels_spmd.c
@@ -92,7 +92,8 @@ int main(void)
 
				 
			
 
				 	unsigned iter, worker, n;
			
 
				 	n = N;
			
 
				-	if (STARPU_RUNNING_ON_VALGRIND) {
			
 
				+	if (STARPU_RUNNING_ON_VALGRIND)
			
 
				+	{
			
 
				 		n /= 300;
			
 
				 	}
			
 
				 	for (iter = 0; iter < n; iter++)
			
--- a/tools/starpu_perfmodel_display.c
+++ b/tools/starpu_perfmodel_display.c
@@ -175,9 +175,12 @@ int main(int argc, char **argv)
 
				 			fprintf(stderr, "The performance model for the symbol <%s> could not be loaded\n", psymbol);
			
 
				 			return 1;
			
 
				 		}
			
 
				-		if (xml) {
			
 
				+		if (xml)
			
 
				+		{
			
 
				 			starpu_perfmodel_dump_xml(stdout, &model);
			
 
				-		} else {
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				 			uint32_t *footprint = NULL;
			
 
				 			if (pdisplay_specific_footprint == 1)
			
 
				 			{
			
--- a/tools/starpu_perfmodel_recdump.c
+++ b/tools/starpu_perfmodel_recdump.c
@@ -129,7 +129,8 @@ void print_archs(FILE* output)
 
				 		{
			
 
				 			if (starpu_worker_get_memory_node(workerid) == node)
			
 
				 			{
			
 
				-				if (!printed) {
			
 
				+				if (!printed)
			
 
				+				{
			
 
				 					fprintf(output, "Workers:");
			
 
				 					printed = 1;
			
 
				 				}
			
@@ -145,7 +146,8 @@ void print_archs(FILE* output)
 
				 	{
			
 
				 		for (dst = 0; dst < starpu_memory_nodes_get_count(); dst++)
			
 
				 		{
			
 
				-			if (src != dst) {
			
 
				+			if (src != dst)
			
 
				+			{
			
 
				 				fprintf(output, "MemoryNodeSrc: %d\n", src);
			
 
				 				fprintf(output, "MemoryNodeDst: %d\n", dst);
			
 
				 				fprintf(output, "Bandwidth: %f\n", starpu_transfer_bandwidth(src, dst));
			
--- a/tools/starpu_replay.c
+++ b/tools/starpu_replay.c
@@ -165,7 +165,8 @@ static void replay_data_register(starpu_data_handle_t *handleptr, starpu_data_ha
 
				 	{
			
 
				 		replay_interface_ops.interfaceid = starpu_data_interface_get_next_id();
			
 
				 	}
			
 
				-	struct replay_interface interface = {
			
 
				+	struct replay_interface interface =
			
 
				+	{
			
 
				 		.id = replay_interface_ops.interfaceid,
			
 
				 		.orig_handle = orig_handle,
			
 
				 		.size = size,
			
@@ -337,7 +338,8 @@ double arch_cost_function(struct starpu_task *task, struct starpu_perfmodel_arch
 
				 /* End of settings */
			
 
				 
			
 
				 static unsigned long nexecuted_tasks;
			
 
				-void dumb_kernel(void *buffers[], void *args) {
			
 
				+void dumb_kernel(void *buffers[], void *args)
			
 
				+{
			
 
				 	(void) buffers;
			
 
				 	(void) args;
			
 
				 	nexecuted_tasks++;