Quellcode durchsuchen

Merge branch 'master' into fpga

Nathalie Furmento vor 5 Jahren
Ursprung
Commit
dc72337184
51 geänderte Dateien mit 727 neuen und 235 gelöschten Zeilen
  1. 2 4
      doc/doxygen/Makefile.am
  2. 2 4
      doc/doxygen_dev/Makefile.am
  3. 2 1
      examples/perf_steering/perf_knobs_03.c
  4. 2 1
      examples/pipeline/pipeline.c
  5. 4 2
      examples/ppm_downscaler/yuv_downscaler.c
  6. 2 1
      examples/scheduler/dummy_modular_sched.c
  7. 4 1
      examples/spmv/spmv.c
  8. 1 1
      examples/tag_example/tag_example2.c
  9. 1 1
      examples/tag_example/tag_example3.c
  10. 8 0
      julia/StarPU.jl/Makefile
  11. 31 15
      julia/StarPU.jl/src/StarPU.jl
  12. 22 0
      julia/StarPU.jl/src/compiler/expressions.jl
  13. 2 0
      julia/StarPU.jl/src/compiler/file_generation.jl
  14. 1 1
      julia/StarPU.jl/src/compiler/parsing.jl
  15. 56 0
      julia/mandelbrot/cpu_mandelbrot.c
  16. 35 0
      julia/mandelbrot/makefile
  17. 99 15
      julia/mandelbrot/mandelbrot.jl
  18. 13 14
      julia/mult/makefile
  19. 90 0
      julia/mult/cpu_mult.c
  20. 84 0
      julia/mult/gpu_mult.cu
  21. 6 2
      mpi/src/nmad/starpu_mpi_nmad.c
  22. 10 4
      mpi/tests/Makefile.am
  23. 8 1
      mpi/tests/sendrecv_gemm_bench.c
  24. 4 0
      mpi/tests/sendrecv_parallel_tasks_bench.c
  25. 2 1
      src/common/barrier.c
  26. 4 2
      src/core/dependencies/data_concurrency.c
  27. 2 1
      src/core/dependencies/dependencies.c
  28. 4 2
      src/core/dependencies/implicit_data_deps.c
  29. 6 3
      src/core/perfmodel/perfmodel_history.c
  30. 16 7
      src/core/topology.c
  31. 3 3
      src/core/workers.c
  32. 2 1
      src/datawizard/coherency.c
  33. 2 1
      src/datawizard/interfaces/matrix_interface.c
  34. 32 23
      src/debug/traces/starpu_fxt.c
  35. 16 16
      src/sched_policies/component_best_implementation.c
  36. 21 10
      src/sched_policies/component_heteroprio.c
  37. 4 2
      src/sched_policies/component_sched.c
  38. 2 1
      src/sched_policies/component_work_stealing.c
  39. 2 1
      src/sched_policies/component_worker.c
  40. 1 1
      src/sched_policies/heteroprio.c
  41. 19 15
      src/sched_policies/modular_ez.c
  42. 2 1
      src/sched_policies/modular_gemm.c
  43. 2 1
      src/sched_policies/modular_heteroprio_heft.c
  44. 72 63
      src/util/openmp_runtime_support.c
  45. 5 3
      tests/datawizard/bcsr.c
  46. 2 1
      tests/microbenchs/tasks_size_overhead.c
  47. 2 1
      tests/parallel_tasks/parallel_kernels.c
  48. 2 1
      tests/parallel_tasks/parallel_kernels_spmd.c
  49. 5 2
      tools/starpu_perfmodel_display.c
  50. 4 2
      tools/starpu_perfmodel_recdump.c
  51. 4 2
      tools/starpu_replay.c

+ 2 - 4
doc/doxygen/Makefile.am

@@ -270,13 +270,11 @@ $(DOX_PDF): $(DOX_TAG) refman.tex
 	! < refman.log grep -v group__ | grep -v _amgrp | grep -v deprecated__ | grep "multiply defined" || exit 1 ;\
 	$(MAKEINDEX) refman.idx ;\
 	max_print_line=1000000 $(PDFLATEX) -interaction batchmode refman.tex ;\
-	done=0; repeat=5 ;\
-	while test $$done = 0 -a $$repeat -gt 0; do \
+	for i in $(shell seq 1 5); do \
            if $(EGREP) 'Rerun (LaTeX|to get cross-references right)' refman.log > /dev/null 2>&1; then \
 	       max_print_line=1000000 $(PDFLATEX) -interaction batchmode refman.tex; \
-	       repeat=`expr $$repeat - 1`; \
 	   else \
-	       done=1; \
+		break ; \
 	   fi; \
 	done
 	mv $(DOX_LATEX_DIR)/refman.pdf $(DOX_PDF)

+ 2 - 4
doc/doxygen_dev/Makefile.am

@@ -217,13 +217,11 @@ $(DOX_PDF): $(DOX_TAG) refman.tex
 	! < refman.log grep -v group__ | grep -v _amgrp | grep -v deprecated__ | grep "multiply defined" || exit 1 ;\
 	$(MAKEINDEX) refman.idx ;\
 	max_print_line=1000000 $(PDFLATEX) -interaction batchmode refman.tex ;\
-	done=0; repeat=5 ;\
-	while test $$done = 0 -a $$repeat -gt 0; do \
+	for i in $(shell seq 1 5); do \
            if $(EGREP) 'Rerun (LaTeX|to get cross-references right)' refman.log > /dev/null 2>&1; then \
 	       max_print_line=1000000 $(PDFLATEX) -interaction batchmode refman.tex; \
-	       repeat=`expr $$repeat - 1`; \
 	   else \
-	       done=1; \
+		break ; \
 	   fi; \
 	done
 	mv $(DOX_LATEX_DIR)/refman.pdf $(DOX_PDF)

+ 2 - 1
examples/perf_steering/perf_knobs_03.c

@@ -126,7 +126,8 @@ int main(int argc, char **argv)
 		const int id = starpu_perf_knob_name_to_id(scope_id, knob_name);
 		STARPU_ASSERT(starpu_perf_knob_get_type_id(id) == starpu_perf_knob_type_name_to_id(knob_type_name));
 
-		struct starpu_codelet cl = {
+		struct starpu_codelet cl =
+		{
 			.cpu_funcs = {cpu_func}
 		};
 

+ 2 - 1
examples/pipeline/pipeline.c

@@ -176,7 +176,8 @@ static struct starpu_codelet pipeline_codelet_sum =
 	.model = &pipeline_model_sum
 };
 
-static void release_sem(void *arg) {
+static void release_sem(void *arg)
+{
 	sem_post(arg);
 };
 

+ 4 - 2
examples/ppm_downscaler/yuv_downscaler.c

@@ -141,14 +141,16 @@ int main(int argc, char **argv)
 
 	/* fetch input data */
 	FILE *f_in = fopen(filename_in, "r");
-	if (!f_in) {
+	if (!f_in)
+	{
 		fprintf(stderr, "couldn't open input file %s\n", filename_in);
 		exit(EXIT_FAILURE);
 	}
 
 	/* allocate room for an output buffer */
 	FILE *f_out = fopen(filename_out, "w+");
-	if (!f_out) {
+	if (!f_out)
+	{
 		fprintf(stderr, "couldn't open output file %s\n", filename_out);
 		exit(EXIT_FAILURE);
 	}

+ 2 - 1
examples/scheduler/dummy_modular_sched.c

@@ -170,7 +170,8 @@ static void init_dummy_sched(unsigned sched_ctx_id)
 {
 	FPRINTF(stderr, "Initialising Dummy scheduler\n");
 
-	struct dummy_sched_params params = {
+	struct dummy_sched_params params =
+	{
 		.verbose = 0,
 	};
 

+ 4 - 1
examples/spmv/spmv.c

@@ -245,10 +245,13 @@ int main(int argc, char **argv)
 			vector_exp_out_ptr[row] += UPPER_BAND * vector_in_ptr[row+1];
 	}
 	for (row = 0; row < size; row++)
-		if (vector_out_ptr[row] != vector_exp_out_ptr[row]) {
+	{
+		if (vector_out_ptr[row] != vector_exp_out_ptr[row])
+		{
 			FPRINTF(stderr, "check failed at %u: %f vs expected %f\n", row, vector_out_ptr[row], vector_exp_out_ptr[row]);
 			exit(EXIT_FAILURE);
 		}
+	}
 
 	starpu_free(nzval);
 	starpu_free(colind);

+ 1 - 1
examples/tag_example/tag_example2.c

@@ -70,7 +70,7 @@ static void tag_cleanup_grid(unsigned iter)
 
 	for (i = 0; i < ni; i++)
 		starpu_tag_remove(TAG(i,iter));
-} 
+}
 
 static int create_task_grid(unsigned iter)
 {

+ 1 - 1
examples/tag_example/tag_example3.c

@@ -72,7 +72,7 @@ static void tag_cleanup_grid(unsigned iter)
 
 	for (i = 0; i < ni; i++)
 		starpu_tag_remove(TAG(i,iter));
-} 
+}
 
 static int create_task_grid(unsigned iter)
 {

+ 8 - 0
julia/StarPU.jl/Makefile

@@ -0,0 +1,8 @@
+SRCS=src/jlstarpu_task_submit.c src/jlstarpu_simple_functions.c src/jlstarpu_data_handles.c
+CC = gcc
+CFLAGS += $(shell pkg-config --cflags starpu-1.3)
+LDFLAGS += $(shell pkg-config --libs starpu-1.3)
+
+lib/libjlstarpu_c_wrapper.so: ${SRCS}
+	test -d lib || mkdir lib
+	$(CC) -O3 -shared -fPIC $(CFLAGS) $^ -o $@ $(LDFLAGS)

+ 31 - 15
julia/StarPU.jl/src/StarPU.jl

@@ -12,14 +12,14 @@ export STARPU_CUDA
 const  STARPU_CPU = 1 << 1
 const  STARPU_CUDA = 1 << 3
 
-global starpu_task_library_name="libjlstarpu_c_wrapper"
+const starpu_task_library_name="libjlstarpu_c_wrapper.so"
 global starpu_tasks_library_handle = C_NULL
 global starpu_target=STARPU_CPU
 
 include("compiler/include.jl")
 
 macro starpufunc(symbol)
-    :($symbol, "libjlstarpu_c_wrapper")
+    :($symbol, starpu_task_library_name)
 end
 
 """
@@ -27,7 +27,7 @@ end
     Works as ccall function
 """
 macro starpucall(func, ret_type, arg_types, args...)
-    return Expr(:call, :ccall, (func, "libjlstarpu_c_wrapper"), esc(ret_type), esc(arg_types), map(esc, args)...)
+    return Expr(:call, :ccall, (func, starpu_task_library_name), esc(ret_type), esc(arg_types), map(esc, args)...)
 end
 
 export @debugprint
@@ -103,13 +103,14 @@ export StarpuCodelet
 
 const jlstarpu_allocated_structures = Vector{Ptr{Cvoid}}([])
 @enum(StarpuPerfmodelType,
-    STARPU_PERFMODEL_INVALID = 0,
-	STARPU_PER_ARCH = 1,
-	STARPU_COMMON = 2,
-	STARPU_HISTORY_BASED = 3,
-	STARPU_REGRESSION_BASED = 4,
-	STARPU_NL_REGRESSION_BASED = 5,
-	STARPU_MULTIPLE_REGRESSION_BASED = 6
+      STARPU_PERFMODEL_INVALID = 0,
+      STARPU_PER_WORKER = 1,
+      STARPU_PER_ARCH = 2,
+      STARPU_COMMON = 3,
+      STARPU_HISTORY_BASED = 4,
+      STARPU_REGRESSION_BASED = 5,
+      STARPU_NL_REGRESSION_BASED = 6,
+      STARPU_MULTIPLE_REGRESSION_BASED = 7
 )
 mutable struct StarpuPerfmodel_c
 
@@ -117,6 +118,7 @@ mutable struct StarpuPerfmodel_c
 
     cost_function :: Ptr{Cvoid}
     arch_cost_function :: Ptr{Cvoid}
+    worker_cost_function :: Ptr{Cvoid}
 
     size_base :: Ptr{Cvoid}
     footprint :: Ptr{Cvoid}
@@ -583,8 +585,9 @@ function starpu_init()
             print(k,">>>>",CPU_CODELETS[k],"\n")
         end
     else
-        system("make generated_tasks.dylib")
-        global starpu_tasks_library_handle=Libdl.dlopen("generated_tasks")
+        @debugprint "generating codelet library"
+        run(`make generated_tasks.so`);
+        global starpu_tasks_library_handle=Libdl.dlopen("generated_tasks.so")
     end
     output = @starpucall jlstarpu_init Cint ()
 
@@ -798,15 +801,28 @@ function starpu_task_submit(task :: StarpuTask)
     @starpucall starpu_task_submit Cint (Ptr{Cvoid},) task.c_task
 end
 
+
+function starpu_modes(x :: Symbol)
+    if (x == Symbol("STARPU_RW"))
+        return STARPU_RW
+    elseif (x == Symbol("STARPU_R"))
+        return STARPU_R
+    else return STARPU_W
+    end
+end
+
 """
     Creates and submits an asynchronous task running cl Codelet function.
     Ex : @starpu_async_cl cl(handle1, handle2)
 """
-macro starpu_async_cl(expr)
+macro starpu_async_cl(expr,modes)
 
     if (!isa(expr, Expr) || expr.head != :call)
         error("Invalid task submit syntax")
     end
+    if (!isa(expr, Expr)||modes.head != :vect)
+        error("Invalid task submit syntax")
+    end
     perfmodel = StarpuPerfmodel(
         perf_type = STARPU_HISTORY_BASED,
         symbol = "history_perf"
@@ -817,7 +833,7 @@ macro starpu_async_cl(expr)
         #cuda_func = "matrix_mult",
         #opencl_func="ocl_matrix_mult",
         ### TODO: CORRECT !
-        modes = [STARPU_R, STARPU_R, STARPU_W],
+        modes = map((x -> starpu_modes(x)),modes.args),
         perfmodel = perfmodel
     )
     handles = Expr(:vect, expr.args[2:end]...)
@@ -1216,7 +1232,7 @@ macro starpu_noparam_function(func_name, ret_type)
 
     quote
         export $func
-        global $func() = ccall(($func_name, "libjlstarpu_c_wrapper"),
+        global $func() = ccall(($func_name, starpu_task_library_name),
                                 $ret_type, ()) :: $ret_type
     end
 end

+ 22 - 0
julia/StarPU.jl/src/compiler/expressions.jl

@@ -93,6 +93,8 @@ end
 struct StarpuExprReturn <: StarpuExpr
     value :: StarpuExpr
 end
+struct StarpuExprBreak <: StarpuExpr
+end
 struct StarpuExprVar <: StarpuExpr
     name :: Symbol
 end
@@ -717,6 +719,26 @@ function apply(func :: Function, expr :: StarpuExprRef)
 end
 
 #======================================================
+                BREAK EXPRESSION
+======================================================#
+
+function starpu_parse_break(x :: Expr)
+    if (x.head != :break)
+        error("Invalid \"break\" expression")
+    end
+
+    return StarpuExprBreak()
+end
+
+function print(io :: IO, x :: StarpuExprBreak ; indent = 0)
+    print(io, "break")
+end
+
+function apply(func :: Function, expr :: StarpuExprBreak)
+
+    return func(StarpuExprBreak())
+end
+#======================================================
                 RETURN EXPRESSION
 ======================================================#
 

+ 2 - 0
julia/StarPU.jl/src/compiler/file_generation.jl

@@ -10,6 +10,7 @@ global generated_cpu_kernel_file_name = "PRINT TO STDOUT"
 const cpu_kernel_file_start = "#include <stdio.h>
 #include <stdint.h>
 #include <starpu.h>
+#include <math.h>
 
 static inline long long jlstarpu_max(long long a, long long b)
 {
@@ -30,6 +31,7 @@ static inline long long jlstarpu_interval_size(long long start, long long step,
 const cuda_kernel_file_start = "#include <stdio.h>
 #include <stdint.h>
 #include <starpu.h>
+#include <math.h>
 
 #define THREADS_PER_BLOCK 64
 

+ 1 - 1
julia/StarPU.jl/src/compiler/parsing.jl

@@ -32,7 +32,7 @@ function starpu_parse(x :: Expr)
 
 end
 
-for kw in (:if, :call, :for, :block, :return, :function, :while, :ref)
+for kw in (:if, :call, :for, :block, :return, :function, :while, :ref, :break)
     starpu_parse_key_word_parsing_function[kw] = eval(Symbol(:starpu_parse_, kw))
 end
 

+ 56 - 0
julia/mandelbrot/cpu_mandelbrot.c

@@ -0,0 +1,56 @@
+#include <stdio.h>
+#include <starpu.h>
+#include <math.h>
+
+void cpu_mandelbrot(void *descr[], void *cl_arg)
+{
+        long long int *pixels;
+	float *params;
+
+        pixels = (long long int *)STARPU_MATRIX_GET_PTR(descr[0]);
+	params = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
+
+        int width = STARPU_MATRIX_GET_NX(descr[0]);
+        int height = STARPU_MATRIX_GET_NY(descr[0]);
+        
+        int ldP = STARPU_MATRIX_GET_LD(descr[0]);
+
+        float centerr = params[0];
+        float centeri = params[1];
+        float offset = params[2];
+        float dim = params[3];
+        float zoom = width * 0.25296875;
+        float diverge = 4.0;
+        int max_iter = (width/2) * 0.049715909 * log10(zoom);
+
+        int x,y,n;
+
+        for (y = 0; y < height; y++){
+                for (x = 0; x < width; x++){
+                        float cr = centerr + (x - (dim/2))/zoom;
+                        float ci = centeri + (y+offset - (dim/2))/zoom;
+                        float zr = cr;
+                        float zi = ci;
+                        
+                        for (n = 0; n <= max_iter; n++) {
+				if (zr*zr + zi*zi>diverge) break;
+                                float tmp = zr*zr - zi*zi + cr;
+                                zi = 2*zr*zi + ci;
+                                zr = tmp;
+                        }
+			int color;
+			if (n<max_iter)
+				color = round(15.*n/max_iter);
+			else
+				color = 0;
+			pixels[x*ldP + y] = color;
+		}
+	}
+}
+
+char* CPU = "cpu_mandelbrot";
+char* GPU = "gpu_mandelbrot";
+extern char *starpu_find_function(char *name, char *device) {
+	if (!strcmp(device,"gpu")) return GPU;
+	return CPU;
+}

+ 35 - 0
julia/mandelbrot/makefile

@@ -0,0 +1,35 @@
+# GCC compiler
+CC=gcc-9
+CFLAGS += -O3 -mavx -mfma -fomit-frame-pointer -march=native -ffast-math $(shell pkg-config --cflags starpu-1.3)
+
+LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
+EXTERNLIB=extern_tasks.dylib
+GENERATEDLIB=generated_tasks.dylib
+OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
+LIBPATH=${PWD}/../StarPU.jl/lib
+
+all: ${EXTERNLIB} 
+
+mult: mult.c cpu_mult.o #gpu_mult.o 
+	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)	
+
+gpu_mult.o: gpu_mult.cu
+	nvcc -c $(CFLAGS) $^ -o $@
+
+%.o: %.c
+	$(CC) -c $(CFLAGS) $^ -o $@
+
+${EXTERNLIB}: cpu_mandelbrot.o
+	$(CC) -shared -fPIC $(LDFLAGS) $^ -o $@  
+
+gpu_mult.so: gpu_mult.o
+	nvcc $(CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
+
+${GENERATEDLIB}: ${OBJECTS}
+	$(CC) -shared -fPIC $(LDFLAGS) $^ -o $@
+
+clean:
+	rm *.so *.o *.dylib c_*.genc gencuda_*.cu *.dat
+
+
+

+ 99 - 15
julia/mandelbrot/mandelbrot.jl

@@ -1,30 +1,114 @@
-function mandelbrotjl(pixels ::Matrix{Int64}, centerr ::Float64, centeri ::Float64)
-    height,width = size(pixels)
-    zoom = width * 0.25296875
-    val_diverge = 2.0
-    max_iterations = (width/2) * 0.049715909 * log10(zoom);
+import Libdl
+using StarPU
+using LinearAlgebra
 
-
-    for y = 1:height
+@target STARPU_CPU+STARPU_CUDA
+@codelet function mandelbrot(pixels ::Matrix{Int64}, params ::Matrix{Float32} ) :: Float32
+    height :: Int64 = height(pixels)
+    width :: Int64 = width(pixels)
+    zoom :: Float64 = width * 0.25296875
+    iz :: Float64 = 1. / zoom
+    diverge :: Float32 = 4.0
+    max_iterations :: Float32 = ((width/2) * 0.049715909 * log10(zoom));
+    imi :: Float32 = 1. / max_iterations
+    centerr :: Float32 = params[1,1]
+    centeri :: Float32 = params[2,1]
+    offset :: Float32 = params[3,1]
+    dim :: Float32 = params[4,1]
+    cr :: Float64 = 0.
+    zr :: Float64 = 0.
+    ci :: Float64 = 0.
+    zi :: Float64 = 0.
+    n :: Int64 = 0
+    tmp :: Float64 = 0.
+    @parallel for y = 1:height
         for x = 1:width
-            cr = centerr + (x - (width / 2))/zoom
+            cr = centerr + (x-1 - (dim / 2)) * iz
             zr = cr
-            ci = centeri + (y - (height / 2))/zoom
+            ci = centeri + (y-1+offset - (dim / 2)) * iz
             zi = ci
-
-            n = 0
-            while ((n < max_iterations) && (zr*zr + zi*zi < val_diverge*val_diverge))
+            for n = 0:max_iterations
+                if (zr*zr + zi*zi > diverge)
+                    break
+                end
                 tmp = zr*zr - zi*zi + cr
                 zi = 2*zr*zi + ci
                 zr = tmp
-                n = n+1
             end
             
             if (n < max_iterations)
-                pixels[y,x] = round(255 * n / max_iterations)
+                pixels[y,x] = round(15 * n * imi)
             else
                 pixels[y,x] = 0
             end
         end
     end
-end
+    return 0. :: Float32
+end
+
+@debugprint "starpu_init"
+starpu_init()
+
+function mandelbrot_with_starpu(A ::Matrix{Int64}, params ::Matrix{Float32}, nslicesx ::Int64)
+    horiz = StarpuDataFilter(STARPU_MATRIX_FILTER_BLOCK, nslicesx)
+    @starpu_block let
+	hA, hP = starpu_data_register(A,params)
+	starpu_data_partition(hA,horiz)
+        starpu_data_partition(hP,horiz)
+        
+	@starpu_sync_tasks for taskx in (1 : nslicesx)
+                @starpu_async_cl mandelbrot(hA[taskx], hP[taskx]) [STARPU_W, STARPU_R]
+	end
+    end
+end
+
+function pixels2img(pixels ::Matrix{Int64}, width ::Int64, height ::Int64, filename ::String)
+    MAPPING = [[66,30,15],[25,7,26],[9,1,47],[4,4,73],[0,7,100],[12,44,138],[24,82,177],[57,125,209],[134,181,229],[211,236,248],[241,233,191],[248,201,95],[255,170,0],[204,128,0],[153,87,0],[106,52,3]]
+    open(filename, "w") do f
+        write(f, "P3\n$width $height\n255\n")
+        for i = 1:height
+            for j = 1:width
+                write(f,"$(MAPPING[1+pixels[i,j]][1]) $(MAPPING[1+pixels[i,j]][2]) $(MAPPING[1+pixels[i,j]][3]) ")
+            end
+            write(f, "\n")
+        end
+    end
+end
+
+function min_times(cr ::Float64, ci ::Float64, dim ::Int64, nslices ::Int64)
+    tmin=0;
+    
+    pixels ::Matrix{Int64} = zeros(dim, dim)
+    params :: Matrix{Float32} = zeros(4*nslices,1)
+    for i=0:(nslices-1)
+        params[4*i+1,1] = cr
+        params[4*i+2,1] = ci
+        params[4*i+3,1] = i*dim/nslices
+        params[4*i+4,1] = dim
+    end
+    for i = 1:10
+        t = time_ns();
+        mandelbrot_with_starpu(pixels, params, nslices)
+        t = time_ns()-t
+        if (tmin==0 || tmin>t)
+            tmin=t
+        end
+    end
+    pixels2img(pixels,dim,dim,"out$(dim).ppm")
+    return tmin
+end
+
+function display_time(cr ::Float64, ci ::Float64, start_dim ::Int64, step_dim ::Int64, stop_dim ::Int64, nslices ::Int64)
+    for dim in (start_dim : step_dim : stop_dim)
+        res = min_times(cr, ci, dim, nslices)
+        res=res/dim/dim; # time per pixel
+        println("$(dim) $(res)")
+    end
+end
+
+
+display_time(-0.800671,-0.158392,32,32,4096,4)
+
+@debugprint "starpu_shutdown"
+starpu_shutdown()
+

+ 13 - 14
julia/mult/makefile

@@ -5,19 +5,20 @@ STRIDE=72
 #CC =icc
 #CFLAGS=-restrict -unroll4 -ipo -falign-loops=256 -O3 -DSTRIDE=${STRIDE} -march=native $(shell pkg-config --cflags starpu-1.3)
 # GCC compiler
-CC=gcc-9
-CFLAGS += -O3 -DSTRIDE=${STRIDE} -mavx -mfma -fomit-frame-pointer -march=native -ffast-math $(shell pkg-config --cflags starpu-1.3)
+CC=gcc
+CFLAGS += -O3 -DSTRIDE=${STRIDE} -mavx -fomit-frame-pointer -march=native -ffast-math $(shell pkg-config --cflags starpu-1.3)
 
 LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
-EXTERNLIB=extern_tasks.dylib
-GENERATEDLIB=generated_tasks.dylib
-OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
+EXTERNLIB=extern_tasks.so
+GENERATEDLIB=generated_tasks.so
+#OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
+OBJECTS=$(wildcard gen*.c)
 LIBPATH=${PWD}/../StarPU.jl/lib
 
-all: ${EXTERNLIB} 
+all: ${EXTERNLIB}
 
-mult: mult.c cpu_mult.o #gpu_mult.o 
-	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)	
+mult: mult.c cpu_mult.o #gpu_mult.o
+	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
 
 gpu_mult.o: gpu_mult.cu
 	nvcc -c $(CFLAGS) $^ -o $@
@@ -25,8 +26,8 @@ gpu_mult.o: gpu_mult.cu
 %.o: %.c
 	$(CC) -c $(CFLAGS) $^ -o $@
 
-${EXTERNLIB}: cpu_mult.o
-	$(CC) -shared -fPIC $(LDFLAGS) $^ -o $@  
+${EXTERNLIB}: cpu_mult.c
+	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
 
 gpu_mult.so: gpu_mult.o
 	nvcc $(CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
@@ -35,10 +36,10 @@ cpu_mult_sa: cpu_mult_sa.o
 	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
 
 ${GENERATEDLIB}: ${OBJECTS}
-	$(CC) -shared -fPIC $(LDFLAGS) $^ -o $@
+	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
 
 clean:
-	rm *.so *.o *.dylib c_*.genc gencuda_*.cu *.dat
+	rm -f mult *.so *.o c_*.genc gencuda_*.cu *.dat
 
 # Performance Tests
 cstarpu.dat: mult
@@ -51,5 +52,3 @@ julia_calllib.dat: ${EXTERNLIB}
 	LD_LIBRARY_PATH+=${LIBPATH} JULIA_TASK_LIB="${EXTERNLIB}" STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mult.jl julia_calllib.dat
 
 test: cstarpu.dat julia_generatedc.dat julia_native.dat julia_calllib.dat
-
-

+ 90 - 0
julia/mult/cpu_mult.c

@@ -0,0 +1,90 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2018                                     Alexis Juven
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <starpu.h>
+/*
+ * The codelet is passed 3 matrices, the "descr" union-type field gives a
+ * description of the layout of those 3 matrices in the local memory (ie. RAM
+ * in the case of CPU, GPU frame buffer in the case of GPU etc.). Since we have
+ * registered data with the "matrix" data interface, we use the matrix macros.
+ */
+void cpu_mult(void *descr[], void *arg)
+{
+	(void)arg;
+	float *subA, *subB, *subC;
+	/* .blas.ptr gives a pointer to the first element of the local copy */
+	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
+	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
+	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
+
+
+	/* .blas.nx is the number of rows (consecutive elements) and .blas.ny
+	 * is the number of lines that are separated by .blas.ld elements (ld
+	 * stands for leading dimension).
+	 * NB: in case some filters were used, the leading dimension is not
+	 * guaranteed to be the same in main memory (on the original matrix)
+	 * and on the accelerator! */
+	const uint32_t nxC = STARPU_MATRIX_GET_NX(descr[2]);
+	const uint32_t nyC = STARPU_MATRIX_GET_NY(descr[2]);
+	const uint32_t nyA = STARPU_MATRIX_GET_NY(descr[0]);
+
+	const uint32_t ldA = STARPU_MATRIX_GET_LD(descr[0]);
+	const uint32_t ldB = STARPU_MATRIX_GET_LD(descr[1]);
+	const uint32_t ldC = STARPU_MATRIX_GET_LD(descr[2]);
+	/* we assume a FORTRAN-ordering! */
+	int i,j,k,ii,jj,kk;
+	for (i = 0; i < nyC*nxC; i++) subC[i] = 0;
+	//fprintf(stderr,"inside cpu_mult %dx%dx%d %d/%d on %d\n",nyC,nyA,nxC,starpu_worker_get_id(),STARPU_NMAXWORKERS,starpu_worker_get_devid(starpu_worker_get_id()));
+	for (i=0;i<nyC;i+=STRIDE) {
+		for (k=0;k<nyA;k+=STRIDE) {
+			for (j=0;j<nxC;j+=STRIDE) {
+				
+				for (ii = i; ii < i+STRIDE; ii+=2) {
+					float *sC0=subC+ii*ldC+j;
+					float *sC1=subC+ii*ldC+ldC+j;
+					for (kk = k; kk < k+STRIDE; kk+=4) {
+						float alpha00=subB[kk +  ii*ldB];
+						float alpha01=subB[kk+1+ii*ldB];
+						float alpha10=subB[kk+  ii*ldB+ldB];
+						float alpha11=subB[kk+1+ii*ldB+ldB];
+						float alpha02=subB[kk+2+ii*ldB];
+						float alpha03=subB[kk+3+ii*ldB];
+						float alpha12=subB[kk+2+ ii*ldB+ldB];
+						float alpha13=subB[kk+3+ii*ldB+ldB];
+						float *sA0=subA+kk*ldA+j;
+						float *sA1=subA+kk*ldA+ldA+j;
+						float *sA2=subA+kk*ldA+2*ldA+j;
+						float *sA3=subA+kk*ldA+3*ldA+j;
+						for (jj = 0; jj < STRIDE; jj+=1) {
+							sC0[jj] += alpha00*sA0[jj]+alpha01*sA1[jj]+alpha02*sA2[jj]+alpha03*sA3[jj];
+							sC1[jj] += alpha10*sA0[jj]+alpha11*sA1[jj]+alpha12*sA2[jj]+alpha13*sA3[jj];
+						}
+					}
+				}
+			}
+		}
+	}
+	//fprintf(stderr,"inside cpu_mult %dx%dx%d\n",nyC,nyA,nxC);
+
+}
+char* CPU = "cpu_mult";
+char* GPU = "gpu_mult";
+extern char *starpu_find_function(char *name, char *device) {
+	if (!strcmp(device,"gpu")) return GPU;
+	return CPU;
+}

+ 84 - 0
julia/mult/gpu_mult.cu

@@ -0,0 +1,84 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2018                                     Alexis Juven
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <starpu.h>
+extern "C" {
+#include <starpu_cuda.h>
+}
+
+#include <stdint.h>
+#include <stdio.h>
+
+
+__global__ void gpuMultKernel
+(
+		uint32_t nxC, uint32_t nyC, uint32_t nyA,
+		uint32_t ldA, uint32_t ldB, uint32_t ldC,
+		float * subA, float * subB, float * subC
+)
+{
+	uint32_t id, i, j, k;
+	float sum;
+
+	id = blockIdx.x * blockDim.x + threadIdx.x;
+	i = id % nxC;
+	j = id / nxC;
+
+	if (j >= nyC){
+		return;
+	}
+
+	sum = 0.;
+
+	for (k = 0 ; k < nyA ; k++){
+		sum += subA[i + k*ldA] * subB[k + j*ldB];
+	}
+
+	subC[i + j*ldC] = sum;
+
+}
+
+
+
+#define THREADS_PER_BLOCK 64
+extern "C" void gpu_mult(void * descr[], void * args)
+{
+
+	float * d_subA, * d_subB, * d_subC;
+	uint32_t nxC, nyC, nyA;
+	uint32_t ldA, ldB, ldC;
+	uint32_t nblocks;
+
+	d_subA = (float *) STARPU_MATRIX_GET_PTR(descr[0]);
+	d_subB = (float *) STARPU_MATRIX_GET_PTR(descr[1]);
+	d_subC = (float *) STARPU_MATRIX_GET_PTR(descr[2]);
+
+	nxC = STARPU_MATRIX_GET_NX(descr[2]);
+	nyC = STARPU_MATRIX_GET_NY(descr[2]);
+	nyA = STARPU_MATRIX_GET_NY(descr[0]);
+
+	ldA = STARPU_MATRIX_GET_LD(descr[0]);
+	ldB = STARPU_MATRIX_GET_LD(descr[1]);
+	ldC = STARPU_MATRIX_GET_LD(descr[2]);
+
+	nblocks = (nxC * nyC + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
+
+	gpuMultKernel
+		<<< nblocks, THREADS_PER_BLOCK, 0, NULL /*starpu_cuda_get_local_stream()*/
+		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
+
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+
+}

+ 6 - 2
mpi/src/nmad/starpu_mpi_nmad.c

@@ -67,8 +67,8 @@ static volatile int pending_request = 0;
 
 #define REQ_FINALIZED 0x1
 
-PUK_LFSTACK_TYPE(callback,	struct _starpu_mpi_req *req;);
-static callback_lfstack_t callback_stack = NULL;
+PUK_LFSTACK_TYPE(callback, struct _starpu_mpi_req *req;);
+static callback_lfstack_t callback_stack;
 
 static starpu_sem_t callback_sem;
 
@@ -594,6 +594,8 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 		_starpu_mpi_thread_cpuid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
 	}
 
+	callback_lfstack_init(&callback_stack);
+
 	/* Tell pioman to use a bound thread for communication progression:
 	 * share the same core as StarPU's MPI thread, the MPI thread has very low activity with NMAD backend */
 	int indexes[1] = { _starpu_mpi_thread_cpuid };
@@ -663,6 +665,8 @@ void _starpu_mpi_progress_shutdown(void **value)
 
 	STARPU_PTHREAD_JOIN(progress_thread, value);
 
+	callback_lfstack_destroy(&callback_stack);
+
         STARPU_PTHREAD_MUTEX_DESTROY(&progress_mutex);
         STARPU_PTHREAD_COND_DESTROY(&progress_cond);
 }

+ 10 - 4
mpi/tests/Makefile.am

@@ -58,9 +58,11 @@ BUILT_SOURCES =
 
 CLEANFILES = *.gcno *.gcda *.linkinfo starpu_idle_microsec.log
 
-EXTRA_DIST = 					\
-	user_defined_datatype_value.h		\
-	helper.h
+EXTRA_DIST = 				\
+	abstract_sendrecv_bench.h	\
+	bench_helper.h			\
+	helper.h			\
+	user_defined_datatype_value.h
 
 examplebindir = $(libdir)/starpu/examples/mpi
 
@@ -138,9 +140,13 @@ starpu_mpi_TESTS +=				\
 	user_defined_datatype			\
 	early_stuff				\
 	sendrecv_bench				\
-	sendrecv_gemm_bench			\
 	sendrecv_parallel_tasks_bench
 
+if !NO_BLAS_LIB
+starpu_mpi_TESTS +=				\
+	sendrecv_gemm_bench
+endif
+
 if !STARPU_SIMGRID
 # missing support in simgrid
 starpu_mpi_TESTS +=				\

+ 8 - 1
mpi/tests/sendrecv_gemm_bench.c

@@ -320,7 +320,13 @@ static void* comm_thread_func(void* arg)
 	return NULL;
 }
 
-
+#ifdef STARPU_USE_MPI_MPI
+int main(int argc, char **argv)
+{
+	FPRINTF(stderr, "This test does not work with the MPI backend.\n");
+	return STARPU_TEST_SKIPPED;
+}
+#else
 int main(int argc, char **argv)
 {
 	double start, end;
@@ -461,3 +467,4 @@ enodev:
 
 	return ret;
 }
+#endif

+ 4 - 0
mpi/tests/sendrecv_parallel_tasks_bench.c

@@ -32,7 +32,11 @@
 
 /* We reduce NX_MAX, since some NICs don't support exchanging simultaneously such amount of memory */
 #undef NX_MAX
+#ifdef STARPU_QUICK_CHECK
+#define NX_MAX (1024)
+#else
 #define NX_MAX (64 * 1024 * 1024)
+#endif
 
 
 void cpu_task(void* descr[], void* args)

+ 2 - 1
src/common/barrier.c

@@ -50,7 +50,8 @@ int _starpu_barrier_test(struct _starpu_barrier *barrier)
 int _starpu_barrier_destroy(struct _starpu_barrier *barrier)
 {
 	int ret;
-	do {
+	do
+	{
 		ret = _starpu_barrier_test(barrier);
 	}
 	while (ret == EBUSY);

+ 4 - 2
src/core/dependencies/data_concurrency.c

@@ -402,7 +402,8 @@ void _starpu_enforce_data_deps_notify_job_ready_soon(struct _starpu_job *j, _sta
 {
 	unsigned buf;
 
-	if (j->task->cl) {
+	if (j->task->cl)
+	{
 		unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(j->task);
 
 		for (buf = 0; buf < nbuffers; buf++)
@@ -415,7 +416,8 @@ void _starpu_enforce_data_deps_notify_job_ready_soon(struct _starpu_job *j, _sta
 
 		/* We need to check data availability only if sequential consistency
 		 * dependencies have not been used */
-		if (!j->sequential_consistency) {
+		if (!j->sequential_consistency)
+		{
 			for (buf = 0; buf < nbuffers; buf++)
 			{
 				starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(j->task, buf);

+ 2 - 1
src/core/dependencies/dependencies.c

@@ -41,7 +41,8 @@ void _starpu_notify_dependencies(struct _starpu_job *j)
 static starpu_notify_ready_soon_func notify_ready_soon_func;
 static void *notify_ready_soon_func_data;
 
-struct _starpu_notify_job_start_data {
+struct _starpu_notify_job_start_data
+{
 	double delay;
 };
 

+ 4 - 2
src/core/dependencies/implicit_data_deps.c

@@ -234,11 +234,13 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 
 		/* Skip tasks that are associated to a reduction phase so that
 		 * they do not interfere with the application. */
-		if (pre_sync_job->reduction_task) {
+		if (pre_sync_job->reduction_task)
+		{
 			*submit_pre_sync = 1;
 			return NULL;
 		}
-		if (post_sync_job->reduction_task) {
+		if (post_sync_job->reduction_task)
+		{
 			*submit_pre_sync = 0;
 			return NULL;
 		}

+ 6 - 3
src/core/perfmodel/perfmodel_history.c

@@ -1026,7 +1026,8 @@ void starpu_perfmodel_dump_xml(FILE *f, struct starpu_perfmodel *model)
 		for(dev = 0; dev < ndevices; dev++)
 		{
 			const char *type;
-			switch (arch_combs[comb]->devices[dev].type) {
+			switch (arch_combs[comb]->devices[dev].type)
+			{
 				case STARPU_CPU_WORKER: type = "CPU"; break;
 				case STARPU_CUDA_WORKER: type = "CUDA"; break;
 				case STARPU_OPENCL_WORKER: type = "OpenCL"; break;
@@ -1421,7 +1422,8 @@ int starpu_perfmodel_list(FILE *output)
 	else
 	{
 		int i;
-		for (i = 0; i < n; i++) {
+		for (i = 0; i < n; i++)
+		{
 			if (strcmp(list[i]->d_name, ".") && strcmp(list[i]->d_name, ".."))
 				fprintf(output, "file: <%s>\n", list[i]->d_name);
 			free(list[i]);
@@ -1772,7 +1774,8 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, s
 
 docal:
 #ifdef STARPU_SIMGRID
-	if (isnan(exp)) {
+	if (isnan(exp))
+	{
 		char archname[STR_SHORT_LENGTH];
 		starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), nimpl);
 

+ 16 - 7
src/core/topology.c

@@ -300,13 +300,15 @@ int _starpu_task_data_get_node_on_node(struct starpu_task *task, unsigned index,
 	int node = STARPU_SPECIFIC_NODE_LOCAL;
 	if (task->cl->specific_nodes)
 		node = STARPU_CODELET_GET_NODE(task->cl, index);
-	switch (node) {
+	switch (node)
+	{
 	case STARPU_SPECIFIC_NODE_LOCAL:
 		// TODO: rather find MCDRAM
 		node = local_node;
 		break;
 	case STARPU_SPECIFIC_NODE_CPU:
-		switch (starpu_node_get_kind(local_node)) {
+		switch (starpu_node_get_kind(local_node))
+		{
 		case STARPU_CPU_RAM:
 			node = local_node;
 			break;
@@ -321,10 +323,13 @@ int _starpu_task_data_get_node_on_node(struct starpu_task *task, unsigned index,
 		node = local_node;
 		break;
 	case STARPU_SPECIFIC_NODE_LOCAL_OR_CPU:
-		if (task->handles[index]->per_node[local_node].state != STARPU_INVALID) {
+		if (task->handles[index]->per_node[local_node].state != STARPU_INVALID)
+		{
 			/* It is here already, rather access it from here */
 			node = local_node;
-		} else {
+		}
+		else
+		{
 			/* It is not here already, do not bother moving it */
 			node = STARPU_MAIN_RAM;
 		}
@@ -339,7 +344,8 @@ int _starpu_task_data_get_node_on_worker(struct starpu_task *task, unsigned inde
 	int node = STARPU_SPECIFIC_NODE_LOCAL;
 	if (task->cl->specific_nodes)
 		node = STARPU_CODELET_GET_NODE(task->cl, index);
-	switch (node) {
+	switch (node)
+	{
 	case STARPU_SPECIFIC_NODE_LOCAL:
 		// TODO: rather find MCDRAM
 		node = local_node;
@@ -354,10 +360,13 @@ int _starpu_task_data_get_node_on_worker(struct starpu_task *task, unsigned inde
 		node = local_node;
 		break;
 	case STARPU_SPECIFIC_NODE_LOCAL_OR_CPU:
-		if (task->handles[index]->per_node[local_node].state != STARPU_INVALID) {
+		if (task->handles[index]->per_node[local_node].state != STARPU_INVALID)
+		{
 			/* It is here already, rather access it from here */
 			node = local_node;
-		} else {
+		}
+		else
+		{
 			/* It is not here already, do not bother moving it */
 			node = STARPU_MAIN_RAM;
 		}

+ 3 - 3
src/core/workers.c

@@ -434,7 +434,8 @@ static inline int _starpu_can_use_nth_implementation(enum starpu_worker_archtype
 
 /* Test if this task can be processed on this worker, regardless of the implementation */
 /* must be called with sched_mutex locked to protect state_blocked */
-static inline int _starpu_can_execute_task_any_impl(unsigned workerid, struct starpu_task *task) {
+static inline int _starpu_can_execute_task_any_impl(unsigned workerid, struct starpu_task *task)
+{
 
 	if (!_starpu_config.workers[workerid].enable_knob)
 		return 0;
@@ -446,7 +447,6 @@ static inline int _starpu_can_execute_task_any_impl(unsigned workerid, struct st
 			return 0;
 	}
 
-	
 	/* if the worker is blocked in a parallel ctx don't submit tasks on it */
 #ifdef STARPU_DEVEL
 #warning FIXME: this is very expensive, while can_execute is supposed to be not very costly so schedulers can call it a lot
@@ -457,7 +457,7 @@ static inline int _starpu_can_execute_task_any_impl(unsigned workerid, struct st
 	if (!(task->where & _starpu_config.workers[workerid].worker_mask))
 		return 0;
 
-	return 1; 
+	return 1;
 }
 
 /* must be called with sched_mutex locked to protect state_blocked_in_parallel */

+ 2 - 1
src/datawizard/coherency.c

@@ -215,7 +215,8 @@ void _starpu_update_data_state(starpu_data_handle_t handle,
 			for (node = 0; node < nnodes; node++)
 			{
 				struct _starpu_data_replicate *replicate = &handle->per_node[node];
-                               if (replicate->state != STARPU_INVALID){
+                               if (replicate->state != STARPU_INVALID)
+			       {
                                        _STARPU_TRACE_DATA_STATE_SHARED(handle, node);
 					replicate->state = STARPU_SHARED;
                                }

+ 2 - 1
src/datawizard/interfaces/matrix_interface.c

@@ -213,7 +213,8 @@ static void display_matrix_interface(starpu_data_handle_t handle, FILE *f)
 
 //#define DYNAMIC_MATRICES
 
-struct pack_matrix_header {
+struct pack_matrix_header
+{
 #ifdef DYNAMIC_MATRICES
 	/* Receiving matrices with different sizes from MPI */
 	/* FIXME: that would break alignment for O_DIRECT disk access...

+ 32 - 23
src/debug/traces/starpu_fxt.c

@@ -343,7 +343,8 @@ static void handle_papi_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *o
 	long long int value = ev->param[2];
 	//char *prefix = options->file_prefix;
 
-	if (papi_file){
+	if (papi_file)
+	{
 		char event_str[PAPI_MAX_STR_LEN];
 		PAPI_event_code_to_name(event_code, event_str);
 		fprintf(papi_file, "JobId: %lu\n", task);
@@ -2470,47 +2471,55 @@ static void handle_memnode_event(struct fxt_ev_64 *ev, struct starpu_fxt_options
 		memnode_set_state(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr);
 }
 
-static void handle_memnode_event_start_3(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr){
+static void handle_memnode_event_start_3(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
+{
 	unsigned memnode = ev->param[0];
-       unsigned size = ev->param[2];
-       unsigned long handle = ev->param[3];
+	unsigned size = ev->param[2];
+	unsigned long handle = ev->param[3];
 
 	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, size, memnode, options);
 }
 
-static void handle_memnode_event_start_4(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr){
-       unsigned memnode = ev->param[0];
-       unsigned dest = ev->param[1];
-       if(strcmp(eventstr, "rc")==0){
-               //If it is a Request Create, use dest normally
-       }else{
-               dest = memnode;
-       }
-       unsigned size = ev->param[2];
-       unsigned long handle = ev->param[3];
-       unsigned prefe = ev->param[4];
+static void handle_memnode_event_start_4(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
+{
+	unsigned memnode = ev->param[0];
+	unsigned dest = ev->param[1];
+	if(strcmp(eventstr, "rc")==0)
+	{
+		//If it is a Request Create, use dest normally
+	}
+	else
+	{
+		dest = memnode;
+	}
+	unsigned size = ev->param[2];
+	unsigned long handle = ev->param[3];
+	unsigned prefe = ev->param[4];
 
-       memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, prefe, size, dest, options);
+	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, prefe, size, dest, options);
 }
 
-static void handle_memnode_event_end_3(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr){
+static void handle_memnode_event_end_3(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
+{
 	unsigned memnode = ev->param[0];
-       unsigned long handle = ev->param[2];
-       unsigned info = ev->param[3];
+	unsigned long handle = ev->param[2];
+	unsigned info = ev->param[3];
 
 	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, info, 0, memnode, options);
 }
 
-static void handle_memnode_event_start_2(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr){
+static void handle_memnode_event_start_2(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
+{
 	unsigned memnode = ev->param[0];
-       unsigned long handle = ev->param[2];
+	unsigned long handle = ev->param[2];
 
 	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, 0, memnode, options);
 }
 
-static void handle_memnode_event_end_2(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr){
+static void handle_memnode_event_end_2(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
+{
 	unsigned memnode = ev->param[0];
-       unsigned long handle = ev->param[2];
+	unsigned long handle = ev->param[2];
 
 	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, 0, memnode, options);
 }

+ 16 - 16
src/sched_policies/component_best_implementation.c

@@ -38,26 +38,26 @@ static int find_best_impl(unsigned sched_ctx_id, struct starpu_task * task, int
 		len = 0.0;
 	}
 	else
-	{	
-	    struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid, sched_ctx_id);
-	    for(impl = 0; impl < STARPU_MAXIMPLEMENTATIONS; impl++)
-	    {
-		if(starpu_worker_can_execute_task(workerid, task, impl))
+	{
+		struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid, sched_ctx_id);
+		for(impl = 0; impl < STARPU_MAXIMPLEMENTATIONS; impl++)
 		{
-			double d = starpu_task_expected_length(task, archtype, impl);
-			if(isnan(d))
-			{
-				best_impl = impl;
-				len = 0.0;
-				break;
-			}
-			if(d < len)
+			if(starpu_worker_can_execute_task(workerid, task, impl))
 			{
-				len = d;
-				best_impl = impl;
+				double d = starpu_task_expected_length(task, archtype, impl);
+				if(isnan(d))
+				{
+					best_impl = impl;
+					len = 0.0;
+					break;
+				}
+				if(d < len)
+				{
+					len = d;
+					best_impl = impl;
+				}
 			}
 		}
-	    }
 	}
 	if(best_impl == -1)
 		return 0;

+ 21 - 10
src/sched_policies/component_heteroprio.c

@@ -206,7 +206,8 @@ out:
 	//fprintf(stderr, "could not push %p to %d actually\n", task, best_icomponent);
 	/* Could not push to child actually, push that one back */
 	STARPU_COMPONENT_MUTEX_LOCK(mutex);
-	for (j = 0; j < (int) data->naccel; j++) {
+	for (j = 0; j < (int) data->naccel; j++)
+	{
 		if (acceleration == data->accel[j])
 		{
 			_starpu_prio_deque_push_front_task(data->bucket[j], task);
@@ -305,7 +306,8 @@ static int heteroprio_progress_one(struct starpu_sched_component *component)
 	task = _starpu_prio_deque_pop_task(no_accel);
 	STARPU_COMPONENT_MUTEX_UNLOCK(mutex);
 
-	if (task) {
+	if (task)
+	{
 		if (heteroprio_progress_noaccel(component, data, task))
 		{
 			/* Could not push to child actually, push that one back */
@@ -388,7 +390,8 @@ static int heteroprio_push_task(struct starpu_sched_component * component, struc
 			max_expected = min_arch;
 	}
 
-	if (workerid == -1) {
+	if (workerid == -1)
+	{
 		/* All archs can run it */
 		STARPU_ASSERT(!isnan(min_expected));
 		STARPU_ASSERT(!isnan(max_expected));
@@ -402,13 +405,15 @@ static int heteroprio_push_task(struct starpu_sched_component * component, struc
 		STARPU_COMPONENT_MUTEX_LOCK(mutex);
 		unsigned i, j;
 		/* Try to find a bucket with similar acceleration */
-		for (i = 0; i < data->naccel; i++) {
+		for (i = 0; i < data->naccel; i++)
+		{
 			if (acceleration >= data->accel[i] * (1 - APPROX) &&
 			    acceleration <= data->accel[i] * (1 + APPROX))
 				break;
 		}
 
-		if (i == data->naccel) {
+		if (i == data->naccel)
+		{
 			/* Didn't find it, add one */
 			data->naccel++;
 
@@ -418,8 +423,10 @@ static int heteroprio_push_task(struct starpu_sched_component * component, struc
 			_starpu_prio_deque_init(newbucket);
 			int inserted = 0;
 
-			for (j = 0; j < data->naccel-1; j++) {
-				if (!inserted && acceleration > data->accel[j]) {
+			for (j = 0; j < data->naccel-1; j++)
+			{
+				if (!inserted && acceleration > data->accel[j])
+				{
 					/* Insert the new bucket here */
 					i = j;
 					newbuckets[j] = newbucket;
@@ -429,7 +436,8 @@ static int heteroprio_push_task(struct starpu_sched_component * component, struc
 				newbuckets[j+inserted] = data->bucket[j];
 				newaccel[j+inserted] = data->accel[j];
 			}
-			if (!inserted) {
+			if (!inserted)
+			{
 				/* Insert it last */
 				newbuckets[data->naccel-1] = newbucket;
 				newaccel[data->naccel-1] = acceleration;
@@ -441,14 +449,17 @@ static int heteroprio_push_task(struct starpu_sched_component * component, struc
 		}
 #if 0
 		fprintf(stderr,"buckets:");
-		for (j = 0; j < data->naccel; j++) {
+		for (j = 0; j < data->naccel; j++)
+		{
 			fprintf(stderr, " %f", data->accel[j]);
 		}
 		fprintf(stderr,"\ninserting %p %f to %d\n", task, acceleration, i);
 #endif
 		_starpu_prio_deque_push_back_task(data->bucket[i],task);
 		STARPU_COMPONENT_MUTEX_UNLOCK(mutex);
-	} else {
+	}
+	else
+	{
 		/* Not all archs can run it, will resort to HEFT strategy */
 		acceleration = INFINITY;
 		//fprintf(stderr,"%s: some archs can't do it\n", starpu_task_get_name(task));

+ 4 - 2
src/sched_policies/component_sched.c

@@ -284,8 +284,10 @@ void _starpu_sched_component_update_workers_in_ctx(struct starpu_sched_component
 	struct starpu_bitmap * workers_in_ctx = _starpu_get_worker_mask(sched_ctx_id);
 	starpu_bitmap_unset_and(component->workers_in_ctx,component->workers, workers_in_ctx);
 	unsigned i,j;
-	for(i = starpu_worker_get_count(); i < starpu_worker_get_count() + starpu_combined_worker_get_count(); i++) {
-		if (starpu_bitmap_get(component->workers, i)) {
+	for(i = starpu_worker_get_count(); i < starpu_worker_get_count() + starpu_combined_worker_get_count(); i++)
+	{
+		if (starpu_bitmap_get(component->workers, i))
+		{
 			/* Component has this combined worker, check whether the
 			 * context has all the corresponding workers */
 			int worker_size;

+ 2 - 1
src/sched_policies/component_work_stealing.c

@@ -240,7 +240,8 @@ static int push_task(struct starpu_sched_component * component, struct starpu_ta
 
 	/* Find a child component that can execute this task */
 	i = (i+1)%component->nchildren;
-	while(1) {
+	while(1)
+	{
 		int workerid;
 		for(workerid = starpu_bitmap_first(component->children[i]->workers_in_ctx);
 		    -1 != workerid;

+ 2 - 1
src/sched_policies/component_worker.c

@@ -149,7 +149,8 @@ struct _starpu_worker_component_data
 	union
 	{
 		struct _starpu_worker * worker;
-		struct {
+		struct
+		{
 			unsigned worker_size;
 			unsigned workerids[STARPU_NMAXWORKERS];
 		} parallel_worker;

+ 1 - 1
src/sched_policies/heteroprio.c

@@ -524,7 +524,7 @@ static struct starpu_task *pop_task_heteroprio_policy(unsigned sched_ctx_id)
 				nb_added_tasks       += 1;
 				// TODO starpu_prefetch_task_input_for(task, workerid);
 			}
-		}		
+		}
 	}
 
 	struct starpu_task* task = NULL;

+ 19 - 15
src/sched_policies/modular_ez.c

@@ -262,10 +262,13 @@ void starpu_sched_component_initialize_simple_schedulers(unsigned sched_ctx_id,
 		unsigned ntasks_threshold;
 		if (starpu_sched_component_is_heft(decision_component) ||
 		    starpu_sched_component_is_mct(decision_component) ||
-		    starpu_sched_component_is_heteroprio(decision_component)) {
+		    starpu_sched_component_is_heteroprio(decision_component))
+		{
 			/* These need more queueing to allow CPUs to take some share of the work */
 			ntasks_threshold = _STARPU_SCHED_NTASKS_THRESHOLD_HEFT;
-		} else {
+		}
+		else
+		{
 			ntasks_threshold = _STARPU_SCHED_NTASKS_THRESHOLD_DEFAULT;
 		}
 		/* But let user tune it */
@@ -279,20 +282,20 @@ void starpu_sched_component_initialize_simple_schedulers(unsigned sched_ctx_id,
 		int exp = flags & STARPU_SCHED_SIMPLE_FIFOS_BELOW_EXP ? 1 : 0;
 
 		struct starpu_sched_component_prio_data prio_data =
-			{
-				.ntasks_threshold = ntasks_threshold,
-				.exp_len_threshold = exp_len_threshold,
-				.ready = ready,
-				.exp = exp,
-			};
+		{
+			.ntasks_threshold = ntasks_threshold,
+			.exp_len_threshold = exp_len_threshold,
+			.ready = ready,
+			.exp = exp,
+		};
 
 		struct starpu_sched_component_fifo_data fifo_data =
-			{
-				.ntasks_threshold = ntasks_threshold,
-				.exp_len_threshold = exp_len_threshold,
-				.ready = ready,
-				.exp = exp,
-			};
+		{
+			.ntasks_threshold = ntasks_threshold,
+			.exp_len_threshold = exp_len_threshold,
+			.ready = ready,
+			.exp = exp,
+		};
 
 		/* Create one fifo+eager component pair per choice, below scheduling decision */
 		for(i = 0; i < nbelow; i++)
@@ -334,7 +337,8 @@ void starpu_sched_component_initialize_simple_schedulers(unsigned sched_ctx_id,
 					STARPU_ABORT();
 			}
 			STARPU_ASSERT(n >= 1);
-			if (n > 1) {
+			if (n > 1)
+			{
 				/* Several workers for this choice, need to introduce
 				 * a component to distribute the work */
 				struct starpu_sched_component *distribute;

+ 2 - 1
src/sched_policies/modular_gemm.c

@@ -26,7 +26,8 @@
 
 #define MEMORY_AFFINITY
 
-struct child_data {
+struct child_data
+{
 	double expected_start;
 	double predicted;
 	double predicted_transfer;

+ 2 - 1
src/sched_policies/modular_heteroprio_heft.c

@@ -22,7 +22,8 @@
 
 static void initialize_heteroprio_heft_center_policy(unsigned sched_ctx_id)
 {
-	struct starpu_sched_component_heteroprio_data heteroprio_data = {
+	struct starpu_sched_component_heteroprio_data heteroprio_data =
+	{
 		.mct = NULL,
 		.batch = 1,
 	};

+ 72 - 63
src/util/openmp_runtime_support.c

@@ -135,7 +135,9 @@ static void wake_up_and_unlock_task(struct starpu_omp_task *task)
 		weak_task_unlock(task);
 		int ret = starpu_task_submit(task->starpu_task);
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
-	} else {
+	}
+	else
+	{
 		weak_task_unlock(task);
 	}
 }
@@ -379,36 +381,37 @@ static void starpu_omp_explicit_task_entry(struct starpu_omp_task *task)
 {
 	STARPU_ASSERT(!(task->flags & STARPU_OMP_TASK_FLAGS_IMPLICIT));
 	struct _starpu_worker *starpu_worker = _starpu_get_local_worker_key();
-   /* XXX on work */
-   if (task->is_loop) {
-      starpu_omp_for_inline_first_alt(task->nb_iterations, task->chunk, starpu_omp_sched_static, 1, &task->begin_i, &task->end_i);
-   }
-   if (starpu_worker->arch == STARPU_CPU_WORKER)
-   {
-      task->cpu_f(task->starpu_buffers, task->starpu_cl_arg);
-   }
+	/* XXX on work */
+	if (task->is_loop)
+	{
+		starpu_omp_for_inline_first_alt(task->nb_iterations, task->chunk, starpu_omp_sched_static, 1, &task->begin_i, &task->end_i);
+	}
+	if (starpu_worker->arch == STARPU_CPU_WORKER)
+	{
+		task->cpu_f(task->starpu_buffers, task->starpu_cl_arg);
+	}
 #ifdef STARPU_USE_CUDA
-   else if (starpu_worker->arch == STARPU_CUDA_WORKER)
-   {
-      task->cuda_f(task->starpu_buffers, task->starpu_cl_arg);
-   }
+	else if (starpu_worker->arch == STARPU_CUDA_WORKER)
+	{
+		task->cuda_f(task->starpu_buffers, task->starpu_cl_arg);
+	}
 #endif
 #ifdef STARPU_USE_OPENCL
-   else if (starpu_worker->arch == STARPU_OPENCL_WORKER)
-   {
-      task->opencl_f(task->starpu_buffers, task->starpu_cl_arg);
-   }
+	else if (starpu_worker->arch == STARPU_OPENCL_WORKER)
+	{
+		task->opencl_f(task->starpu_buffers, task->starpu_cl_arg);
+	}
 #endif
-   else
-      _STARPU_ERROR("invalid worker architecture");
-   /**/
+	else
+		_STARPU_ERROR("invalid worker architecture");
+	/**/
 	_starpu_omp_unregister_task_handles(task);
 	_starpu_spin_lock(&task->lock);
 	task->state = starpu_omp_task_state_terminated;
 	task->transaction_pending=1;
 	_starpu_spin_unlock(&task->lock);
 	struct starpu_omp_thread *thread = _starpu_omp_get_thread();
-	/* 
+	/*
 	 * the task reached the terminated state, definitively give hand back to the worker code.
 	 *
 	 * about to run on the worker stack...
@@ -428,7 +431,7 @@ static void starpu_omp_implicit_task_entry(struct starpu_omp_task *task)
 		_starpu_omp_unregister_region_handles(task->owner_region);
 	}
 	task->state = starpu_omp_task_state_terminated;
-	/* 
+	/*
 	 * the task reached the terminated state, definitively give hand back to the worker code.
 	 *
 	 * about to run on the worker stack...
@@ -447,7 +450,7 @@ static void starpu_omp_task_preempt(void)
 	struct starpu_omp_thread *thread = _starpu_omp_get_thread();
 	task->state = starpu_omp_task_state_preempted;
 
-	/* 
+	/*
 	 * the task reached a blocked state, give hand back to the worker code.
 	 *
 	 * about to run on the worker stack...
@@ -486,7 +489,7 @@ static void starpu_omp_implicit_task_exec(void *buffers[], void *cl_arg)
 
 	task->state = starpu_omp_task_state_clear;
 
-	/* 
+	/*
 	 * start the task execution, or restore a previously preempted task.
 	 * about to run on the task stack...
 	 * */
@@ -655,7 +658,7 @@ static void starpu_omp_explicit_task_exec(void *buffers[], void *cl_arg)
 	}
 	task->state = starpu_omp_task_state_clear;
 
-	/* 
+	/*
 	 * start the task execution, or restore a previously preempted task.
 	 * about to run on the task stack...
 	 * */
@@ -694,11 +697,11 @@ static struct starpu_omp_task *create_omp_task_struct(struct starpu_omp_task *pa
 		task->flags |= STARPU_OMP_TASK_FLAGS_IMPLICIT;
 	}
 	_starpu_spin_init(&task->lock);
-	/* TODO: initialize task->data_env_icvs with proper values */ 
+	/* TODO: initialize task->data_env_icvs with proper values */
 	memset(&task->data_env_icvs, 0, sizeof(task->data_env_icvs));
 	if (is_implicit)
 	{
-	  /* TODO: initialize task->implicit_task_icvs with proper values */ 
+	  /* TODO: initialize task->implicit_task_icvs with proper values */
 		memset(&task->implicit_task_icvs, 0, sizeof(task->implicit_task_icvs));
 	}
 
@@ -1037,7 +1040,7 @@ void starpu_omp_parallel_region(const struct starpu_omp_parallel_region_attr *at
 	struct starpu_omp_task *task = _starpu_omp_get_task();
 	struct starpu_omp_region *generating_region = task->owner_region;
 	const int max_active_levels = generating_region->owner_device->icvs.max_active_levels_var;
-	struct starpu_omp_region *new_region = 
+	struct starpu_omp_region *new_region =
 		create_omp_region_struct(generating_region, _global_state.initial_device);
 	int ret;
 	int nb_threads = 1;
@@ -1166,7 +1169,7 @@ void starpu_omp_parallel_region(const struct starpu_omp_parallel_region_attr *at
 	}
 	STARPU_ASSERT(new_region->nb_threads == nb_threads);
 
-	/* 
+	/*
 	 * if task == initial_task, create a starpu task as a continuation to all the implicit
 	 * tasks of the new region, else prepare the task for preemption,
 	 * to become itself a continuation to the implicit tasks of the new region
@@ -1194,7 +1197,7 @@ void starpu_omp_parallel_region(const struct starpu_omp_parallel_region_attr *at
 	 * create the starpu tasks for the implicit omp tasks,
 	 * create explicit dependencies between these starpu tasks and the continuation starpu task
 	 */
-	for (i = 0; i < nb_threads; i++) 
+	for (i = 0; i < nb_threads; i++)
 	{
 		struct starpu_omp_task * implicit_task = new_region->implicit_task_array[i];
 		implicit_task->cl = attr->cl;
@@ -1234,7 +1237,7 @@ void starpu_omp_parallel_region(const struct starpu_omp_parallel_region_attr *at
 	/*
 	 * submit all the region implicit starpu tasks
 	 */
-	for (i = 0; i < nb_threads; i++) 
+	for (i = 0; i < nb_threads; i++)
 	{
 		struct starpu_omp_task * implicit_task = new_region->implicit_task_array[i];
 		ret = starpu_task_submit(implicit_task->starpu_task);
@@ -1292,7 +1295,7 @@ static void wake_up_barrier(struct starpu_omp_region *parallel_region)
 {
 	struct starpu_omp_task *task = _starpu_omp_get_task();
 	int i;
-	for (i = 0; i < parallel_region->nb_threads; i++) 
+	for (i = 0; i < parallel_region->nb_threads; i++)
 	{
 		struct starpu_omp_task * implicit_task = parallel_region->implicit_task_array[i];
 		if (implicit_task == task)
@@ -1343,7 +1346,7 @@ void starpu_omp_barrier(void)
 	{
 		ANNOTATE_HAPPENS_BEFORE(&parallel_region->barrier_count);
 		/* not the last task reaching the barrier
-		 * . prepare for conditional continuation 
+		 * . prepare for conditional continuation
 		 * . sleep
 		 */
 
@@ -1826,40 +1829,46 @@ void starpu_omp_taskgroup_inline_end(void)
 // XXX on work
 void starpu_omp_taskloop_inline_begin(struct starpu_omp_task_region_attr *attr)
 {
-   if (!attr->nogroup_clause)
-   {
-      starpu_omp_taskgroup_inline_begin();
-   }
-
-   int nb_subloop;
-   if (attr->num_tasks) {
-      nb_subloop = attr->num_tasks;
-   } else if (attr->grainsize) {
-      nb_subloop = attr->nb_iterations / attr->grainsize;
-   } else {
-      nb_subloop = 4;
-   }
-
-   attr->is_loop = 1;
-
-   int i;
-   int nb_iter_i = attr->nb_iterations / nb_subloop;
-   for (i = 0; i < nb_subloop; i++)
-   {
-      attr->begin_i = nb_iter_i * i;
-      attr->end_i = attr->begin_i + nb_iter_i;
-      attr->end_i += (i+1 != nb_subloop) ? 0 : (attr->nb_iterations % nb_subloop);
-      attr->chunk = attr->end_i - attr->begin_i;
-      starpu_omp_task_region(attr);
-   }
+	if (!attr->nogroup_clause)
+	{
+		starpu_omp_taskgroup_inline_begin();
+	}
+
+	int nb_subloop;
+	if (attr->num_tasks)
+	{
+		nb_subloop = attr->num_tasks;
+	}
+	else if (attr->grainsize)
+	{
+		nb_subloop = attr->nb_iterations / attr->grainsize;
+	}
+	else
+	{
+		nb_subloop = 4;
+	}
+
+	attr->is_loop = 1;
+
+	int i;
+	int nb_iter_i = attr->nb_iterations / nb_subloop;
+	for (i = 0; i < nb_subloop; i++)
+	{
+		attr->begin_i = nb_iter_i * i;
+		attr->end_i = attr->begin_i + nb_iter_i;
+		attr->end_i += (i+1 != nb_subloop) ? 0 : (attr->nb_iterations % nb_subloop);
+		attr->chunk = attr->end_i - attr->begin_i;
+		starpu_omp_task_region(attr);
+	}
 }
 
 // XXX on work
 void starpu_omp_taskloop_inline_end(const struct starpu_omp_task_region_attr *attr)
 {
-   if (!attr->nogroup_clause) {
-      starpu_omp_taskgroup_inline_end();
-   }
+	if (!attr->nogroup_clause)
+	{
+		starpu_omp_taskgroup_inline_end();
+	}
 }
 
 static inline void _starpu_omp_for_loop(struct starpu_omp_region *parallel_region, struct starpu_omp_task *task,
@@ -2170,7 +2179,7 @@ void starpu_omp_ordered_inline_end(void)
 	struct starpu_omp_region *parallel_region = task->owner_region;
 	struct starpu_omp_loop *loop = _starpu_omp_for_get_loop(parallel_region, task);
 
-	loop->ordered_iteration++;	
+	loop->ordered_iteration++;
 	condition_broadcast(&loop->ordered_cond, starpu_omp_task_wait_on_ordered);
 	_starpu_spin_unlock(&loop->ordered_lock);
 }

+ 5 - 3
tests/datawizard/bcsr.c

@@ -41,7 +41,8 @@ void cpu_show_bcsr(void *descr[], void *arg)
 
 	printf("nnz %d elemsize %d\n", nnz, elemsize);
 
-	for (i = 0; i < nrow; i++) {
+	for (i = 0; i < nrow; i++)
+	{
 		uint32_t row_start = rowptr[i] - firstentry;
 		uint32_t row_end = rowptr[i+1] - firstentry;
 
@@ -73,7 +74,7 @@ struct starpu_codelet show_cl =
 };
 
 /*
- * In this test, we use the following matrix: 
+ * In this test, we use the following matrix:
  *
  *   +----------------+
  *   |  0   1   0   0 |
@@ -129,7 +130,8 @@ int main(int argc, char **argv)
 
 	starpu_task_insert(&show_cl, STARPU_R, bcsr_handle, 0);
 
-	struct starpu_data_filter filter = {
+	struct starpu_data_filter filter =
+	{
 		.filter_func = starpu_bcsr_filter_vertical_block,
 		.nchildren = 2,
 	};

+ 2 - 1
tests/microbenchs/tasks_size_overhead.c

@@ -185,7 +185,8 @@ int main(int argc, char **argv)
 	unsetenv("STARPU_NCPU");
 #endif
 
-	if (STARPU_RUNNING_ON_VALGRIND) {
+	if (STARPU_RUNNING_ON_VALGRIND)
+	{
 		factortime *= 4;
 		cpustep *= 4;
 	}

+ 2 - 1
tests/parallel_tasks/parallel_kernels.c

@@ -89,7 +89,8 @@ int main(void)
 
 	unsigned iter, worker, n;
 	n = N;
-	if (STARPU_RUNNING_ON_VALGRIND) {
+	if (STARPU_RUNNING_ON_VALGRIND)
+	{
 		n /= 300;
 	}
 	for (iter = 0; iter < n; iter++)

+ 2 - 1
tests/parallel_tasks/parallel_kernels_spmd.c

@@ -92,7 +92,8 @@ int main(void)
 
 	unsigned iter, worker, n;
 	n = N;
-	if (STARPU_RUNNING_ON_VALGRIND) {
+	if (STARPU_RUNNING_ON_VALGRIND)
+	{
 		n /= 300;
 	}
 	for (iter = 0; iter < n; iter++)

+ 5 - 2
tools/starpu_perfmodel_display.c

@@ -175,9 +175,12 @@ int main(int argc, char **argv)
 			fprintf(stderr, "The performance model for the symbol <%s> could not be loaded\n", psymbol);
 			return 1;
 		}
-		if (xml) {
+		if (xml)
+		{
 			starpu_perfmodel_dump_xml(stdout, &model);
-		} else {
+		}
+		else
+		{
 			uint32_t *footprint = NULL;
 			if (pdisplay_specific_footprint == 1)
 			{

+ 4 - 2
tools/starpu_perfmodel_recdump.c

@@ -129,7 +129,8 @@ void print_archs(FILE* output)
 		{
 			if (starpu_worker_get_memory_node(workerid) == node)
 			{
-				if (!printed) {
+				if (!printed)
+				{
 					fprintf(output, "Workers:");
 					printed = 1;
 				}
@@ -145,7 +146,8 @@ void print_archs(FILE* output)
 	{
 		for (dst = 0; dst < starpu_memory_nodes_get_count(); dst++)
 		{
-			if (src != dst) {
+			if (src != dst)
+			{
 				fprintf(output, "MemoryNodeSrc: %d\n", src);
 				fprintf(output, "MemoryNodeDst: %d\n", dst);
 				fprintf(output, "Bandwidth: %f\n", starpu_transfer_bandwidth(src, dst));

+ 4 - 2
tools/starpu_replay.c

@@ -165,7 +165,8 @@ static void replay_data_register(starpu_data_handle_t *handleptr, starpu_data_ha
 	{
 		replay_interface_ops.interfaceid = starpu_data_interface_get_next_id();
 	}
-	struct replay_interface interface = {
+	struct replay_interface interface =
+	{
 		.id = replay_interface_ops.interfaceid,
 		.orig_handle = orig_handle,
 		.size = size,
@@ -337,7 +338,8 @@ double arch_cost_function(struct starpu_task *task, struct starpu_perfmodel_arch
 /* End of settings */
 
 static unsigned long nexecuted_tasks;
-void dumb_kernel(void *buffers[], void *args) {
+void dumb_kernel(void *buffers[], void *args)
+{
 	(void) buffers;
 	(void) args;
 	nexecuted_tasks++;