лет назад: 5 · ec57de33fa
--- a/configure.ac
+++ b/configure.ac
@@ -3096,10 +3096,15 @@ AC_ARG_WITH([hwloc],
 
				 				if test ! -d "$withval" ; then
			
 
				 				   AC_MSG_ERROR("Directory specified for hwloc <$withval> does not exist")
			
 
				 				fi
			
 
				-				if test ! -d "$withval/lib/pkgconfig" ; then
			
 
				-				   AC_MSG_ERROR("Hwloc directory <$withval> does not have a subdirectory lib/pkgconfig")
			
 
				+				if test -d "$withval/lib64/pkgconfig" ; then
			
 
				+				   export PKG_CONFIG_PATH=$withval/lib64/pkgconfig:$PKG_CONFIG_PATH
			
 
				+			        else
			
 
				+				   if test -d "$withval/lib/pkgconfig" ; then
			
 
				+				      export PKG_CONFIG_PATH=$withval/lib/pkgconfig:$PKG_CONFIG_PATH
			
 
				+				   else
			
 
				+				      AC_MSG_ERROR("Hwloc directory <$withval> does not have a subdirectory lib/pkgconfig or lib64/pkgconfig")
			
 
				+				   fi
			
 
				 				fi
			
 
				-				export PKG_CONFIG_PATH=$withval/lib/pkgconfig:$PKG_CONFIG_PATH
			
 
				 				use_hwloc=yes
			
 
				 			fi
			
 
				 		else
			
--- a/doc/doxygen/Makefile.am
+++ b/doc/doxygen/Makefile.am
@@ -267,13 +267,11 @@ $(DOX_PDF): $(DOX_TAG) refman.tex
 
				 	! < refman.log grep -v group__ | grep -v _amgrp | grep -v deprecated__ | grep "multiply defined" || exit 1 ;\
			
 
				 	$(MAKEINDEX) refman.idx ;\
			
 
				 	max_print_line=1000000 $(PDFLATEX) -interaction batchmode refman.tex ;\
			
 
				-	done=0; repeat=5 ;\
			
 
				-	while test $$done = 0 -a $$repeat -gt 0; do \
			
 
				+	for i in $(shell seq 1 5); do \
			
 
				            if $(EGREP) 'Rerun (LaTeX|to get cross-references right)' refman.log > /dev/null 2>&1; then \
			
 
				 	       max_print_line=1000000 $(PDFLATEX) -interaction batchmode refman.tex; \
			
 
				-	       repeat=`expr $$repeat - 1`; \
			
 
				 	   else \
			
 
				-	       done=1; \
			
 
				+		break ; \
			
 
				 	   fi; \
			
 
				 	done
			
 
				 	mv $(DOX_LATEX_DIR)/refman.pdf $(DOX_PDF)
			
--- a/doc/doxygen_dev/Makefile.am
+++ b/doc/doxygen_dev/Makefile.am
@@ -217,13 +217,11 @@ $(DOX_PDF): $(DOX_TAG) refman.tex
 
				 	! < refman.log grep -v group__ | grep -v _amgrp | grep -v deprecated__ | grep "multiply defined" || exit 1 ;\
			
 
				 	$(MAKEINDEX) refman.idx ;\
			
 
				 	max_print_line=1000000 $(PDFLATEX) -interaction batchmode refman.tex ;\
			
 
				-	done=0; repeat=5 ;\
			
 
				-	while test $$done = 0 -a $$repeat -gt 0; do \
			
 
				+	for i in $(shell seq 1 5); do \
			
 
				            if $(EGREP) 'Rerun (LaTeX|to get cross-references right)' refman.log > /dev/null 2>&1; then \
			
 
				 	       max_print_line=1000000 $(PDFLATEX) -interaction batchmode refman.tex; \
			
 
				-	       repeat=`expr $$repeat - 1`; \
			
 
				 	   else \
			
 
				-	       done=1; \
			
 
				+		break ; \
			
 
				 	   fi; \
			
 
				 	done
			
 
				 	mv $(DOX_LATEX_DIR)/refman.pdf $(DOX_PDF)
			
--- a/examples/perf_steering/perf_knobs_03.c
+++ b/examples/perf_steering/perf_knobs_03.c
@@ -126,7 +126,8 @@ int main(int argc, char **argv)
 
				 		const int id = starpu_perf_knob_name_to_id(scope_id, knob_name);
			
 
				 		STARPU_ASSERT(starpu_perf_knob_get_type_id(id) == starpu_perf_knob_type_name_to_id(knob_type_name));
			
 
				 
			
 
				-		struct starpu_codelet cl = {
			
 
				+		struct starpu_codelet cl =
			
 
				+		{
			
 
				 			.cpu_funcs = {cpu_func}
			
 
				 		};
			
 
				 
			
--- a/examples/pipeline/pipeline.c
+++ b/examples/pipeline/pipeline.c
@@ -176,7 +176,8 @@ static struct starpu_codelet pipeline_codelet_sum =
 
				 	.model = &pipeline_model_sum
			
 
				 };
			
 
				 
			
 
				-static void release_sem(void *arg) {
			
 
				+static void release_sem(void *arg)
			
 
				+{
			
 
				 	sem_post(arg);
			
 
				 };
			
 
				 
			
--- a/examples/ppm_downscaler/yuv_downscaler.c
+++ b/examples/ppm_downscaler/yuv_downscaler.c
@@ -141,14 +141,16 @@ int main(int argc, char **argv)
 
				 
			
 
				 	/* fetch input data */
			
 
				 	FILE *f_in = fopen(filename_in, "r");
			
 
				-	if (!f_in) {
			
 
				+	if (!f_in)
			
 
				+	{
			
 
				 		fprintf(stderr, "couldn't open input file %s\n", filename_in);
			
 
				 		exit(EXIT_FAILURE);
			
 
				 	}
			
 
				 
			
 
				 	/* allocate room for an output buffer */
			
 
				 	FILE *f_out = fopen(filename_out, "w+");
			
 
				-	if (!f_out) {
			
 
				+	if (!f_out)
			
 
				+	{
			
 
				 		fprintf(stderr, "couldn't open output file %s\n", filename_out);
			
 
				 		exit(EXIT_FAILURE);
			
 
				 	}
			
--- a/examples/scheduler/dummy_modular_sched.c
+++ b/examples/scheduler/dummy_modular_sched.c
@@ -170,7 +170,8 @@ static void init_dummy_sched(unsigned sched_ctx_id)
 
				 {
			
 
				 	FPRINTF(stderr, "Initialising Dummy scheduler\n");
			
 
				 
			
 
				-	struct dummy_sched_params params = {
			
 
				+	struct dummy_sched_params params =
			
 
				+	{
			
 
				 		.verbose = 0,
			
 
				 	};
			
 
				 
			
--- a/examples/spmv/spmv.c
+++ b/examples/spmv/spmv.c
@@ -245,10 +245,13 @@ int main(int argc, char **argv)
 
				 			vector_exp_out_ptr[row] += UPPER_BAND * vector_in_ptr[row+1];
			
 
				 	}
			
 
				 	for (row = 0; row < size; row++)
			
 
				-		if (vector_out_ptr[row] != vector_exp_out_ptr[row]) {
			
 
				+	{
			
 
				+		if (vector_out_ptr[row] != vector_exp_out_ptr[row])
			
 
				+		{
			
 
				 			FPRINTF(stderr, "check failed at %u: %f vs expected %f\n", row, vector_out_ptr[row], vector_exp_out_ptr[row]);
			
 
				 			exit(EXIT_FAILURE);
			
 
				 		}
			
 
				+	}
			
 
				 
			
 
				 	starpu_free(nzval);
			
 
				 	starpu_free(colind);
			
--- a/examples/tag_example/tag_example2.c
+++ b/examples/tag_example/tag_example2.c
@@ -70,7 +70,7 @@ static void tag_cleanup_grid(unsigned iter)
 
				 
			
 
				 	for (i = 0; i < ni; i++)
			
 
				 		starpu_tag_remove(TAG(i,iter));
			
 
				-} 
			
 
				+}
			
 
				 
			
 
				 static int create_task_grid(unsigned iter)
			
 
				 {
			
--- a/examples/tag_example/tag_example3.c
+++ b/examples/tag_example/tag_example3.c
@@ -72,7 +72,7 @@ static void tag_cleanup_grid(unsigned iter)
 
				 
			
 
				 	for (i = 0; i < ni; i++)
			
 
				 		starpu_tag_remove(TAG(i,iter));
			
 
				-} 
			
 
				+}
			
 
				 
			
 
				 static int create_task_grid(unsigned iter)
			
 
				 {
			
--- a/julia/mandelbrot/Makefile
+++ b/julia/mandelbrot/Makefile
@@ -0,0 +1,47 @@
 
				+CC=gcc
			
 
				+CFLAGS += -Wall -Wextra -O3 -mavx -mfma -fomit-frame-pointer -march=native -ffast-math $(shell pkg-config --cflags starpu-1.3)
			
 
				+
			
 
				+LDFLAGS +=$(shell pkg-config --libs starpu-1.3) -lm
			
 
				+EXTERNLIB=extern_tasks.so
			
 
				+GENERATEDLIB=generated_tasks.so
			
 
				+#OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
			
 
				+OBJECTS=$(wildcard gen*.c)
			
 
				+LIBPATH=${PWD}/../StarPU.jl/lib
			
 
				+
			
 
				+all: ${EXTERNLIB}
			
 
				+
			
 
				+mandelbrot: mandelbrot.c cpu_mandelbrot.o #gpu_mandelbrot.o
			
 
				+	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
			
 
				+
			
 
				+gpu_mandelbrot.o: gpu_mandelbrot.cu
			
 
				+	nvcc -c $(CFLAGS) $^ -o $@
			
 
				+
			
 
				+%.o: %.c
			
 
				+	$(CC) -c $(CFLAGS) $^ -o $@
			
 
				+
			
 
				+${EXTERNLIB}: cpu_mandelbrot.c
			
 
				+	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
			
 
				+
			
 
				+gpu_mandelbrot.so: gpu_mandelbrot.o
			
 
				+	nvcc $(CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
			
 
				+
			
 
				+cpu_mandelbrot_sa: cpu_mandelbrot_sa.o
			
 
				+	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
			
 
				+
			
 
				+${GENERATEDLIB}: ${OBJECTS}
			
 
				+	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
			
 
				+
			
 
				+clean:
			
 
				+	rm -f mandelbrot *.so *.o c_*.genc gencuda_*.cu *.dat
			
 
				+
			
 
				+# Performance Tests
			
 
				+cstarpu.dat: mandelbrot
			
 
				+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 ./mandelbrot -0.800671 -0.158392 32 32 4096 4 > $@
			
 
				+julia_generatedc.dat:
			
 
				+	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mandelbrot.jl $@
			
 
				+julia_native.dat:
			
 
				+	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mandelbrot_native.jl $@
			
 
				+julia_calllib.dat: ${EXTERNLIB}
			
 
				+	LD_LIBRARY_PATH+=${LIBPATH} JULIA_TASK_LIB="${EXTERNLIB}" STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mandelbrot.jl julia_calllib.dat
			
 
				+
			
 
				+test: cstarpu.dat julia_generatedc.dat julia_native.dat julia_calllib.dat
			
--- a/julia/mandelbrot/cpu_mandelbrot.c
+++ b/julia/mandelbrot/cpu_mandelbrot.c
@@ -4,46 +4,50 @@
 
				 
			
 
				 void cpu_mandelbrot(void *descr[], void *cl_arg)
			
 
				 {
			
 
				-        long long int *pixels;
			
 
				-	float *params;
			
 
				+        long long *pixels;
			
 
				+        float *params;
			
 
				 
			
 
				         pixels = (long long int *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				-	params = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				-
			
 
				-        int width = STARPU_MATRIX_GET_NX(descr[0]);
			
 
				-        int height = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				-        
			
 
				-        int ldP = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+        params = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				 
			
 
				+        long long width = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+        long long height = STARPU_MATRIX_GET_NX(descr[0]);
			
 
				+        double zoom = width * 0.25296875;
			
 
				+        double iz = 1. / zoom;
			
 
				+        float diverge = 4.0;
			
 
				+        float max_iterations = (width/2) * 0.049715909 * log10(zoom);
			
 
				+        float imi = 1. / max_iterations;
			
 
				         float centerr = params[0];
			
 
				         float centeri = params[1];
			
 
				         float offset = params[2];
			
 
				         float dim = params[3];
			
 
				-        float zoom = width * 0.25296875;
			
 
				-        float diverge = 4.0;
			
 
				-        int max_iter = (width/2) * 0.049715909 * log10(zoom);
			
 
				+        double cr = 0;
			
 
				+        double zr = 0;
			
 
				+        double ci = 0;
			
 
				+        double zi = 0;
			
 
				+        long long n = 0;
			
 
				+        double tmp = 0;
			
 
				+        int ldP = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				 
			
 
				-        int x,y,n;
			
 
				+        long long x,y;
			
 
				 
			
 
				         for (y = 0; y < height; y++){
			
 
				                 for (x = 0; x < width; x++){
			
 
				-                        float cr = centerr + (x - (dim/2))/zoom;
			
 
				-                        float ci = centeri + (y+offset - (dim/2))/zoom;
			
 
				-                        float zr = cr;
			
 
				-                        float zi = ci;
			
 
				-                        
			
 
				-                        for (n = 0; n <= max_iter; n++) {
			
 
				+                        cr = centerr + (x - (dim/2)) * iz;
			
 
				+			zr = cr;
			
 
				+                        ci = centeri + (y+offset - (dim/2)) * iz;
			
 
				+                        zi = ci;
			
 
				+
			
 
				+                        for (n = 0; n <= max_iterations; n++) {
			
 
				 				if (zr*zr + zi*zi>diverge) break;
			
 
				-                                float tmp = zr*zr - zi*zi + cr;
			
 
				+                                tmp = zr*zr - zi*zi + cr;
			
 
				                                 zi = 2*zr*zi + ci;
			
 
				                                 zr = tmp;
			
 
				                         }
			
 
				-			int color;
			
 
				-			if (n<max_iter)
			
 
				-				color = round(15.*n/max_iter);
			
 
				+			if (n<max_iterations)
			
 
				+				pixels[y +x*ldP] = round(15.*n*imi);
			
 
				 			else
			
 
				-				color = 0;
			
 
				-			pixels[x*ldP + y] = color;
			
 
				+				pixels[y +x*ldP] = 0;
			
 
				 		}
			
 
				 	}
			
 
				 }
			
--- a/julia/mandelbrot/makefile
+++ b/julia/mandelbrot/makefile
@@ -1,35 +0,0 @@
 
				-# GCC compiler
			
 
				-CC=gcc-9
			
 
				-CFLAGS += -O3 -mavx -mfma -fomit-frame-pointer -march=native -ffast-math $(shell pkg-config --cflags starpu-1.3)
			
 
				-
			
 
				-LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
			
 
				-EXTERNLIB=extern_tasks.dylib
			
 
				-GENERATEDLIB=generated_tasks.dylib
			
 
				-OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
			
 
				-LIBPATH=${PWD}/../StarPU.jl/lib
			
 
				-
			
 
				-all: ${EXTERNLIB} 
			
 
				-
			
 
				-mult: mult.c cpu_mult.o #gpu_mult.o 
			
 
				-	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)	
			
 
				-
			
 
				-gpu_mult.o: gpu_mult.cu
			
 
				-	nvcc -c $(CFLAGS) $^ -o $@
			
 
				-
			
 
				-%.o: %.c
			
 
				-	$(CC) -c $(CFLAGS) $^ -o $@
			
 
				-
			
 
				-${EXTERNLIB}: cpu_mandelbrot.o
			
 
				-	$(CC) -shared -fPIC $(LDFLAGS) $^ -o $@  
			
 
				-
			
 
				-gpu_mult.so: gpu_mult.o
			
 
				-	nvcc $(CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
			
 
				-
			
 
				-${GENERATEDLIB}: ${OBJECTS}
			
 
				-	$(CC) -shared -fPIC $(LDFLAGS) $^ -o $@
			
 
				-
			
 
				-clean:
			
 
				-	rm *.so *.o *.dylib c_*.genc gencuda_*.cu *.dat
			
 
				-
			
 
				-
			
 
				-
			
--- a/julia/mandelbrot/mandelbrot.c
+++ b/julia/mandelbrot/mandelbrot.c
@@ -16,43 +16,33 @@
 
				 #include <stdio.h>
			
 
				 #include <stdlib.h>
			
 
				 #include <starpu.h>
			
 
				-#include "../display.h"
			
 
				 
			
 
				 void cpu_mandelbrot(void **, void *);
			
 
				 void gpu_mandelbrot(void **, void *);
			
 
				 
			
 
				-struct Params
			
 
				+static struct starpu_perfmodel model =
			
 
				 {
			
 
				-	float cr;
			
 
				-	float ci;
			
 
				-	unsigned taskx;
			
 
				-	unsigned tasky;
			
 
				-	unsigned width;
			
 
				-	unsigned height;
			
 
				+		.type = STARPU_HISTORY_BASED,
			
 
				+		.symbol = "history_perf"
			
 
				 };
			
 
				 
			
 
				-
			
 
				-
			
 
				-struct starpu_codelet cl =
			
 
				+static struct starpu_codelet cl =
			
 
				 {
			
 
				 	.cpu_funcs = {cpu_mandelbrot},
			
 
				-	.cuda_funcs = {gpu_mandelbrot},
			
 
				-	.nbuffers = 1,
			
 
				-	.modes = {STARPU_RW}
			
 
				+	//.cuda_funcs = {gpu_mandelbrot},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_W, STARPU_R},
			
 
				+	.model = &model
			
 
				 };
			
 
				 
			
 
				 
			
 
				-void mandelbrot_with_starpu(int *pixels, float cr, float ci, unsigned width, unsigned height, unsigned nslicesx, unsigned nslicesy)
			
 
				+void mandelbrot_with_starpu(long long *pixels, float *params, long long dim, long long nslicesx)
			
 
				 {
			
 
				-	starpu_data_handle_t p_handle;
			
 
				-
			
 
				-	starpu_matrix_data_register(&p_handle, STARPU_MAIN_RAM, (uintptr_t)pixels, width, width, height, sizeof(int));
			
 
				+	starpu_data_handle_t pixels_handle;
			
 
				+	starpu_data_handle_t params_handle;
			
 
				 
			
 
				-	struct starpu_data_filter vert =
			
 
				-	{
			
 
				-		.filter_func = starpu_matrix_filter_vertical_block,
			
 
				-		.nchildren = nslicesy
			
 
				-	};
			
 
				+	starpu_matrix_data_register(&pixels_handle, STARPU_MAIN_RAM, (uintptr_t)pixels, dim, dim, dim, sizeof(long long));
			
 
				+	starpu_matrix_data_register(&params_handle, STARPU_MAIN_RAM, (uintptr_t)params, 4*nslicesx, 4*nslicesx, 1, sizeof(float));
			
 
				 
			
 
				 	struct starpu_data_filter horiz =
			
 
				 	{
			
@@ -60,179 +50,100 @@ void mandelbrot_with_starpu(int *pixels, float cr, float ci, unsigned width, uns
 
				 		.nchildren = nslicesx
			
 
				 	};
			
 
				 
			
 
				-	starpu_data_map_filters(p_handle, 2, &vert, &horiz);
			
 
				+	starpu_data_partition(pixels_handle, &horiz);
			
 
				+	starpu_data_partition(params_handle, &horiz);
			
 
				 
			
 
				-	unsigned taskx, tasky;
			
 
				-
			
 
				-	struct Params *params = malloc(nslicesx*nslicesy*sizeof(struct Params));
			
 
				+	long long taskx;
			
 
				 
			
 
				 	for (taskx = 0; taskx < nslicesx; taskx++){
			
 
				-		for (tasky = 0; tasky < nslicesy; tasky++){
			
 
				-			struct starpu_task *task = starpu_task_create();
			
 
				-			
			
 
				-			task->cl = &cl;
			
 
				-			task->handles[0] = starpu_data_get_sub_data(p_handle, 2, tasky, taskx);
			
 
				-			struct Params param = {cr, ci, taskx, tasky, width, height};
			
 
				-
			
 
				-			params[taskx + tasky*nslicesx] = param;
			
 
				-
			
 
				-			task->cl_arg = (params + taskx + tasky * nslicesx);
			
 
				-			task->cl_arg_size = sizeof(struct Params);
			
 
				-			
			
 
				-			starpu_task_submit(task);
			
 
				-		}
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+		task->cl = &cl;
			
 
				+		task->handles[0] = starpu_data_get_child(pixels_handle, taskx);
			
 
				+		task->handles[1] = starpu_data_get_child(params_handle, taskx);
			
 
				+		if (starpu_task_submit(task)!=0) fprintf(stderr,"submit task error\n");
			
 
				 	}
			
 
				-	starpu_task_wait_for_all();
			
 
				 
			
 
				-	starpu_data_unpartition(p_handle, STARPU_MAIN_RAM);
			
 
				+	starpu_task_wait_for_all();
			
 
				 
			
 
				-	starpu_data_unregister(p_handle);
			
 
				+	starpu_data_unpartition(pixels_handle, STARPU_MAIN_RAM);
			
 
				+	starpu_data_unpartition(params_handle, STARPU_MAIN_RAM);
			
 
				 
			
 
				-	free(params);
			
 
				+	starpu_data_unregister(pixels_handle);
			
 
				+	starpu_data_unregister(params_handle);
			
 
				 }
			
 
				 
			
 
				-void init_zero(int * pixels, unsigned width, unsigned height)
			
 
				+void pixels2img(long long *pixels, long long width, long long height, const char *filename)
			
 
				 {
			
 
				-	unsigned i,j;
			
 
				-	for (i = 0; i < height; i++){
			
 
				-		for (j = 0; j < width; j++){
			
 
				-			pixels[j + i*width] = 0;
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				+  FILE *fp = fopen(filename, "w");
			
 
				+  if (!fp)
			
 
				+    return;
			
 
				 
			
 
				-void sort(double *arr, unsigned nbr_tests)
			
 
				-{
			
 
				-	unsigned j;
			
 
				-	
			
 
				-	int is_sort = 0;
			
 
				-	
			
 
				-	while (!is_sort){
			
 
				-
			
 
				-		is_sort = 1;
			
 
				-		
			
 
				-		for (j = 0; j < nbr_tests - 1; j++){
			
 
				-			if (arr[j] > arr[j+1]){
			
 
				-				is_sort = 0;
			
 
				-				double tmp = arr[j];
			
 
				-				arr[j] = arr[j+1];
			
 
				-				arr[j+1] = tmp;
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-double median_time(float cr, float ci, unsigned width, unsigned height, unsigned nslicesx, unsigned nslicesy, unsigned nbr_tests)
			
 
				-{
			
 
				-	int *Pixels = malloc(width*height*sizeof(int));
			
 
				-	
			
 
				-	unsigned i;
			
 
				+  int MAPPING[16][3] = {{66,30,15},{25,7,26},{9,1,47},{4,4,73},{0,7,100},{12,44,138},{24,82,177},{57,125,209},{134,181,229},{211,236,248},{241,233,191},{248,201,95},{255,170,0},{204,128,0},{153,87,0},{106,52,3}};
			
 
				 
			
 
				-	double exec_times[nbr_tests];
			
 
				+  fprintf(fp, "P3\n%lld %lld\n255\n", width, height);
			
 
				+  long long i, j;
			
 
				+  for (i = 0; i < height; ++i) {
			
 
				+    for (j = 0; j < width; ++j) {
			
 
				+      fprintf(fp, "%d %d %d ", MAPPING[pixels[j*width+i]][0], MAPPING[pixels[j*width+i]][1], MAPPING[pixels[j*width+i]][2]);
			
 
				+    }
			
 
				+  }
			
 
				 
			
 
				-	double start, stop, exec_t;
			
 
				-	for (i = 0; i < nbr_tests; i++){
			
 
				-		init_zero(Pixels, width, height);
			
 
				-		
			
 
				-		start = starpu_timing_now(); // starpu_timing_now() gives the time in microseconds.
			
 
				-		mandelbrot_with_starpu(Pixels, cr, ci, width, height, nslicesx, nslicesy);
			
 
				-		stop = starpu_timing_now();
			
 
				-		
			
 
				-		exec_t = (stop-start)/1.e6;
			
 
				-		exec_times[i] = exec_t;
			
 
				-	}
			
 
				-	char filename[30];
			
 
				-	sprintf(filename, "PPM/mandelbrot%d.ppm", width);
			
 
				-	printf("%s\n", filename);
			
 
				-
			
 
				-	mandelbrot_graph(filename, Pixels, width, height);
			
 
				-
			
 
				-	free(Pixels);
			
 
				-
			
 
				-	sort(exec_times, nbr_tests);
			
 
				-
			
 
				-	return exec_times[nbr_tests/2];	
			
 
				+  fclose(fp);
			
 
				 }
			
 
				 
			
 
				-void fluctuation_time(float cr, float ci, unsigned width, unsigned height, unsigned nslicesx, unsigned nslicesy, unsigned nbr_tests, double *exec_times)
			
 
				+double min_times(double cr, double ci, long long dim, long long nslices)
			
 
				 {
			
 
				-	int *Pixels = malloc(width*height*sizeof(int));
			
 
				-	
			
 
				-	unsigned i;
			
 
				+	long long *pixels = calloc(dim*dim, sizeof(long long));
			
 
				+	float *params = calloc(4*nslices, sizeof(float));
			
 
				+
			
 
				+	double t_min = 0;
			
 
				+	long long i;
			
 
				+
			
 
				+	for (i=0; i<nslices; i++) {
			
 
				+		params[4*i+0] = cr;
			
 
				+		params[4*i+1] = ci;
			
 
				+		params[4*i+2] = i*dim/nslices;
			
 
				+		params[4*i+3] = dim;
			
 
				+	}
			
 
				 
			
 
				 	double start, stop, exec_t;
			
 
				-	for (i = 0; i < nbr_tests; i++){
			
 
				-		init_zero(Pixels, width, height);
			
 
				-		
			
 
				+	for (i = 0; i < 10; i++){
			
 
				 		start = starpu_timing_now(); // starpu_timing_now() gives the time in microseconds.
			
 
				-		mandelbrot_with_starpu(Pixels, cr, ci, width, height, nslicesx, nslicesy);
			
 
				+		mandelbrot_with_starpu(pixels, params, dim, nslices);
			
 
				 		stop = starpu_timing_now();
			
 
				-		
			
 
				-		exec_t = (stop-start)/1.e6;
			
 
				-		exec_times[i] = exec_t;
			
 
				-
			
 
				-		/* char filename[33]; */
			
 
				-		/* sprintf(filename, "../PPM/mandelbrot%d.ppm", i + 1); */
			
 
				-		/* printf("%s\n", filename); */
			
 
				-		/* mandelbrot_graph(filename, Pixels, width, height); */
			
 
				+		exec_t = (stop-start)*1.e3;
			
 
				+		if (t_min==0 || t_min>exec_t)
			
 
				+		  t_min = exec_t;
			
 
				 	}
			
 
				 
			
 
				+	char filename[64];
			
 
				+	snprintf(filename, 64, "out%lld.ppm", dim);
			
 
				+	pixels2img(pixels,dim,dim,filename);
			
 
				 
			
 
				-	free(Pixels);
			
 
				-
			
 
				-
			
 
				+	free(pixels);
			
 
				+	free(params);
			
 
				 
			
 
				-	
			
 
				+	return t_min;
			
 
				 }
			
 
				 
			
 
				-
			
 
				-void display_times(float cr, float ci, unsigned start_dim, unsigned step_dim, unsigned stop_dim, unsigned nslices, unsigned nbr_tests)
			
 
				+void display_times(double cr, double ci, long long start_dim, long long step_dim, long long stop_dim, long long nslices)
			
 
				 {
			
 
				-	
			
 
				-	unsigned dim;
			
 
				-
			
 
				-	FILE *myfile;
			
 
				-	myfile = fopen("DAT/mandelbrot_c_struct_times.dat", "w");
			
 
				-
			
 
				-	for (dim = start_dim; dim <= stop_dim; dim += step_dim){
			
 
				-		printf("Dimension: %u...\n", dim);
			
 
				-		double t = median_time(cr, ci, dim, dim, nslices, nslices, nbr_tests);
			
 
				-		
			
 
				-		printf("w = %u ; h = %u ; t = %f\n", dim, dim, t);
			
 
				-		
			
 
				-		fprintf(myfile, "%f\n", t);
			
 
				-		}
			
 
				-	
			
 
				-	fclose(myfile);
			
 
				-}
			
 
				 
			
 
				-void display_fluctuations(float cr, float ci, unsigned start_dim, unsigned step_dim, unsigned stop_dim, unsigned nslices, unsigned nbr_tests)
			
 
				-{
			
 
				-	
			
 
				-	unsigned dim;
			
 
				-
			
 
				-	FILE *myfile;
			
 
				-	myfile = fopen("DAT/mandelbrot_c_fluctuation.dat", "w");
			
 
				-
			
 
				-	double *exec_times = malloc(nbr_tests * sizeof(double));
			
 
				-	fluctuation_time(cr, ci, start_dim, start_dim, nslices, nslices, nbr_tests, exec_times);
			
 
				-		
			
 
				-	/* printf("w = %u ; h = %u ; t = %f\n", dim, dim, t); */
			
 
				-	unsigned i;
			
 
				-	for (i = 0; i < nbr_tests; i++){
			
 
				-		printf("test %u: %f seconds\n", i, exec_times[i]);
			
 
				-		fprintf(myfile, "%u %f\n", i, exec_times[i]);
			
 
				+	long long dim;
			
 
				+
			
 
				+	for (dim = start_dim; dim <= stop_dim; dim += step_dim) {
			
 
				+		printf("Dimension: %lld...\n", dim);
			
 
				+		double res = min_times(cr, ci, dim, nslices);
			
 
				+		res = res / dim / dim; // time per pixel
			
 
				+		printf("%lld %lf\n", dim, res);
			
 
				 	}
			
 
				-	
			
 
				-	fclose(myfile);
			
 
				-	free(exec_times);
			
 
				 }
			
 
				 
			
 
				-
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				-
			
 
				-	if (argc != 8){
			
 
				-		printf("Usage: %s cr ci start_dim step_dim stop_dim nslices(must divide dims) nbr_tests\n", argv[0]);
			
 
				+	if (argc != 7){
			
 
				+		printf("Usage: %s cr ci start_dim step_dim stop_dim nslices(must divide dims)\n", argv[0]);
			
 
				 		return 1;
			
 
				 	}
			
 
				 	if (starpu_init(NULL) != EXIT_SUCCESS){
			
@@ -240,24 +151,16 @@ int main(int argc, char **argv)
 
				 		return 77;
			
 
				 	}
			
 
				 
			
 
				+	double cr = (float) atof(argv[1]);
			
 
				+	double ci = (float) atof(argv[2]);
			
 
				+	long long start_dim = atoll(argv[3]);
			
 
				+	long long step_dim = atoll(argv[4]);
			
 
				+	long long stop_dim = atoll(argv[5]);
			
 
				+	long long nslices = atoll(argv[6]);
			
 
				 
			
 
				-	
			
 
				-	float cr = (float) atof(argv[1]);
			
 
				-	float ci = (float) atof(argv[2]);
			
 
				-	unsigned start_dim = (unsigned) atoi(argv[3]);
			
 
				-	unsigned step_dim = (unsigned) atoi(argv[4]);	
			
 
				-	unsigned stop_dim = (unsigned) atoi(argv[5]);
			
 
				-	unsigned nslices = (unsigned) atoi(argv[6]);
			
 
				-	unsigned nbr_tests = (unsigned) atoi(argv[7]);
			
 
				-
			
 
				-	display_times(cr, ci, start_dim, step_dim, stop_dim, nslices, nbr_tests);
			
 
				-	
			
 
				-	
			
 
				-	/* display_fluctuations(cr, ci, start_dim, step_dim, stop_dim, nslices, nbr_tests); */
			
 
				-
			
 
				+	display_times(cr, ci, start_dim, step_dim, stop_dim, nslices);
			
 
				 
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-
			
 
				 	return 0;
			
 
				 }
			
--- a/julia/mandelbrot/mandelbrot_native.jl
+++ b/julia/mandelbrot/mandelbrot_native.jl
@@ -0,0 +1,106 @@
 
				+using LinearAlgebra
			
 
				+
			
 
				+function mandelbrot(pixels, params) :: Float32
			
 
				+    height :: Int64, width :: Int64 = size(pixels)
			
 
				+    zoom :: Float64 = width * 0.25296875
			
 
				+    iz :: Float64 = 1. / zoom
			
 
				+    diverge :: Float32 = 4.0
			
 
				+    max_iterations :: Float32 = ((width/2) * 0.049715909 * log10(zoom));
			
 
				+    imi :: Float32 = 1. / max_iterations
			
 
				+    centerr :: Float32 = params[1]
			
 
				+    centeri :: Float32 = params[2]
			
 
				+    offset :: Float32 = params[3]
			
 
				+    dim :: Float32 = params[4]
			
 
				+    cr :: Float64 = 0.
			
 
				+    zr :: Float64 = 0.
			
 
				+    ci :: Float64 = 0.
			
 
				+    zi :: Float64 = 0.
			
 
				+    n :: Int64 = 0
			
 
				+    tmp :: Float64 = 0.
			
 
				+    for y = 1:height
			
 
				+        for x = 1:width
			
 
				+            cr = centerr + (x-1 - (dim / 2)) * iz
			
 
				+            zr = cr
			
 
				+            ci = centeri + (y-1+offset - (dim / 2)) * iz
			
 
				+            zi = ci
			
 
				+            for n = 0:max_iterations
			
 
				+                if (zr*zr + zi*zi > diverge)
			
 
				+                    break
			
 
				+                end
			
 
				+                tmp = zr*zr - zi*zi + cr
			
 
				+                zi = 2*zr*zi + ci
			
 
				+                zr = tmp
			
 
				+            end
			
 
				+
			
 
				+            if (n < max_iterations)
			
 
				+                pixels[y,x] = round(15 * n * imi)
			
 
				+            else
			
 
				+                pixels[y,x] = 0
			
 
				+            end
			
 
				+        end
			
 
				+    end
			
 
				+
			
 
				+    ret :: Float32 = 0.
			
 
				+    return ret
			
 
				+end
			
 
				+
			
 
				+function mandelbrot_without_starpu(A ::Matrix{Int64}, params ::Matrix{Float32}, nslicesx ::Int64)
			
 
				+    width,height = size(A)
			
 
				+    step = height / nslicesx
			
 
				+
			
 
				+    for taskx in (1 : nslicesx)
			
 
				+        start_id = floor(Int64, (taskx-1)*step+1)
			
 
				+        end_id = floor(Int64, (taskx-1)*step+step)
			
 
				+        a = view(A, start_id:end_id, :)
			
 
				+        p = view(params, (taskx-1)*4+1:(taskx-1)*4+4)
			
 
				+
			
 
				+        mandelbrot(a, p)
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+function pixels2img(pixels ::Matrix{Int64}, width ::Int64, height ::Int64, filename ::String)
			
 
				+    MAPPING = [[66,30,15],[25,7,26],[9,1,47],[4,4,73],[0,7,100],[12,44,138],[24,82,177],[57,125,209],[134,181,229],[211,236,248],[241,233,191],[248,201,95],[255,170,0],[204,128,0],[153,87,0],[106,52,3]]
			
 
				+    open(filename, "w") do f
			
 
				+        write(f, "P3\n$width $height\n255\n")
			
 
				+        for i = 1:height
			
 
				+            for j = 1:width
			
 
				+                write(f,"$(MAPPING[1+pixels[i,j]][1]) $(MAPPING[1+pixels[i,j]][2]) $(MAPPING[1+pixels[i,j]][3]) ")
			
 
				+            end
			
 
				+            write(f, "\n")
			
 
				+        end
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+function min_times(cr ::Float64, ci ::Float64, dim ::Int64, nslices ::Int64)
			
 
				+    tmin=0;
			
 
				+
			
 
				+    pixels ::Matrix{Int64} = zeros(dim, dim)
			
 
				+    params :: Matrix{Float32} = zeros(4*nslices,1)
			
 
				+    for i=0:(nslices-1)
			
 
				+        params[4*i+1,1] = cr
			
 
				+        params[4*i+2,1] = ci
			
 
				+        params[4*i+3,1] = i*dim/nslices
			
 
				+        params[4*i+4,1] = dim
			
 
				+    end
			
 
				+    for i = 1:10
			
 
				+        t = time_ns();
			
 
				+        mandelbrot_without_starpu(pixels, params, nslices)
			
 
				+        t = time_ns()-t
			
 
				+        if (tmin==0 || tmin>t)
			
 
				+            tmin=t
			
 
				+        end
			
 
				+    end
			
 
				+    pixels2img(pixels,dim,dim,"out$(dim).ppm")
			
 
				+    return tmin
			
 
				+end
			
 
				+
			
 
				+function display_time(cr ::Float64, ci ::Float64, start_dim ::Int64, step_dim ::Int64, stop_dim ::Int64, nslices ::Int64)
			
 
				+    for dim in (start_dim : step_dim : stop_dim)
			
 
				+        res = min_times(cr, ci, dim, nslices)
			
 
				+        res=res/dim/dim; # time per pixel
			
 
				+        println("$(dim) $(res)")
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+
			
 
				+display_time(-0.800671,-0.158392,32,32,4096,4)
			
--- a/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
+++ b/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
@@ -115,6 +115,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	starpu_mpi_wait_for_all(MPI_COMM_WORLD);
			
 
				 	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				 	start = starpu_timing_now();
			
 
				 
			
@@ -159,9 +160,9 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
				 		starpu_iteration_pop();
			
 
				 	}
			
 
				 
			
 
				-	starpu_task_wait_for_all();
			
 
				-
			
 
				+	starpu_mpi_wait_for_all(MPI_COMM_WORLD);
			
 
				 	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+
			
 
				 	end = starpu_timing_now();
			
 
				 
			
 
				 	for (m = 0; m < nblocks; m++)
			
--- a/mpi/examples/mpi_lu/pxlu.c
+++ b/mpi/examples/mpi_lu/pxlu.c
@@ -899,6 +899,8 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size, unsig
 
				 		starpu_iteration_pop();
			
 
				 	}
			
 
				 
			
 
				+	int wait_ret = starpu_mpi_wait_for_all(MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(wait_ret == MPI_SUCCESS);
			
 
				 	int barrier_ret = starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
			
 
				 
			
--- a/mpi/examples/mpi_lu/pxlu_implicit.c
+++ b/mpi/examples/mpi_lu/pxlu_implicit.c
@@ -57,7 +57,7 @@ static void create_task_11(unsigned k)
 
				 static void create_task_12(unsigned k, unsigned j)
			
 
				 {
			
 
				 #ifdef STARPU_DEVEL
			
 
				-#warning temporary fix 
			
 
				+#warning temporary fix
			
 
				 #endif
			
 
				 	starpu_mpi_task_insert(MPI_COMM_WORLD,
			
 
				 			       //&STARPU_PLU(cl12),
			
@@ -79,7 +79,7 @@ static void create_task_12(unsigned k, unsigned j)
 
				 static void create_task_21(unsigned k, unsigned i)
			
 
				 {
			
 
				 #ifdef STARPU_DEVEL
			
 
				-#warning temporary fix 
			
 
				+#warning temporary fix
			
 
				 #endif
			
 
				 	starpu_mpi_task_insert(MPI_COMM_WORLD,
			
 
				 			       //&STARPU_PLU(cl21),
			
@@ -114,13 +114,14 @@ static void create_task_22(unsigned k, unsigned i, unsigned j)
 
				 }
			
 
				 
			
 
				 /*
			
 
				- *	code to bootstrap the factorization 
			
 
				+ *	code to bootstrap the factorization
			
 
				  */
			
 
				 
			
 
				 double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size, unsigned _no_prio)
			
 
				 {
			
 
				 	double start;
			
 
				 	double end;
			
 
				+	int ret;
			
 
				 
			
 
				 	nblocks = _nblocks;
			
 
				 	rank = _rank;
			
@@ -130,7 +131,10 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size, unsig
 
				 	/* create all the DAG nodes */
			
 
				 	unsigned i,j,k;
			
 
				 
			
 
				-	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+	ret = starpu_mpi_wait_for_all(MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(ret == MPI_SUCCESS);
			
 
				+	ret = starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(ret == MPI_SUCCESS);
			
 
				 
			
 
				 	start = starpu_timing_now();
			
 
				 
			
@@ -170,15 +174,16 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size, unsig
 
				 		starpu_iteration_pop();
			
 
				 	}
			
 
				 
			
 
				-	starpu_task_wait_for_all();
			
 
				-
			
 
				-	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+	ret = starpu_mpi_wait_for_all(MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(ret == MPI_SUCCESS);
			
 
				+	ret = starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(ret == MPI_SUCCESS);
			
 
				 
			
 
				 	end = starpu_timing_now();
			
 
				 
			
 
				 	double timing = end - start;
			
 
				-	
			
 
				+
			
 
				 //	fprintf(stderr, "RANK %d -> took %f ms\n", rank, timing/1000);
			
 
				-	
			
 
				+
			
 
				 	return timing;
			
 
				 }
			
--- a/mpi/examples/user_datatype/user_datatype.c
+++ b/mpi/examples/user_datatype/user_datatype.c
@@ -120,8 +120,8 @@ int main(int argc, char **argv)
 
				 		starpu_mpi_isend_detached(handle0, 0, 20, MPI_COMM_WORLD, NULL, NULL);
			
 
				 	}
			
 
				 
			
 
				-	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				 	starpu_mpi_wait_for_all(MPI_COMM_WORLD);
			
 
				+	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				 
			
 
				 	starpu_mpi_datatype_unregister(handle0);
			
 
				 	starpu_data_unregister(handle0);
			
--- a/mpi/examples/user_datatype/user_datatype2.c
+++ b/mpi/examples/user_datatype/user_datatype2.c
@@ -80,8 +80,8 @@ int main(int argc, char **argv)
 
				 		starpu_mpi_isend_detached(handle0, 0, 20, MPI_COMM_WORLD, NULL, NULL);
			
 
				 	}
			
 
				 
			
 
				-	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				 	starpu_mpi_wait_for_all(MPI_COMM_WORLD);
			
 
				+	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				 
			
 
				 	starpu_mpi_datatype_unregister(handle0);
			
 
				 	starpu_data_unregister(handle0);
			
--- a/mpi/src/mpi/starpu_mpi_mpi.c
+++ b/mpi/src/mpi/starpu_mpi_mpi.c
@@ -96,7 +96,7 @@ starpu_pthread_queue_t _starpu_mpi_thread_dontsleep;
 
				 /* Count requests posted by the application and not yet submitted to MPI */
			
 
				 static starpu_pthread_mutex_t mutex_posted_requests;
			
 
				 static starpu_pthread_mutex_t mutex_ready_requests;
			
 
				-static int posted_requests = 0, ready_requests = 0, newer_requests, barrier_running = 0;
			
 
				+static int posted_requests = 0, ready_requests = 0, newer_requests, mpi_wait_for_all_running = 0;
			
 
				 
			
 
				 #define _STARPU_MPI_INC_POSTED_REQUESTS(value) { STARPU_PTHREAD_MUTEX_LOCK(&mutex_posted_requests); posted_requests += value; STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_posted_requests); }
			
 
				 #define _STARPU_MPI_INC_READY_REQUESTS(value) { STARPU_PTHREAD_MUTEX_LOCK(&mutex_ready_requests); ready_requests += value; STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_ready_requests); }
			
@@ -761,16 +761,40 @@ static void _starpu_mpi_barrier_func(struct _starpu_mpi_req *barrier_req)
 
				 int _starpu_mpi_barrier(MPI_Comm comm)
			
 
				 {
			
 
				 	struct _starpu_mpi_req *barrier_req;
			
 
				-	int ret = posted_requests+ready_requests;
			
 
				 
			
 
				+	/* Initialize the request structure */
			
 
				+	_starpu_mpi_request_init(&barrier_req);
			
 
				+	barrier_req->prio = INT_MAX;
			
 
				+	barrier_req->func = _starpu_mpi_barrier_func;
			
 
				+	barrier_req->request_type = BARRIER_REQ;
			
 
				+	barrier_req->node_tag.node.comm = comm;
			
 
				+
			
 
				+	_STARPU_MPI_INC_POSTED_REQUESTS(1);
			
 
				+	_starpu_mpi_submit_ready_request(barrier_req);
			
 
				+
			
 
				+	/* We wait for the MPI request to finish */
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&barrier_req->backend->req_mutex);
			
 
				+	while (!barrier_req->completed)
			
 
				+		STARPU_PTHREAD_COND_WAIT(&barrier_req->backend->req_cond, &barrier_req->backend->req_mutex);
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&barrier_req->backend->req_mutex);
			
 
				+
			
 
				+	_starpu_mpi_request_destroy(barrier_req);
			
 
				+	_STARPU_MPI_LOG_OUT();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int _starpu_mpi_wait_for_all(MPI_Comm comm)
			
 
				+{
			
 
				+	(void) comm;
			
 
				 	_STARPU_MPI_LOG_IN();
			
 
				 
			
 
				 	/* First wait for *both* all tasks and MPI requests to finish, in case
			
 
				 	 * some tasks generate MPI requests, MPI requests generate tasks, etc.
			
 
				 	 */
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
			
 
				-	STARPU_MPI_ASSERT_MSG(!barrier_running, "Concurrent starpu_mpi_barrier is not implemented, even on different communicators");
			
 
				-	barrier_running = 1;
			
 
				+	STARPU_MPI_ASSERT_MSG(!mpi_wait_for_all_running, "Concurrent starpu_mpi_wait_for_all is not implemented, even on different communicators");
			
 
				+	mpi_wait_for_all_running = 1;
			
 
				 	do
			
 
				 	{
			
 
				 		while (posted_requests || ready_requests)
			
@@ -786,29 +810,9 @@ int _starpu_mpi_barrier(MPI_Comm comm)
 
				 		 * triggered by tasks completed and triggered tasks between
			
 
				 		 * wait_for_all finished and we take the lock */
			
 
				 	} while (posted_requests || ready_requests || newer_requests);
			
 
				-	barrier_running = 0;
			
 
				+	mpi_wait_for_all_running = 0;
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
			
 
				-
			
 
				-	/* Initialize the request structure */
			
 
				-	_starpu_mpi_request_init(&barrier_req);
			
 
				-	barrier_req->prio = INT_MAX;
			
 
				-	barrier_req->func = _starpu_mpi_barrier_func;
			
 
				-	barrier_req->request_type = BARRIER_REQ;
			
 
				-	barrier_req->node_tag.node.comm = comm;
			
 
				-
			
 
				-	_STARPU_MPI_INC_POSTED_REQUESTS(1);
			
 
				-	_starpu_mpi_submit_ready_request(barrier_req);
			
 
				-
			
 
				-	/* We wait for the MPI request to finish */
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&barrier_req->backend->req_mutex);
			
 
				-	while (!barrier_req->completed)
			
 
				-		STARPU_PTHREAD_COND_WAIT(&barrier_req->backend->req_cond, &barrier_req->backend->req_mutex);
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&barrier_req->backend->req_mutex);
			
 
				-
			
 
				-	_starpu_mpi_request_destroy(barrier_req);
			
 
				-	_STARPU_MPI_LOG_OUT();
			
 
				-
			
 
				-	return ret;
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 /********************************************************/
			
@@ -1269,7 +1273,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 			_STARPU_MPI_DEBUG(3, "NO MORE REQUESTS TO HANDLE\n");
			
 
				 			_STARPU_MPI_TRACE_SLEEP_BEGIN();
			
 
				 
			
 
				-			if (barrier_running)
			
 
				+			if (mpi_wait_for_all_running)
			
 
				 				/* Tell mpi_barrier */
			
 
				 				STARPU_PTHREAD_COND_SIGNAL(&barrier_cond);
			
 
				 			STARPU_PTHREAD_COND_WAIT(&progress_cond, &progress_mutex);
			
--- a/mpi/src/nmad/starpu_mpi_nmad.c
+++ b/mpi/src/nmad/starpu_mpi_nmad.c
@@ -59,16 +59,20 @@ static starpu_pthread_cond_t progress_cond;
 
				 static starpu_pthread_mutex_t progress_mutex;
			
 
				 static volatile int running = 0;
			
 
				 
			
 
				-extern struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count);
			
 
				+static starpu_pthread_cond_t mpi_wait_for_all_running_cond;
			
 
				+static int mpi_wait_for_all_running = 0;
			
 
				+static starpu_pthread_mutex_t mpi_wait_for_all_running_mutex;
			
 
				 
			
 
				-/* Count requests posted by the application and not yet submitted to MPI, i.e pushed into the new_requests list */
			
 
				+extern struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count);
			
 
				 
			
 
				-static volatile int pending_request = 0;
			
 
				+/* Count running requests: this counter is incremented just before StarPU
			
 
				+ * submits a MPI request, and decremented when a MPI request finishes. */
			
 
				+static volatile int nb_pending_requests = 0;
			
 
				 
			
 
				 #define REQ_FINALIZED 0x1
			
 
				 
			
 
				-PUK_LFSTACK_TYPE(callback,	struct _starpu_mpi_req *req;);
			
 
				-static callback_lfstack_t callback_stack = NULL;
			
 
				+PUK_LFSTACK_TYPE(callback, struct _starpu_mpi_req *req;);
			
 
				+static callback_lfstack_t callback_stack;
			
 
				 
			
 
				 static starpu_sem_t callback_sem;
			
 
				 
			
@@ -80,7 +84,7 @@ static starpu_sem_t callback_sem;
 
				 
			
 
				 void _starpu_mpi_req_willpost(struct _starpu_mpi_req *req STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				-	STARPU_ATOMIC_ADD( &pending_request, 1);
			
 
				+	STARPU_ATOMIC_ADD( &nb_pending_requests, 1);
			
 
				 }
			
 
				 
			
 
				 /********************************************************/
			
@@ -269,16 +273,39 @@ int _starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 
				 int _starpu_mpi_barrier(MPI_Comm comm)
			
 
				 {
			
 
				 	_STARPU_MPI_LOG_IN();
			
 
				-	int ret;
			
 
				-	//	STARPU_ASSERT_MSG(!barrier_running, "Concurrent starpu_mpi_barrier is not implemented, even on different communicators");
			
 
				-	ret = MPI_Barrier(comm);
			
 
				 
			
 
				+	int ret = MPI_Barrier(comm);
			
 
				 	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Barrier returning %d", ret);
			
 
				 
			
 
				 	_STARPU_MPI_LOG_OUT();
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+int _starpu_mpi_wait_for_all(MPI_Comm comm)
			
 
				+{
			
 
				+	(void) comm;
			
 
				+	_STARPU_MPI_LOG_IN();
			
 
				+
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&mpi_wait_for_all_running_mutex);
			
 
				+	STARPU_MPI_ASSERT_MSG(!mpi_wait_for_all_running, "Concurrent starpu_mpi_wait_for_all is not implemented, even on different communicators");
			
 
				+	mpi_wait_for_all_running = 1;
			
 
				+	do
			
 
				+	{
			
 
				+		while (nb_pending_requests)
			
 
				+			STARPU_PTHREAD_COND_WAIT(&mpi_wait_for_all_running_cond, &mpi_wait_for_all_running_mutex);
			
 
				+		STARPU_PTHREAD_MUTEX_UNLOCK(&mpi_wait_for_all_running_mutex);
			
 
				+
			
 
				+		starpu_task_wait_for_all();
			
 
				+
			
 
				+		STARPU_PTHREAD_MUTEX_LOCK(&mpi_wait_for_all_running_mutex);
			
 
				+	} while (nb_pending_requests);
			
 
				+	mpi_wait_for_all_running = 0;
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&mpi_wait_for_all_running_mutex);
			
 
				+
			
 
				+	_STARPU_MPI_LOG_OUT();
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 /********************************************************/
			
 
				 /*                                                      */
			
 
				 /*  Progression                                         */
			
@@ -353,9 +380,13 @@ void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req,nm_sr_ev
 
				 			req->completed = 1;
			
 
				 			piom_cond_signal(&req->backend->req_cond, REQ_FINALIZED);
			
 
				 		}
			
 
				-		int pending_remaining = STARPU_ATOMIC_ADD(&pending_request, -1);
			
 
				-		if (!running && !pending_remaining)
			
 
				-			starpu_sem_post(&callback_sem);
			
 
				+		int pending_remaining = STARPU_ATOMIC_ADD(&nb_pending_requests, -1);
			
 
				+		if (!pending_remaining)
			
 
				+		{
			
 
				+			STARPU_PTHREAD_COND_BROADCAST(&mpi_wait_for_all_running_cond);
			
 
				+			if (!running)
			
 
				+				starpu_sem_post(&callback_sem);
			
 
				+		}
			
 
				 	}
			
 
				 	_STARPU_MPI_LOG_OUT();
			
 
				 }
			
@@ -476,24 +507,24 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 		struct callback_lfstack_cell_s* c = callback_lfstack_pop(&callback_stack);
			
 
				 		int err=0;
			
 
				 
			
 
				-		if(running || pending_request>0)
			
 
				+		if(running || nb_pending_requests>0)
			
 
				 		{
			
 
				 			/* shall we block ? */
			
 
				 			err = starpu_sem_wait(&callback_sem);
			
 
				-			//running pending_request can change while waiting
			
 
				+			//running nb_pending_requests can change while waiting
			
 
				 		}
			
 
				 		if(c==NULL)
			
 
				 		{
			
 
				 			c = callback_lfstack_pop(&callback_stack);
			
 
				 			if (c == NULL)
			
 
				 			{
			
 
				-				if(running && pending_request>0)
			
 
				+				if(running && nb_pending_requests>0)
			
 
				 				{
			
 
				 					STARPU_ASSERT_MSG(c!=NULL, "Callback thread awakened without callback ready with error %d.",err);
			
 
				 				}
			
 
				 				else
			
 
				 				{
			
 
				-					if (pending_request==0)
			
 
				+					if (nb_pending_requests==0)
			
 
				 						break;
			
 
				 				}
			
 
				 				continue;
			
@@ -511,14 +542,14 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 			c->req->completed=1;
			
 
				 			piom_cond_signal(&(c->req->backend->req_cond), REQ_FINALIZED);
			
 
				 		}
			
 
				-		STARPU_ATOMIC_ADD( &pending_request, -1);
			
 
				+		STARPU_ATOMIC_ADD( &nb_pending_requests, -1);
			
 
				 		/* we signal that the request is completed.*/
			
 
				 
			
 
				 		free(c);
			
 
				 
			
 
				 	}
			
 
				 	STARPU_ASSERT_MSG(callback_lfstack_pop(&callback_stack)==NULL, "List of callback not empty.");
			
 
				-	STARPU_ASSERT_MSG(pending_request==0, "Request still pending.");
			
 
				+	STARPU_ASSERT_MSG(nb_pending_requests==0, "Request still pending.");
			
 
				 
			
 
				 	if (argc_argv->initialize_mpi)
			
 
				 	{
			
@@ -580,6 +611,9 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 
				         STARPU_PTHREAD_MUTEX_INIT(&progress_mutex, NULL);
			
 
				         STARPU_PTHREAD_COND_INIT(&progress_cond, NULL);
			
 
				 
			
 
				+        STARPU_PTHREAD_MUTEX_INIT(&mpi_wait_for_all_running_mutex, NULL);
			
 
				+        STARPU_PTHREAD_COND_INIT(&mpi_wait_for_all_running_cond, NULL);
			
 
				+
			
 
				 	starpu_sem_init(&callback_sem, 0, 0);
			
 
				 	running = 0;
			
 
				 
			
@@ -594,6 +628,8 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 
				 		_starpu_mpi_thread_cpuid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
			
 
				 	}
			
 
				 
			
 
				+	callback_lfstack_init(&callback_stack);
			
 
				+
			
 
				 	/* Tell pioman to use a bound thread for communication progression:
			
 
				 	 * share the same core as StarPU's MPI thread, the MPI thread has very low activity with NMAD backend */
			
 
				 	int indexes[1] = { _starpu_mpi_thread_cpuid };
			
@@ -663,8 +699,13 @@ void _starpu_mpi_progress_shutdown(void **value)
 
				 
			
 
				 	STARPU_PTHREAD_JOIN(progress_thread, value);
			
 
				 
			
 
				+	callback_lfstack_destroy(&callback_stack);
			
 
				+
			
 
				         STARPU_PTHREAD_MUTEX_DESTROY(&progress_mutex);
			
 
				         STARPU_PTHREAD_COND_DESTROY(&progress_cond);
			
 
				+
			
 
				+        STARPU_PTHREAD_MUTEX_DESTROY(&mpi_wait_for_all_running_mutex);
			
 
				+        STARPU_PTHREAD_COND_DESTROY(&mpi_wait_for_all_running_cond);
			
 
				 }
			
 
				 
			
 
				 static int64_t _starpu_mpi_tag_max = INT64_MAX;
			
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -437,7 +437,5 @@ void starpu_mpi_data_migrate(MPI_Comm comm, starpu_data_handle_t data, int new_r
 
				 
			
 
				 int starpu_mpi_wait_for_all(MPI_Comm comm)
			
 
				 {
			
 
				-	starpu_task_wait_for_all();
			
 
				-	starpu_mpi_barrier(comm);
			
 
				-	return 0;
			
 
				+	return _starpu_mpi_wait_for_all(comm);
			
 
				 }
			
--- a/mpi/src/starpu_mpi_private.h
+++ b/mpi/src/starpu_mpi_private.h
@@ -310,7 +310,6 @@ void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req);
 
				 void _starpu_mpi_irecv_size_func(struct _starpu_mpi_req *req);
			
 
				 int _starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status);
			
 
				 int _starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status);
			
 
				-int _starpu_mpi_barrier(MPI_Comm comm);
			
 
				 
			
 
				 struct _starpu_mpi_argc_argv
			
 
				 {
			
@@ -331,6 +330,9 @@ void _starpu_mpi_wait_for_initialization();
 
				 #endif
			
 
				 void _starpu_mpi_data_flush(starpu_data_handle_t data_handle);
			
 
				 
			
 
				+int _starpu_mpi_barrier(MPI_Comm comm);
			
 
				+int _starpu_mpi_wait_for_all(MPI_Comm comm);
			
 
				+
			
 
				 /*
			
 
				  * Specific functions to backend implementation
			
 
				  */
			
--- a/mpi/tests/Makefile.am
+++ b/mpi/tests/Makefile.am
@@ -58,9 +58,11 @@ BUILT_SOURCES =
 
				 
			
 
				 CLEANFILES = *.gcno *.gcda *.linkinfo starpu_idle_microsec.log
			
 
				 
			
 
				-EXTRA_DIST = 					\
			
 
				-	user_defined_datatype_value.h		\
			
 
				-	helper.h
			
 
				+EXTRA_DIST = 				\
			
 
				+	abstract_sendrecv_bench.h	\
			
 
				+	bench_helper.h			\
			
 
				+	helper.h			\
			
 
				+	user_defined_datatype_value.h
			
 
				 
			
 
				 examplebindir = $(libdir)/starpu/examples/mpi
			
 
				 
			
--- a/mpi/tests/abstract_sendrecv_bench.c
+++ b/mpi/tests/abstract_sendrecv_bench.c
@@ -25,6 +25,11 @@ void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
 
				 
			
 
				 	if (mpi_rank >= 2)
			
 
				 	{
			
 
				+		if (thread_barrier != NULL)
			
 
				+		{
			
 
				+			STARPU_PTHREAD_BARRIER_WAIT(thread_barrier);
			
 
				+		}
			
 
				+
			
 
				 		for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
			
 
				 		{
			
 
				 			iterations = bench_nb_iterations(iterations, s);
			
--- a/mpi/tests/bench_helper.c
+++ b/mpi/tests/bench_helper.c
@@ -33,7 +33,7 @@ int comp_double(const void*_a, const void*_b)
 
				 
			
 
				 uint64_t bench_next_size(uint64_t len)
			
 
				 {
			
 
				-	uint64_t next = len * MULT_DEFAULT + INCR_DEFAULT;
			
 
				+	uint64_t next = len * MULT_DEFAULT;
			
 
				 
			
 
				 	if(next <= len)
			
 
				 		next++;
			
--- a/mpi/tests/bench_helper.h
+++ b/mpi/tests/bench_helper.h
@@ -18,18 +18,16 @@
 
				 #include <starpu_mpi.h>
			
 
				 #include "helper.h"
			
 
				 
			
 
				-#define NX_MAX (512 * 1024 * 1024) // kB
			
 
				 #define NX_MIN 0
			
 
				+
			
 
				 #ifdef STARPU_QUICK_CHECK
			
 
				-#define MULT_DEFAULT 4
			
 
				-#else
			
 
				-#define MULT_DEFAULT 2
			
 
				-#endif
			
 
				-#define INCR_DEFAULT 0
			
 
				-#ifdef STARPU_QUICK_CHECK
			
 
				-#define LOOPS_DEFAULT 100
			
 
				+	#define MULT_DEFAULT 4
			
 
				+	#define LOOPS_DEFAULT 100
			
 
				+	#define NX_MAX (64 * 1024 * 1024) // kB
			
 
				 #else
			
 
				-#define LOOPS_DEFAULT 100000
			
 
				+	#define MULT_DEFAULT 2
			
 
				+	#define LOOPS_DEFAULT 100000
			
 
				+	#define NX_MAX (512 * 1024 * 1024) // kB
			
 
				 #endif
			
 
				 
			
 
				 int comp_double(const void*_a, const void*_b);
			
--- a/mpi/tests/sendrecv_gemm_bench.c
+++ b/mpi/tests/sendrecv_gemm_bench.c
@@ -320,7 +320,6 @@ static void* comm_thread_func(void* arg)
 
				 	return NULL;
			
 
				 }
			
 
				 
			
 
				-
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	double start, end;
			
--- a/mpi/tests/sendrecv_parallel_tasks_bench.c
+++ b/mpi/tests/sendrecv_parallel_tasks_bench.c
@@ -32,7 +32,11 @@
 
				 
			
 
				 /* We reduce NX_MAX, since some NICs don't support exchanging simultaneously such amount of memory */
			
 
				 #undef NX_MAX
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				+#define NX_MAX (1024)
			
 
				+#else
			
 
				 #define NX_MAX (64 * 1024 * 1024)
			
 
				+#endif
			
 
				 
			
 
				 
			
 
				 void cpu_task(void* descr[], void* args)
			
@@ -108,6 +112,8 @@ void cpu_task(void* descr[], void* args)
 
				 			current_worker, (long long) s, min_lat, bw_million_byte, bw_mbyte, d1_lat, med_lat, avg_lat, d9_lat, max_lat);
			
 
				 		fflush(stdout);
			
 
				 	}
			
 
				+
			
 
				+	free(lats);
			
 
				 }
			
 
				 
			
 
				 static struct starpu_codelet cl =
			
--- a/src/common/barrier.c
+++ b/src/common/barrier.c
@@ -50,7 +50,8 @@ int _starpu_barrier_test(struct _starpu_barrier *barrier)
 
				 int _starpu_barrier_destroy(struct _starpu_barrier *barrier)
			
 
				 {
			
 
				 	int ret;
			
 
				-	do {
			
 
				+	do
			
 
				+	{
			
 
				 		ret = _starpu_barrier_test(barrier);
			
 
				 	}
			
 
				 	while (ret == EBUSY);
			
--- a/src/core/dependencies/data_concurrency.c
+++ b/src/core/dependencies/data_concurrency.c
@@ -402,7 +402,8 @@ void _starpu_enforce_data_deps_notify_job_ready_soon(struct _starpu_job *j, _sta
 
				 {
			
 
				 	unsigned buf;
			
 
				 
			
 
				-	if (j->task->cl) {
			
 
				+	if (j->task->cl)
			
 
				+	{
			
 
				 		unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(j->task);
			
 
				 
			
 
				 		for (buf = 0; buf < nbuffers; buf++)
			
@@ -415,7 +416,8 @@ void _starpu_enforce_data_deps_notify_job_ready_soon(struct _starpu_job *j, _sta
 
				 
			
 
				 		/* We need to check data availability only if sequential consistency
			
 
				 		 * dependencies have not been used */
			
 
				-		if (!j->sequential_consistency) {
			
 
				+		if (!j->sequential_consistency)
			
 
				+		{
			
 
				 			for (buf = 0; buf < nbuffers; buf++)
			
 
				 			{
			
 
				 				starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(j->task, buf);
			
--- a/src/core/dependencies/dependencies.c
+++ b/src/core/dependencies/dependencies.c
@@ -41,7 +41,8 @@ void _starpu_notify_dependencies(struct _starpu_job *j)
 
				 static starpu_notify_ready_soon_func notify_ready_soon_func;
			
 
				 static void *notify_ready_soon_func_data;
			
 
				 
			
 
				-struct _starpu_notify_job_start_data {
			
 
				+struct _starpu_notify_job_start_data
			
 
				+{
			
 
				 	double delay;
			
 
				 };
			
 
				 
			
--- a/src/core/dependencies/implicit_data_deps.c
+++ b/src/core/dependencies/implicit_data_deps.c
@@ -234,11 +234,13 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 
				 
			
 
				 		/* Skip tasks that are associated to a reduction phase so that
			
 
				 		 * they do not interfere with the application. */
			
 
				-		if (pre_sync_job->reduction_task) {
			
 
				+		if (pre_sync_job->reduction_task)
			
 
				+		{
			
 
				 			*submit_pre_sync = 1;
			
 
				 			return NULL;
			
 
				 		}
			
 
				-		if (post_sync_job->reduction_task) {
			
 
				+		if (post_sync_job->reduction_task)
			
 
				+		{
			
 
				 			*submit_pre_sync = 0;
			
 
				 			return NULL;
			
 
				 		}
			
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -1026,7 +1026,8 @@ void starpu_perfmodel_dump_xml(FILE *f, struct starpu_perfmodel *model)
 
				 		for(dev = 0; dev < ndevices; dev++)
			
 
				 		{
			
 
				 			const char *type;
			
 
				-			switch (arch_combs[comb]->devices[dev].type) {
			
 
				+			switch (arch_combs[comb]->devices[dev].type)
			
 
				+			{
			
 
				 				case STARPU_CPU_WORKER: type = "CPU"; break;
			
 
				 				case STARPU_CUDA_WORKER: type = "CUDA"; break;
			
 
				 				case STARPU_OPENCL_WORKER: type = "OpenCL"; break;
			
@@ -1421,7 +1422,8 @@ int starpu_perfmodel_list(FILE *output)
 
				 	else
			
 
				 	{
			
 
				 		int i;
			
 
				-		for (i = 0; i < n; i++) {
			
 
				+		for (i = 0; i < n; i++)
			
 
				+		{
			
 
				 			if (strcmp(list[i]->d_name, ".") && strcmp(list[i]->d_name, ".."))
			
 
				 				fprintf(output, "file: <%s>\n", list[i]->d_name);
			
 
				 			free(list[i]);
			
@@ -1772,7 +1774,8 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, s
 
				 
			
 
				 docal:
			
 
				 #ifdef STARPU_SIMGRID
			
 
				-	if (isnan(exp)) {
			
 
				+	if (isnan(exp))
			
 
				+	{
			
 
				 		char archname[STR_SHORT_LENGTH];
			
 
				 		starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), nimpl);
			
 
				 
			
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -299,13 +299,15 @@ int _starpu_task_data_get_node_on_node(struct starpu_task *task, unsigned index,
 
				 	int node = STARPU_SPECIFIC_NODE_LOCAL;
			
 
				 	if (task->cl->specific_nodes)
			
 
				 		node = STARPU_CODELET_GET_NODE(task->cl, index);
			
 
				-	switch (node) {
			
 
				+	switch (node)
			
 
				+	{
			
 
				 	case STARPU_SPECIFIC_NODE_LOCAL:
			
 
				 		// TODO: rather find MCDRAM
			
 
				 		node = local_node;
			
 
				 		break;
			
 
				 	case STARPU_SPECIFIC_NODE_CPU:
			
 
				-		switch (starpu_node_get_kind(local_node)) {
			
 
				+		switch (starpu_node_get_kind(local_node))
			
 
				+		{
			
 
				 		case STARPU_CPU_RAM:
			
 
				 			node = local_node;
			
 
				 			break;
			
@@ -320,10 +322,13 @@ int _starpu_task_data_get_node_on_node(struct starpu_task *task, unsigned index,
 
				 		node = local_node;
			
 
				 		break;
			
 
				 	case STARPU_SPECIFIC_NODE_LOCAL_OR_CPU:
			
 
				-		if (task->handles[index]->per_node[local_node].state != STARPU_INVALID) {
			
 
				+		if (task->handles[index]->per_node[local_node].state != STARPU_INVALID)
			
 
				+		{
			
 
				 			/* It is here already, rather access it from here */
			
 
				 			node = local_node;
			
 
				-		} else {
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				 			/* It is not here already, do not bother moving it */
			
 
				 			node = STARPU_MAIN_RAM;
			
 
				 		}
			
@@ -338,7 +343,8 @@ int _starpu_task_data_get_node_on_worker(struct starpu_task *task, unsigned inde
 
				 	int node = STARPU_SPECIFIC_NODE_LOCAL;
			
 
				 	if (task->cl->specific_nodes)
			
 
				 		node = STARPU_CODELET_GET_NODE(task->cl, index);
			
 
				-	switch (node) {
			
 
				+	switch (node)
			
 
				+	{
			
 
				 	case STARPU_SPECIFIC_NODE_LOCAL:
			
 
				 		// TODO: rather find MCDRAM
			
 
				 		node = local_node;
			
@@ -353,10 +359,13 @@ int _starpu_task_data_get_node_on_worker(struct starpu_task *task, unsigned inde
 
				 		node = local_node;
			
 
				 		break;
			
 
				 	case STARPU_SPECIFIC_NODE_LOCAL_OR_CPU:
			
 
				-		if (task->handles[index]->per_node[local_node].state != STARPU_INVALID) {
			
 
				+		if (task->handles[index]->per_node[local_node].state != STARPU_INVALID)
			
 
				+		{
			
 
				 			/* It is here already, rather access it from here */
			
 
				 			node = local_node;
			
 
				-		} else {
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				 			/* It is not here already, do not bother moving it */
			
 
				 			node = STARPU_MAIN_RAM;
			
 
				 		}
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -428,7 +428,8 @@ static inline int _starpu_can_use_nth_implementation(enum starpu_worker_archtype
 
				 
			
 
				 /* Test if this task can be processed on this worker, regardless of the implementation */
			
 
				 /* must be called with sched_mutex locked to protect state_blocked */
			
 
				-static inline int _starpu_can_execute_task_any_impl(unsigned workerid, struct starpu_task *task) {
			
 
				+static inline int _starpu_can_execute_task_any_impl(unsigned workerid, struct starpu_task *task)
			
 
				+{
			
 
				 
			
 
				 	if (!_starpu_config.workers[workerid].enable_knob)
			
 
				 		return 0;
			
@@ -440,7 +441,6 @@ static inline int _starpu_can_execute_task_any_impl(unsigned workerid, struct st
 
				 			return 0;
			
 
				 	}
			
 
				 
			
 
				-	
			
 
				 	/* if the worker is blocked in a parallel ctx don't submit tasks on it */
			
 
				 #ifdef STARPU_DEVEL
			
 
				 #warning FIXME: this is very expensive, while can_execute is supposed to be not very costly so schedulers can call it a lot
			
@@ -451,7 +451,7 @@ static inline int _starpu_can_execute_task_any_impl(unsigned workerid, struct st
 
				 	if (!(task->where & _starpu_config.workers[workerid].worker_mask))
			
 
				 		return 0;
			
 
				 
			
 
				-	return 1; 
			
 
				+	return 1;
			
 
				 }
			
 
				 
			
 
				 /* must be called with sched_mutex locked to protect state_blocked_in_parallel */
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -208,7 +208,8 @@ void _starpu_update_data_state(starpu_data_handle_t handle,
 
				 			for (node = 0; node < nnodes; node++)
			
 
				 			{
			
 
				 				struct _starpu_data_replicate *replicate = &handle->per_node[node];
			
 
				-                               if (replicate->state != STARPU_INVALID){
			
 
				+                               if (replicate->state != STARPU_INVALID)
			
 
				+			       {
			
 
				                                        _STARPU_TRACE_DATA_STATE_SHARED(handle, node);
			
 
				 					replicate->state = STARPU_SHARED;
			
 
				                                }
			
--- a/src/datawizard/interfaces/matrix_interface.c
+++ b/src/datawizard/interfaces/matrix_interface.c
@@ -213,7 +213,8 @@ static void display_matrix_interface(starpu_data_handle_t handle, FILE *f)
 
				 
			
 
				 //#define DYNAMIC_MATRICES
			
 
				 
			
 
				-struct pack_matrix_header {
			
 
				+struct pack_matrix_header
			
 
				+{
			
 
				 #ifdef DYNAMIC_MATRICES
			
 
				 	/* Receiving matrices with different sizes from MPI */
			
 
				 	/* FIXME: that would break alignment for O_DIRECT disk access...
			
--- a/src/debug/traces/starpu_fxt.c
+++ b/src/debug/traces/starpu_fxt.c
@@ -343,7 +343,8 @@ static void handle_papi_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *o
 
				 	long long int value = ev->param[2];
			
 
				 	//char *prefix = options->file_prefix;
			
 
				 
			
 
				-	if (papi_file){
			
 
				+	if (papi_file)
			
 
				+	{
			
 
				 		char event_str[PAPI_MAX_STR_LEN];
			
 
				 		PAPI_event_code_to_name(event_code, event_str);
			
 
				 		fprintf(papi_file, "JobId: %lu\n", task);
			
@@ -2470,47 +2471,55 @@ static void handle_memnode_event(struct fxt_ev_64 *ev, struct starpu_fxt_options
 
				 		memnode_set_state(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr);
			
 
				 }
			
 
				 
			
 
				-static void handle_memnode_event_start_3(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr){
			
 
				+static void handle_memnode_event_start_3(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
			
 
				+{
			
 
				 	unsigned memnode = ev->param[0];
			
 
				-       unsigned size = ev->param[2];
			
 
				-       unsigned long handle = ev->param[3];
			
 
				+	unsigned size = ev->param[2];
			
 
				+	unsigned long handle = ev->param[3];
			
 
				 
			
 
				 	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, size, memnode, options);
			
 
				 }
			
 
				 
			
 
				-static void handle_memnode_event_start_4(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr){
			
 
				-       unsigned memnode = ev->param[0];
			
 
				-       unsigned dest = ev->param[1];
			
 
				-       if(strcmp(eventstr, "rc")==0){
			
 
				-               //If it is a Request Create, use dest normally
			
 
				-       }else{
			
 
				-               dest = memnode;
			
 
				-       }
			
 
				-       unsigned size = ev->param[2];
			
 
				-       unsigned long handle = ev->param[3];
			
 
				-       unsigned prefe = ev->param[4];
			
 
				+static void handle_memnode_event_start_4(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
			
 
				+{
			
 
				+	unsigned memnode = ev->param[0];
			
 
				+	unsigned dest = ev->param[1];
			
 
				+	if(strcmp(eventstr, "rc")==0)
			
 
				+	{
			
 
				+		//If it is a Request Create, use dest normally
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		dest = memnode;
			
 
				+	}
			
 
				+	unsigned size = ev->param[2];
			
 
				+	unsigned long handle = ev->param[3];
			
 
				+	unsigned prefe = ev->param[4];
			
 
				 
			
 
				-       memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, prefe, size, dest, options);
			
 
				+	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, prefe, size, dest, options);
			
 
				 }
			
 
				 
			
 
				-static void handle_memnode_event_end_3(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr){
			
 
				+static void handle_memnode_event_end_3(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
			
 
				+{
			
 
				 	unsigned memnode = ev->param[0];
			
 
				-       unsigned long handle = ev->param[2];
			
 
				-       unsigned info = ev->param[3];
			
 
				+	unsigned long handle = ev->param[2];
			
 
				+	unsigned info = ev->param[3];
			
 
				 
			
 
				 	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, info, 0, memnode, options);
			
 
				 }
			
 
				 
			
 
				-static void handle_memnode_event_start_2(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr){
			
 
				+static void handle_memnode_event_start_2(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
			
 
				+{
			
 
				 	unsigned memnode = ev->param[0];
			
 
				-       unsigned long handle = ev->param[2];
			
 
				+	unsigned long handle = ev->param[2];
			
 
				 
			
 
				 	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, 0, memnode, options);
			
 
				 }
			
 
				 
			
 
				-static void handle_memnode_event_end_2(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr){
			
 
				+static void handle_memnode_event_end_2(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
			
 
				+{
			
 
				 	unsigned memnode = ev->param[0];
			
 
				-       unsigned long handle = ev->param[2];
			
 
				+	unsigned long handle = ev->param[2];
			
 
				 
			
 
				 	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, 0, memnode, options);
			
 
				 }
			
--- a/src/sched_policies/component_best_implementation.c
+++ b/src/sched_policies/component_best_implementation.c
@@ -38,26 +38,26 @@ static int find_best_impl(unsigned sched_ctx_id, struct starpu_task * task, int
 
				 		len = 0.0;
			
 
				 	}
			
 
				 	else
			
 
				-	{	
			
 
				-	    struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid, sched_ctx_id);
			
 
				-	    for(impl = 0; impl < STARPU_MAXIMPLEMENTATIONS; impl++)
			
 
				-	    {
			
 
				-		if(starpu_worker_can_execute_task(workerid, task, impl))
			
 
				+	{
			
 
				+		struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid, sched_ctx_id);
			
 
				+		for(impl = 0; impl < STARPU_MAXIMPLEMENTATIONS; impl++)
			
 
				 		{
			
 
				-			double d = starpu_task_expected_length(task, archtype, impl);
			
 
				-			if(isnan(d))
			
 
				-			{
			
 
				-				best_impl = impl;
			
 
				-				len = 0.0;
			
 
				-				break;
			
 
				-			}
			
 
				-			if(d < len)
			
 
				+			if(starpu_worker_can_execute_task(workerid, task, impl))
			
 
				 			{
			
 
				-				len = d;
			
 
				-				best_impl = impl;
			
 
				+				double d = starpu_task_expected_length(task, archtype, impl);
			
 
				+				if(isnan(d))
			
 
				+				{
			
 
				+					best_impl = impl;
			
 
				+					len = 0.0;
			
 
				+					break;
			
 
				+				}
			
 
				+				if(d < len)
			
 
				+				{
			
 
				+					len = d;
			
 
				+					best_impl = impl;
			
 
				+				}
			
 
				 			}
			
 
				 		}
			
 
				-	    }
			
 
				 	}
			
 
				 	if(best_impl == -1)
			
 
				 		return 0;
			
--- a/src/sched_policies/component_heteroprio.c
+++ b/src/sched_policies/component_heteroprio.c
@@ -206,7 +206,8 @@ out:
 
				 	//fprintf(stderr, "could not push %p to %d actually\n", task, best_icomponent);
			
 
				 	/* Could not push to child actually, push that one back */
			
 
				 	STARPU_COMPONENT_MUTEX_LOCK(mutex);
			
 
				-	for (j = 0; j < (int) data->naccel; j++) {
			
 
				+	for (j = 0; j < (int) data->naccel; j++)
			
 
				+	{
			
 
				 		if (acceleration == data->accel[j])
			
 
				 		{
			
 
				 			_starpu_prio_deque_push_front_task(data->bucket[j], task);
			
@@ -305,7 +306,8 @@ static int heteroprio_progress_one(struct starpu_sched_component *component)
 
				 	task = _starpu_prio_deque_pop_task(no_accel);
			
 
				 	STARPU_COMPONENT_MUTEX_UNLOCK(mutex);
			
 
				 
			
 
				-	if (task) {
			
 
				+	if (task)
			
 
				+	{
			
 
				 		if (heteroprio_progress_noaccel(component, data, task))
			
 
				 		{
			
 
				 			/* Could not push to child actually, push that one back */
			
@@ -388,7 +390,8 @@ static int heteroprio_push_task(struct starpu_sched_component * component, struc
 
				 			max_expected = min_arch;
			
 
				 	}
			
 
				 
			
 
				-	if (workerid == -1) {
			
 
				+	if (workerid == -1)
			
 
				+	{
			
 
				 		/* All archs can run it */
			
 
				 		STARPU_ASSERT(!isnan(min_expected));
			
 
				 		STARPU_ASSERT(!isnan(max_expected));
			
@@ -402,13 +405,15 @@ static int heteroprio_push_task(struct starpu_sched_component * component, struc
 
				 		STARPU_COMPONENT_MUTEX_LOCK(mutex);
			
 
				 		unsigned i, j;
			
 
				 		/* Try to find a bucket with similar acceleration */
			
 
				-		for (i = 0; i < data->naccel; i++) {
			
 
				+		for (i = 0; i < data->naccel; i++)
			
 
				+		{
			
 
				 			if (acceleration >= data->accel[i] * (1 - APPROX) &&
			
 
				 			    acceleration <= data->accel[i] * (1 + APPROX))
			
 
				 				break;
			
 
				 		}
			
 
				 
			
 
				-		if (i == data->naccel) {
			
 
				+		if (i == data->naccel)
			
 
				+		{
			
 
				 			/* Didn't find it, add one */
			
 
				 			data->naccel++;
			
 
				 
			
@@ -418,8 +423,10 @@ static int heteroprio_push_task(struct starpu_sched_component * component, struc
 
				 			_starpu_prio_deque_init(newbucket);
			
 
				 			int inserted = 0;
			
 
				 
			
 
				-			for (j = 0; j < data->naccel-1; j++) {
			
 
				-				if (!inserted && acceleration > data->accel[j]) {
			
 
				+			for (j = 0; j < data->naccel-1; j++)
			
 
				+			{
			
 
				+				if (!inserted && acceleration > data->accel[j])
			
 
				+				{
			
 
				 					/* Insert the new bucket here */
			
 
				 					i = j;
			
 
				 					newbuckets[j] = newbucket;
			
@@ -429,7 +436,8 @@ static int heteroprio_push_task(struct starpu_sched_component * component, struc
 
				 				newbuckets[j+inserted] = data->bucket[j];
			
 
				 				newaccel[j+inserted] = data->accel[j];
			
 
				 			}
			
 
				-			if (!inserted) {
			
 
				+			if (!inserted)
			
 
				+			{
			
 
				 				/* Insert it last */
			
 
				 				newbuckets[data->naccel-1] = newbucket;
			
 
				 				newaccel[data->naccel-1] = acceleration;
			
@@ -441,14 +449,17 @@ static int heteroprio_push_task(struct starpu_sched_component * component, struc
 
				 		}
			
 
				 #if 0
			
 
				 		fprintf(stderr,"buckets:");
			
 
				-		for (j = 0; j < data->naccel; j++) {
			
 
				+		for (j = 0; j < data->naccel; j++)
			
 
				+		{
			
 
				 			fprintf(stderr, " %f", data->accel[j]);
			
 
				 		}
			
 
				 		fprintf(stderr,"\ninserting %p %f to %d\n", task, acceleration, i);
			
 
				 #endif
			
 
				 		_starpu_prio_deque_push_back_task(data->bucket[i],task);
			
 
				 		STARPU_COMPONENT_MUTEX_UNLOCK(mutex);
			
 
				-	} else {
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				 		/* Not all archs can run it, will resort to HEFT strategy */
			
 
				 		acceleration = INFINITY;
			
 
				 		//fprintf(stderr,"%s: some archs can't do it\n", starpu_task_get_name(task));
			
--- a/src/sched_policies/component_sched.c
+++ b/src/sched_policies/component_sched.c
@@ -284,8 +284,10 @@ void _starpu_sched_component_update_workers_in_ctx(struct starpu_sched_component
 
				 	struct starpu_bitmap * workers_in_ctx = _starpu_get_worker_mask(sched_ctx_id);
			
 
				 	starpu_bitmap_unset_and(component->workers_in_ctx,component->workers, workers_in_ctx);
			
 
				 	unsigned i,j;
			
 
				-	for(i = starpu_worker_get_count(); i < starpu_worker_get_count() + starpu_combined_worker_get_count(); i++) {
			
 
				-		if (starpu_bitmap_get(component->workers, i)) {
			
 
				+	for(i = starpu_worker_get_count(); i < starpu_worker_get_count() + starpu_combined_worker_get_count(); i++)
			
 
				+	{
			
 
				+		if (starpu_bitmap_get(component->workers, i))
			
 
				+		{
			
 
				 			/* Component has this combined worker, check whether the
			
 
				 			 * context has all the corresponding workers */
			
 
				 			int worker_size;
			
--- a/src/sched_policies/component_work_stealing.c
+++ b/src/sched_policies/component_work_stealing.c
@@ -240,7 +240,8 @@ static int push_task(struct starpu_sched_component * component, struct starpu_ta
 
				 
			
 
				 	/* Find a child component that can execute this task */
			
 
				 	i = (i+1)%component->nchildren;
			
 
				-	while(1) {
			
 
				+	while(1)
			
 
				+	{
			
 
				 		int workerid;
			
 
				 		for(workerid = starpu_bitmap_first(component->children[i]->workers_in_ctx);
			
 
				 		    -1 != workerid;
			
--- a/src/sched_policies/component_worker.c
+++ b/src/sched_policies/component_worker.c
@@ -149,7 +149,8 @@ struct _starpu_worker_component_data
 
				 	union
			
 
				 	{
			
 
				 		struct _starpu_worker * worker;
			
 
				-		struct {
			
 
				+		struct
			
 
				+		{
			
 
				 			unsigned worker_size;
			
 
				 			unsigned workerids[STARPU_NMAXWORKERS];
			
 
				 		} parallel_worker;
			
--- a/src/sched_policies/heteroprio.c
+++ b/src/sched_policies/heteroprio.c
@@ -524,7 +524,7 @@ static struct starpu_task *pop_task_heteroprio_policy(unsigned sched_ctx_id)
 
				 				nb_added_tasks       += 1;
			
 
				 				// TODO starpu_prefetch_task_input_for(task, workerid);
			
 
				 			}
			
 
				-		}		
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	struct starpu_task* task = NULL;
			
--- a/src/sched_policies/modular_ez.c
+++ b/src/sched_policies/modular_ez.c
@@ -262,10 +262,13 @@ void starpu_sched_component_initialize_simple_schedulers(unsigned sched_ctx_id,
 
				 		unsigned ntasks_threshold;
			
 
				 		if (starpu_sched_component_is_heft(decision_component) ||
			
 
				 		    starpu_sched_component_is_mct(decision_component) ||
			
 
				-		    starpu_sched_component_is_heteroprio(decision_component)) {
			
 
				+		    starpu_sched_component_is_heteroprio(decision_component))
			
 
				+		{
			
 
				 			/* These need more queueing to allow CPUs to take some share of the work */
			
 
				 			ntasks_threshold = _STARPU_SCHED_NTASKS_THRESHOLD_HEFT;
			
 
				-		} else {
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				 			ntasks_threshold = _STARPU_SCHED_NTASKS_THRESHOLD_DEFAULT;
			
 
				 		}
			
 
				 		/* But let user tune it */
			
@@ -279,20 +282,20 @@ void starpu_sched_component_initialize_simple_schedulers(unsigned sched_ctx_id,
 
				 		int exp = flags & STARPU_SCHED_SIMPLE_FIFOS_BELOW_EXP ? 1 : 0;
			
 
				 
			
 
				 		struct starpu_sched_component_prio_data prio_data =
			
 
				-			{
			
 
				-				.ntasks_threshold = ntasks_threshold,
			
 
				-				.exp_len_threshold = exp_len_threshold,
			
 
				-				.ready = ready,
			
 
				-				.exp = exp,
			
 
				-			};
			
 
				+		{
			
 
				+			.ntasks_threshold = ntasks_threshold,
			
 
				+			.exp_len_threshold = exp_len_threshold,
			
 
				+			.ready = ready,
			
 
				+			.exp = exp,
			
 
				+		};
			
 
				 
			
 
				 		struct starpu_sched_component_fifo_data fifo_data =
			
 
				-			{
			
 
				-				.ntasks_threshold = ntasks_threshold,
			
 
				-				.exp_len_threshold = exp_len_threshold,
			
 
				-				.ready = ready,
			
 
				-				.exp = exp,
			
 
				-			};
			
 
				+		{
			
 
				+			.ntasks_threshold = ntasks_threshold,
			
 
				+			.exp_len_threshold = exp_len_threshold,
			
 
				+			.ready = ready,
			
 
				+			.exp = exp,
			
 
				+		};
			
 
				 
			
 
				 		/* Create one fifo+eager component pair per choice, below scheduling decision */
			
 
				 		for(i = 0; i < nbelow; i++)
			
@@ -334,7 +337,8 @@ void starpu_sched_component_initialize_simple_schedulers(unsigned sched_ctx_id,
 
				 					STARPU_ABORT();
			
 
				 			}
			
 
				 			STARPU_ASSERT(n >= 1);
			
 
				-			if (n > 1) {
			
 
				+			if (n > 1)
			
 
				+			{
			
 
				 				/* Several workers for this choice, need to introduce
			
 
				 				 * a component to distribute the work */
			
 
				 				struct starpu_sched_component *distribute;
			
--- a/src/sched_policies/modular_gemm.c
+++ b/src/sched_policies/modular_gemm.c
@@ -26,7 +26,8 @@
 
				 
			
 
				 #define MEMORY_AFFINITY
			
 
				 
			
 
				-struct child_data {
			
 
				+struct child_data
			
 
				+{
			
 
				 	double expected_start;
			
 
				 	double predicted;
			
 
				 	double predicted_transfer;
			
--- a/src/sched_policies/modular_heteroprio_heft.c
+++ b/src/sched_policies/modular_heteroprio_heft.c
@@ -22,7 +22,8 @@
 
				 
			
 
				 static void initialize_heteroprio_heft_center_policy(unsigned sched_ctx_id)
			
 
				 {
			
 
				-	struct starpu_sched_component_heteroprio_data heteroprio_data = {
			
 
				+	struct starpu_sched_component_heteroprio_data heteroprio_data =
			
 
				+	{
			
 
				 		.mct = NULL,
			
 
				 		.batch = 1,
			
 
				 	};
			
--- a/src/util/openmp_runtime_support.c
+++ b/src/util/openmp_runtime_support.c
@@ -135,7 +135,9 @@ static void wake_up_and_unlock_task(struct starpu_omp_task *task)
 
				 		weak_task_unlock(task);
			
 
				 		int ret = starpu_task_submit(task->starpu_task);
			
 
				 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				-	} else {
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				 		weak_task_unlock(task);
			
 
				 	}
			
 
				 }
			
@@ -379,36 +381,37 @@ static void starpu_omp_explicit_task_entry(struct starpu_omp_task *task)
 
				 {
			
 
				 	STARPU_ASSERT(!(task->flags & STARPU_OMP_TASK_FLAGS_IMPLICIT));
			
 
				 	struct _starpu_worker *starpu_worker = _starpu_get_local_worker_key();
			
 
				-   /* XXX on work */
			
 
				-   if (task->is_loop) {
			
 
				-      starpu_omp_for_inline_first_alt(task->nb_iterations, task->chunk, starpu_omp_sched_static, 1, &task->begin_i, &task->end_i);
			
 
				-   }
			
 
				-   if (starpu_worker->arch == STARPU_CPU_WORKER)
			
 
				-   {
			
 
				-      task->cpu_f(task->starpu_buffers, task->starpu_cl_arg);
			
 
				-   }
			
 
				+	/* XXX on work */
			
 
				+	if (task->is_loop)
			
 
				+	{
			
 
				+		starpu_omp_for_inline_first_alt(task->nb_iterations, task->chunk, starpu_omp_sched_static, 1, &task->begin_i, &task->end_i);
			
 
				+	}
			
 
				+	if (starpu_worker->arch == STARPU_CPU_WORKER)
			
 
				+	{
			
 
				+		task->cpu_f(task->starpu_buffers, task->starpu_cl_arg);
			
 
				+	}
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-   else if (starpu_worker->arch == STARPU_CUDA_WORKER)
			
 
				-   {
			
 
				-      task->cuda_f(task->starpu_buffers, task->starpu_cl_arg);
			
 
				-   }
			
 
				+	else if (starpu_worker->arch == STARPU_CUDA_WORKER)
			
 
				+	{
			
 
				+		task->cuda_f(task->starpu_buffers, task->starpu_cl_arg);
			
 
				+	}
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-   else if (starpu_worker->arch == STARPU_OPENCL_WORKER)
			
 
				-   {
			
 
				-      task->opencl_f(task->starpu_buffers, task->starpu_cl_arg);
			
 
				-   }
			
 
				+	else if (starpu_worker->arch == STARPU_OPENCL_WORKER)
			
 
				+	{
			
 
				+		task->opencl_f(task->starpu_buffers, task->starpu_cl_arg);
			
 
				+	}
			
 
				 #endif
			
 
				-   else
			
 
				-      _STARPU_ERROR("invalid worker architecture");
			
 
				-   /**/
			
 
				+	else
			
 
				+		_STARPU_ERROR("invalid worker architecture");
			
 
				+	/**/
			
 
				 	_starpu_omp_unregister_task_handles(task);
			
 
				 	_starpu_spin_lock(&task->lock);
			
 
				 	task->state = starpu_omp_task_state_terminated;
			
 
				 	task->transaction_pending=1;
			
 
				 	_starpu_spin_unlock(&task->lock);
			
 
				 	struct starpu_omp_thread *thread = _starpu_omp_get_thread();
			
 
				-	/* 
			
 
				+	/*
			
 
				 	 * the task reached the terminated state, definitively give hand back to the worker code.
			
 
				 	 *
			
 
				 	 * about to run on the worker stack...
			
@@ -428,7 +431,7 @@ static void starpu_omp_implicit_task_entry(struct starpu_omp_task *task)
 
				 		_starpu_omp_unregister_region_handles(task->owner_region);
			
 
				 	}
			
 
				 	task->state = starpu_omp_task_state_terminated;
			
 
				-	/* 
			
 
				+	/*
			
 
				 	 * the task reached the terminated state, definitively give hand back to the worker code.
			
 
				 	 *
			
 
				 	 * about to run on the worker stack...
			
@@ -447,7 +450,7 @@ static void starpu_omp_task_preempt(void)
 
				 	struct starpu_omp_thread *thread = _starpu_omp_get_thread();
			
 
				 	task->state = starpu_omp_task_state_preempted;
			
 
				 
			
 
				-	/* 
			
 
				+	/*
			
 
				 	 * the task reached a blocked state, give hand back to the worker code.
			
 
				 	 *
			
 
				 	 * about to run on the worker stack...
			
@@ -486,7 +489,7 @@ static void starpu_omp_implicit_task_exec(void *buffers[], void *cl_arg)
 
				 
			
 
				 	task->state = starpu_omp_task_state_clear;
			
 
				 
			
 
				-	/* 
			
 
				+	/*
			
 
				 	 * start the task execution, or restore a previously preempted task.
			
 
				 	 * about to run on the task stack...
			
 
				 	 * */
			
@@ -655,7 +658,7 @@ static void starpu_omp_explicit_task_exec(void *buffers[], void *cl_arg)
 
				 	}
			
 
				 	task->state = starpu_omp_task_state_clear;
			
 
				 
			
 
				-	/* 
			
 
				+	/*
			
 
				 	 * start the task execution, or restore a previously preempted task.
			
 
				 	 * about to run on the task stack...
			
 
				 	 * */
			
@@ -694,11 +697,11 @@ static struct starpu_omp_task *create_omp_task_struct(struct starpu_omp_task *pa
 
				 		task->flags |= STARPU_OMP_TASK_FLAGS_IMPLICIT;
			
 
				 	}
			
 
				 	_starpu_spin_init(&task->lock);
			
 
				-	/* TODO: initialize task->data_env_icvs with proper values */ 
			
 
				+	/* TODO: initialize task->data_env_icvs with proper values */
			
 
				 	memset(&task->data_env_icvs, 0, sizeof(task->data_env_icvs));
			
 
				 	if (is_implicit)
			
 
				 	{
			
 
				-	  /* TODO: initialize task->implicit_task_icvs with proper values */ 
			
 
				+	  /* TODO: initialize task->implicit_task_icvs with proper values */
			
 
				 		memset(&task->implicit_task_icvs, 0, sizeof(task->implicit_task_icvs));
			
 
				 	}
			
 
				 
			
@@ -1037,7 +1040,7 @@ void starpu_omp_parallel_region(const struct starpu_omp_parallel_region_attr *at
 
				 	struct starpu_omp_task *task = _starpu_omp_get_task();
			
 
				 	struct starpu_omp_region *generating_region = task->owner_region;
			
 
				 	const int max_active_levels = generating_region->owner_device->icvs.max_active_levels_var;
			
 
				-	struct starpu_omp_region *new_region = 
			
 
				+	struct starpu_omp_region *new_region =
			
 
				 		create_omp_region_struct(generating_region, _global_state.initial_device);
			
 
				 	int ret;
			
 
				 	int nb_threads = 1;
			
@@ -1166,7 +1169,7 @@ void starpu_omp_parallel_region(const struct starpu_omp_parallel_region_attr *at
 
				 	}
			
 
				 	STARPU_ASSERT(new_region->nb_threads == nb_threads);
			
 
				 
			
 
				-	/* 
			
 
				+	/*
			
 
				 	 * if task == initial_task, create a starpu task as a continuation to all the implicit
			
 
				 	 * tasks of the new region, else prepare the task for preemption,
			
 
				 	 * to become itself a continuation to the implicit tasks of the new region
			
@@ -1194,7 +1197,7 @@ void starpu_omp_parallel_region(const struct starpu_omp_parallel_region_attr *at
 
				 	 * create the starpu tasks for the implicit omp tasks,
			
 
				 	 * create explicit dependencies between these starpu tasks and the continuation starpu task
			
 
				 	 */
			
 
				-	for (i = 0; i < nb_threads; i++) 
			
 
				+	for (i = 0; i < nb_threads; i++)
			
 
				 	{
			
 
				 		struct starpu_omp_task * implicit_task = new_region->implicit_task_array[i];
			
 
				 		implicit_task->cl = attr->cl;
			
@@ -1234,7 +1237,7 @@ void starpu_omp_parallel_region(const struct starpu_omp_parallel_region_attr *at
 
				 	/*
			
 
				 	 * submit all the region implicit starpu tasks
			
 
				 	 */
			
 
				-	for (i = 0; i < nb_threads; i++) 
			
 
				+	for (i = 0; i < nb_threads; i++)
			
 
				 	{
			
 
				 		struct starpu_omp_task * implicit_task = new_region->implicit_task_array[i];
			
 
				 		ret = starpu_task_submit(implicit_task->starpu_task);
			
@@ -1292,7 +1295,7 @@ static void wake_up_barrier(struct starpu_omp_region *parallel_region)
 
				 {
			
 
				 	struct starpu_omp_task *task = _starpu_omp_get_task();
			
 
				 	int i;
			
 
				-	for (i = 0; i < parallel_region->nb_threads; i++) 
			
 
				+	for (i = 0; i < parallel_region->nb_threads; i++)
			
 
				 	{
			
 
				 		struct starpu_omp_task * implicit_task = parallel_region->implicit_task_array[i];
			
 
				 		if (implicit_task == task)
			
@@ -1343,7 +1346,7 @@ void starpu_omp_barrier(void)
 
				 	{
			
 
				 		ANNOTATE_HAPPENS_BEFORE(&parallel_region->barrier_count);
			
 
				 		/* not the last task reaching the barrier
			
 
				-		 * . prepare for conditional continuation 
			
 
				+		 * . prepare for conditional continuation
			
 
				 		 * . sleep
			
 
				 		 */
			
 
				 
			
@@ -1826,40 +1829,46 @@ void starpu_omp_taskgroup_inline_end(void)
 
				 // XXX on work
			
 
				 void starpu_omp_taskloop_inline_begin(struct starpu_omp_task_region_attr *attr)
			
 
				 {
			
 
				-   if (!attr->nogroup_clause)
			
 
				-   {
			
 
				-      starpu_omp_taskgroup_inline_begin();
			
 
				-   }
			
 
				-
			
 
				-   int nb_subloop;
			
 
				-   if (attr->num_tasks) {
			
 
				-      nb_subloop = attr->num_tasks;
			
 
				-   } else if (attr->grainsize) {
			
 
				-      nb_subloop = attr->nb_iterations / attr->grainsize;
			
 
				-   } else {
			
 
				-      nb_subloop = 4;
			
 
				-   }
			
 
				-
			
 
				-   attr->is_loop = 1;
			
 
				-
			
 
				-   int i;
			
 
				-   int nb_iter_i = attr->nb_iterations / nb_subloop;
			
 
				-   for (i = 0; i < nb_subloop; i++)
			
 
				-   {
			
 
				-      attr->begin_i = nb_iter_i * i;
			
 
				-      attr->end_i = attr->begin_i + nb_iter_i;
			
 
				-      attr->end_i += (i+1 != nb_subloop) ? 0 : (attr->nb_iterations % nb_subloop);
			
 
				-      attr->chunk = attr->end_i - attr->begin_i;
			
 
				-      starpu_omp_task_region(attr);
			
 
				-   }
			
 
				+	if (!attr->nogroup_clause)
			
 
				+	{
			
 
				+		starpu_omp_taskgroup_inline_begin();
			
 
				+	}
			
 
				+
			
 
				+	int nb_subloop;
			
 
				+	if (attr->num_tasks)
			
 
				+	{
			
 
				+		nb_subloop = attr->num_tasks;
			
 
				+	}
			
 
				+	else if (attr->grainsize)
			
 
				+	{
			
 
				+		nb_subloop = attr->nb_iterations / attr->grainsize;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		nb_subloop = 4;
			
 
				+	}
			
 
				+
			
 
				+	attr->is_loop = 1;
			
 
				+
			
 
				+	int i;
			
 
				+	int nb_iter_i = attr->nb_iterations / nb_subloop;
			
 
				+	for (i = 0; i < nb_subloop; i++)
			
 
				+	{
			
 
				+		attr->begin_i = nb_iter_i * i;
			
 
				+		attr->end_i = attr->begin_i + nb_iter_i;
			
 
				+		attr->end_i += (i+1 != nb_subloop) ? 0 : (attr->nb_iterations % nb_subloop);
			
 
				+		attr->chunk = attr->end_i - attr->begin_i;
			
 
				+		starpu_omp_task_region(attr);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 // XXX on work
			
 
				 void starpu_omp_taskloop_inline_end(const struct starpu_omp_task_region_attr *attr)
			
 
				 {
			
 
				-   if (!attr->nogroup_clause) {
			
 
				-      starpu_omp_taskgroup_inline_end();
			
 
				-   }
			
 
				+	if (!attr->nogroup_clause)
			
 
				+	{
			
 
				+		starpu_omp_taskgroup_inline_end();
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 static inline void _starpu_omp_for_loop(struct starpu_omp_region *parallel_region, struct starpu_omp_task *task,
			
@@ -2170,7 +2179,7 @@ void starpu_omp_ordered_inline_end(void)
 
				 	struct starpu_omp_region *parallel_region = task->owner_region;
			
 
				 	struct starpu_omp_loop *loop = _starpu_omp_for_get_loop(parallel_region, task);
			
 
				 
			
 
				-	loop->ordered_iteration++;	
			
 
				+	loop->ordered_iteration++;
			
 
				 	condition_broadcast(&loop->ordered_cond, starpu_omp_task_wait_on_ordered);
			
 
				 	_starpu_spin_unlock(&loop->ordered_lock);
			
 
				 }
			
--- a/tests/datawizard/bcsr.c
+++ b/tests/datawizard/bcsr.c
@@ -41,7 +41,8 @@ void cpu_show_bcsr(void *descr[], void *arg)
 
				 
			
 
				 	printf("nnz %d elemsize %d\n", nnz, elemsize);
			
 
				 
			
 
				-	for (i = 0; i < nrow; i++) {
			
 
				+	for (i = 0; i < nrow; i++)
			
 
				+	{
			
 
				 		uint32_t row_start = rowptr[i] - firstentry;
			
 
				 		uint32_t row_end = rowptr[i+1] - firstentry;
			
 
				 
			
@@ -73,7 +74,7 @@ struct starpu_codelet show_cl =
 
				 };
			
 
				 
			
 
				 /*
			
 
				- * In this test, we use the following matrix: 
			
 
				+ * In this test, we use the following matrix:
			
 
				  *
			
 
				  *   +----------------+
			
 
				  *   |  0   1   0   0 |
			
@@ -129,7 +130,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_task_insert(&show_cl, STARPU_R, bcsr_handle, 0);
			
 
				 
			
 
				-	struct starpu_data_filter filter = {
			
 
				+	struct starpu_data_filter filter =
			
 
				+	{
			
 
				 		.filter_func = starpu_bcsr_filter_vertical_block,
			
 
				 		.nchildren = 2,
			
 
				 	};
			
--- a/tests/microbenchs/tasks_size_overhead.c
+++ b/tests/microbenchs/tasks_size_overhead.c
@@ -185,7 +185,8 @@ int main(int argc, char **argv)
 
				 	unsetenv("STARPU_NCPU");
			
 
				 #endif
			
 
				 
			
 
				-	if (STARPU_RUNNING_ON_VALGRIND) {
			
 
				+	if (STARPU_RUNNING_ON_VALGRIND)
			
 
				+	{
			
 
				 		factortime *= 4;
			
 
				 		cpustep *= 4;
			
 
				 	}
			
--- a/tests/parallel_tasks/parallel_kernels.c
+++ b/tests/parallel_tasks/parallel_kernels.c
@@ -89,7 +89,8 @@ int main(void)
 
				 
			
 
				 	unsigned iter, worker, n;
			
 
				 	n = N;
			
 
				-	if (STARPU_RUNNING_ON_VALGRIND) {
			
 
				+	if (STARPU_RUNNING_ON_VALGRIND)
			
 
				+	{
			
 
				 		n /= 300;
			
 
				 	}
			
 
				 	for (iter = 0; iter < n; iter++)
			
--- a/tests/parallel_tasks/parallel_kernels_spmd.c
+++ b/tests/parallel_tasks/parallel_kernels_spmd.c
@@ -92,7 +92,8 @@ int main(void)
 
				 
			
 
				 	unsigned iter, worker, n;
			
 
				 	n = N;
			
 
				-	if (STARPU_RUNNING_ON_VALGRIND) {
			
 
				+	if (STARPU_RUNNING_ON_VALGRIND)
			
 
				+	{
			
 
				 		n /= 300;
			
 
				 	}
			
 
				 	for (iter = 0; iter < n; iter++)
			
--- a/tools/starpu_perfmodel_display.c
+++ b/tools/starpu_perfmodel_display.c
@@ -175,9 +175,12 @@ int main(int argc, char **argv)
 
				 			fprintf(stderr, "The performance model for the symbol <%s> could not be loaded\n", psymbol);
			
 
				 			return 1;
			
 
				 		}
			
 
				-		if (xml) {
			
 
				+		if (xml)
			
 
				+		{
			
 
				 			starpu_perfmodel_dump_xml(stdout, &model);
			
 
				-		} else {
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				 			uint32_t *footprint = NULL;
			
 
				 			if (pdisplay_specific_footprint == 1)
			
 
				 			{
			
--- a/tools/starpu_perfmodel_recdump.c
+++ b/tools/starpu_perfmodel_recdump.c
@@ -129,7 +129,8 @@ void print_archs(FILE* output)
 
				 		{
			
 
				 			if (starpu_worker_get_memory_node(workerid) == node)
			
 
				 			{
			
 
				-				if (!printed) {
			
 
				+				if (!printed)
			
 
				+				{
			
 
				 					fprintf(output, "Workers:");
			
 
				 					printed = 1;
			
 
				 				}
			
@@ -145,7 +146,8 @@ void print_archs(FILE* output)
 
				 	{
			
 
				 		for (dst = 0; dst < starpu_memory_nodes_get_count(); dst++)
			
 
				 		{
			
 
				-			if (src != dst) {
			
 
				+			if (src != dst)
			
 
				+			{
			
 
				 				fprintf(output, "MemoryNodeSrc: %d\n", src);
			
 
				 				fprintf(output, "MemoryNodeDst: %d\n", dst);
			
 
				 				fprintf(output, "Bandwidth: %f\n", starpu_transfer_bandwidth(src, dst));
			
--- a/tools/starpu_replay.c
+++ b/tools/starpu_replay.c
@@ -165,7 +165,8 @@ static void replay_data_register(starpu_data_handle_t *handleptr, starpu_data_ha
 
				 	{
			
 
				 		replay_interface_ops.interfaceid = starpu_data_interface_get_next_id();
			
 
				 	}
			
 
				-	struct replay_interface interface = {
			
 
				+	struct replay_interface interface =
			
 
				+	{
			
 
				 		.id = replay_interface_ops.interfaceid,
			
 
				 		.orig_handle = orig_handle,
			
 
				 		.size = size,
			
@@ -337,7 +338,8 @@ double arch_cost_function(struct starpu_task *task, struct starpu_perfmodel_arch
 
				 /* End of settings */
			
 
				 
			
 
				 static unsigned long nexecuted_tasks;
			
 
				-void dumb_kernel(void *buffers[], void *args) {
			
 
				+void dumb_kernel(void *buffers[], void *args)
			
 
				+{
			
 
				 	(void) buffers;
			
 
				 	(void) args;
			
 
				 	nexecuted_tasks++;