浏览代码

Merge branch 'master' of git+ssh://scm.gforge.inria.fr/gitroot/starpu/starpu into ft_checkpoint

Romain LION 5 年之前
父节点
当前提交
ec57de33fa
共有 57 个文件被更改,包括 667 次插入499 次删除
  1. 8 3
      configure.ac
  2. 2 4
      doc/doxygen/Makefile.am
  3. 2 4
      doc/doxygen_dev/Makefile.am
  4. 2 1
      examples/perf_steering/perf_knobs_03.c
  5. 2 1
      examples/pipeline/pipeline.c
  6. 4 2
      examples/ppm_downscaler/yuv_downscaler.c
  7. 2 1
      examples/scheduler/dummy_modular_sched.c
  8. 4 1
      examples/spmv/spmv.c
  9. 1 1
      examples/tag_example/tag_example2.c
  10. 1 1
      examples/tag_example/tag_example3.c
  11. 47 0
      julia/mandelbrot/Makefile
  12. 28 24
      julia/mandelbrot/cpu_mandelbrot.c
  13. 0 35
      julia/mandelbrot/makefile
  14. 81 178
      julia/mandelbrot/mandelbrot.c
  15. 106 0
      julia/mandelbrot/mandelbrot_native.jl
  16. 3 2
      mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
  17. 2 0
      mpi/examples/mpi_lu/pxlu.c
  18. 14 9
      mpi/examples/mpi_lu/pxlu_implicit.c
  19. 1 1
      mpi/examples/user_datatype/user_datatype.c
  20. 1 1
      mpi/examples/user_datatype/user_datatype2.c
  21. 31 27
      mpi/src/mpi/starpu_mpi_mpi.c
  22. 59 18
      mpi/src/nmad/starpu_mpi_nmad.c
  23. 1 3
      mpi/src/starpu_mpi.c
  24. 3 1
      mpi/src/starpu_mpi_private.h
  25. 5 3
      mpi/tests/Makefile.am
  26. 5 0
      mpi/tests/abstract_sendrecv_bench.c
  27. 1 1
      mpi/tests/bench_helper.c
  28. 7 9
      mpi/tests/bench_helper.h
  29. 0 1
      mpi/tests/sendrecv_gemm_bench.c
  30. 6 0
      mpi/tests/sendrecv_parallel_tasks_bench.c
  31. 2 1
      src/common/barrier.c
  32. 4 2
      src/core/dependencies/data_concurrency.c
  33. 2 1
      src/core/dependencies/dependencies.c
  34. 4 2
      src/core/dependencies/implicit_data_deps.c
  35. 6 3
      src/core/perfmodel/perfmodel_history.c
  36. 16 7
      src/core/topology.c
  37. 3 3
      src/core/workers.c
  38. 2 1
      src/datawizard/coherency.c
  39. 2 1
      src/datawizard/interfaces/matrix_interface.c
  40. 32 23
      src/debug/traces/starpu_fxt.c
  41. 16 16
      src/sched_policies/component_best_implementation.c
  42. 21 10
      src/sched_policies/component_heteroprio.c
  43. 4 2
      src/sched_policies/component_sched.c
  44. 2 1
      src/sched_policies/component_work_stealing.c
  45. 2 1
      src/sched_policies/component_worker.c
  46. 1 1
      src/sched_policies/heteroprio.c
  47. 19 15
      src/sched_policies/modular_ez.c
  48. 2 1
      src/sched_policies/modular_gemm.c
  49. 2 1
      src/sched_policies/modular_heteroprio_heft.c
  50. 72 63
      src/util/openmp_runtime_support.c
  51. 5 3
      tests/datawizard/bcsr.c
  52. 2 1
      tests/microbenchs/tasks_size_overhead.c
  53. 2 1
      tests/parallel_tasks/parallel_kernels.c
  54. 2 1
      tests/parallel_tasks/parallel_kernels_spmd.c
  55. 5 2
      tools/starpu_perfmodel_display.c
  56. 4 2
      tools/starpu_perfmodel_recdump.c
  57. 4 2
      tools/starpu_replay.c

+ 8 - 3
configure.ac

@@ -3096,10 +3096,15 @@ AC_ARG_WITH([hwloc],
 				if test ! -d "$withval" ; then
 				   AC_MSG_ERROR("Directory specified for hwloc <$withval> does not exist")
 				fi
-				if test ! -d "$withval/lib/pkgconfig" ; then
-				   AC_MSG_ERROR("Hwloc directory <$withval> does not have a subdirectory lib/pkgconfig")
+				if test -d "$withval/lib64/pkgconfig" ; then
+				   export PKG_CONFIG_PATH=$withval/lib64/pkgconfig:$PKG_CONFIG_PATH
+			        else
+				   if test -d "$withval/lib/pkgconfig" ; then
+				      export PKG_CONFIG_PATH=$withval/lib/pkgconfig:$PKG_CONFIG_PATH
+				   else
+				      AC_MSG_ERROR("Hwloc directory <$withval> does not have a subdirectory lib/pkgconfig or lib64/pkgconfig")
+				   fi
 				fi
-				export PKG_CONFIG_PATH=$withval/lib/pkgconfig:$PKG_CONFIG_PATH
 				use_hwloc=yes
 			fi
 		else

+ 2 - 4
doc/doxygen/Makefile.am

@@ -267,13 +267,11 @@ $(DOX_PDF): $(DOX_TAG) refman.tex
 	! < refman.log grep -v group__ | grep -v _amgrp | grep -v deprecated__ | grep "multiply defined" || exit 1 ;\
 	$(MAKEINDEX) refman.idx ;\
 	max_print_line=1000000 $(PDFLATEX) -interaction batchmode refman.tex ;\
-	done=0; repeat=5 ;\
-	while test $$done = 0 -a $$repeat -gt 0; do \
+	for i in $(shell seq 1 5); do \
            if $(EGREP) 'Rerun (LaTeX|to get cross-references right)' refman.log > /dev/null 2>&1; then \
 	       max_print_line=1000000 $(PDFLATEX) -interaction batchmode refman.tex; \
-	       repeat=`expr $$repeat - 1`; \
 	   else \
-	       done=1; \
+		break ; \
 	   fi; \
 	done
 	mv $(DOX_LATEX_DIR)/refman.pdf $(DOX_PDF)

+ 2 - 4
doc/doxygen_dev/Makefile.am

@@ -217,13 +217,11 @@ $(DOX_PDF): $(DOX_TAG) refman.tex
 	! < refman.log grep -v group__ | grep -v _amgrp | grep -v deprecated__ | grep "multiply defined" || exit 1 ;\
 	$(MAKEINDEX) refman.idx ;\
 	max_print_line=1000000 $(PDFLATEX) -interaction batchmode refman.tex ;\
-	done=0; repeat=5 ;\
-	while test $$done = 0 -a $$repeat -gt 0; do \
+	for i in $(shell seq 1 5); do \
            if $(EGREP) 'Rerun (LaTeX|to get cross-references right)' refman.log > /dev/null 2>&1; then \
 	       max_print_line=1000000 $(PDFLATEX) -interaction batchmode refman.tex; \
-	       repeat=`expr $$repeat - 1`; \
 	   else \
-	       done=1; \
+		break ; \
 	   fi; \
 	done
 	mv $(DOX_LATEX_DIR)/refman.pdf $(DOX_PDF)

+ 2 - 1
examples/perf_steering/perf_knobs_03.c

@@ -126,7 +126,8 @@ int main(int argc, char **argv)
 		const int id = starpu_perf_knob_name_to_id(scope_id, knob_name);
 		STARPU_ASSERT(starpu_perf_knob_get_type_id(id) == starpu_perf_knob_type_name_to_id(knob_type_name));
 
-		struct starpu_codelet cl = {
+		struct starpu_codelet cl =
+		{
 			.cpu_funcs = {cpu_func}
 		};
 

+ 2 - 1
examples/pipeline/pipeline.c

@@ -176,7 +176,8 @@ static struct starpu_codelet pipeline_codelet_sum =
 	.model = &pipeline_model_sum
 };
 
-static void release_sem(void *arg) {
+static void release_sem(void *arg)
+{
 	sem_post(arg);
 };
 

+ 4 - 2
examples/ppm_downscaler/yuv_downscaler.c

@@ -141,14 +141,16 @@ int main(int argc, char **argv)
 
 	/* fetch input data */
 	FILE *f_in = fopen(filename_in, "r");
-	if (!f_in) {
+	if (!f_in)
+	{
 		fprintf(stderr, "couldn't open input file %s\n", filename_in);
 		exit(EXIT_FAILURE);
 	}
 
 	/* allocate room for an output buffer */
 	FILE *f_out = fopen(filename_out, "w+");
-	if (!f_out) {
+	if (!f_out)
+	{
 		fprintf(stderr, "couldn't open output file %s\n", filename_out);
 		exit(EXIT_FAILURE);
 	}

+ 2 - 1
examples/scheduler/dummy_modular_sched.c

@@ -170,7 +170,8 @@ static void init_dummy_sched(unsigned sched_ctx_id)
 {
 	FPRINTF(stderr, "Initialising Dummy scheduler\n");
 
-	struct dummy_sched_params params = {
+	struct dummy_sched_params params =
+	{
 		.verbose = 0,
 	};
 

+ 4 - 1
examples/spmv/spmv.c

@@ -245,10 +245,13 @@ int main(int argc, char **argv)
 			vector_exp_out_ptr[row] += UPPER_BAND * vector_in_ptr[row+1];
 	}
 	for (row = 0; row < size; row++)
-		if (vector_out_ptr[row] != vector_exp_out_ptr[row]) {
+	{
+		if (vector_out_ptr[row] != vector_exp_out_ptr[row])
+		{
 			FPRINTF(stderr, "check failed at %u: %f vs expected %f\n", row, vector_out_ptr[row], vector_exp_out_ptr[row]);
 			exit(EXIT_FAILURE);
 		}
+	}
 
 	starpu_free(nzval);
 	starpu_free(colind);

+ 1 - 1
examples/tag_example/tag_example2.c

@@ -70,7 +70,7 @@ static void tag_cleanup_grid(unsigned iter)
 
 	for (i = 0; i < ni; i++)
 		starpu_tag_remove(TAG(i,iter));
-} 
+}
 
 static int create_task_grid(unsigned iter)
 {

+ 1 - 1
examples/tag_example/tag_example3.c

@@ -72,7 +72,7 @@ static void tag_cleanup_grid(unsigned iter)
 
 	for (i = 0; i < ni; i++)
 		starpu_tag_remove(TAG(i,iter));
-} 
+}
 
 static int create_task_grid(unsigned iter)
 {

+ 47 - 0
julia/mandelbrot/Makefile

@@ -0,0 +1,47 @@
+CC=gcc
+CFLAGS += -Wall -Wextra -O3 -mavx -mfma -fomit-frame-pointer -march=native -ffast-math $(shell pkg-config --cflags starpu-1.3)
+
+LDFLAGS +=$(shell pkg-config --libs starpu-1.3) -lm
+EXTERNLIB=extern_tasks.so
+GENERATEDLIB=generated_tasks.so
+#OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
+OBJECTS=$(wildcard gen*.c)
+LIBPATH=${PWD}/../StarPU.jl/lib
+
+all: ${EXTERNLIB}
+
+mandelbrot: mandelbrot.c cpu_mandelbrot.o #gpu_mandelbrot.o
+	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
+
+gpu_mandelbrot.o: gpu_mandelbrot.cu
+	nvcc -c $(CFLAGS) $^ -o $@
+
+%.o: %.c
+	$(CC) -c $(CFLAGS) $^ -o $@
+
+${EXTERNLIB}: cpu_mandelbrot.c
+	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
+
+gpu_mandelbrot.so: gpu_mandelbrot.o
+	nvcc $(CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
+
+cpu_mandelbrot_sa: cpu_mandelbrot_sa.o
+	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
+
+${GENERATEDLIB}: ${OBJECTS}
+	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
+
+clean:
+	rm -f mandelbrot *.so *.o c_*.genc gencuda_*.cu *.dat
+
+# Performance Tests
+cstarpu.dat: mandelbrot
+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 ./mandelbrot -0.800671 -0.158392 32 32 4096 4 > $@
+julia_generatedc.dat:
+	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mandelbrot.jl $@
+julia_native.dat:
+	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mandelbrot_native.jl $@
+julia_calllib.dat: ${EXTERNLIB}
+	LD_LIBRARY_PATH+=${LIBPATH} JULIA_TASK_LIB="${EXTERNLIB}" STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mandelbrot.jl julia_calllib.dat
+
+test: cstarpu.dat julia_generatedc.dat julia_native.dat julia_calllib.dat

+ 28 - 24
julia/mandelbrot/cpu_mandelbrot.c

@@ -4,46 +4,50 @@
 
 void cpu_mandelbrot(void *descr[], void *cl_arg)
 {
-        long long int *pixels;
-	float *params;
+        long long *pixels;
+        float *params;
 
         pixels = (long long int *)STARPU_MATRIX_GET_PTR(descr[0]);
-	params = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
-
-        int width = STARPU_MATRIX_GET_NX(descr[0]);
-        int height = STARPU_MATRIX_GET_NY(descr[0]);
-        
-        int ldP = STARPU_MATRIX_GET_LD(descr[0]);
+        params = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
 
+        long long width = STARPU_MATRIX_GET_NY(descr[0]);
+        long long height = STARPU_MATRIX_GET_NX(descr[0]);
+        double zoom = width * 0.25296875;
+        double iz = 1. / zoom;
+        float diverge = 4.0;
+        float max_iterations = (width/2) * 0.049715909 * log10(zoom);
+        float imi = 1. / max_iterations;
         float centerr = params[0];
         float centeri = params[1];
         float offset = params[2];
         float dim = params[3];
-        float zoom = width * 0.25296875;
-        float diverge = 4.0;
-        int max_iter = (width/2) * 0.049715909 * log10(zoom);
+        double cr = 0;
+        double zr = 0;
+        double ci = 0;
+        double zi = 0;
+        long long n = 0;
+        double tmp = 0;
+        int ldP = STARPU_MATRIX_GET_LD(descr[0]);
 
-        int x,y,n;
+        long long x,y;
 
         for (y = 0; y < height; y++){
                 for (x = 0; x < width; x++){
-                        float cr = centerr + (x - (dim/2))/zoom;
-                        float ci = centeri + (y+offset - (dim/2))/zoom;
-                        float zr = cr;
-                        float zi = ci;
-                        
-                        for (n = 0; n <= max_iter; n++) {
+                        cr = centerr + (x - (dim/2)) * iz;
+			zr = cr;
+                        ci = centeri + (y+offset - (dim/2)) * iz;
+                        zi = ci;
+
+                        for (n = 0; n <= max_iterations; n++) {
 				if (zr*zr + zi*zi>diverge) break;
-                                float tmp = zr*zr - zi*zi + cr;
+                                tmp = zr*zr - zi*zi + cr;
                                 zi = 2*zr*zi + ci;
                                 zr = tmp;
                         }
-			int color;
-			if (n<max_iter)
-				color = round(15.*n/max_iter);
+			if (n<max_iterations)
+				pixels[y +x*ldP] = round(15.*n*imi);
 			else
-				color = 0;
-			pixels[x*ldP + y] = color;
+				pixels[y +x*ldP] = 0;
 		}
 	}
 }

+ 0 - 35
julia/mandelbrot/makefile

@@ -1,35 +0,0 @@
-# GCC compiler
-CC=gcc-9
-CFLAGS += -O3 -mavx -mfma -fomit-frame-pointer -march=native -ffast-math $(shell pkg-config --cflags starpu-1.3)
-
-LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
-EXTERNLIB=extern_tasks.dylib
-GENERATEDLIB=generated_tasks.dylib
-OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
-LIBPATH=${PWD}/../StarPU.jl/lib
-
-all: ${EXTERNLIB} 
-
-mult: mult.c cpu_mult.o #gpu_mult.o 
-	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)	
-
-gpu_mult.o: gpu_mult.cu
-	nvcc -c $(CFLAGS) $^ -o $@
-
-%.o: %.c
-	$(CC) -c $(CFLAGS) $^ -o $@
-
-${EXTERNLIB}: cpu_mandelbrot.o
-	$(CC) -shared -fPIC $(LDFLAGS) $^ -o $@  
-
-gpu_mult.so: gpu_mult.o
-	nvcc $(CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
-
-${GENERATEDLIB}: ${OBJECTS}
-	$(CC) -shared -fPIC $(LDFLAGS) $^ -o $@
-
-clean:
-	rm *.so *.o *.dylib c_*.genc gencuda_*.cu *.dat
-
-
-

+ 81 - 178
julia/mandelbrot/mandelbrot.c

@@ -16,43 +16,33 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <starpu.h>
-#include "../display.h"
 
 void cpu_mandelbrot(void **, void *);
 void gpu_mandelbrot(void **, void *);
 
-struct Params
+static struct starpu_perfmodel model =
 {
-	float cr;
-	float ci;
-	unsigned taskx;
-	unsigned tasky;
-	unsigned width;
-	unsigned height;
+		.type = STARPU_HISTORY_BASED,
+		.symbol = "history_perf"
 };
 
-
-
-struct starpu_codelet cl =
+static struct starpu_codelet cl =
 {
 	.cpu_funcs = {cpu_mandelbrot},
-	.cuda_funcs = {gpu_mandelbrot},
-	.nbuffers = 1,
-	.modes = {STARPU_RW}
+	//.cuda_funcs = {gpu_mandelbrot},
+	.nbuffers = 2,
+	.modes = {STARPU_W, STARPU_R},
+	.model = &model
 };
 
 
-void mandelbrot_with_starpu(int *pixels, float cr, float ci, unsigned width, unsigned height, unsigned nslicesx, unsigned nslicesy)
+void mandelbrot_with_starpu(long long *pixels, float *params, long long dim, long long nslicesx)
 {
-	starpu_data_handle_t p_handle;
-
-	starpu_matrix_data_register(&p_handle, STARPU_MAIN_RAM, (uintptr_t)pixels, width, width, height, sizeof(int));
+	starpu_data_handle_t pixels_handle;
+	starpu_data_handle_t params_handle;
 
-	struct starpu_data_filter vert =
-	{
-		.filter_func = starpu_matrix_filter_vertical_block,
-		.nchildren = nslicesy
-	};
+	starpu_matrix_data_register(&pixels_handle, STARPU_MAIN_RAM, (uintptr_t)pixels, dim, dim, dim, sizeof(long long));
+	starpu_matrix_data_register(&params_handle, STARPU_MAIN_RAM, (uintptr_t)params, 4*nslicesx, 4*nslicesx, 1, sizeof(float));
 
 	struct starpu_data_filter horiz =
 	{
@@ -60,179 +50,100 @@ void mandelbrot_with_starpu(int *pixels, float cr, float ci, unsigned width, uns
 		.nchildren = nslicesx
 	};
 
-	starpu_data_map_filters(p_handle, 2, &vert, &horiz);
+	starpu_data_partition(pixels_handle, &horiz);
+	starpu_data_partition(params_handle, &horiz);
 
-	unsigned taskx, tasky;
-
-	struct Params *params = malloc(nslicesx*nslicesy*sizeof(struct Params));
+	long long taskx;
 
 	for (taskx = 0; taskx < nslicesx; taskx++){
-		for (tasky = 0; tasky < nslicesy; tasky++){
-			struct starpu_task *task = starpu_task_create();
-			
-			task->cl = &cl;
-			task->handles[0] = starpu_data_get_sub_data(p_handle, 2, tasky, taskx);
-			struct Params param = {cr, ci, taskx, tasky, width, height};
-
-			params[taskx + tasky*nslicesx] = param;
-
-			task->cl_arg = (params + taskx + tasky * nslicesx);
-			task->cl_arg_size = sizeof(struct Params);
-			
-			starpu_task_submit(task);
-		}
+		struct starpu_task *task = starpu_task_create();
+
+		task->cl = &cl;
+		task->handles[0] = starpu_data_get_child(pixels_handle, taskx);
+		task->handles[1] = starpu_data_get_child(params_handle, taskx);
+		if (starpu_task_submit(task)!=0) fprintf(stderr,"submit task error\n");
 	}
-	starpu_task_wait_for_all();
 
-	starpu_data_unpartition(p_handle, STARPU_MAIN_RAM);
+	starpu_task_wait_for_all();
 
-	starpu_data_unregister(p_handle);
+	starpu_data_unpartition(pixels_handle, STARPU_MAIN_RAM);
+	starpu_data_unpartition(params_handle, STARPU_MAIN_RAM);
 
-	free(params);
+	starpu_data_unregister(pixels_handle);
+	starpu_data_unregister(params_handle);
 }
 
-void init_zero(int * pixels, unsigned width, unsigned height)
+void pixels2img(long long *pixels, long long width, long long height, const char *filename)
 {
-	unsigned i,j;
-	for (i = 0; i < height; i++){
-		for (j = 0; j < width; j++){
-			pixels[j + i*width] = 0;
-		}
-	}
-}
+  FILE *fp = fopen(filename, "w");
+  if (!fp)
+    return;
 
-void sort(double *arr, unsigned nbr_tests)
-{
-	unsigned j;
-	
-	int is_sort = 0;
-	
-	while (!is_sort){
-
-		is_sort = 1;
-		
-		for (j = 0; j < nbr_tests - 1; j++){
-			if (arr[j] > arr[j+1]){
-				is_sort = 0;
-				double tmp = arr[j];
-				arr[j] = arr[j+1];
-				arr[j+1] = tmp;
-			}
-		}
-	}
-}
-double median_time(float cr, float ci, unsigned width, unsigned height, unsigned nslicesx, unsigned nslicesy, unsigned nbr_tests)
-{
-	int *Pixels = malloc(width*height*sizeof(int));
-	
-	unsigned i;
+  int MAPPING[16][3] = {{66,30,15},{25,7,26},{9,1,47},{4,4,73},{0,7,100},{12,44,138},{24,82,177},{57,125,209},{134,181,229},{211,236,248},{241,233,191},{248,201,95},{255,170,0},{204,128,0},{153,87,0},{106,52,3}};
 
-	double exec_times[nbr_tests];
+  fprintf(fp, "P3\n%lld %lld\n255\n", width, height);
+  long long i, j;
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      fprintf(fp, "%d %d %d ", MAPPING[pixels[j*width+i]][0], MAPPING[pixels[j*width+i]][1], MAPPING[pixels[j*width+i]][2]);
+    }
+  }
 
-	double start, stop, exec_t;
-	for (i = 0; i < nbr_tests; i++){
-		init_zero(Pixels, width, height);
-		
-		start = starpu_timing_now(); // starpu_timing_now() gives the time in microseconds.
-		mandelbrot_with_starpu(Pixels, cr, ci, width, height, nslicesx, nslicesy);
-		stop = starpu_timing_now();
-		
-		exec_t = (stop-start)/1.e6;
-		exec_times[i] = exec_t;
-	}
-	char filename[30];
-	sprintf(filename, "PPM/mandelbrot%d.ppm", width);
-	printf("%s\n", filename);
-
-	mandelbrot_graph(filename, Pixels, width, height);
-
-	free(Pixels);
-
-	sort(exec_times, nbr_tests);
-
-	return exec_times[nbr_tests/2];	
+  fclose(fp);
 }
 
-void fluctuation_time(float cr, float ci, unsigned width, unsigned height, unsigned nslicesx, unsigned nslicesy, unsigned nbr_tests, double *exec_times)
+double min_times(double cr, double ci, long long dim, long long nslices)
 {
-	int *Pixels = malloc(width*height*sizeof(int));
-	
-	unsigned i;
+	long long *pixels = calloc(dim*dim, sizeof(long long));
+	float *params = calloc(4*nslices, sizeof(float));
+
+	double t_min = 0;
+	long long i;
+
+	for (i=0; i<nslices; i++) {
+		params[4*i+0] = cr;
+		params[4*i+1] = ci;
+		params[4*i+2] = i*dim/nslices;
+		params[4*i+3] = dim;
+	}
 
 	double start, stop, exec_t;
-	for (i = 0; i < nbr_tests; i++){
-		init_zero(Pixels, width, height);
-		
+	for (i = 0; i < 10; i++){
 		start = starpu_timing_now(); // starpu_timing_now() gives the time in microseconds.
-		mandelbrot_with_starpu(Pixels, cr, ci, width, height, nslicesx, nslicesy);
+		mandelbrot_with_starpu(pixels, params, dim, nslices);
 		stop = starpu_timing_now();
-		
-		exec_t = (stop-start)/1.e6;
-		exec_times[i] = exec_t;
-
-		/* char filename[33]; */
-		/* sprintf(filename, "../PPM/mandelbrot%d.ppm", i + 1); */
-		/* printf("%s\n", filename); */
-		/* mandelbrot_graph(filename, Pixels, width, height); */
+		exec_t = (stop-start)*1.e3;
+		if (t_min==0 || t_min>exec_t)
+		  t_min = exec_t;
 	}
 
+	char filename[64];
+	snprintf(filename, 64, "out%lld.ppm", dim);
+	pixels2img(pixels,dim,dim,filename);
 
-	free(Pixels);
-
-
+	free(pixels);
+	free(params);
 
-	
+	return t_min;
 }
 
-
-void display_times(float cr, float ci, unsigned start_dim, unsigned step_dim, unsigned stop_dim, unsigned nslices, unsigned nbr_tests)
+void display_times(double cr, double ci, long long start_dim, long long step_dim, long long stop_dim, long long nslices)
 {
-	
-	unsigned dim;
-
-	FILE *myfile;
-	myfile = fopen("DAT/mandelbrot_c_struct_times.dat", "w");
-
-	for (dim = start_dim; dim <= stop_dim; dim += step_dim){
-		printf("Dimension: %u...\n", dim);
-		double t = median_time(cr, ci, dim, dim, nslices, nslices, nbr_tests);
-		
-		printf("w = %u ; h = %u ; t = %f\n", dim, dim, t);
-		
-		fprintf(myfile, "%f\n", t);
-		}
-	
-	fclose(myfile);
-}
 
-void display_fluctuations(float cr, float ci, unsigned start_dim, unsigned step_dim, unsigned stop_dim, unsigned nslices, unsigned nbr_tests)
-{
-	
-	unsigned dim;
-
-	FILE *myfile;
-	myfile = fopen("DAT/mandelbrot_c_fluctuation.dat", "w");
-
-	double *exec_times = malloc(nbr_tests * sizeof(double));
-	fluctuation_time(cr, ci, start_dim, start_dim, nslices, nslices, nbr_tests, exec_times);
-		
-	/* printf("w = %u ; h = %u ; t = %f\n", dim, dim, t); */
-	unsigned i;
-	for (i = 0; i < nbr_tests; i++){
-		printf("test %u: %f seconds\n", i, exec_times[i]);
-		fprintf(myfile, "%u %f\n", i, exec_times[i]);
+	long long dim;
+
+	for (dim = start_dim; dim <= stop_dim; dim += step_dim) {
+		printf("Dimension: %lld...\n", dim);
+		double res = min_times(cr, ci, dim, nslices);
+		res = res / dim / dim; // time per pixel
+		printf("%lld %lf\n", dim, res);
 	}
-	
-	fclose(myfile);
-	free(exec_times);
 }
 
-
 int main(int argc, char **argv)
 {
-
-	if (argc != 8){
-		printf("Usage: %s cr ci start_dim step_dim stop_dim nslices(must divide dims) nbr_tests\n", argv[0]);
+	if (argc != 7){
+		printf("Usage: %s cr ci start_dim step_dim stop_dim nslices(must divide dims)\n", argv[0]);
 		return 1;
 	}
 	if (starpu_init(NULL) != EXIT_SUCCESS){
@@ -240,24 +151,16 @@ int main(int argc, char **argv)
 		return 77;
 	}
 
+	double cr = (float) atof(argv[1]);
+	double ci = (float) atof(argv[2]);
+	long long start_dim = atoll(argv[3]);
+	long long step_dim = atoll(argv[4]);
+	long long stop_dim = atoll(argv[5]);
+	long long nslices = atoll(argv[6]);
 
-	
-	float cr = (float) atof(argv[1]);
-	float ci = (float) atof(argv[2]);
-	unsigned start_dim = (unsigned) atoi(argv[3]);
-	unsigned step_dim = (unsigned) atoi(argv[4]);	
-	unsigned stop_dim = (unsigned) atoi(argv[5]);
-	unsigned nslices = (unsigned) atoi(argv[6]);
-	unsigned nbr_tests = (unsigned) atoi(argv[7]);
-
-	display_times(cr, ci, start_dim, step_dim, stop_dim, nslices, nbr_tests);
-	
-	
-	/* display_fluctuations(cr, ci, start_dim, step_dim, stop_dim, nslices, nbr_tests); */
-
+	display_times(cr, ci, start_dim, step_dim, stop_dim, nslices);
 
 	starpu_shutdown();
 
-
 	return 0;
 }

+ 106 - 0
julia/mandelbrot/mandelbrot_native.jl

@@ -0,0 +1,106 @@
+using LinearAlgebra
+
+function mandelbrot(pixels, params) :: Float32
+    height :: Int64, width :: Int64 = size(pixels)
+    zoom :: Float64 = width * 0.25296875
+    iz :: Float64 = 1. / zoom
+    diverge :: Float32 = 4.0
+    max_iterations :: Float32 = ((width/2) * 0.049715909 * log10(zoom));
+    imi :: Float32 = 1. / max_iterations
+    centerr :: Float32 = params[1]
+    centeri :: Float32 = params[2]
+    offset :: Float32 = params[3]
+    dim :: Float32 = params[4]
+    cr :: Float64 = 0.
+    zr :: Float64 = 0.
+    ci :: Float64 = 0.
+    zi :: Float64 = 0.
+    n :: Int64 = 0
+    tmp :: Float64 = 0.
+    for y = 1:height
+        for x = 1:width
+            cr = centerr + (x-1 - (dim / 2)) * iz
+            zr = cr
+            ci = centeri + (y-1+offset - (dim / 2)) * iz
+            zi = ci
+            for n = 0:max_iterations
+                if (zr*zr + zi*zi > diverge)
+                    break
+                end
+                tmp = zr*zr - zi*zi + cr
+                zi = 2*zr*zi + ci
+                zr = tmp
+            end
+
+            if (n < max_iterations)
+                pixels[y,x] = round(15 * n * imi)
+            else
+                pixels[y,x] = 0
+            end
+        end
+    end
+
+    ret :: Float32 = 0.
+    return ret
+end
+
+function mandelbrot_without_starpu(A ::Matrix{Int64}, params ::Matrix{Float32}, nslicesx ::Int64)
+    width,height = size(A)
+    step = height / nslicesx
+
+    for taskx in (1 : nslicesx)
+        start_id = floor(Int64, (taskx-1)*step+1)
+        end_id = floor(Int64, (taskx-1)*step+step)
+        a = view(A, start_id:end_id, :)
+        p = view(params, (taskx-1)*4+1:(taskx-1)*4+4)
+
+        mandelbrot(a, p)
+    end
+end
+
+function pixels2img(pixels ::Matrix{Int64}, width ::Int64, height ::Int64, filename ::String)
+    MAPPING = [[66,30,15],[25,7,26],[9,1,47],[4,4,73],[0,7,100],[12,44,138],[24,82,177],[57,125,209],[134,181,229],[211,236,248],[241,233,191],[248,201,95],[255,170,0],[204,128,0],[153,87,0],[106,52,3]]
+    open(filename, "w") do f
+        write(f, "P3\n$width $height\n255\n")
+        for i = 1:height
+            for j = 1:width
+                write(f,"$(MAPPING[1+pixels[i,j]][1]) $(MAPPING[1+pixels[i,j]][2]) $(MAPPING[1+pixels[i,j]][3]) ")
+            end
+            write(f, "\n")
+        end
+    end
+end
+
+function min_times(cr ::Float64, ci ::Float64, dim ::Int64, nslices ::Int64)
+    tmin=0;
+
+    pixels ::Matrix{Int64} = zeros(dim, dim)
+    params :: Matrix{Float32} = zeros(4*nslices,1)
+    for i=0:(nslices-1)
+        params[4*i+1,1] = cr
+        params[4*i+2,1] = ci
+        params[4*i+3,1] = i*dim/nslices
+        params[4*i+4,1] = dim
+    end
+    for i = 1:10
+        t = time_ns();
+        mandelbrot_without_starpu(pixels, params, nslices)
+        t = time_ns()-t
+        if (tmin==0 || tmin>t)
+            tmin=t
+        end
+    end
+    pixels2img(pixels,dim,dim,"out$(dim).ppm")
+    return tmin
+end
+
+function display_time(cr ::Float64, ci ::Float64, start_dim ::Int64, step_dim ::Int64, stop_dim ::Int64, nslices ::Int64)
+    for dim in (start_dim : step_dim : stop_dim)
+        res = min_times(cr, ci, dim, nslices)
+        res=res/dim/dim; # time per pixel
+        println("$(dim) $(res)")
+    end
+end
+
+
+display_time(-0.800671,-0.158392,32,32,4096,4)

+ 3 - 2
mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c

@@ -115,6 +115,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 		}
 	}
 
+	starpu_mpi_wait_for_all(MPI_COMM_WORLD);
 	starpu_mpi_barrier(MPI_COMM_WORLD);
 	start = starpu_timing_now();
 
@@ -159,9 +160,9 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 		starpu_iteration_pop();
 	}
 
-	starpu_task_wait_for_all();
-
+	starpu_mpi_wait_for_all(MPI_COMM_WORLD);
 	starpu_mpi_barrier(MPI_COMM_WORLD);
+
 	end = starpu_timing_now();
 
 	for (m = 0; m < nblocks; m++)

+ 2 - 0
mpi/examples/mpi_lu/pxlu.c

@@ -899,6 +899,8 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size, unsig
 		starpu_iteration_pop();
 	}
 
+	int wait_ret = starpu_mpi_wait_for_all(MPI_COMM_WORLD);
+	STARPU_ASSERT(wait_ret == MPI_SUCCESS);
 	int barrier_ret = starpu_mpi_barrier(MPI_COMM_WORLD);
 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
 

+ 14 - 9
mpi/examples/mpi_lu/pxlu_implicit.c

@@ -57,7 +57,7 @@ static void create_task_11(unsigned k)
 static void create_task_12(unsigned k, unsigned j)
 {
 #ifdef STARPU_DEVEL
-#warning temporary fix 
+#warning temporary fix
 #endif
 	starpu_mpi_task_insert(MPI_COMM_WORLD,
 			       //&STARPU_PLU(cl12),
@@ -79,7 +79,7 @@ static void create_task_12(unsigned k, unsigned j)
 static void create_task_21(unsigned k, unsigned i)
 {
 #ifdef STARPU_DEVEL
-#warning temporary fix 
+#warning temporary fix
 #endif
 	starpu_mpi_task_insert(MPI_COMM_WORLD,
 			       //&STARPU_PLU(cl21),
@@ -114,13 +114,14 @@ static void create_task_22(unsigned k, unsigned i, unsigned j)
 }
 
 /*
- *	code to bootstrap the factorization 
+ *	code to bootstrap the factorization
  */
 
 double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size, unsigned _no_prio)
 {
 	double start;
 	double end;
+	int ret;
 
 	nblocks = _nblocks;
 	rank = _rank;
@@ -130,7 +131,10 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size, unsig
 	/* create all the DAG nodes */
 	unsigned i,j,k;
 
-	starpu_mpi_barrier(MPI_COMM_WORLD);
+	ret = starpu_mpi_wait_for_all(MPI_COMM_WORLD);
+	STARPU_ASSERT(ret == MPI_SUCCESS);
+	ret = starpu_mpi_barrier(MPI_COMM_WORLD);
+	STARPU_ASSERT(ret == MPI_SUCCESS);
 
 	start = starpu_timing_now();
 
@@ -170,15 +174,16 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size, unsig
 		starpu_iteration_pop();
 	}
 
-	starpu_task_wait_for_all();
-
-	starpu_mpi_barrier(MPI_COMM_WORLD);
+	ret = starpu_mpi_wait_for_all(MPI_COMM_WORLD);
+	STARPU_ASSERT(ret == MPI_SUCCESS);
+	ret = starpu_mpi_barrier(MPI_COMM_WORLD);
+	STARPU_ASSERT(ret == MPI_SUCCESS);
 
 	end = starpu_timing_now();
 
 	double timing = end - start;
-	
+
 //	fprintf(stderr, "RANK %d -> took %f ms\n", rank, timing/1000);
-	
+
 	return timing;
 }

+ 1 - 1
mpi/examples/user_datatype/user_datatype.c

@@ -120,8 +120,8 @@ int main(int argc, char **argv)
 		starpu_mpi_isend_detached(handle0, 0, 20, MPI_COMM_WORLD, NULL, NULL);
 	}
 
-	starpu_mpi_barrier(MPI_COMM_WORLD);
 	starpu_mpi_wait_for_all(MPI_COMM_WORLD);
+	starpu_mpi_barrier(MPI_COMM_WORLD);
 
 	starpu_mpi_datatype_unregister(handle0);
 	starpu_data_unregister(handle0);

+ 1 - 1
mpi/examples/user_datatype/user_datatype2.c

@@ -80,8 +80,8 @@ int main(int argc, char **argv)
 		starpu_mpi_isend_detached(handle0, 0, 20, MPI_COMM_WORLD, NULL, NULL);
 	}
 
-	starpu_mpi_barrier(MPI_COMM_WORLD);
 	starpu_mpi_wait_for_all(MPI_COMM_WORLD);
+	starpu_mpi_barrier(MPI_COMM_WORLD);
 
 	starpu_mpi_datatype_unregister(handle0);
 	starpu_data_unregister(handle0);

+ 31 - 27
mpi/src/mpi/starpu_mpi_mpi.c

@@ -96,7 +96,7 @@ starpu_pthread_queue_t _starpu_mpi_thread_dontsleep;
 /* Count requests posted by the application and not yet submitted to MPI */
 static starpu_pthread_mutex_t mutex_posted_requests;
 static starpu_pthread_mutex_t mutex_ready_requests;
-static int posted_requests = 0, ready_requests = 0, newer_requests, barrier_running = 0;
+static int posted_requests = 0, ready_requests = 0, newer_requests, mpi_wait_for_all_running = 0;
 
 #define _STARPU_MPI_INC_POSTED_REQUESTS(value) { STARPU_PTHREAD_MUTEX_LOCK(&mutex_posted_requests); posted_requests += value; STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_posted_requests); }
 #define _STARPU_MPI_INC_READY_REQUESTS(value) { STARPU_PTHREAD_MUTEX_LOCK(&mutex_ready_requests); ready_requests += value; STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_ready_requests); }
@@ -761,16 +761,40 @@ static void _starpu_mpi_barrier_func(struct _starpu_mpi_req *barrier_req)
 int _starpu_mpi_barrier(MPI_Comm comm)
 {
 	struct _starpu_mpi_req *barrier_req;
-	int ret = posted_requests+ready_requests;
 
+	/* Initialize the request structure */
+	_starpu_mpi_request_init(&barrier_req);
+	barrier_req->prio = INT_MAX;
+	barrier_req->func = _starpu_mpi_barrier_func;
+	barrier_req->request_type = BARRIER_REQ;
+	barrier_req->node_tag.node.comm = comm;
+
+	_STARPU_MPI_INC_POSTED_REQUESTS(1);
+	_starpu_mpi_submit_ready_request(barrier_req);
+
+	/* We wait for the MPI request to finish */
+	STARPU_PTHREAD_MUTEX_LOCK(&barrier_req->backend->req_mutex);
+	while (!barrier_req->completed)
+		STARPU_PTHREAD_COND_WAIT(&barrier_req->backend->req_cond, &barrier_req->backend->req_mutex);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&barrier_req->backend->req_mutex);
+
+	_starpu_mpi_request_destroy(barrier_req);
+	_STARPU_MPI_LOG_OUT();
+
+	return 0;
+}
+
+int _starpu_mpi_wait_for_all(MPI_Comm comm)
+{
+	(void) comm;
 	_STARPU_MPI_LOG_IN();
 
 	/* First wait for *both* all tasks and MPI requests to finish, in case
 	 * some tasks generate MPI requests, MPI requests generate tasks, etc.
 	 */
 	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
-	STARPU_MPI_ASSERT_MSG(!barrier_running, "Concurrent starpu_mpi_barrier is not implemented, even on different communicators");
-	barrier_running = 1;
+	STARPU_MPI_ASSERT_MSG(!mpi_wait_for_all_running, "Concurrent starpu_mpi_wait_for_all is not implemented, even on different communicators");
+	mpi_wait_for_all_running = 1;
 	do
 	{
 		while (posted_requests || ready_requests)
@@ -786,29 +810,9 @@ int _starpu_mpi_barrier(MPI_Comm comm)
 		 * triggered by tasks completed and triggered tasks between
 		 * wait_for_all finished and we take the lock */
 	} while (posted_requests || ready_requests || newer_requests);
-	barrier_running = 0;
+	mpi_wait_for_all_running = 0;
 	STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
-
-	/* Initialize the request structure */
-	_starpu_mpi_request_init(&barrier_req);
-	barrier_req->prio = INT_MAX;
-	barrier_req->func = _starpu_mpi_barrier_func;
-	barrier_req->request_type = BARRIER_REQ;
-	barrier_req->node_tag.node.comm = comm;
-
-	_STARPU_MPI_INC_POSTED_REQUESTS(1);
-	_starpu_mpi_submit_ready_request(barrier_req);
-
-	/* We wait for the MPI request to finish */
-	STARPU_PTHREAD_MUTEX_LOCK(&barrier_req->backend->req_mutex);
-	while (!barrier_req->completed)
-		STARPU_PTHREAD_COND_WAIT(&barrier_req->backend->req_cond, &barrier_req->backend->req_mutex);
-	STARPU_PTHREAD_MUTEX_UNLOCK(&barrier_req->backend->req_mutex);
-
-	_starpu_mpi_request_destroy(barrier_req);
-	_STARPU_MPI_LOG_OUT();
-
-	return ret;
+	return 0;
 }
 
 /********************************************************/
@@ -1269,7 +1273,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 			_STARPU_MPI_DEBUG(3, "NO MORE REQUESTS TO HANDLE\n");
 			_STARPU_MPI_TRACE_SLEEP_BEGIN();
 
-			if (barrier_running)
+			if (mpi_wait_for_all_running)
 				/* Tell mpi_barrier */
 				STARPU_PTHREAD_COND_SIGNAL(&barrier_cond);
 			STARPU_PTHREAD_COND_WAIT(&progress_cond, &progress_mutex);

+ 59 - 18
mpi/src/nmad/starpu_mpi_nmad.c

@@ -59,16 +59,20 @@ static starpu_pthread_cond_t progress_cond;
 static starpu_pthread_mutex_t progress_mutex;
 static volatile int running = 0;
 
-extern struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count);
+static starpu_pthread_cond_t mpi_wait_for_all_running_cond;
+static int mpi_wait_for_all_running = 0;
+static starpu_pthread_mutex_t mpi_wait_for_all_running_mutex;
 
-/* Count requests posted by the application and not yet submitted to MPI, i.e pushed into the new_requests list */
+extern struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count);
 
-static volatile int pending_request = 0;
+/* Count running requests: this counter is incremented just before StarPU
+ * submits a MPI request, and decremented when a MPI request finishes. */
+static volatile int nb_pending_requests = 0;
 
 #define REQ_FINALIZED 0x1
 
-PUK_LFSTACK_TYPE(callback,	struct _starpu_mpi_req *req;);
-static callback_lfstack_t callback_stack = NULL;
+PUK_LFSTACK_TYPE(callback, struct _starpu_mpi_req *req;);
+static callback_lfstack_t callback_stack;
 
 static starpu_sem_t callback_sem;
 
@@ -80,7 +84,7 @@ static starpu_sem_t callback_sem;
 
 void _starpu_mpi_req_willpost(struct _starpu_mpi_req *req STARPU_ATTRIBUTE_UNUSED)
 {
-	STARPU_ATOMIC_ADD( &pending_request, 1);
+	STARPU_ATOMIC_ADD( &nb_pending_requests, 1);
 }
 
 /********************************************************/
@@ -269,16 +273,39 @@ int _starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 int _starpu_mpi_barrier(MPI_Comm comm)
 {
 	_STARPU_MPI_LOG_IN();
-	int ret;
-	//	STARPU_ASSERT_MSG(!barrier_running, "Concurrent starpu_mpi_barrier is not implemented, even on different communicators");
-	ret = MPI_Barrier(comm);
 
+	int ret = MPI_Barrier(comm);
 	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Barrier returning %d", ret);
 
 	_STARPU_MPI_LOG_OUT();
 	return ret;
 }
 
+int _starpu_mpi_wait_for_all(MPI_Comm comm)
+{
+	(void) comm;
+	_STARPU_MPI_LOG_IN();
+
+	STARPU_PTHREAD_MUTEX_LOCK(&mpi_wait_for_all_running_mutex);
+	STARPU_MPI_ASSERT_MSG(!mpi_wait_for_all_running, "Concurrent starpu_mpi_wait_for_all is not implemented, even on different communicators");
+	mpi_wait_for_all_running = 1;
+	do
+	{
+		while (nb_pending_requests)
+			STARPU_PTHREAD_COND_WAIT(&mpi_wait_for_all_running_cond, &mpi_wait_for_all_running_mutex);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&mpi_wait_for_all_running_mutex);
+
+		starpu_task_wait_for_all();
+
+		STARPU_PTHREAD_MUTEX_LOCK(&mpi_wait_for_all_running_mutex);
+	} while (nb_pending_requests);
+	mpi_wait_for_all_running = 0;
+	STARPU_PTHREAD_MUTEX_UNLOCK(&mpi_wait_for_all_running_mutex);
+
+	_STARPU_MPI_LOG_OUT();
+	return 0;
+}
+
 /********************************************************/
 /*                                                      */
 /*  Progression                                         */
@@ -353,9 +380,13 @@ void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req,nm_sr_ev
 			req->completed = 1;
 			piom_cond_signal(&req->backend->req_cond, REQ_FINALIZED);
 		}
-		int pending_remaining = STARPU_ATOMIC_ADD(&pending_request, -1);
-		if (!running && !pending_remaining)
-			starpu_sem_post(&callback_sem);
+		int pending_remaining = STARPU_ATOMIC_ADD(&nb_pending_requests, -1);
+		if (!pending_remaining)
+		{
+			STARPU_PTHREAD_COND_BROADCAST(&mpi_wait_for_all_running_cond);
+			if (!running)
+				starpu_sem_post(&callback_sem);
+		}
 	}
 	_STARPU_MPI_LOG_OUT();
 }
@@ -476,24 +507,24 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 		struct callback_lfstack_cell_s* c = callback_lfstack_pop(&callback_stack);
 		int err=0;
 
-		if(running || pending_request>0)
+		if(running || nb_pending_requests>0)
 		{
 			/* shall we block ? */
 			err = starpu_sem_wait(&callback_sem);
-			//running pending_request can change while waiting
+			//running nb_pending_requests can change while waiting
 		}
 		if(c==NULL)
 		{
 			c = callback_lfstack_pop(&callback_stack);
 			if (c == NULL)
 			{
-				if(running && pending_request>0)
+				if(running && nb_pending_requests>0)
 				{
 					STARPU_ASSERT_MSG(c!=NULL, "Callback thread awakened without callback ready with error %d.",err);
 				}
 				else
 				{
-					if (pending_request==0)
+					if (nb_pending_requests==0)
 						break;
 				}
 				continue;
@@ -511,14 +542,14 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 			c->req->completed=1;
 			piom_cond_signal(&(c->req->backend->req_cond), REQ_FINALIZED);
 		}
-		STARPU_ATOMIC_ADD( &pending_request, -1);
+		STARPU_ATOMIC_ADD( &nb_pending_requests, -1);
 		/* we signal that the request is completed.*/
 
 		free(c);
 
 	}
 	STARPU_ASSERT_MSG(callback_lfstack_pop(&callback_stack)==NULL, "List of callback not empty.");
-	STARPU_ASSERT_MSG(pending_request==0, "Request still pending.");
+	STARPU_ASSERT_MSG(nb_pending_requests==0, "Request still pending.");
 
 	if (argc_argv->initialize_mpi)
 	{
@@ -580,6 +611,9 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
         STARPU_PTHREAD_MUTEX_INIT(&progress_mutex, NULL);
         STARPU_PTHREAD_COND_INIT(&progress_cond, NULL);
 
+        STARPU_PTHREAD_MUTEX_INIT(&mpi_wait_for_all_running_mutex, NULL);
+        STARPU_PTHREAD_COND_INIT(&mpi_wait_for_all_running_cond, NULL);
+
 	starpu_sem_init(&callback_sem, 0, 0);
 	running = 0;
 
@@ -594,6 +628,8 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 		_starpu_mpi_thread_cpuid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
 	}
 
+	callback_lfstack_init(&callback_stack);
+
 	/* Tell pioman to use a bound thread for communication progression:
 	 * share the same core as StarPU's MPI thread, the MPI thread has very low activity with NMAD backend */
 	int indexes[1] = { _starpu_mpi_thread_cpuid };
@@ -663,8 +699,13 @@ void _starpu_mpi_progress_shutdown(void **value)
 
 	STARPU_PTHREAD_JOIN(progress_thread, value);
 
+	callback_lfstack_destroy(&callback_stack);
+
         STARPU_PTHREAD_MUTEX_DESTROY(&progress_mutex);
         STARPU_PTHREAD_COND_DESTROY(&progress_cond);
+
+        STARPU_PTHREAD_MUTEX_DESTROY(&mpi_wait_for_all_running_mutex);
+        STARPU_PTHREAD_COND_DESTROY(&mpi_wait_for_all_running_cond);
 }
 
 static int64_t _starpu_mpi_tag_max = INT64_MAX;

+ 1 - 3
mpi/src/starpu_mpi.c

@@ -437,7 +437,5 @@ void starpu_mpi_data_migrate(MPI_Comm comm, starpu_data_handle_t data, int new_r
 
 int starpu_mpi_wait_for_all(MPI_Comm comm)
 {
-	starpu_task_wait_for_all();
-	starpu_mpi_barrier(comm);
-	return 0;
+	return _starpu_mpi_wait_for_all(comm);
 }

+ 3 - 1
mpi/src/starpu_mpi_private.h

@@ -310,7 +310,6 @@ void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req);
 void _starpu_mpi_irecv_size_func(struct _starpu_mpi_req *req);
 int _starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status);
 int _starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status);
-int _starpu_mpi_barrier(MPI_Comm comm);
 
 struct _starpu_mpi_argc_argv
 {
@@ -331,6 +330,9 @@ void _starpu_mpi_wait_for_initialization();
 #endif
 void _starpu_mpi_data_flush(starpu_data_handle_t data_handle);
 
+int _starpu_mpi_barrier(MPI_Comm comm);
+int _starpu_mpi_wait_for_all(MPI_Comm comm);
+
 /*
  * Specific functions to backend implementation
  */

+ 5 - 3
mpi/tests/Makefile.am

@@ -58,9 +58,11 @@ BUILT_SOURCES =
 
 CLEANFILES = *.gcno *.gcda *.linkinfo starpu_idle_microsec.log
 
-EXTRA_DIST = 					\
-	user_defined_datatype_value.h		\
-	helper.h
+EXTRA_DIST = 				\
+	abstract_sendrecv_bench.h	\
+	bench_helper.h			\
+	helper.h			\
+	user_defined_datatype_value.h
 
 examplebindir = $(libdir)/starpu/examples/mpi
 

+ 5 - 0
mpi/tests/abstract_sendrecv_bench.c

@@ -25,6 +25,11 @@ void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
 
 	if (mpi_rank >= 2)
 	{
+		if (thread_barrier != NULL)
+		{
+			STARPU_PTHREAD_BARRIER_WAIT(thread_barrier);
+		}
+
 		for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
 		{
 			iterations = bench_nb_iterations(iterations, s);

+ 1 - 1
mpi/tests/bench_helper.c

@@ -33,7 +33,7 @@ int comp_double(const void*_a, const void*_b)
 
 uint64_t bench_next_size(uint64_t len)
 {
-	uint64_t next = len * MULT_DEFAULT + INCR_DEFAULT;
+	uint64_t next = len * MULT_DEFAULT;
 
 	if(next <= len)
 		next++;

+ 7 - 9
mpi/tests/bench_helper.h

@@ -18,18 +18,16 @@
 #include <starpu_mpi.h>
 #include "helper.h"
 
-#define NX_MAX (512 * 1024 * 1024) // kB
 #define NX_MIN 0
+
 #ifdef STARPU_QUICK_CHECK
-#define MULT_DEFAULT 4
-#else
-#define MULT_DEFAULT 2
-#endif
-#define INCR_DEFAULT 0
-#ifdef STARPU_QUICK_CHECK
-#define LOOPS_DEFAULT 100
+	#define MULT_DEFAULT 4
+	#define LOOPS_DEFAULT 100
+	#define NX_MAX (64 * 1024 * 1024) // kB
 #else
-#define LOOPS_DEFAULT 100000
+	#define MULT_DEFAULT 2
+	#define LOOPS_DEFAULT 100000
+	#define NX_MAX (512 * 1024 * 1024) // kB
 #endif
 
 int comp_double(const void*_a, const void*_b);

+ 0 - 1
mpi/tests/sendrecv_gemm_bench.c

@@ -320,7 +320,6 @@ static void* comm_thread_func(void* arg)
 	return NULL;
 }
 
-
 int main(int argc, char **argv)
 {
 	double start, end;

+ 6 - 0
mpi/tests/sendrecv_parallel_tasks_bench.c

@@ -32,7 +32,11 @@
 
 /* We reduce NX_MAX, since some NICs don't support exchanging simultaneously such amount of memory */
 #undef NX_MAX
+#ifdef STARPU_QUICK_CHECK
+#define NX_MAX (1024)
+#else
 #define NX_MAX (64 * 1024 * 1024)
+#endif
 
 
 void cpu_task(void* descr[], void* args)
@@ -108,6 +112,8 @@ void cpu_task(void* descr[], void* args)
 			current_worker, (long long) s, min_lat, bw_million_byte, bw_mbyte, d1_lat, med_lat, avg_lat, d9_lat, max_lat);
 		fflush(stdout);
 	}
+
+	free(lats);
 }
 
 static struct starpu_codelet cl =

+ 2 - 1
src/common/barrier.c

@@ -50,7 +50,8 @@ int _starpu_barrier_test(struct _starpu_barrier *barrier)
 int _starpu_barrier_destroy(struct _starpu_barrier *barrier)
 {
 	int ret;
-	do {
+	do
+	{
 		ret = _starpu_barrier_test(barrier);
 	}
 	while (ret == EBUSY);

+ 4 - 2
src/core/dependencies/data_concurrency.c

@@ -402,7 +402,8 @@ void _starpu_enforce_data_deps_notify_job_ready_soon(struct _starpu_job *j, _sta
 {
 	unsigned buf;
 
-	if (j->task->cl) {
+	if (j->task->cl)
+	{
 		unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(j->task);
 
 		for (buf = 0; buf < nbuffers; buf++)
@@ -415,7 +416,8 @@ void _starpu_enforce_data_deps_notify_job_ready_soon(struct _starpu_job *j, _sta
 
 		/* We need to check data availability only if sequential consistency
 		 * dependencies have not been used */
-		if (!j->sequential_consistency) {
+		if (!j->sequential_consistency)
+		{
 			for (buf = 0; buf < nbuffers; buf++)
 			{
 				starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(j->task, buf);

+ 2 - 1
src/core/dependencies/dependencies.c

@@ -41,7 +41,8 @@ void _starpu_notify_dependencies(struct _starpu_job *j)
 static starpu_notify_ready_soon_func notify_ready_soon_func;
 static void *notify_ready_soon_func_data;
 
-struct _starpu_notify_job_start_data {
+struct _starpu_notify_job_start_data
+{
 	double delay;
 };
 

+ 4 - 2
src/core/dependencies/implicit_data_deps.c

@@ -234,11 +234,13 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 
 		/* Skip tasks that are associated to a reduction phase so that
 		 * they do not interfere with the application. */
-		if (pre_sync_job->reduction_task) {
+		if (pre_sync_job->reduction_task)
+		{
 			*submit_pre_sync = 1;
 			return NULL;
 		}
-		if (post_sync_job->reduction_task) {
+		if (post_sync_job->reduction_task)
+		{
 			*submit_pre_sync = 0;
 			return NULL;
 		}

+ 6 - 3
src/core/perfmodel/perfmodel_history.c

@@ -1026,7 +1026,8 @@ void starpu_perfmodel_dump_xml(FILE *f, struct starpu_perfmodel *model)
 		for(dev = 0; dev < ndevices; dev++)
 		{
 			const char *type;
-			switch (arch_combs[comb]->devices[dev].type) {
+			switch (arch_combs[comb]->devices[dev].type)
+			{
 				case STARPU_CPU_WORKER: type = "CPU"; break;
 				case STARPU_CUDA_WORKER: type = "CUDA"; break;
 				case STARPU_OPENCL_WORKER: type = "OpenCL"; break;
@@ -1421,7 +1422,8 @@ int starpu_perfmodel_list(FILE *output)
 	else
 	{
 		int i;
-		for (i = 0; i < n; i++) {
+		for (i = 0; i < n; i++)
+		{
 			if (strcmp(list[i]->d_name, ".") && strcmp(list[i]->d_name, ".."))
 				fprintf(output, "file: <%s>\n", list[i]->d_name);
 			free(list[i]);
@@ -1772,7 +1774,8 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, s
 
 docal:
 #ifdef STARPU_SIMGRID
-	if (isnan(exp)) {
+	if (isnan(exp))
+	{
 		char archname[STR_SHORT_LENGTH];
 		starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), nimpl);
 

+ 16 - 7
src/core/topology.c

@@ -299,13 +299,15 @@ int _starpu_task_data_get_node_on_node(struct starpu_task *task, unsigned index,
 	int node = STARPU_SPECIFIC_NODE_LOCAL;
 	if (task->cl->specific_nodes)
 		node = STARPU_CODELET_GET_NODE(task->cl, index);
-	switch (node) {
+	switch (node)
+	{
 	case STARPU_SPECIFIC_NODE_LOCAL:
 		// TODO: rather find MCDRAM
 		node = local_node;
 		break;
 	case STARPU_SPECIFIC_NODE_CPU:
-		switch (starpu_node_get_kind(local_node)) {
+		switch (starpu_node_get_kind(local_node))
+		{
 		case STARPU_CPU_RAM:
 			node = local_node;
 			break;
@@ -320,10 +322,13 @@ int _starpu_task_data_get_node_on_node(struct starpu_task *task, unsigned index,
 		node = local_node;
 		break;
 	case STARPU_SPECIFIC_NODE_LOCAL_OR_CPU:
-		if (task->handles[index]->per_node[local_node].state != STARPU_INVALID) {
+		if (task->handles[index]->per_node[local_node].state != STARPU_INVALID)
+		{
 			/* It is here already, rather access it from here */
 			node = local_node;
-		} else {
+		}
+		else
+		{
 			/* It is not here already, do not bother moving it */
 			node = STARPU_MAIN_RAM;
 		}
@@ -338,7 +343,8 @@ int _starpu_task_data_get_node_on_worker(struct starpu_task *task, unsigned inde
 	int node = STARPU_SPECIFIC_NODE_LOCAL;
 	if (task->cl->specific_nodes)
 		node = STARPU_CODELET_GET_NODE(task->cl, index);
-	switch (node) {
+	switch (node)
+	{
 	case STARPU_SPECIFIC_NODE_LOCAL:
 		// TODO: rather find MCDRAM
 		node = local_node;
@@ -353,10 +359,13 @@ int _starpu_task_data_get_node_on_worker(struct starpu_task *task, unsigned inde
 		node = local_node;
 		break;
 	case STARPU_SPECIFIC_NODE_LOCAL_OR_CPU:
-		if (task->handles[index]->per_node[local_node].state != STARPU_INVALID) {
+		if (task->handles[index]->per_node[local_node].state != STARPU_INVALID)
+		{
 			/* It is here already, rather access it from here */
 			node = local_node;
-		} else {
+		}
+		else
+		{
 			/* It is not here already, do not bother moving it */
 			node = STARPU_MAIN_RAM;
 		}

+ 3 - 3
src/core/workers.c

@@ -428,7 +428,8 @@ static inline int _starpu_can_use_nth_implementation(enum starpu_worker_archtype
 
 /* Test if this task can be processed on this worker, regardless of the implementation */
 /* must be called with sched_mutex locked to protect state_blocked */
-static inline int _starpu_can_execute_task_any_impl(unsigned workerid, struct starpu_task *task) {
+static inline int _starpu_can_execute_task_any_impl(unsigned workerid, struct starpu_task *task)
+{
 
 	if (!_starpu_config.workers[workerid].enable_knob)
 		return 0;
@@ -440,7 +441,6 @@ static inline int _starpu_can_execute_task_any_impl(unsigned workerid, struct st
 			return 0;
 	}
 
-	
 	/* if the worker is blocked in a parallel ctx don't submit tasks on it */
 #ifdef STARPU_DEVEL
 #warning FIXME: this is very expensive, while can_execute is supposed to be not very costly so schedulers can call it a lot
@@ -451,7 +451,7 @@ static inline int _starpu_can_execute_task_any_impl(unsigned workerid, struct st
 	if (!(task->where & _starpu_config.workers[workerid].worker_mask))
 		return 0;
 
-	return 1; 
+	return 1;
 }
 
 /* must be called with sched_mutex locked to protect state_blocked_in_parallel */

+ 2 - 1
src/datawizard/coherency.c

@@ -208,7 +208,8 @@ void _starpu_update_data_state(starpu_data_handle_t handle,
 			for (node = 0; node < nnodes; node++)
 			{
 				struct _starpu_data_replicate *replicate = &handle->per_node[node];
-                               if (replicate->state != STARPU_INVALID){
+                               if (replicate->state != STARPU_INVALID)
+			       {
                                        _STARPU_TRACE_DATA_STATE_SHARED(handle, node);
 					replicate->state = STARPU_SHARED;
                                }

+ 2 - 1
src/datawizard/interfaces/matrix_interface.c

@@ -213,7 +213,8 @@ static void display_matrix_interface(starpu_data_handle_t handle, FILE *f)
 
 //#define DYNAMIC_MATRICES
 
-struct pack_matrix_header {
+struct pack_matrix_header
+{
 #ifdef DYNAMIC_MATRICES
 	/* Receiving matrices with different sizes from MPI */
 	/* FIXME: that would break alignment for O_DIRECT disk access...

+ 32 - 23
src/debug/traces/starpu_fxt.c

@@ -343,7 +343,8 @@ static void handle_papi_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *o
 	long long int value = ev->param[2];
 	//char *prefix = options->file_prefix;
 
-	if (papi_file){
+	if (papi_file)
+	{
 		char event_str[PAPI_MAX_STR_LEN];
 		PAPI_event_code_to_name(event_code, event_str);
 		fprintf(papi_file, "JobId: %lu\n", task);
@@ -2470,47 +2471,55 @@ static void handle_memnode_event(struct fxt_ev_64 *ev, struct starpu_fxt_options
 		memnode_set_state(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr);
 }
 
-static void handle_memnode_event_start_3(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr){
+static void handle_memnode_event_start_3(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
+{
 	unsigned memnode = ev->param[0];
-       unsigned size = ev->param[2];
-       unsigned long handle = ev->param[3];
+	unsigned size = ev->param[2];
+	unsigned long handle = ev->param[3];
 
 	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, size, memnode, options);
 }
 
-static void handle_memnode_event_start_4(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr){
-       unsigned memnode = ev->param[0];
-       unsigned dest = ev->param[1];
-       if(strcmp(eventstr, "rc")==0){
-               //If it is a Request Create, use dest normally
-       }else{
-               dest = memnode;
-       }
-       unsigned size = ev->param[2];
-       unsigned long handle = ev->param[3];
-       unsigned prefe = ev->param[4];
+static void handle_memnode_event_start_4(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
+{
+	unsigned memnode = ev->param[0];
+	unsigned dest = ev->param[1];
+	if(strcmp(eventstr, "rc")==0)
+	{
+		//If it is a Request Create, use dest normally
+	}
+	else
+	{
+		dest = memnode;
+	}
+	unsigned size = ev->param[2];
+	unsigned long handle = ev->param[3];
+	unsigned prefe = ev->param[4];
 
-       memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, prefe, size, dest, options);
+	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, prefe, size, dest, options);
 }
 
-static void handle_memnode_event_end_3(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr){
+static void handle_memnode_event_end_3(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
+{
 	unsigned memnode = ev->param[0];
-       unsigned long handle = ev->param[2];
-       unsigned info = ev->param[3];
+	unsigned long handle = ev->param[2];
+	unsigned info = ev->param[3];
 
 	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, info, 0, memnode, options);
 }
 
-static void handle_memnode_event_start_2(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr){
+static void handle_memnode_event_start_2(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
+{
 	unsigned memnode = ev->param[0];
-       unsigned long handle = ev->param[2];
+	unsigned long handle = ev->param[2];
 
 	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, 0, memnode, options);
 }
 
-static void handle_memnode_event_end_2(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr){
+static void handle_memnode_event_end_2(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
+{
 	unsigned memnode = ev->param[0];
-       unsigned long handle = ev->param[2];
+	unsigned long handle = ev->param[2];
 
 	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, 0, memnode, options);
 }

+ 16 - 16
src/sched_policies/component_best_implementation.c

@@ -38,26 +38,26 @@ static int find_best_impl(unsigned sched_ctx_id, struct starpu_task * task, int
 		len = 0.0;
 	}
 	else
-	{	
-	    struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid, sched_ctx_id);
-	    for(impl = 0; impl < STARPU_MAXIMPLEMENTATIONS; impl++)
-	    {
-		if(starpu_worker_can_execute_task(workerid, task, impl))
+	{
+		struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid, sched_ctx_id);
+		for(impl = 0; impl < STARPU_MAXIMPLEMENTATIONS; impl++)
 		{
-			double d = starpu_task_expected_length(task, archtype, impl);
-			if(isnan(d))
-			{
-				best_impl = impl;
-				len = 0.0;
-				break;
-			}
-			if(d < len)
+			if(starpu_worker_can_execute_task(workerid, task, impl))
 			{
-				len = d;
-				best_impl = impl;
+				double d = starpu_task_expected_length(task, archtype, impl);
+				if(isnan(d))
+				{
+					best_impl = impl;
+					len = 0.0;
+					break;
+				}
+				if(d < len)
+				{
+					len = d;
+					best_impl = impl;
+				}
 			}
 		}
-	    }
 	}
 	if(best_impl == -1)
 		return 0;

+ 21 - 10
src/sched_policies/component_heteroprio.c

@@ -206,7 +206,8 @@ out:
 	//fprintf(stderr, "could not push %p to %d actually\n", task, best_icomponent);
 	/* Could not push to child actually, push that one back */
 	STARPU_COMPONENT_MUTEX_LOCK(mutex);
-	for (j = 0; j < (int) data->naccel; j++) {
+	for (j = 0; j < (int) data->naccel; j++)
+	{
 		if (acceleration == data->accel[j])
 		{
 			_starpu_prio_deque_push_front_task(data->bucket[j], task);
@@ -305,7 +306,8 @@ static int heteroprio_progress_one(struct starpu_sched_component *component)
 	task = _starpu_prio_deque_pop_task(no_accel);
 	STARPU_COMPONENT_MUTEX_UNLOCK(mutex);
 
-	if (task) {
+	if (task)
+	{
 		if (heteroprio_progress_noaccel(component, data, task))
 		{
 			/* Could not push to child actually, push that one back */
@@ -388,7 +390,8 @@ static int heteroprio_push_task(struct starpu_sched_component * component, struc
 			max_expected = min_arch;
 	}
 
-	if (workerid == -1) {
+	if (workerid == -1)
+	{
 		/* All archs can run it */
 		STARPU_ASSERT(!isnan(min_expected));
 		STARPU_ASSERT(!isnan(max_expected));
@@ -402,13 +405,15 @@ static int heteroprio_push_task(struct starpu_sched_component * component, struc
 		STARPU_COMPONENT_MUTEX_LOCK(mutex);
 		unsigned i, j;
 		/* Try to find a bucket with similar acceleration */
-		for (i = 0; i < data->naccel; i++) {
+		for (i = 0; i < data->naccel; i++)
+		{
 			if (acceleration >= data->accel[i] * (1 - APPROX) &&
 			    acceleration <= data->accel[i] * (1 + APPROX))
 				break;
 		}
 
-		if (i == data->naccel) {
+		if (i == data->naccel)
+		{
 			/* Didn't find it, add one */
 			data->naccel++;
 
@@ -418,8 +423,10 @@ static int heteroprio_push_task(struct starpu_sched_component * component, struc
 			_starpu_prio_deque_init(newbucket);
 			int inserted = 0;
 
-			for (j = 0; j < data->naccel-1; j++) {
-				if (!inserted && acceleration > data->accel[j]) {
+			for (j = 0; j < data->naccel-1; j++)
+			{
+				if (!inserted && acceleration > data->accel[j])
+				{
 					/* Insert the new bucket here */
 					i = j;
 					newbuckets[j] = newbucket;
@@ -429,7 +436,8 @@ static int heteroprio_push_task(struct starpu_sched_component * component, struc
 				newbuckets[j+inserted] = data->bucket[j];
 				newaccel[j+inserted] = data->accel[j];
 			}
-			if (!inserted) {
+			if (!inserted)
+			{
 				/* Insert it last */
 				newbuckets[data->naccel-1] = newbucket;
 				newaccel[data->naccel-1] = acceleration;
@@ -441,14 +449,17 @@ static int heteroprio_push_task(struct starpu_sched_component * component, struc
 		}
 #if 0
 		fprintf(stderr,"buckets:");
-		for (j = 0; j < data->naccel; j++) {
+		for (j = 0; j < data->naccel; j++)
+		{
 			fprintf(stderr, " %f", data->accel[j]);
 		}
 		fprintf(stderr,"\ninserting %p %f to %d\n", task, acceleration, i);
 #endif
 		_starpu_prio_deque_push_back_task(data->bucket[i],task);
 		STARPU_COMPONENT_MUTEX_UNLOCK(mutex);
-	} else {
+	}
+	else
+	{
 		/* Not all archs can run it, will resort to HEFT strategy */
 		acceleration = INFINITY;
 		//fprintf(stderr,"%s: some archs can't do it\n", starpu_task_get_name(task));

+ 4 - 2
src/sched_policies/component_sched.c

@@ -284,8 +284,10 @@ void _starpu_sched_component_update_workers_in_ctx(struct starpu_sched_component
 	struct starpu_bitmap * workers_in_ctx = _starpu_get_worker_mask(sched_ctx_id);
 	starpu_bitmap_unset_and(component->workers_in_ctx,component->workers, workers_in_ctx);
 	unsigned i,j;
-	for(i = starpu_worker_get_count(); i < starpu_worker_get_count() + starpu_combined_worker_get_count(); i++) {
-		if (starpu_bitmap_get(component->workers, i)) {
+	for(i = starpu_worker_get_count(); i < starpu_worker_get_count() + starpu_combined_worker_get_count(); i++)
+	{
+		if (starpu_bitmap_get(component->workers, i))
+		{
 			/* Component has this combined worker, check whether the
 			 * context has all the corresponding workers */
 			int worker_size;

+ 2 - 1
src/sched_policies/component_work_stealing.c

@@ -240,7 +240,8 @@ static int push_task(struct starpu_sched_component * component, struct starpu_ta
 
 	/* Find a child component that can execute this task */
 	i = (i+1)%component->nchildren;
-	while(1) {
+	while(1)
+	{
 		int workerid;
 		for(workerid = starpu_bitmap_first(component->children[i]->workers_in_ctx);
 		    -1 != workerid;

+ 2 - 1
src/sched_policies/component_worker.c

@@ -149,7 +149,8 @@ struct _starpu_worker_component_data
 	union
 	{
 		struct _starpu_worker * worker;
-		struct {
+		struct
+		{
 			unsigned worker_size;
 			unsigned workerids[STARPU_NMAXWORKERS];
 		} parallel_worker;

+ 1 - 1
src/sched_policies/heteroprio.c

@@ -524,7 +524,7 @@ static struct starpu_task *pop_task_heteroprio_policy(unsigned sched_ctx_id)
 				nb_added_tasks       += 1;
 				// TODO starpu_prefetch_task_input_for(task, workerid);
 			}
-		}		
+		}
 	}
 
 	struct starpu_task* task = NULL;

+ 19 - 15
src/sched_policies/modular_ez.c

@@ -262,10 +262,13 @@ void starpu_sched_component_initialize_simple_schedulers(unsigned sched_ctx_id,
 		unsigned ntasks_threshold;
 		if (starpu_sched_component_is_heft(decision_component) ||
 		    starpu_sched_component_is_mct(decision_component) ||
-		    starpu_sched_component_is_heteroprio(decision_component)) {
+		    starpu_sched_component_is_heteroprio(decision_component))
+		{
 			/* These need more queueing to allow CPUs to take some share of the work */
 			ntasks_threshold = _STARPU_SCHED_NTASKS_THRESHOLD_HEFT;
-		} else {
+		}
+		else
+		{
 			ntasks_threshold = _STARPU_SCHED_NTASKS_THRESHOLD_DEFAULT;
 		}
 		/* But let user tune it */
@@ -279,20 +282,20 @@ void starpu_sched_component_initialize_simple_schedulers(unsigned sched_ctx_id,
 		int exp = flags & STARPU_SCHED_SIMPLE_FIFOS_BELOW_EXP ? 1 : 0;
 
 		struct starpu_sched_component_prio_data prio_data =
-			{
-				.ntasks_threshold = ntasks_threshold,
-				.exp_len_threshold = exp_len_threshold,
-				.ready = ready,
-				.exp = exp,
-			};
+		{
+			.ntasks_threshold = ntasks_threshold,
+			.exp_len_threshold = exp_len_threshold,
+			.ready = ready,
+			.exp = exp,
+		};
 
 		struct starpu_sched_component_fifo_data fifo_data =
-			{
-				.ntasks_threshold = ntasks_threshold,
-				.exp_len_threshold = exp_len_threshold,
-				.ready = ready,
-				.exp = exp,
-			};
+		{
+			.ntasks_threshold = ntasks_threshold,
+			.exp_len_threshold = exp_len_threshold,
+			.ready = ready,
+			.exp = exp,
+		};
 
 		/* Create one fifo+eager component pair per choice, below scheduling decision */
 		for(i = 0; i < nbelow; i++)
@@ -334,7 +337,8 @@ void starpu_sched_component_initialize_simple_schedulers(unsigned sched_ctx_id,
 					STARPU_ABORT();
 			}
 			STARPU_ASSERT(n >= 1);
-			if (n > 1) {
+			if (n > 1)
+			{
 				/* Several workers for this choice, need to introduce
 				 * a component to distribute the work */
 				struct starpu_sched_component *distribute;

+ 2 - 1
src/sched_policies/modular_gemm.c

@@ -26,7 +26,8 @@
 
 #define MEMORY_AFFINITY
 
-struct child_data {
+struct child_data
+{
 	double expected_start;
 	double predicted;
 	double predicted_transfer;

+ 2 - 1
src/sched_policies/modular_heteroprio_heft.c

@@ -22,7 +22,8 @@
 
 static void initialize_heteroprio_heft_center_policy(unsigned sched_ctx_id)
 {
-	struct starpu_sched_component_heteroprio_data heteroprio_data = {
+	struct starpu_sched_component_heteroprio_data heteroprio_data =
+	{
 		.mct = NULL,
 		.batch = 1,
 	};

+ 72 - 63
src/util/openmp_runtime_support.c

@@ -135,7 +135,9 @@ static void wake_up_and_unlock_task(struct starpu_omp_task *task)
 		weak_task_unlock(task);
 		int ret = starpu_task_submit(task->starpu_task);
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
-	} else {
+	}
+	else
+	{
 		weak_task_unlock(task);
 	}
 }
@@ -379,36 +381,37 @@ static void starpu_omp_explicit_task_entry(struct starpu_omp_task *task)
 {
 	STARPU_ASSERT(!(task->flags & STARPU_OMP_TASK_FLAGS_IMPLICIT));
 	struct _starpu_worker *starpu_worker = _starpu_get_local_worker_key();
-   /* XXX on work */
-   if (task->is_loop) {
-      starpu_omp_for_inline_first_alt(task->nb_iterations, task->chunk, starpu_omp_sched_static, 1, &task->begin_i, &task->end_i);
-   }
-   if (starpu_worker->arch == STARPU_CPU_WORKER)
-   {
-      task->cpu_f(task->starpu_buffers, task->starpu_cl_arg);
-   }
+	/* XXX on work */
+	if (task->is_loop)
+	{
+		starpu_omp_for_inline_first_alt(task->nb_iterations, task->chunk, starpu_omp_sched_static, 1, &task->begin_i, &task->end_i);
+	}
+	if (starpu_worker->arch == STARPU_CPU_WORKER)
+	{
+		task->cpu_f(task->starpu_buffers, task->starpu_cl_arg);
+	}
 #ifdef STARPU_USE_CUDA
-   else if (starpu_worker->arch == STARPU_CUDA_WORKER)
-   {
-      task->cuda_f(task->starpu_buffers, task->starpu_cl_arg);
-   }
+	else if (starpu_worker->arch == STARPU_CUDA_WORKER)
+	{
+		task->cuda_f(task->starpu_buffers, task->starpu_cl_arg);
+	}
 #endif
 #ifdef STARPU_USE_OPENCL
-   else if (starpu_worker->arch == STARPU_OPENCL_WORKER)
-   {
-      task->opencl_f(task->starpu_buffers, task->starpu_cl_arg);
-   }
+	else if (starpu_worker->arch == STARPU_OPENCL_WORKER)
+	{
+		task->opencl_f(task->starpu_buffers, task->starpu_cl_arg);
+	}
 #endif
-   else
-      _STARPU_ERROR("invalid worker architecture");
-   /**/
+	else
+		_STARPU_ERROR("invalid worker architecture");
+	/**/
 	_starpu_omp_unregister_task_handles(task);
 	_starpu_spin_lock(&task->lock);
 	task->state = starpu_omp_task_state_terminated;
 	task->transaction_pending=1;
 	_starpu_spin_unlock(&task->lock);
 	struct starpu_omp_thread *thread = _starpu_omp_get_thread();
-	/* 
+	/*
 	 * the task reached the terminated state, definitively give hand back to the worker code.
 	 *
 	 * about to run on the worker stack...
@@ -428,7 +431,7 @@ static void starpu_omp_implicit_task_entry(struct starpu_omp_task *task)
 		_starpu_omp_unregister_region_handles(task->owner_region);
 	}
 	task->state = starpu_omp_task_state_terminated;
-	/* 
+	/*
 	 * the task reached the terminated state, definitively give hand back to the worker code.
 	 *
 	 * about to run on the worker stack...
@@ -447,7 +450,7 @@ static void starpu_omp_task_preempt(void)
 	struct starpu_omp_thread *thread = _starpu_omp_get_thread();
 	task->state = starpu_omp_task_state_preempted;
 
-	/* 
+	/*
 	 * the task reached a blocked state, give hand back to the worker code.
 	 *
 	 * about to run on the worker stack...
@@ -486,7 +489,7 @@ static void starpu_omp_implicit_task_exec(void *buffers[], void *cl_arg)
 
 	task->state = starpu_omp_task_state_clear;
 
-	/* 
+	/*
 	 * start the task execution, or restore a previously preempted task.
 	 * about to run on the task stack...
 	 * */
@@ -655,7 +658,7 @@ static void starpu_omp_explicit_task_exec(void *buffers[], void *cl_arg)
 	}
 	task->state = starpu_omp_task_state_clear;
 
-	/* 
+	/*
 	 * start the task execution, or restore a previously preempted task.
 	 * about to run on the task stack...
 	 * */
@@ -694,11 +697,11 @@ static struct starpu_omp_task *create_omp_task_struct(struct starpu_omp_task *pa
 		task->flags |= STARPU_OMP_TASK_FLAGS_IMPLICIT;
 	}
 	_starpu_spin_init(&task->lock);
-	/* TODO: initialize task->data_env_icvs with proper values */ 
+	/* TODO: initialize task->data_env_icvs with proper values */
 	memset(&task->data_env_icvs, 0, sizeof(task->data_env_icvs));
 	if (is_implicit)
 	{
-	  /* TODO: initialize task->implicit_task_icvs with proper values */ 
+	  /* TODO: initialize task->implicit_task_icvs with proper values */
 		memset(&task->implicit_task_icvs, 0, sizeof(task->implicit_task_icvs));
 	}
 
@@ -1037,7 +1040,7 @@ void starpu_omp_parallel_region(const struct starpu_omp_parallel_region_attr *at
 	struct starpu_omp_task *task = _starpu_omp_get_task();
 	struct starpu_omp_region *generating_region = task->owner_region;
 	const int max_active_levels = generating_region->owner_device->icvs.max_active_levels_var;
-	struct starpu_omp_region *new_region = 
+	struct starpu_omp_region *new_region =
 		create_omp_region_struct(generating_region, _global_state.initial_device);
 	int ret;
 	int nb_threads = 1;
@@ -1166,7 +1169,7 @@ void starpu_omp_parallel_region(const struct starpu_omp_parallel_region_attr *at
 	}
 	STARPU_ASSERT(new_region->nb_threads == nb_threads);
 
-	/* 
+	/*
 	 * if task == initial_task, create a starpu task as a continuation to all the implicit
 	 * tasks of the new region, else prepare the task for preemption,
 	 * to become itself a continuation to the implicit tasks of the new region
@@ -1194,7 +1197,7 @@ void starpu_omp_parallel_region(const struct starpu_omp_parallel_region_attr *at
 	 * create the starpu tasks for the implicit omp tasks,
 	 * create explicit dependencies between these starpu tasks and the continuation starpu task
 	 */
-	for (i = 0; i < nb_threads; i++) 
+	for (i = 0; i < nb_threads; i++)
 	{
 		struct starpu_omp_task * implicit_task = new_region->implicit_task_array[i];
 		implicit_task->cl = attr->cl;
@@ -1234,7 +1237,7 @@ void starpu_omp_parallel_region(const struct starpu_omp_parallel_region_attr *at
 	/*
 	 * submit all the region implicit starpu tasks
 	 */
-	for (i = 0; i < nb_threads; i++) 
+	for (i = 0; i < nb_threads; i++)
 	{
 		struct starpu_omp_task * implicit_task = new_region->implicit_task_array[i];
 		ret = starpu_task_submit(implicit_task->starpu_task);
@@ -1292,7 +1295,7 @@ static void wake_up_barrier(struct starpu_omp_region *parallel_region)
 {
 	struct starpu_omp_task *task = _starpu_omp_get_task();
 	int i;
-	for (i = 0; i < parallel_region->nb_threads; i++) 
+	for (i = 0; i < parallel_region->nb_threads; i++)
 	{
 		struct starpu_omp_task * implicit_task = parallel_region->implicit_task_array[i];
 		if (implicit_task == task)
@@ -1343,7 +1346,7 @@ void starpu_omp_barrier(void)
 	{
 		ANNOTATE_HAPPENS_BEFORE(&parallel_region->barrier_count);
 		/* not the last task reaching the barrier
-		 * . prepare for conditional continuation 
+		 * . prepare for conditional continuation
 		 * . sleep
 		 */
 
@@ -1826,40 +1829,46 @@ void starpu_omp_taskgroup_inline_end(void)
 // XXX on work
 void starpu_omp_taskloop_inline_begin(struct starpu_omp_task_region_attr *attr)
 {
-   if (!attr->nogroup_clause)
-   {
-      starpu_omp_taskgroup_inline_begin();
-   }
-
-   int nb_subloop;
-   if (attr->num_tasks) {
-      nb_subloop = attr->num_tasks;
-   } else if (attr->grainsize) {
-      nb_subloop = attr->nb_iterations / attr->grainsize;
-   } else {
-      nb_subloop = 4;
-   }
-
-   attr->is_loop = 1;
-
-   int i;
-   int nb_iter_i = attr->nb_iterations / nb_subloop;
-   for (i = 0; i < nb_subloop; i++)
-   {
-      attr->begin_i = nb_iter_i * i;
-      attr->end_i = attr->begin_i + nb_iter_i;
-      attr->end_i += (i+1 != nb_subloop) ? 0 : (attr->nb_iterations % nb_subloop);
-      attr->chunk = attr->end_i - attr->begin_i;
-      starpu_omp_task_region(attr);
-   }
+	if (!attr->nogroup_clause)
+	{
+		starpu_omp_taskgroup_inline_begin();
+	}
+
+	int nb_subloop;
+	if (attr->num_tasks)
+	{
+		nb_subloop = attr->num_tasks;
+	}
+	else if (attr->grainsize)
+	{
+		nb_subloop = attr->nb_iterations / attr->grainsize;
+	}
+	else
+	{
+		nb_subloop = 4;
+	}
+
+	attr->is_loop = 1;
+
+	int i;
+	int nb_iter_i = attr->nb_iterations / nb_subloop;
+	for (i = 0; i < nb_subloop; i++)
+	{
+		attr->begin_i = nb_iter_i * i;
+		attr->end_i = attr->begin_i + nb_iter_i;
+		attr->end_i += (i+1 != nb_subloop) ? 0 : (attr->nb_iterations % nb_subloop);
+		attr->chunk = attr->end_i - attr->begin_i;
+		starpu_omp_task_region(attr);
+	}
 }
 
 // XXX on work
 void starpu_omp_taskloop_inline_end(const struct starpu_omp_task_region_attr *attr)
 {
-   if (!attr->nogroup_clause) {
-      starpu_omp_taskgroup_inline_end();
-   }
+	if (!attr->nogroup_clause)
+	{
+		starpu_omp_taskgroup_inline_end();
+	}
 }
 
 static inline void _starpu_omp_for_loop(struct starpu_omp_region *parallel_region, struct starpu_omp_task *task,
@@ -2170,7 +2179,7 @@ void starpu_omp_ordered_inline_end(void)
 	struct starpu_omp_region *parallel_region = task->owner_region;
 	struct starpu_omp_loop *loop = _starpu_omp_for_get_loop(parallel_region, task);
 
-	loop->ordered_iteration++;	
+	loop->ordered_iteration++;
 	condition_broadcast(&loop->ordered_cond, starpu_omp_task_wait_on_ordered);
 	_starpu_spin_unlock(&loop->ordered_lock);
 }

+ 5 - 3
tests/datawizard/bcsr.c

@@ -41,7 +41,8 @@ void cpu_show_bcsr(void *descr[], void *arg)
 
 	printf("nnz %d elemsize %d\n", nnz, elemsize);
 
-	for (i = 0; i < nrow; i++) {
+	for (i = 0; i < nrow; i++)
+	{
 		uint32_t row_start = rowptr[i] - firstentry;
 		uint32_t row_end = rowptr[i+1] - firstentry;
 
@@ -73,7 +74,7 @@ struct starpu_codelet show_cl =
 };
 
 /*
- * In this test, we use the following matrix: 
+ * In this test, we use the following matrix:
  *
  *   +----------------+
  *   |  0   1   0   0 |
@@ -129,7 +130,8 @@ int main(int argc, char **argv)
 
 	starpu_task_insert(&show_cl, STARPU_R, bcsr_handle, 0);
 
-	struct starpu_data_filter filter = {
+	struct starpu_data_filter filter =
+	{
 		.filter_func = starpu_bcsr_filter_vertical_block,
 		.nchildren = 2,
 	};

+ 2 - 1
tests/microbenchs/tasks_size_overhead.c

@@ -185,7 +185,8 @@ int main(int argc, char **argv)
 	unsetenv("STARPU_NCPU");
 #endif
 
-	if (STARPU_RUNNING_ON_VALGRIND) {
+	if (STARPU_RUNNING_ON_VALGRIND)
+	{
 		factortime *= 4;
 		cpustep *= 4;
 	}

+ 2 - 1
tests/parallel_tasks/parallel_kernels.c

@@ -89,7 +89,8 @@ int main(void)
 
 	unsigned iter, worker, n;
 	n = N;
-	if (STARPU_RUNNING_ON_VALGRIND) {
+	if (STARPU_RUNNING_ON_VALGRIND)
+	{
 		n /= 300;
 	}
 	for (iter = 0; iter < n; iter++)

+ 2 - 1
tests/parallel_tasks/parallel_kernels_spmd.c

@@ -92,7 +92,8 @@ int main(void)
 
 	unsigned iter, worker, n;
 	n = N;
-	if (STARPU_RUNNING_ON_VALGRIND) {
+	if (STARPU_RUNNING_ON_VALGRIND)
+	{
 		n /= 300;
 	}
 	for (iter = 0; iter < n; iter++)

+ 5 - 2
tools/starpu_perfmodel_display.c

@@ -175,9 +175,12 @@ int main(int argc, char **argv)
 			fprintf(stderr, "The performance model for the symbol <%s> could not be loaded\n", psymbol);
 			return 1;
 		}
-		if (xml) {
+		if (xml)
+		{
 			starpu_perfmodel_dump_xml(stdout, &model);
-		} else {
+		}
+		else
+		{
 			uint32_t *footprint = NULL;
 			if (pdisplay_specific_footprint == 1)
 			{

+ 4 - 2
tools/starpu_perfmodel_recdump.c

@@ -129,7 +129,8 @@ void print_archs(FILE* output)
 		{
 			if (starpu_worker_get_memory_node(workerid) == node)
 			{
-				if (!printed) {
+				if (!printed)
+				{
 					fprintf(output, "Workers:");
 					printed = 1;
 				}
@@ -145,7 +146,8 @@ void print_archs(FILE* output)
 	{
 		for (dst = 0; dst < starpu_memory_nodes_get_count(); dst++)
 		{
-			if (src != dst) {
+			if (src != dst)
+			{
 				fprintf(output, "MemoryNodeSrc: %d\n", src);
 				fprintf(output, "MemoryNodeDst: %d\n", dst);
 				fprintf(output, "Bandwidth: %f\n", starpu_transfer_bandwidth(src, dst));

+ 4 - 2
tools/starpu_replay.c

@@ -165,7 +165,8 @@ static void replay_data_register(starpu_data_handle_t *handleptr, starpu_data_ha
 	{
 		replay_interface_ops.interfaceid = starpu_data_interface_get_next_id();
 	}
-	struct replay_interface interface = {
+	struct replay_interface interface =
+	{
 		.id = replay_interface_ops.interfaceid,
 		.orig_handle = orig_handle,
 		.size = size,
@@ -337,7 +338,8 @@ double arch_cost_function(struct starpu_task *task, struct starpu_perfmodel_arch
 /* End of settings */
 
 static unsigned long nexecuted_tasks;
-void dumb_kernel(void *buffers[], void *args) {
+void dumb_kernel(void *buffers[], void *args)
+{
 	(void) buffers;
 	(void) args;
 	nexecuted_tasks++;