Просмотр исходного кода

Added benchmarks of StarPU in Julia

Mael Keryell лет назад: 6
Родитель
Сommit
8fb5d1fe16
42 измененных файлов с 4869 добавлено и 0 удалено
  1. 226 0
      julia/tst/black_scholes/black_scholes.c
  2. 81 0
      julia/tst/black_scholes/black_scholes_def.jl
  3. 35 0
      julia/tst/black_scholes/black_scholes_generated.jl
  4. 155 0
      julia/tst/black_scholes/black_scholes_with_generated.c
  5. 39 0
      julia/tst/black_scholes/cpu_black_scholes.c
  6. 124 0
      julia/tst/black_scholes/cpu_cuda_black_scholes.jl
  7. 74 0
      julia/tst/black_scholes/gpu_black_scholes.cu
  8. 196 0
      julia/tst/includes/display.c
  9. 28 0
      julia/tst/includes/display.h
  10. 52 0
      julia/tst/includes/sorting.c
  11. 12 0
      julia/tst/includes/sorting.h
  12. 58 0
      julia/tst/mandelbrot/cpu_cuda_mandelbrot.jl
  13. 74 0
      julia/tst/mandelbrot/cpu_mandelbrot.c
  14. 138 0
      julia/tst/mandelbrot/cpu_mandelbrot_between.c
  15. 95 0
      julia/tst/mandelbrot/gpu_mandelbrot.cu
  16. 113 0
      julia/tst/mandelbrot/gpu_mandelbrot_between.cu
  17. 248 0
      julia/tst/mandelbrot/mandelbrot.c
  18. 30 0
      julia/tst/mandelbrot/mandelbrot.jl
  19. 270 0
      julia/tst/mandelbrot/mandelbrot_between.c
  20. 197 0
      julia/tst/mandelbrot/mandelbrot_def.jl
  21. 48 0
      julia/tst/mandelbrot/mandelbrot_generated.jl
  22. 29 0
      julia/tst/mult/cpu_cuda_mult.jl
  23. 54 0
      julia/tst/mult/cpu_mult.c
  24. 68 0
      julia/tst/mult/gpu_mult.cu
  25. 288 0
      julia/tst/mult/mult.c
  26. 228 0
      julia/tst/mult/mult_def.jl
  27. 30 0
      julia/tst/mult/mult_extern.jl
  28. 30 0
      julia/tst/mult/mult_extern_graph.jl
  29. 46 0
      julia/tst/mult/mult_generated.jl
  30. 34 0
      julia/tst/mult/mult_generated_graph.jl
  31. 13 0
      julia/tst/mult/mult_naive.jl
  32. 55 0
      julia/tst/nbody/cpu_cuda_nbody.jl
  33. 94 0
      julia/tst/nbody/cpu_nbody.c
  34. 104 0
      julia/tst/nbody/cpu_nbody_between.c
  35. 145 0
      julia/tst/nbody/gpu_nbody.cu
  36. 151 0
      julia/tst/nbody/gpu_nbody_between.cu
  37. 296 0
      julia/tst/nbody/nbody.c
  38. 40 0
      julia/tst/nbody/nbody.jl
  39. 363 0
      julia/tst/nbody/nbody_between.c
  40. 366 0
      julia/tst/nbody/nbody_def.jl
  41. 74 0
      julia/tst/nbody/nbody_display.jl
  42. 68 0
      julia/tst/nbody/nbody_generated.jl

+ 226 - 0
julia/tst/black_scholes/black_scholes.c

@@ -0,0 +1,226 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <starpu.h>
+#include <math.h>
+#include "../includes/sorting.h"
+
+
+
+void cpu_black_scholes(void **, void *);
+void gpu_black_scholes(void **, void *);
+
+static struct starpu_codelet cl =
+{
+	.cpu_funcs = {cpu_black_scholes},
+	.cuda_funcs = {gpu_black_scholes},
+	.nbuffers = 7,
+	.modes = {STARPU_R, STARPU_R, STARPU_R, STARPU_R, STARPU_R, STARPU_W, STARPU_W}
+};
+
+void black_scholes_with_starpu(double *S, double *K, double *R, double *T, double *sig, double *call_res, double *put_res, unsigned nbr_data, unsigned nslices)
+{
+	starpu_data_handle_t S_handle, K_handle, R_handle, T_handle, SIG_handle, CRES_handle, PRES_handle;
+	
+
+	starpu_vector_data_register(&S_handle, STARPU_MAIN_RAM, (uintptr_t)S, nbr_data, sizeof(double));	
+	starpu_vector_data_register(&K_handle, STARPU_MAIN_RAM, (uintptr_t)K, nbr_data, sizeof(double));
+	starpu_vector_data_register(&R_handle, STARPU_MAIN_RAM, (uintptr_t)R, nbr_data, sizeof(double));
+	starpu_vector_data_register(&T_handle, STARPU_MAIN_RAM, (uintptr_t)T, nbr_data, sizeof(double));
+	starpu_vector_data_register(&SIG_handle, STARPU_MAIN_RAM, (uintptr_t)sig, nbr_data, sizeof(double));
+	starpu_vector_data_register(&CRES_handle, STARPU_MAIN_RAM, (uintptr_t)call_res, nbr_data, sizeof(double));
+	starpu_vector_data_register(&PRES_handle, STARPU_MAIN_RAM, (uintptr_t)put_res, nbr_data, sizeof(double));
+
+	struct starpu_data_filter f =
+	{
+		.filter_func = starpu_vector_filter_block,
+		.nchildren = nslices
+	};
+	/* printf("%f %f\n", nslices, nbr_data); */
+
+	starpu_data_partition(S_handle, &f);
+	starpu_data_partition(K_handle, &f);
+	starpu_data_partition(R_handle, &f);
+	starpu_data_partition(T_handle, &f);
+	starpu_data_partition(SIG_handle, &f);
+	starpu_data_partition(CRES_handle, &f);
+	starpu_data_partition(PRES_handle, &f);
+	
+	unsigned taskid;
+
+	for (taskid = 0; taskid < nslices; taskid++){
+
+		struct starpu_task *task = starpu_task_create();
+
+		task->cl = &cl;
+		task->handles[0] = starpu_data_get_sub_data(S_handle, 1, taskid);
+		task->handles[1] = starpu_data_get_sub_data(K_handle, 1, taskid);
+		task->handles[2] = starpu_data_get_sub_data(R_handle, 1, taskid);
+		task->handles[3] = starpu_data_get_sub_data(T_handle, 1, taskid);
+		task->handles[4] = starpu_data_get_sub_data(SIG_handle, 1, taskid);
+		task->handles[5] = starpu_data_get_sub_data(CRES_handle, 1, taskid);
+		task->handles[6] = starpu_data_get_sub_data(PRES_handle, 1, taskid);
+		
+		starpu_task_submit(task);
+
+	}
+
+	starpu_task_wait_for_all();
+
+	starpu_data_unpartition(S_handle, STARPU_MAIN_RAM);
+	starpu_data_unpartition(K_handle, STARPU_MAIN_RAM);
+	starpu_data_unpartition(R_handle, STARPU_MAIN_RAM);
+	starpu_data_unpartition(T_handle, STARPU_MAIN_RAM);
+	starpu_data_unpartition(SIG_handle, STARPU_MAIN_RAM);
+	starpu_data_unpartition(CRES_handle, STARPU_MAIN_RAM);
+	starpu_data_unpartition(PRES_handle, STARPU_MAIN_RAM);
+
+	starpu_data_unregister(S_handle);
+	starpu_data_unregister(K_handle);
+	starpu_data_unregister(R_handle);
+	starpu_data_unregister(T_handle);
+	starpu_data_unregister(SIG_handle);
+	starpu_data_unregister(CRES_handle);
+	starpu_data_unregister(PRES_handle);
+	
+
+
+}
+
+static void init_S(double *S, unsigned nbr_data)
+{
+	unsigned i;
+	for (i = 0; i < nbr_data; i++){
+		S[i] = 100. * rand() / (double) RAND_MAX;
+	}
+}
+
+static void init_K(double *K, unsigned nbr_data)
+{
+	unsigned i;
+	for (i = 0; i < nbr_data; i++){
+		K[i] = 100. * rand() / (double) RAND_MAX;
+	}
+}
+
+static void init_R(double *R, unsigned nbr_data)
+{
+	unsigned i;
+	for (i = 0; i < nbr_data; i++){
+		R[i] = rand() / (double) RAND_MAX;
+	}
+}
+
+static void init_T(double *T, unsigned nbr_data)
+{
+	unsigned i;
+	for (i = 0; i < nbr_data; i++){
+		T[i] = 10. * rand() / (double) RAND_MAX;
+	}
+}
+
+static void init_sig(double *sig, unsigned nbr_data)
+{
+	unsigned i;
+	for (i = 0; i < nbr_data; i++){
+		sig[i] = 10. * rand() / (double) RAND_MAX;
+	}
+}
+
+
+double median_time(unsigned nbr_data, unsigned nslices, unsigned nbr_tests)
+{
+	double exec_times[nbr_tests];
+	
+	double *S = malloc(nbr_data * sizeof(double));
+	double *K = malloc(nbr_data * sizeof(double));
+	double *R = malloc(nbr_data * sizeof(double));
+	double *T = malloc(nbr_data * sizeof(double));
+	double *sig = malloc(nbr_data * sizeof(double));
+
+	double *call_res = calloc(nbr_data, sizeof(double));
+	double *put_res = calloc(nbr_data, sizeof(double));
+
+	double start, stop;
+	unsigned i;
+	for (i = 0; i < nbr_tests; i++){
+
+		init_S(S,nbr_data);
+		init_K(K,nbr_data);
+		init_R(R,nbr_data);
+		init_T(T,nbr_data);
+		init_sig(sig,nbr_data);
+
+		/* S[0] = 100.; */
+		/* K[0] = 100.; */
+		/* R[0] = 0.05; */
+		/* T[0] = 1.0; */
+		/* sig[0] = 0.2; */
+		
+		start = starpu_timing_now();
+		black_scholes_with_starpu(S, K, R, T, sig, call_res, put_res, nbr_data, nslices);
+		stop = starpu_timing_now();
+	
+		exec_times[i] = (stop - start) / 1.e6;
+	}
+
+	/* printf("%f %f\n", call_res[0], put_res[0]); */
+
+	free(S);
+	free(K);
+	free(R);
+	free(T);
+	free(sig);
+	free(call_res);
+	free(put_res);
+
+	quicksort(exec_times, 0, nbr_tests - 1);
+
+	
+	return exec_times[nbr_tests/2];
+}
+	
+
+
+void display_times(unsigned start_nbr, unsigned step_nbr, unsigned stop_nbr, unsigned nslices, unsigned nbr_tests)
+{
+	FILE *myfile;
+
+	myfile = fopen("DAT/black_scholes_c_times.dat", "w");
+
+	unsigned nbr_data;
+
+	for (nbr_data = start_nbr; nbr_data <= stop_nbr; nbr_data += step_nbr){
+		double t = median_time(nbr_data, nslices, nbr_tests);
+		printf("nbr_data:\n%u\nTime:\n%f\n", nbr_data, t);
+		fprintf(myfile, "%f\n", t);
+	}
+	fclose(myfile);
+}
+
+int main(int argc, char *argv[])
+{
+	if (argc != 6){
+		printf("Usage: %s start_nbr step_nbr stop_nbr nslices nbr_tests\n", argv[0]);
+		return 1;
+	}
+
+	if (starpu_init(NULL) != EXIT_SUCCESS){
+		fprintf(stderr, "ERROR\n");
+		return 77;
+	}
+
+	unsigned start_nbr = (unsigned) atoi(argv[1]);
+	unsigned step_nbr = (unsigned) atoi(argv[2]);
+	unsigned stop_nbr = (unsigned) atoi(argv[3]);
+	unsigned nslices = (unsigned) atoi(argv[4]);
+	unsigned nbr_tests = (unsigned) atoi(argv[5]);
+
+	srand(time(NULL));
+
+	display_times(start_nbr, step_nbr, stop_nbr, nslices, nbr_tests);
+
+	starpu_shutdown();
+
+	return 0;
+}
+		

+ 81 - 0
julia/tst/black_scholes/black_scholes_def.jl

@@ -0,0 +1,81 @@
+function black_scholes_starpu(data ::Matrix{Float64}, res ::Matrix{Float64}, nslices ::Int64)
+    vert = StarpuDataFilter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslices)
+
+    @starpu_block let
+        dat_handle, res_handle = starpu_data_register(data, res)
+
+        starpu_data_partition(dat_handle, vert)
+        starpu_data_partition(res_handle, vert)
+        
+        #Compute the price of call and put option in the res matrix
+        @starpu_sync_tasks for task in (1:nslices)
+            @starpu_async_cl cl(dat_handle[task], res_handle[task])
+        end
+    end
+end
+
+
+function init_data(data, data_nbr);
+    for i in 1:data_nbr
+        data[1,i] = rand(Float64) * 100
+        data[2,i] = rand(Float64) * 100
+        data[3,i] = rand(Float64)
+        data[4,i] = rand(Float64) * 10
+        data[5,i] = rand(Float64) * 10
+    end
+    return data
+end
+        
+
+
+function median_times(data_nbr, nslices, nbr_tests)
+
+    data ::Matrix{Float64} = zeros(5, data_nbr)
+    # data[1,1] = 100.0
+    # data[2,1] = 100.0
+    # data[3,1] = 0.05
+    # data[4,1] = 1.0
+    # data[5,1] = 0.2
+
+
+    res ::Matrix{Float64} = zeros(2, data_nbr)
+
+    exec_times ::Vector{Float64} = [0. for i in 1:nbr_tests]
+
+    for i = 1:nbr_tests
+        
+        init_data(data, data_nbr)
+
+        tic()
+        black_scholes_starpu(data, res, nslices);
+        t = toq()
+
+        exec_times[i] = t
+    end
+    sort!(exec_times)
+    # println(data)
+    # println(res)
+    
+    return exec_times[1 + div(nbr_tests - 1, 2)]
+end
+
+function display_times(start_nbr, step_nbr, stop_nbr, nslices, nbr_tests)
+
+    mtc = map( (x->parse(Float64,x)), open("../DAT/black_scholes_c_times.dat") do f
+                  readlines(f)
+                  end)
+
+
+    mtcgen = map( (x->parse(Float64,x)), open("../DAT/black_scholes_c_generated_times.dat") do f
+                  readlines(f)
+                  end)
+    i = 1
+    open("../DAT/black_scholes_times.dat", "w") do f 
+        for data_nbr in (start_nbr : step_nbr : stop_nbr)
+            t = median_times(data_nbr, nslices, nbr_tests)
+            println("Number of data:\n$data_nbr\nTimes:\njl: $t\nC: $(mtc[i])\nGen: $(mtcgen[i])")
+            write(f, "$data_nbr $(t) $(mtcgen[i]) $(mtc[i])\n")
+            i = i + 1
+        end
+    end
+end

+ 35 - 0
julia/tst/black_scholes/black_scholes_generated.jl

@@ -0,0 +1,35 @@
+if length(ARGS) != 5
+    println("Usage: julia prog.jl start_data_nbr step_data_nbr stop_data_nbr nslices nbr_tests")
+    quit()
+end
+
+
+if (parse(Int64,ARGS[1]) < parse(Int64,ARGS[4]))
+    println("The number of slices must be smaller than the number of data")
+    quit()
+end
+
+include("../../src/Wrapper/Julia/starpu_include.jl")
+using StarPU
+
+@debugprint "starpu_init"
+starpu_init(extern_task_path = "../build/generated_tasks_black_scholes")
+
+perfmodel = StarpuPerfmodel(
+    perf_type = STARPU_HISTORY_BASED,
+    symbol = "history_perf"
+)
+
+cl = StarpuCodelet(
+cpu_func = "black_scholes",
+gpu_func = "CUDA_black_scholes",
+modes = [STARPU_RW, STARPU_RW],
+perfmodel = perfmodel
+)
+
+include("black_scholes_def.jl")
+
+display_times(map( (x->parse(Int64, x)), ARGS)...)
+
+@debugprint "starpu_shutdown"
+starpu_shutdown()

+ 155 - 0
julia/tst/black_scholes/black_scholes_with_generated.c

@@ -0,0 +1,155 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <starpu.h>
+#include "../includes/sorting.h"
+
+void black_scholes(void **, void *);
+void CUDA_black_scholes(void **, void*);
+
+static struct starpu_perfmodel model =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "history_perf"
+};
+
+struct starpu_codelet cl =
+{
+	.cpu_funcs = {black_scholes},
+	.cuda_funcs = {CUDA_black_scholes},
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_RW},
+	.model = &model
+};
+
+void black_scholes_with_starpu(double *data, double *res, unsigned nslices, unsigned nbr_data)
+{
+
+	starpu_data_handle_t D_handle, RES_handle;
+
+	starpu_matrix_data_register(&D_handle, STARPU_MAIN_RAM, (uintptr_t)data, 5, 5, nbr_data, sizeof(double));
+	starpu_matrix_data_register(&RES_handle, STARPU_MAIN_RAM, (uintptr_t)res, 2, 2, nbr_data, sizeof(double));
+
+	struct starpu_data_filter vert =
+	{
+		.filter_func = starpu_matrix_filter_vertical_block,
+		.nchildren = nslices
+	};
+
+	starpu_data_partition(D_handle, &vert);
+	starpu_data_partition(RES_handle, &vert);
+
+	unsigned taskx;
+	
+	for (taskx = 0; taskx < nslices; taskx++){
+		struct starpu_task *task = starpu_task_create();
+		
+		task->cl = &cl;
+		task->handles[0] = starpu_data_get_sub_data(D_handle, 1, taskx);
+		task->handles[1] = starpu_data_get_sub_data(RES_handle, 1, taskx);
+		
+		starpu_task_submit(task);
+	}
+
+	starpu_task_wait_for_all();
+
+
+	starpu_data_unpartition(D_handle, STARPU_MAIN_RAM);
+	starpu_data_unpartition(RES_handle, STARPU_MAIN_RAM);
+
+	starpu_data_unregister(D_handle);
+	starpu_data_unregister(RES_handle);
+}
+	
+
+void init_data(double *data, unsigned nbr_data)
+{
+	unsigned i;
+	for (i = 0; i < nbr_data; i++){
+
+		data[5*i] = 100. * rand() / (double) RAND_MAX;
+		data[5*i + 1] = 100. * rand() / (double) RAND_MAX;
+		data[5*i + 2] = rand() / (double) RAND_MAX;
+		data[5*i + 3] = 10. * rand() / (double) RAND_MAX;
+		data[5*i + 4] = 10. * rand() / (double) RAND_MAX;
+		
+	}
+}
+
+double median_time(unsigned nbr_data, unsigned nslices, unsigned nbr_tests)
+{
+	double *data = malloc(5 * nbr_data * sizeof(double));
+	double *res = calloc(2 * nbr_data, sizeof(double));
+	double exec_times[nbr_tests];
+	
+	/* printf("nbr_data: %u\n", nbr_data); */
+	unsigned i;
+	for (i = 0; i < nbr_tests; i++){
+		
+		init_data(data, nbr_data);
+		/* data[0] = 100.0; */
+		/* data[1] = 100.0; */
+		/* data[2] = 0.05; */
+		/* data[3] = 1.0; */
+		/* data[4] = 0.2; */
+
+		double start = starpu_timing_now();
+		black_scholes_with_starpu(data, res, nslices, nbr_data);
+		double stop = starpu_timing_now();
+		
+		exec_times[i] = (stop-start)/1.e6;
+		
+		
+	}
+
+	/* printf("RES:\n%f\n%f\n", res[0], res[1]); */
+
+	free(data);
+	free(res);
+
+	quicksort(exec_times, 0, nbr_tests - 1);
+	return exec_times[nbr_tests/2];
+}
+	
+
+
+void display_times(unsigned start_nbr, unsigned step_nbr, unsigned stop_nbr, unsigned nslices, unsigned nbr_tests){
+	
+	double t;
+	unsigned nbr_data;
+
+	FILE *myfile;
+	myfile = fopen("DAT/black_scholes_c_generated_times.dat", "w");
+
+	for (nbr_data = start_nbr; nbr_data <= stop_nbr; nbr_data+=step_nbr){
+		t = median_time(nbr_data, nslices, nbr_tests);
+		printf("Number of data: %u\nTime: %f\n", nbr_data, t);
+		fprintf(myfile, "%f\n", t);
+	}
+	fclose(myfile);
+}
+
+int main(int argc, char *argv[])
+{
+	if (argc != 6){
+		printf("Usage: %s start_nbr step_nbr stop_nbr nslices nbr_tests\n", argv[0]);
+		return 1;
+	}
+	
+	if (starpu_init(NULL) != EXIT_SUCCESS){
+		fprintf(stderr, "ERROR\n");
+		return 77;
+	}
+
+	unsigned start_nbr = (unsigned) atoi(argv[1]);
+	unsigned step_nbr = (unsigned) atoi(argv[2]);
+	unsigned stop_nbr = (unsigned) atoi(argv[3]);
+	unsigned nslices = (unsigned) atoi(argv[4]);
+	unsigned nbr_tests = (unsigned) atoi(argv[5]);
+
+
+	display_times(start_nbr, step_nbr, stop_nbr, nslices, nbr_tests);
+		
+	starpu_shutdown();
+
+	return 0;
+}

+ 39 - 0
julia/tst/black_scholes/cpu_black_scholes.c

@@ -0,0 +1,39 @@
+#include <stdint.h>
+#include <starpu.h>
+#include <math.h>
+
+
+static inline double normcdf(double x)
+{
+	
+	return (1.0 + erf(x/sqrt(2.0)))/2.0;
+}
+
+void cpu_black_scholes(void *descr[], void *arg)
+{ 
+	double *S, *K, *R, *T, *SIG, *CRES, *PRES;
+
+	uint32_t nxS;
+	
+	
+	S = (double *)STARPU_MATRIX_GET_PTR(descr[0]);
+	K = (double *)STARPU_MATRIX_GET_PTR(descr[1]);
+	R = (double *)STARPU_MATRIX_GET_PTR(descr[2]);
+	T = (double *)STARPU_MATRIX_GET_PTR(descr[3]);
+	SIG = (double *)STARPU_MATRIX_GET_PTR(descr[4]);
+	CRES = (double *)STARPU_MATRIX_GET_PTR(descr[5]);
+	PRES = (double *)STARPU_MATRIX_GET_PTR(descr[6]);
+	
+	nxS = STARPU_MATRIX_GET_NX(descr[0]);
+
+	
+	uint32_t i;
+	for (i = 0; i < nxS; i++){
+				
+		double d1 = (log(S[i] / K[i]) + (R[i] + pow(SIG[i], 2.0) * 0.5) * T[i]) / (SIG[i] * sqrt(T[i]));
+		double d2 = (log(S[i] / K[i]) + (R[i] - pow(SIG[i], 2.0) * 0.5) * T[i]) / (SIG[i] * sqrt(T[i]));
+		
+		CRES[i] = S[i] * normcdf(d1) - K[i] * exp(-R[i] * T[i]) * normcdf(d2);
+		PRES[i] = -S[i] * normcdf(-d1) + K[i] * exp(-R[i] * T[i]) * normcdf(-d2);
+	}
+}

+ 124 - 0
julia/tst/black_scholes/cpu_cuda_black_scholes.jl

@@ -0,0 +1,124 @@
+include("../../src/Compiler/include.jl")
+
+starpu_new_cpu_kernel_file("../build/generated_cpu_black_scholes.c")
+starpu_new_cuda_kernel_file("../build/generated_cuda_black_scholes.cu")
+
+
+
+
+
+@cpu_cuda_kernel function black_scholes(data ::Matrix{Float64}, res ::Matrix{Float64}) ::Void
+    
+    widthn ::Int64 = width(data)
+        
+    # data[1,...] -> S
+    # data[2,...] -> K
+    # data[3,...] -> r
+    # data[4,...] -> T
+    # data[4,...] -> sig
+
+    p ::Float64 = 0.2316419
+    b1 ::Float64 = 0.31938153
+    b2 ::Float64 = -0.356563782
+    b3 ::Float64 = 1.781477937
+    b4 ::Float64 = -1.821255978
+    b5 ::Float64 = 1.330274428
+
+    
+    @indep for i = 1:widthn
+        
+
+        d1 ::Float64 = (log(data[1,i] / data[2,i]) + (data[3,i] + pow(data[5,i], 2.0) * 0.5) * data[4,i]) / (data[5,i] * sqrt(data[4,i]))
+        d2 ::Float64 = (log(data[1,i] / data[2,i]) + (data[3,i] - pow(data[5,i], 2.0) * 0.5) * data[4,i]) / (data[5,i] * sqrt(data[4,i]))
+        
+
+
+
+        f ::Float64 = 0
+        ff ::Float64 = 0
+        s1 ::Float64 = 0
+        s2 ::Float64 = 0
+        s3 ::Float64 = 0
+        s4 ::Float64 = 0
+        s5 ::Float64 = 0
+        sz ::Float64 = 0
+        
+
+
+        
+        ######## Compute normcdf of d1
+
+        normd1p ::Float64 = 0
+        normd1n ::Float64 = 0
+
+        boold1 ::Int64 = (d1 >= 0) + (d1 <= 0)
+        
+        if (boold1 >= 2)
+            normd1p = 0.5
+            normd1n = 0.5
+        else
+            tmp1 ::Float64 = abs(d1)
+            f = 1 / sqrt(2 * M_PI)
+            ff = exp(-pow(tmp1, 2.0) / 2) * f
+            s1 = b1 / (1 + p * tmp1)
+            s2 = b2 / pow((1 + p * tmp1), 2.0)
+            s3 = b3 / pow((1 + p * tmp1), 3.0)
+            s4 = b4 / pow((1 + p * tmp1), 4.0)
+            s5 = b5 / pow((1 + p * tmp1), 5.0)
+            sz = ff * (s1 + s2 + s3 + s4 + s5)
+        
+            if (d1 > 0)
+                normd1p = 1 - sz # normcdf(d1)
+                normd1n = sz # normcdf(-d1)
+            else
+                normd1p = sz
+                normd1n = 1 - sz
+            end    
+        end
+        ########
+        
+
+        ######## Compute normcdf of d2
+        normd2p ::Float64 = 0
+        normd2n ::Float64 = 0
+
+        boold2 ::Int64 = (d2 >= 0) + (d2 <= 0)
+        
+        if (boold2 >= 2)
+            normd2p = 0.5
+            normd2n = 0.5
+        else
+            tmp2 ::Float64 = abs(d2)
+            f = 1 / sqrt(2 * M_PI)
+            ff = exp(-pow(tmp2, 2.0) / 2) * f
+            s1 = b1 / (1 + p * tmp2)
+            s2 = b2 / pow((1 + p * tmp2), 2.0)
+            s3 = b3 / pow((1 + p * tmp2), 3.0)
+            s4 = b4 / pow((1 + p * tmp2), 4.0)
+            s5 = b5 / pow((1 + p * tmp2), 5.0)
+            sz = ff * (s1 + s2 + s3 + s4 + s5)
+        
+        
+            if (d2 > 0)
+                normd2p = 1 - sz # normcdf(d2)
+                normd2n = sz # normcdf(-d2)
+            else
+                normd2p = sz
+                normd2n = 1 - sz
+            end
+        end
+        # normd1p = (1 + erf(d1/sqrt(2.0)))/2.0
+        # normd1n = (1 + erf(-d1/sqrt(2.0)))/2.0
+        
+        # normd2p = (1 + erf(d2/sqrt(2.0)))/2.0
+        # normd2n = (1 + erf(-d2/sqrt(2.0)))/2.0
+        
+        res[1,i] = data[1,i] * (normd1p) - data[2,i]*exp(-data[3,i]*data[4,i]) * (normd2p) # S * N(d1) - r*exp(-r*T) * norm(d2)
+        res[2,i] = -data[1,i] * (normd1n) + data[2,i]*exp(-data[3,i]*data[4,i]) * (normd2n) # -S * N(-d1) + r*exp(-r*T) * norm(-d2)
+        
+    end
+end
+
+compile_cpu_kernels("../build/generated_cpu_black_scholes.so")
+compile_cuda_kernels("../build/generated_cuda_black_scholes.so")
+combine_kernel_files("../build/generated_tasks_black_scholes.so", ["../build/generated_cpu_black_scholes.so", "../build/generated_cuda_black_scholes.so"])

+ 74 - 0
julia/tst/black_scholes/gpu_black_scholes.cu

@@ -0,0 +1,74 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <math.h>
+#include <starpu.h>
+
+// __device__ inline double cndGPU(double d)
+// {
+//   const double A1 = 0.31938153f;
+//   const double A2 = -0.356563782f;
+//   const double A3 = 1.781477937f;
+//   const double A4 = -1.821255978f;
+//   const double A5 = 1.330274429f;
+//   const float RSQRT2PI = 0.39894228040143267793994605993438f;
+
+    
+//   double K = __fdividef(1.0f, (1.0f + 0.2316419f * fabsf(d)));
+
+    
+//   double cnd = RSQRT2PI * __expf(- 0.5f * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
+
+//     if (d > 0)
+//       cnd = 1.0f - cnd;
+
+//     return cnd;
+// }
+
+__device__ inline double cndGPU(double d)
+{
+  return (1.0 + erf(d/sqrt(2.0)))/2.0;
+}
+
+__global__ void gpuBlackScholesKernel(double *S, double *K, double *R, double *T, 
+				      double *SIG, double *CRES, double *PRES,
+				      uint32_t nxS)
+{
+  uint32_t i, id;
+  
+  id = blockIdx.x * blockDim.x + threadIdx.x;
+  i = id % nxS;
+  
+  double sqrtT = __fdividef(1.0F, rsqrtf(T[i]));
+  double d1 = (log(S[i] / K[i]) + (R[i] + SIG[i] * SIG[i] * 0.5) * T[i]) / (SIG[i] * sqrt(T[i]));  
+  double d2 = (log(S[i] / K[i]) + (R[i] - SIG[i] * SIG[i] * 0.5) * T[i]) / (SIG[i] * sqrt(T[i]));
+  
+  CRES[i] = S[i] * (normcdf(d1)) - K[i] * exp(-R[i] * T[i]) * normcdf(d2);
+  PRES[i] = -S[i] * (normcdf(-d1)) + K[i] * exp(-R[i] * T[i]) * normcdf(-d2);
+}
+
+#define THREADS_PER_BLOCK 64
+
+extern "C" void gpu_black_scholes(void *descr[], void *args)
+{
+  double *S, *K, *R, *T, *SIG, *CRES, *PRES;
+  uint32_t nxS;
+  uint32_t nblocks;
+
+  S = (double *) STARPU_MATRIX_GET_PTR(descr[0]);
+  K = (double *) STARPU_MATRIX_GET_PTR(descr[1]);
+  R = (double *) STARPU_MATRIX_GET_PTR(descr[2]);
+  T = (double *) STARPU_MATRIX_GET_PTR(descr[3]);
+  SIG = (double *) STARPU_MATRIX_GET_PTR(descr[4]);
+  CRES = (double *) STARPU_MATRIX_GET_PTR(descr[5]);
+  PRES = (double *) STARPU_MATRIX_GET_PTR(descr[6]);
+
+  nxS = STARPU_MATRIX_GET_NX(descr[0]);
+
+  nblocks = (nxS + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+
+  gpuBlackScholesKernel
+    <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
+    >>> (S, K, R, T, SIG, CRES, PRES, nxS);
+  
+  cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}

+ 196 - 0
julia/tst/includes/display.c

@@ -0,0 +1,196 @@
+#include "display.h"
+
+
+void mandelbrot_graph(char *filename, int *pixels, unsigned width, unsigned height)
+{
+	FILE *myfile;
+	myfile = fopen(filename, "w");
+	fprintf(myfile, "P3\n%u %u\n255\n", width, height);
+	unsigned i,j;
+	for (i = 0; i < height; i++){
+		for (j = 0; j < width; j++){
+			fprintf(myfile, "%d 0 0 ", pixels[j + i*width]);
+		}
+		fprintf(myfile, "\n");
+	}
+	fclose(myfile);
+}
+
+
+void mandelbrot_graph_transpose(char *filename, int64_t *pixels, unsigned width, unsigned height)
+{
+	FILE *myfile;
+	myfile = fopen(filename, "w");
+	fprintf(myfile, "P3\n%u %u\n255\n", width, height);
+	unsigned i,j;
+	for (i = 0; i < width; i++){
+		for (j = 0; j < height; j++){
+			fprintf(myfile, "%d 0 0 ", pixels[i + j*width]);
+		}
+		fprintf(myfile, "\n");
+	}
+	fclose(myfile);
+}
+
+
+
+
+void pixels_print(int *pixels, unsigned width, unsigned height)
+{
+	unsigned i,j;
+	for (i = 0; i < height; i++){
+		for (j = 0; j < width; j++){
+			printf("%d ", pixels[j + i * width]);
+		}
+		printf("\n");
+	}
+}
+////////////////////////// NBODY ////////////////////////
+
+
+static int is_inside(struct Position p, unsigned width, unsigned height);
+static void get_planet_pixels(struct Position *pix, struct Position p);
+static void graph_pixels(struct Pixel *pixel, unsigned width, unsigned height, struct Position *pix, unsigned index, unsigned nbr_planets);
+static void nbody_ppm(char *filename, struct Pixel *pixels, unsigned width, unsigned height);
+
+
+static int is_inside(struct Position p, unsigned width, unsigned height)
+{
+	if (p.x >= 0 && p.x < width && p.y >= 0 && p.y < height)
+		return 1;
+	else
+		return 0;
+}
+
+
+void nbody_print(double *array, unsigned nbr_planets)
+{
+	unsigned i,j;
+	for (j = 0; j < nbr_planets; j++){
+		
+		printf("Planet %u:\n", j);
+		printf("%f ; %f\n", array[j], array[j + nbr_planets]);
+		
+	}
+}
+
+
+//Returns a circle centered on the position p.
+static void get_planet_pixels(struct Position *pix, struct Position p)
+{
+	
+
+	int i,j,k;
+	k = 0;
+	for (i = p.x - 1; i <= p.x + 1; i++){
+		for (j = p.y - 3; j <= p.y + 3; j++){
+	
+			pix[k].x = i;
+			pix[k].y = j;
+			k++;
+		}
+	}
+
+	for (j = p.y - 2; j <= p.y + 2; j++){
+		
+		pix[k].x = p.x - 2;
+		pix[k].y = j;
+		k++;
+		pix[k].x = p.x + 2;
+		pix[k].y = j;
+		k++;
+	}
+
+	for (j = p.y - 1; j <= p.y + 1; j++){
+		
+		pix[k].x = p.x - 3;
+		pix[k].y = j;
+		k++;
+		pix[k].x = p.x + 3;
+		pix[k].y = j;
+		k++;
+	}
+}
+
+static void graph_pixels(struct Pixel *pixels, unsigned width, unsigned height, struct Position *pix, unsigned index, unsigned nbr_planets)
+{
+	/* printf("Planet: %u\n", index); */
+	unsigned j;
+	struct Pixel pixel = {0,0,0};
+	for (j = 0; j < 37; j++){
+		
+		/* printf("X: %d, Y: %d\n", pix[j].x, pix[j].y); */
+			
+		if ( is_inside(pix[j], width, height) ){
+
+			pixel.r = 125;
+			pixel.b = round((255. * index) / nbr_planets);
+			pixels[pix[j].x + pix[j].y * width] = pixel;
+		}
+	}
+}
+
+
+static void nbody_ppm(char *filename, struct Pixel *pixels, unsigned width, unsigned height)
+{
+	unsigned i,j;
+	FILE *myfile;
+	myfile = fopen(filename, "w");
+	fprintf(myfile, "P3\n%u %u\n255\n", width, height);
+	for (i = 0; i < height; i++){
+		for (j = 0; j < width; j++){
+			struct Pixel pixel = pixels[j + i * width];
+			fprintf(myfile, "%u %u %u ", pixel.r, pixel.g, pixel.b);
+		}
+		fprintf(myfile, "\n");
+	}
+	fclose(myfile);
+}
+
+
+void nbody_graph(char *filename, double *positions, unsigned nbr_planets, unsigned width, unsigned height, double min_val, double max_val)
+{
+	struct Position *pix = malloc(37 * sizeof(struct Position));
+	
+	struct Pixel *pixels = calloc(width * height, sizeof(struct Pixel));
+
+	unsigned i,j;
+
+	for (i = 0; i < nbr_planets; i++){
+		struct Position posi;
+		
+		posi.x = round( (positions[i] - min_val) / (max_val - min_val) * (width - 1));
+		posi.y = round( (positions[i + nbr_planets] - min_val) / (max_val - min_val) * (width - 1));
+
+		
+		get_planet_pixels(pix, posi);
+
+		graph_pixels(pixels, width, height, pix, i, nbr_planets);
+
+	}
+	nbody_ppm(filename, pixels, width, height);
+}
+
+
+void nbody_graph_transpose(char *filename, double *positions, unsigned nbr_planets, unsigned width, unsigned height, double min_val, double max_val)
+{
+	struct Position *pix = malloc(37 * sizeof(struct Position));
+	
+	struct Pixel *pixels = calloc(width * height, sizeof(struct Pixel));
+
+	unsigned i,j;
+
+	for (i = 0; i < nbr_planets; i++){
+		struct Position posi;
+		
+		posi.x = round( (positions[2 * i] - min_val) / (max_val - min_val) * (width - 1));
+		posi.y = round( (positions[2 * i + 1] - min_val) / (max_val - min_val) * (width - 1));
+
+		
+		get_planet_pixels(pix, posi);
+
+		graph_pixels(pixels, width, height, pix, i, nbr_planets);
+
+	}
+	nbody_ppm(filename, pixels, width, height);
+}

+ 28 - 0
julia/tst/includes/display.h

@@ -0,0 +1,28 @@
+#ifndef DISPLAY_H
+#define DISPLAY_H
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdint.h>
+
+struct Position {
+	int x;
+	int y;
+};
+
+struct Pixel {
+	unsigned r;
+	unsigned g;
+	unsigned b;
+};
+
+// Fills PPM/mandelbrot.ppm with the red values inside the pixels matrix.           
+void mandelbrot_graph(char *filename, int *pixels, unsigned width, unsigned height);
+void mandelbrot_graph_transpose(char *filename, int64_t *pixels, unsigned width, unsigned height);
+void pixels_print(int *pixels, unsigned width, unsigned height);
+void nbody_print(double *array, unsigned nbr_planets);
+
+void nbody_graph(char *filename, double *positions, unsigned nbr_planets, unsigned width, unsigned height, double min_val, double max_val);
+void nbody_graph_transpose(char *filename, double *positions, unsigned nbr_planets, unsigned width, unsigned height, double min_val, double max_val);
+
+#endif

+ 52 - 0
julia/tst/includes/sorting.c

@@ -0,0 +1,52 @@
+#include "sorting.h"
+
+unsigned chose_pivot(int first, int last)
+{
+	return ( rand() % (last - first + 1) + first );
+}
+
+int partitionning(double *arr, int first, int last, int pivot)
+{
+	double tmp;
+	int i,j;
+
+	tmp = arr[last];
+	arr[last] = arr[pivot];
+	arr[pivot] = tmp;
+
+	j = first;
+
+	for (i = first; i < last; i++)
+	{
+
+		if (arr[i] <= arr[last]){
+			tmp = arr[i];
+
+			arr[i] = arr[j];
+			arr[j] = tmp;
+
+			j++;
+		}
+	}
+
+	tmp = arr[j];
+	arr[j] = arr[last];
+	arr[last] = tmp;
+
+	return j;
+}
+
+void quicksort(double *arr, int first, int last)
+{
+
+	if (first < last){
+		int pivot = chose_pivot(first, last);
+		int j;
+
+		j = partitionning(arr, first, last, pivot);
+
+		quicksort(arr, first, j - 1);
+		quicksort(arr, j + 1, last);
+	}
+
+}

+ 12 - 0
julia/tst/includes/sorting.h

@@ -0,0 +1,12 @@
+#ifndef SORTING_H
+#define SORTING_H
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+unsigned chose_pivot(int first, int last);
+int partitionning(double *arr, int first, int last, int pivot);
+void quicksort(double *arr, int first, int last);
+
+#endif

+ 58 - 0
julia/tst/mandelbrot/cpu_cuda_mandelbrot.jl

@@ -0,0 +1,58 @@
+include("../../src/Compiler/include.jl")  
+
+starpu_new_cpu_kernel_file("../build/generated_cpu_mandelbrot.c")
+starpu_new_cuda_kernel_file("../build/generated_cuda_mandelbrot.cu")
+
+@cpu_cuda_kernel function mandelbrot(pixels ::Matrix{Int64}, params ::Vector{Float64}, slice_pos ::Vector{Int64}) :: Void
+    
+
+    local_width ::Int64 = width(pixels) 
+    local_height ::Int64 = height(pixels)
+
+    #max_iterations ::Int64 = 250
+    conv_limit ::Float64 = 2.0
+   
+    @indep for x in (1 : local_width)
+	@indep for y in (1 : local_height)
+
+            max_iterations ::Float64 = params[5]
+
+            zoom ::Float64 = params[3] * 0.25296875
+
+	    X ::Int64 = x + local_width * (slice_pos[1] - 1)
+	    Y ::Int64 = y + local_height * (slice_pos[2] - 1)
+	    
+            cr ::Float64 = params[1] + (X - params[3]/2)/zoom
+	    zr ::Float64 = cr
+
+            ci ::Float64 = params[2] + (Y - params[4]/2)/zoom
+            zi ::Float64 = ci
+
+	    n ::Int64 = 0
+
+            b1 ::Int64 = (n < max_iterations) + (zr*zr + zi*zi < conv_limit * conv_limit)
+
+	    while (b1 >= 2)#n <= max_iterations) && (z * z < conv_limit * conv_limit) #Double condition impossible!!!
+
+                tmp ::Float64 = zr*zr - zi*zi + cr
+                zi = 2*zr*zi + ci
+                zr = tmp
+
+		n = n + 1
+                b1 = (n <= max_iterations) + (zr*zr + zi*zi <= conv_limit * conv_limit)
+
+	    end 
+
+	    if (n < max_iterations)
+		pixels[y,x] = 255 * n / max_iterations
+	    else
+	        pixels[y,x] = 0
+	    end
+	end
+    end
+end
+
+
+compile_cpu_kernels("../build/generated_cpu_mandelbrot.so")
+compile_cuda_kernels("../build/generated_cuda_mandelbrot.so")
+combine_kernel_files("../build/generated_tasks_mandelbrot.so", ["../build/generated_cpu_mandelbrot.so","../build/generated_cuda_mandelbrot.so"])

+ 74 - 0
julia/tst/mandelbrot/cpu_mandelbrot.c

@@ -0,0 +1,74 @@
+#include <stdint.h>
+#include <starpu.h>
+#include <math.h>
+
+
+struct Params
+{
+	float cr;
+	float ci;
+	unsigned taskx;
+	unsigned tasky;
+	unsigned width;
+	unsigned height;
+};
+
+void cpu_mandelbrot(void *descr[], void *cl_arg)
+{
+	
+	struct Params *params = cl_arg;
+
+	int *subP;
+	uint32_t nxP, nyP;
+	uint32_t ldP;
+
+	subP = (int *)STARPU_MATRIX_GET_PTR(descr[0]);
+
+	nxP = STARPU_MATRIX_GET_NX(descr[0]);
+	nyP = STARPU_MATRIX_GET_NY(descr[0]);
+	
+	ldP = STARPU_MATRIX_GET_LD(descr[0]);
+
+	float centerr = params->cr;
+	float centeri = params->ci;
+
+	unsigned Idx = params->taskx;
+	unsigned Idy = params->tasky;
+
+	unsigned width = params->width;
+	unsigned height = params->height;
+
+	float zoom = width * 0.25296875;
+	float conv_limit = 2.0;
+	int max_iter = (width/2) * 0.049715909 * log10(zoom);
+
+	int x,y,n;
+
+	for (y = 0; y < nyP; y++){
+		for (x = 0; x < nxP; x++){
+			float X = x + Idx*nxP; //Coordinates in the whole matrice.
+			float Y = y + Idy*nyP;
+			float cr = centerr + (X - (width/2))/zoom;
+			float ci = centeri + (Y - (height/2))/zoom;
+			float zr = cr;
+			float zi = ci;
+			float m = zr * zr + zi * zi;
+			
+			for (n = 0; n <= max_iter && m < conv_limit * conv_limit; n++) {
+
+				float tmp = zr*zr - zi*zi + cr;
+				zi = 2*zr*zi + ci;
+				zr = tmp;
+				m = zr*zr + zi*zi;
+			}
+			int color;
+			if (n<max_iter)
+				color = 255.*n/max_iter;
+			else
+				color = 0;
+
+			subP[x + y*ldP] = color;
+		}
+	}
+
+}

+ 138 - 0
julia/tst/mandelbrot/cpu_mandelbrot_between.c

@@ -0,0 +1,138 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <starpu.h>
+
+struct Params
+{
+	float cr;
+	float ci;
+	unsigned taskx;
+	unsigned tasky;
+	unsigned width;
+	unsigned height;
+};
+
+static inline long long jlstarpu_max(long long a, long long b)
+{
+	return (a > b) ? a : b;
+}
+
+static inline long long jlstarpu_interval_size(long long start, long long step, long long stop)
+{
+    if (stop >= start){
+            return jlstarpu_max(0, (stop - start + 1) / step);
+    } else {
+            return jlstarpu_max(0, (stop - start - 1) / step);
+    }
+}
+
+void mandelbrot(void** buffers_86BwRM71, void* cl_arg_86BwRM71)
+{
+    uint32_t ld_o2BQqRir = (uint32_t) (STARPU_MATRIX_GET_LD(buffers_86BwRM71[(1) - (1)]));
+    int64_t* ptr_o2BQqRir = (int64_t*) (STARPU_MATRIX_GET_PTR(buffers_86BwRM71[(1) - (1)]));
+    
+//ARRAY PAR
+    double* ptr_Ul4Ys0Mt = (double*) (STARPU_VECTOR_GET_PTR(buffers_86BwRM71[(2) - (1)]));
+    int64_t* ptr_cE3zj60d = (int64_t*) (STARPU_VECTOR_GET_PTR(buffers_86BwRM71[(3) - (1)]));
+//
+
+    int64_t local_width = (int64_t) (STARPU_MATRIX_GET_NY(buffers_86BwRM71[(1) - (1)]));
+    int64_t local_height = (int64_t) (STARPU_MATRIX_GET_NX(buffers_86BwRM71[(1) - (1)]));
+    double conv_limit = (double) (2.0);
+
+//STRUCT PAR
+    
+    /* struct Params *params = cl_arg_86BwRM71; */
+  
+
+    /* double centerr = params->cr; */
+    /* double centeri = params->ci; */
+
+    /* unsigned Idx = params->taskx; */
+    /* unsigned Idy = params->tasky; */
+
+    /* unsigned width = params->width; */
+    /* unsigned height = params->height; */
+
+    /* /\* printf("cr / ci: %f / %f\n", centerr, centeri); *\/ */
+
+    /* int64_t zoom = width * 0.25296875; */
+    /* int64_t max_iterations = (width/2) * 0.049715909 * log10(zoom); */
+
+//
+
+
+    int64_t start_qxJwMzwA = (int64_t) (1);
+    int64_t stop_qxJwMzwA = (int64_t) (local_width);
+    int64_t x;
+
+    for (x = start_qxJwMzwA ; x <= stop_qxJwMzwA ; x += 1)
+    {
+        
+        int64_t start_ekV9GHK1 = (int64_t) (1);
+        int64_t stop_ekV9GHK1 = (int64_t) (local_height);
+        int64_t y;
+
+        for (y = start_ekV9GHK1 ; y <= stop_ekV9GHK1 ; y += 1)
+        {
+	    //ARRAY PAR
+	    double max_iterations = (double) (ptr_Ul4Ys0Mt[(5) - (1)]);
+            double zoom = (double) ((ptr_Ul4Ys0Mt[(3) - (1)]) * (0.25296875));
+	    //
+
+	    //STRUCT PAR
+	    /* double X = x + Idy*local_width; */
+	    /* double Y = y + Idx*local_height; */
+
+	    /* double cr = centerr + (X - (width / 2))/zoom; */
+	    /* double ci = centeri + (Y - (height / 2))/zoom; */
+	    //
+
+	    //ARRAY PAR
+            int64_t X = (int64_t) ((x) + ((local_width) * ((ptr_cE3zj60d[(2) - (1)]) - (1))));
+            int64_t Y = (int64_t) ((y) + ((local_height) * ((ptr_cE3zj60d[(1) - (1)]) - (1))));
+
+	    double cr = (double) ((ptr_Ul4Ys0Mt[(1) - (1)]) + (((X) - ((ptr_Ul4Ys0Mt[(3) - (1)]) / (2))) / (zoom)));
+            double ci = (double) ((ptr_Ul4Ys0Mt[(2) - (1)]) + (((Y) - ((ptr_Ul4Ys0Mt[(4) - (1)]) / (2))) / (zoom)));
+	    //
+
+
+            double zi = (double) (ci);
+            double zr = (double) (cr);
+
+
+            int64_t n = (int64_t) (0);
+
+	    float m = zr * zr + zi * zi;
+            /* int64_t b1 = (int64_t) (((n) < (max_iterations)) + ((((zr) * (zr)) + ((zi) * (zi))) < ((conv_limit) * (conv_limit)))); */
+            
+            /* while ((b1) >= (2)) */
+	    /* printf("%d\n", max_iterations); */
+
+            for (n = 0; n < max_iterations && m < conv_limit * conv_limit; n++)
+	    {
+                double tmp = (double) ((((zr) * (zr)) - ((zi) * (zi))) + (cr));
+                zi = ((2) * (zr) * (zi)) + (ci);
+                zr = tmp;
+                /* n = (n) + (1); */
+		m = zr*zr + zi*zi;
+                /* b1 = ((n) <= (max_iterations)) + ((((zr) * (zr)) + ((zi) * (zi))) <= ((conv_limit) * (conv_limit))); */
+            }
+            ;
+	    
+	    /* printf("n: %d\n max_iter: %d\n", n, max_iterations); */
+            if ((n) < (max_iterations))
+            {
+		    ptr_o2BQqRir[((y) + (((x) - (1)) * (ld_o2BQqRir))) - (1)] = 255 * (1.0 * n / (max_iterations));
+            } else
+            {
+                ptr_o2BQqRir[((y) + (((x) - (1)) * (ld_o2BQqRir))) - (1)] = 0;
+            }
+            ;
+        }
+        ;
+    }
+    ;
+}
+
+

+ 95 - 0
julia/tst/mandelbrot/gpu_mandelbrot.cu

@@ -0,0 +1,95 @@
+#include <starpu.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <math.h>
+
+struct Params
+{
+  float cr;
+  float ci;
+  unsigned taskx;
+  unsigned tasky;
+  unsigned width;
+  unsigned height;
+};
+
+
+__global__ void gpuMandelbrotKernel
+(
+ uint32_t nxP, uint32_t nyP, 
+ uint32_t ldP,
+ int * subP,
+ struct Params params
+ )
+{
+  unsigned width = params.width;
+  unsigned height = params.height;
+  unsigned taskx = params.taskx;
+  unsigned tasky = params.tasky;
+
+  float centerr = params.cr;
+  float centeri = params.ci;
+
+  float zoom = width * 0.25296875;
+  int maxiter = (width/2) * 0.049715909 * log10(zoom);
+  float conv_lim = 2.0;
+
+  uint32_t id;
+  int n,i,j,x,y;
+  float zr,zi,cr,ci;
+
+  id = blockIdx.x * blockDim.x + threadIdx.x;
+  i = id % nxP;
+  j = id / nxP;
+  
+
+  if (j >= nyP){
+    return;
+  }
+
+  x = i + taskx * nxP;
+  y = j + tasky * nyP;
+  
+  zr = cr = centerr + (x - width/2.)/zoom;
+  zi = ci = centeri + (y - height/2.)/zoom;
+  float m = zr*zr + zi*zi;
+  
+  for (n = 0; n <= maxiter && m < conv_lim * conv_lim; n++) {
+    float tmp = zr * zr - zi*zi + cr;
+    zi = 2*zr*zi + ci;
+    zr = tmp;
+    m = zr*zr + zi*zi;
+  }
+  int color;
+  if (n < maxiter)
+     color = 255.*n/maxiter;
+   else
+     color = 0;
+  subP[i + j*ldP] = color;
+
+}
+
+
+#define THREADS_PER_BLOCK 64
+
+extern "C" void gpu_mandelbrot(void *descr[], void *args)
+{
+  int *d_subP;
+  uint32_t nxP, nyP;
+  uint32_t ldP;
+  uint32_t nblocks;
+  struct Params *params = (struct Params *) args;
+
+  d_subP = (int *) STARPU_MATRIX_GET_PTR(descr[0]);
+
+  nxP = STARPU_MATRIX_GET_NX(descr[0]);
+  nyP = STARPU_MATRIX_GET_NY(descr[0]);
+
+  ldP = STARPU_MATRIX_GET_LD(descr[0]);
+
+  nblocks = (nxP * nyP + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
+
+  gpuMandelbrotKernel <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream() >>> (nxP, nyP, ldP, d_subP, *params);
+
+  cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}

+ 113 - 0
julia/tst/mandelbrot/gpu_mandelbrot_between.cu

@@ -0,0 +1,113 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <starpu.h>
+
+#define THREADS_PER_BLOCK 64
+
+static inline long long jlstarpu_max(long long a, long long b)
+{
+	return (a > b) ? a : b;
+}
+
+static inline long long jlstarpu_interval_size(long long start, long long step, long long stop)
+{
+    if (stop >= start){
+            return jlstarpu_max(0, (stop - start + 1) / step);
+    } else {
+            return jlstarpu_max(0, (stop - start - 1) / step);
+    }
+}
+
+
+__device__ static inline long long jlstarpu_max__device(long long a, long long b)
+{
+	return (a > b) ? a : b;
+}
+
+__device__ static inline long long jlstarpu_interval_size__device(long long start, long long step, long long stop)
+{
+	if (stop >= start){
+		return jlstarpu_max__device(0, (stop - start + 1) / step);
+	} else {
+		return jlstarpu_max__device(0, (stop - start - 1) / step);
+	}
+}
+
+
+__global__ void mandelbrot(int64_t kernel_ids__start_1, int64_t kernel_ids__step_1, int64_t kernel_ids__dim_1, int64_t kernel_ids__start_2, 
+                           int64_t kernel_ids__step_2, int64_t kernel_ids__dim_2, double* ptr_hF6lCYyJ, int64_t local_width, 
+                           int64_t* ptr_qoUGBRtY, int64_t local_height, double conv_limit, int64_t* ptr_A5zD9sJZ, 
+                           uint32_t ld_A5zD9sJZ)
+{
+    int64_t THREAD_ID = (int64_t) ((((blockIdx).x) * ((blockDim).x)) + ((threadIdx).x));
+    
+    if ((THREAD_ID) >= (((1) * (kernel_ids__dim_2)) * (kernel_ids__dim_1)))
+    {
+        return ;
+    };
+    int64_t kernel_ids__index_1 = (int64_t) (((THREAD_ID) / ((1) * (kernel_ids__dim_2))) % (kernel_ids__dim_1));
+    int64_t kernel_ids__index_2 = (int64_t) (((THREAD_ID) / (1)) % (kernel_ids__dim_2));
+    int64_t x = (int64_t) ((kernel_ids__start_1) + ((kernel_ids__index_1) * (kernel_ids__step_1)));
+    int64_t y = (int64_t) ((kernel_ids__start_2) + ((kernel_ids__index_2) * (kernel_ids__step_2)));
+    double max_iterations = (double) (ptr_hF6lCYyJ[(5) - (1)]);
+    double zoom = (double) ((ptr_hF6lCYyJ[(3) - (1)]) * (0.25296875));
+    int64_t X = (int64_t) ((x) + ((local_width) * ((ptr_qoUGBRtY[(2) - (1)]) - (1))));
+    int64_t Y = (int64_t) ((y) + ((local_height) * ((ptr_qoUGBRtY[(1) - (1)]) - (1))));
+    double cr = (double) ((ptr_hF6lCYyJ[(1) - (1)]) + (((X) - ((ptr_hF6lCYyJ[(3) - (1)]) / (2))) / (zoom)));
+    double zr = (double) (cr);
+    double ci = (double) ((ptr_hF6lCYyJ[(2) - (1)]) + (((Y) - ((ptr_hF6lCYyJ[(4) - (1)]) / (2))) / (zoom)));
+    double zi = (double) (ci);
+    int64_t n = (int64_t) (0);
+    int64_t b1 = (int64_t) (((n) < (max_iterations)) + ((((zr) * (zr)) + ((zi) * (zi))) < ((conv_limit) * (conv_limit))));
+    
+    while ((b1) >= (2))
+    {
+        double tmp = (double) ((((zr) * (zr)) - ((zi) * (zi))) + (cr));
+        zi = ((2) * (zr) * (zi)) + (ci);
+        zr = tmp;
+        n = (n) + (1);
+        b1 = ((n) <= (max_iterations)) + ((((zr) * (zr)) + ((zi) * (zi))) <= ((conv_limit) * (conv_limit)));
+    }
+    ;
+    
+    if ((n) < (max_iterations))
+    {
+        ptr_A5zD9sJZ[((y) + (((x) - (1)) * (ld_A5zD9sJZ))) - (1)] = ((255) * (n)) / (max_iterations);
+    } else
+    {
+        ptr_A5zD9sJZ[((y) + (((x) - (1)) * (ld_A5zD9sJZ))) - (1)] = 0;
+    }
+    ;
+}
+
+
+
+extern "C" void CUDA_mandelbrot(void** buffers_uwrYFDVe, void* cl_arg_uwrYFDVe)
+{
+    uint32_t ld_A5zD9sJZ = (uint32_t) (STARPU_MATRIX_GET_LD(buffers_uwrYFDVe[(1) - (1)]));
+    int64_t* ptr_A5zD9sJZ = (int64_t*) (STARPU_MATRIX_GET_PTR(buffers_uwrYFDVe[(1) - (1)]));
+    double* ptr_hF6lCYyJ = (double*) (STARPU_VECTOR_GET_PTR(buffers_uwrYFDVe[(2) - (1)]));
+    int64_t* ptr_qoUGBRtY = (int64_t*) (STARPU_VECTOR_GET_PTR(buffers_uwrYFDVe[(3) - (1)]));
+    int64_t local_width = (int64_t) (STARPU_MATRIX_GET_NY(buffers_uwrYFDVe[(1) - (1)]));
+    int64_t local_height = (int64_t) (STARPU_MATRIX_GET_NX(buffers_uwrYFDVe[(1) - (1)]));
+    double conv_limit = (double) (2.0);
+    int64_t kernel_ids__start_1 = (int64_t) (1);
+    int64_t kernel_ids__step_1 = (int64_t) (1);
+    int64_t kernel_ids__dim_1 = (int64_t) (jlstarpu_interval_size(kernel_ids__start_1, kernel_ids__step_1, local_width));
+    int64_t kernel_ids__start_2 = (int64_t) (1);
+    int64_t kernel_ids__step_2 = (int64_t) (1);
+    int64_t kernel_ids__dim_2 = (int64_t) (jlstarpu_interval_size(kernel_ids__start_2, kernel_ids__step_2, local_height));
+    int64_t nthreads = (int64_t) (((1) * (kernel_ids__dim_1)) * (kernel_ids__dim_2));
+    int64_t nblocks = (int64_t) ((((nthreads) + (THREADS_PER_BLOCK)) - (1)) / (THREADS_PER_BLOCK));
+    
+    mandelbrot
+        <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
+        >>> (kernel_ids__start_1, kernel_ids__step_1, kernel_ids__dim_1, kernel_ids__start_2, 
+             kernel_ids__step_2, kernel_ids__dim_2, ptr_hF6lCYyJ, local_width, 
+             ptr_qoUGBRtY, local_height, conv_limit, ptr_A5zD9sJZ, 
+             ld_A5zD9sJZ);
+    ;
+    cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+
+

+ 248 - 0
julia/tst/mandelbrot/mandelbrot.c

@@ -0,0 +1,248 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <starpu.h>
+#include "../display.h"
+
+void cpu_mandelbrot(void **, void *);
+void gpu_mandelbrot(void **, void *);
+
+struct Params
+{
+	float cr;
+	float ci;
+	unsigned taskx;
+	unsigned tasky;
+	unsigned width;
+	unsigned height;
+};
+
+
+
+struct starpu_codelet cl =
+{
+	.cpu_funcs = {cpu_mandelbrot},
+	.cuda_funcs = {gpu_mandelbrot},
+	.nbuffers = 1,
+	.modes = {STARPU_RW}
+};
+
+
+void mandelbrot_with_starpu(int *pixels, float cr, float ci, unsigned width, unsigned height, unsigned nslicesx, unsigned nslicesy)
+{
+	starpu_data_handle_t p_handle;
+
+	starpu_matrix_data_register(&p_handle, STARPU_MAIN_RAM, (uintptr_t)pixels, width, width, height, sizeof(int));
+
+	struct starpu_data_filter vert =
+	{
+		.filter_func = starpu_matrix_filter_vertical_block,
+		.nchildren = nslicesy
+	};
+
+	struct starpu_data_filter horiz =
+	{
+		.filter_func = starpu_matrix_filter_block,
+		.nchildren = nslicesx
+	};
+
+	starpu_data_map_filters(p_handle, 2, &vert, &horiz);
+
+	unsigned taskx, tasky;
+
+	struct Params *params = malloc(nslicesx*nslicesy*sizeof(struct Params));
+
+	for (taskx = 0; taskx < nslicesx; taskx++){
+		for (tasky = 0; tasky < nslicesy; tasky++){
+			struct starpu_task *task = starpu_task_create();
+			
+			task->cl = &cl;
+			task->handles[0] = starpu_data_get_sub_data(p_handle, 2, tasky, taskx);
+			struct Params param = {cr, ci, taskx, tasky, width, height};
+
+			params[taskx + tasky*nslicesx] = param;
+
+			task->cl_arg = (params + taskx + tasky * nslicesx);
+			task->cl_arg_size = sizeof(struct Params);
+			
+			starpu_task_submit(task);
+		}
+	}
+	starpu_task_wait_for_all();
+
+	starpu_data_unpartition(p_handle, STARPU_MAIN_RAM);
+
+	starpu_data_unregister(p_handle);
+
+	free(params);
+}
+
+void init_zero(int * pixels, unsigned width, unsigned height)
+{
+	unsigned i,j;
+	for (i = 0; i < height; i++){
+		for (j = 0; j < width; j++){
+			pixels[j + i*width] = 0;
+		}
+	}
+}
+
+void sort(double *arr, unsigned nbr_tests)
+{
+	unsigned j;
+	
+	int is_sort = 0;
+	
+	while (!is_sort){
+
+		is_sort = 1;
+		
+		for (j = 0; j < nbr_tests - 1; j++){
+			if (arr[j] > arr[j+1]){
+				is_sort = 0;
+				double tmp = arr[j];
+				arr[j] = arr[j+1];
+				arr[j+1] = tmp;
+			}
+		}
+	}
+}
+double median_time(float cr, float ci, unsigned width, unsigned height, unsigned nslicesx, unsigned nslicesy, unsigned nbr_tests)
+{
+	int *Pixels = malloc(width*height*sizeof(int));
+	
+	unsigned i;
+
+	double exec_times[nbr_tests];
+
+	double start, stop, exec_t;
+	for (i = 0; i < nbr_tests; i++){
+		init_zero(Pixels, width, height);
+		
+		start = starpu_timing_now(); // starpu_timing_now() gives the time in microseconds.
+		mandelbrot_with_starpu(Pixels, cr, ci, width, height, nslicesx, nslicesy);
+		stop = starpu_timing_now();
+		
+		exec_t = (stop-start)/1.e6;
+		exec_times[i] = exec_t;
+	}
+	char filename[30];
+	sprintf(filename, "PPM/mandelbrot%d.ppm", width);
+	printf("%s\n", filename);
+
+	mandelbrot_graph(filename, Pixels, width, height);
+
+	free(Pixels);
+
+	sort(exec_times, nbr_tests);
+
+	return exec_times[nbr_tests/2];	
+}
+
+void fluctuation_time(float cr, float ci, unsigned width, unsigned height, unsigned nslicesx, unsigned nslicesy, unsigned nbr_tests, double *exec_times)
+{
+	int *Pixels = malloc(width*height*sizeof(int));
+	
+	unsigned i;
+
+	double start, stop, exec_t;
+	for (i = 0; i < nbr_tests; i++){
+		init_zero(Pixels, width, height);
+		
+		start = starpu_timing_now(); // starpu_timing_now() gives the time in microseconds.
+		mandelbrot_with_starpu(Pixels, cr, ci, width, height, nslicesx, nslicesy);
+		stop = starpu_timing_now();
+		
+		exec_t = (stop-start)/1.e6;
+		exec_times[i] = exec_t;
+
+		/* char filename[33]; */
+		/* sprintf(filename, "../PPM/mandelbrot%d.ppm", i + 1); */
+		/* printf("%s\n", filename); */
+		/* mandelbrot_graph(filename, Pixels, width, height); */
+	}
+
+
+	free(Pixels);
+
+
+
+	
+}
+
+
+void display_times(float cr, float ci, unsigned start_dim, unsigned step_dim, unsigned stop_dim, unsigned nslices, unsigned nbr_tests)
+{
+	
+	unsigned dim;
+
+	FILE *myfile;
+	myfile = fopen("DAT/mandelbrot_c_struct_times.dat", "w");
+
+	for (dim = start_dim; dim <= stop_dim; dim += step_dim){
+		printf("Dimension: %u...\n", dim);
+		double t = median_time(cr, ci, dim, dim, nslices, nslices, nbr_tests);
+		
+		printf("w = %u ; h = %u ; t = %f\n", dim, dim, t);
+		
+		fprintf(myfile, "%f\n", t);
+		}
+	
+	fclose(myfile);
+}
+
+void display_fluctuations(float cr, float ci, unsigned start_dim, unsigned step_dim, unsigned stop_dim, unsigned nslices, unsigned nbr_tests)
+{
+	
+	unsigned dim;
+
+	FILE *myfile;
+	myfile = fopen("DAT/mandelbrot_c_fluctuation.dat", "w");
+
+	double *exec_times = malloc(nbr_tests * sizeof(double));
+	fluctuation_time(cr, ci, start_dim, start_dim, nslices, nslices, nbr_tests, exec_times);
+		
+	/* printf("w = %u ; h = %u ; t = %f\n", dim, dim, t); */
+	unsigned i;
+	for (i = 0; i < nbr_tests; i++){
+		printf("test %u: %f seconds\n", i, exec_times[i]);
+		fprintf(myfile, "%u %f\n", i, exec_times[i]);
+	}
+	
+	fclose(myfile);
+	free(exec_times);
+}
+
+
+int main(int argc, char **argv)
+{
+
+	if (argc != 8){
+		printf("Usage: %s cr ci start_dim step_dim stop_dim nslices(must divide dims) nbr_tests\n", argv[0]);
+		return 1;
+	}
+	if (starpu_init(NULL) != EXIT_SUCCESS){
+		fprintf(stderr, "ERROR\n");
+		return 77;
+	}
+
+
+	
+	float cr = (float) atof(argv[1]);
+	float ci = (float) atof(argv[2]);
+	unsigned start_dim = (unsigned) atoi(argv[3]);
+	unsigned step_dim = (unsigned) atoi(argv[4]);	
+	unsigned stop_dim = (unsigned) atoi(argv[5]);
+	unsigned nslices = (unsigned) atoi(argv[6]);
+	unsigned nbr_tests = (unsigned) atoi(argv[7]);
+
+	display_times(cr, ci, start_dim, step_dim, stop_dim, nslices, nbr_tests);
+	
+	
+	/* display_fluctuations(cr, ci, start_dim, step_dim, stop_dim, nslices, nbr_tests); */
+
+
+	starpu_shutdown();
+
+
+	return 0;
+}

+ 30 - 0
julia/tst/mandelbrot/mandelbrot.jl

@@ -0,0 +1,30 @@
+function mandelbrotjl(pixels ::Matrix{Int64}, centerr ::Float64, centeri ::Float64)
+    height,width = size(pixels)
+    zoom = width * 0.25296875
+    val_diverge = 2.0
+    max_iterations = (width/2) * 0.049715909 * log10(zoom);
+
+
+    for y = 1:height
+        for x = 1:width
+            cr = centerr + (x - (width / 2))/zoom
+            zr = cr
+            ci = centeri + (y - (height / 2))/zoom
+            zi = ci
+
+            n = 0
+            while ((n < max_iterations) && (zr*zr + zi*zi < val_diverge*val_diverge))
+                tmp = zr*zr - zi*zi + cr
+                zi = 2*zr*zi + ci
+                zr = tmp
+                n = n+1
+            end
+            
+            if (n < max_iterations)
+                pixels[y,x] = round(255 * n / max_iterations)
+            else
+                pixels[y,x] = 0
+            end
+        end
+    end
+end

+ 270 - 0
julia/tst/mandelbrot/mandelbrot_between.c

@@ -0,0 +1,270 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <starpu.h>
+#include <stdint.h>
+#include "../includes/display.h"
+
+void mandelbrot(void **, void *);
+void CUDA_mandelbrot(void **, void *);
+void test(void **, void *); /* Function used to test on my matrix, in the cpu_test_with_generated.c file. */
+
+struct Params
+{
+	float cr;
+	float ci;
+	unsigned taskx;
+	unsigned tasky;
+	unsigned width;
+	unsigned height;
+};
+
+
+
+struct starpu_codelet cl =
+{
+	/* .cpu_funcs = {test}, */ 
+	.cpu_funcs = {mandelbrot},
+	.cuda_funcs = {CUDA_mandelbrot},
+
+	//ARRAY PAR
+	.nbuffers = 3,
+	.modes = {STARPU_RW, STARPU_R, STARPU_R}
+	//
+
+	//STRUCT PAR
+	/* .nbuffers = 1, */
+	/* .modes = {STARPU_RW} */
+	//
+};
+
+
+void mandelbrot_with_starpu(int64_t *pixels, float cr, float ci, unsigned width, unsigned height, unsigned nslicesx, unsigned nslicesy, double *params)
+{
+	//ARRAY PAR
+	starpu_data_handle_t p_handle, par_handle, v_handle;
+	//
+
+	//STRUCT PAR
+	/* starpu_data_handle_t p_handle; */
+	//
+
+	starpu_matrix_data_register(&p_handle, STARPU_MAIN_RAM, (uintptr_t)pixels, width, width, height, sizeof(int64_t));
+	//ARRAY PAR
+	starpu_matrix_data_register(&par_handle, STARPU_MAIN_RAM, (uintptr_t)params, 5, 5, 1, sizeof(double));
+	//
+
+	struct starpu_data_filter vert =
+	{
+		/* .filter_func = starpu_matrix_filter_block, */
+		.filter_func = starpu_matrix_filter_vertical_block,
+		.nchildren = nslicesy
+	};
+
+	struct starpu_data_filter horiz =
+	{
+		.filter_func = starpu_matrix_filter_block,
+		/* .filter_func = starpu_matrix_filter_vertical_block, */
+		.nchildren = nslicesx
+	};
+
+	starpu_data_map_filters(p_handle, 2, &vert, &horiz);
+
+	unsigned taskx, tasky;
+
+	//ARRAY PAR
+	int64_t *V = malloc(2 * nslicesx * nslicesy * sizeof(int64_t));
+	//
+
+	//STRUCT PAR
+	/* struct Params *parameters = malloc(nslicesx * nslicesy * sizeof(struct Params)); */
+	//
+
+	for (tasky = 0; tasky < nslicesy; tasky++){
+		for (taskx = 0; taskx < nslicesx; taskx++){
+			struct starpu_task *task = starpu_task_create();
+
+			//ARRAY PAR			
+			V[2 * (taskx + nslicesx * tasky)] = taskx + 1;
+			V[2 * (taskx + nslicesx * tasky) + 1] = tasky + 1;
+			starpu_vector_data_register(&v_handle, STARPU_MAIN_RAM, (uintptr_t)&(V[2 * (taskx + nslicesx * tasky)]), 2, sizeof(int64_t));
+			//
+			
+
+			/* printf("Pre-Task%u_%u\n", taskx, tasky); */
+
+			task->cl = &cl;
+
+			
+			task->handles[0] = starpu_data_get_sub_data(p_handle, 2, tasky, taskx);
+			
+			//ARRAY PAR
+			task->handles[1] = par_handle;
+			task->handles[2] = v_handle;
+			//
+
+			//STRUCT PAR
+			/* struct Params param = {cr, ci, taskx, tasky, width, height}; */
+
+			
+
+			/* parameters[taskx + tasky * nslicesx] = param; */
+
+			/* task->cl_arg = (parameters + taskx + tasky * nslicesx); */
+			/* task->cl_arg_size = sizeof(struct Params); */
+			//
+
+
+			starpu_task_submit(task);
+
+			//ARRAY PAR
+			starpu_data_unregister_submit(v_handle);
+			//
+		}
+	}
+	starpu_task_wait_for_all();
+
+	starpu_data_unpartition(p_handle, STARPU_MAIN_RAM);
+
+	starpu_data_unregister(p_handle);
+
+	//STRUCT PAR
+	/* free(parameters); */
+	//
+
+	//ARRAY PAR
+	starpu_data_unregister(par_handle);
+	/* starpu_data_unregister(v_handle); */
+	free(V);
+	//
+}
+
+void init_zero(int64_t * pixels, unsigned width, unsigned height)
+{
+	unsigned i,j;
+	for (i = 0; i < height; i++){
+		for (j = 0; j < width; j++){
+			pixels[j + i*width] = 0;
+		}
+	}
+}
+
+void sort(double *arr, unsigned nbr_tests)
+{
+	unsigned j;
+	
+	int is_sort = 0;
+	
+	while (!is_sort){
+
+		is_sort = 1;
+		
+		for (j = 0; j < nbr_tests - 1; j++){
+			if (arr[j] > arr[j+1]){
+				is_sort = 0;
+				double tmp = arr[j];
+				arr[j] = arr[j+1];
+				arr[j+1] = tmp;
+			}
+		}
+	}
+}
+double median_time(float cr, float ci, unsigned width, unsigned height, unsigned nslicesx, unsigned nslicesy, unsigned nbr_tests)
+{
+	int64_t *Pixels = malloc(width*height*sizeof(int64_t));
+	double *params = malloc(4*sizeof(double));
+	
+	double max_iterations = (width/2) * 0.049715909 * log10(width * 0.25296875);
+
+	params[0] = (double) cr;
+	params[1] = (double) ci;
+	params[2] = (double) width;
+	params[3] = (double) height;
+	params[4] = (double) max_iterations;
+	unsigned i;
+
+	double exec_times[nbr_tests];
+
+	double start, stop, exec_t;
+	for (i = 0; i < nbr_tests; i++){
+		init_zero(Pixels, width, height);
+		
+		start = starpu_timing_now(); // starpu_timing_now() gives the time in microseconds.
+		mandelbrot_with_starpu(Pixels, cr, ci, width, height, nslicesx, nslicesy, params);
+		stop = starpu_timing_now();
+		
+		exec_t = (stop-start)/1.e6;
+		exec_times[i] = exec_t;
+	}
+
+	
+	char filename[34];
+	sprintf(filename, "PPM/mandelbrottest%d.ppm", width);
+	printf("%s\n", filename);
+
+	/* Due to Julia registering matrices differently in memory, we need to transpose the matrix we get from the Julia generated kernels */
+
+	mandelbrot_graph_transpose(filename, Pixels, width, height);
+
+
+	/* mandelbrot_graph_transpose("PPM/mandelbrottest.ppm", Pixels, width, height); */
+
+	free(Pixels);
+	free(params);
+
+	sort(exec_times, nbr_tests);
+
+	return exec_times[nbr_tests/2];	
+}
+
+
+void display_times(float cr, float ci, unsigned start_dim, unsigned step_dim, unsigned stop_dim, unsigned nslices, unsigned nbr_tests)
+{
+	
+	unsigned dim;
+
+	FILE *myfile;
+	myfile = fopen("DAT/mandelbrot_c_array_times.dat", "w");
+
+	for (dim = start_dim; dim <= stop_dim; dim += step_dim){
+		
+		double t = median_time(cr, ci, dim, dim, nslices, nslices, nbr_tests);
+		
+		printf("w = %u ; h = %u ; t = %f\n", dim, dim, t);
+		
+		fprintf(myfile, "%f\n", t);
+		}
+	
+	fclose(myfile);
+}
+
+int main(int argc, char **argv)
+{
+
+	if (argc != 8){
+		printf("Usage: %s cr ci start_dim step_dim stop_dim nslices(must divide dims) nbr_tests\n", argv[0]);
+		return 1;
+	}
+	if (starpu_init(NULL) != EXIT_SUCCESS){
+		fprintf(stderr, "ERROR\n");
+		return 77;
+	}
+
+
+	
+	float cr = (float) atof(argv[1]);
+	float ci = (float) atof(argv[2]);
+	unsigned start_dim = (unsigned) atoi(argv[3]);
+	unsigned step_dim = (unsigned) atoi(argv[4]);	
+	unsigned stop_dim = (unsigned) atoi(argv[5]);
+	unsigned nslices = (unsigned) atoi(argv[6]);
+	unsigned nbr_tests = (unsigned) atoi(argv[7]);
+
+	display_times(cr, ci, start_dim, step_dim, stop_dim, nslices, nbr_tests);
+
+
+
+	starpu_shutdown();
+
+
+	return 0;
+}

+ 197 - 0
julia/tst/mandelbrot/mandelbrot_def.jl

@@ -0,0 +1,197 @@
+include("mandelbrot.jl")
+
+
+function mandelbrot_with_starpu(A ::Matrix{Int64}, params ::Vector{Float64}, nslicesx ::Int64, nslicesy ::Int64) #mettre params en matrice. (pour que starpu le traite en matrice et non vecteur)
+    vert = StarpuDataFilter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslicesy)
+    horiz = StarpuDataFilter(STARPU_MATRIX_FILTER_BLOCK, nslicesx)
+    @starpu_block let
+	hA = starpu_data_register(A)
+        hP = starpu_data_register(params)
+	starpu_data_map_filters(hA, vert, horiz)
+        
+	@starpu_sync_tasks for tasky in (1:nslicesy)
+            for taskx in (1 : nslicesx)
+                @starpu_block let
+                    v = Int64[tasky, taskx] #C'est le x qu'on augmente en fonction du nombre de slicey. Si il y a trois colonnes, x sera coupé en 3. Donc on inverse dans v.
+                    hV = starpu_data_register(v)
+                    @starpu_async_cl cl(hA[tasky, taskx], hP, hV)
+	        end
+            end
+        end
+    end
+    return nothing
+end
+
+function mandelbrot_with_starpu_cpu(A ::Matrix{Int64}, params ::Vector{Float64}, nslicesx ::Int64, nslicesy ::Int64) #mettre params en matrice. (pour que starpu le traite en matrice et non vecteur)
+    vert = StarpuDataFilter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslicesy)
+    horiz = StarpuDataFilter(STARPU_MATRIX_FILTER_BLOCK, nslicesx)
+    @starpu_block let
+	hA = starpu_data_register(A)
+        hP = starpu_data_register(params)
+	starpu_data_map_filters(hA, vert, horiz)
+        
+	@starpu_sync_tasks for tasky in (1:nslicesy)
+            for taskx in (1 : nslicesx)
+                v = Int64[tasky, taskx] #C'est le x qu'on augmente en fonction du nombre de slicey. Si il y a trois colonnes, x sera coupé en 3. Donc on inverse dans v.
+                hV = starpu_data_register(v)
+                @starpu_async_cl clcpu(hA[tasky, taskx], hP, hV)
+	    end
+        end
+    end
+    return nothing
+end
+
+function mandelbrot_with_starpu_gpu(A ::Matrix{Int64}, params ::Vector{Float64}, nslicesx ::Int64, nslicesy ::Int64) #mettre params en matrice. (pour que starpu le traite en matrice et non vecteur)
+    vert = StarpuDataFilter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslicesx)
+    horiz = StarpuDataFilter(STARPU_MATRIX_FILTER_BLOCK, nslicesy)
+    @starpu_block let
+	hA = starpu_data_register(A)
+        hP = starpu_data_register(params)
+	starpu_data_map_filters(hA, vert, horiz)
+        
+	@starpu_sync_tasks for taskx in (1:nslicesx)
+            for tasky in (1 : nslicesy)
+                v = Int64[taskx, tasky] #C'est le x qu'on augmente en fonction du nombre de slicey. Si il y a trois colonnes, x sera coupé en 3. Donc on inverse dans v.
+                hV = starpu_data_register(v)
+                @starpu_async_cl clgpu(hA[taskx, tasky], hP, hV)
+	    end
+        end
+    end
+    return nothing
+end
+
+function init_zero(Pixels ::Matrix{Int64}, width ::Int64, height ::Int64)
+    for i in 1:height
+        for j in 1:width
+            Pixels[i,j] = 0
+        end
+    end
+end
+
+function graph_pixels(Pixels ::Matrix{Int64}, width ::Int64, height ::Int64, filename ::String)
+    open(filename, "w") do f
+        write(f, "P3\n$width $height\n255\n")
+        for i = 1:height
+            for j = 1:width
+                write(f, "$(Pixels[i,j]) 0 0 ")
+            end
+            write(f, "\n")
+        end
+    end
+end
+
+function median_times(nbr_tests ::Int64, cr ::Float64, ci ::Float64, dim ::Int64, nslices ::Int64)
+
+    exec_times_st ::Vector{Float64} = [0 for i = 1:nbr_tests]
+    exec_times_cpu ::Vector{Float64} = [0 for i = 1:nbr_tests]
+    exec_times_gpu ::Vector{Float64} = [0 for i = 1:nbr_tests]
+    exec_times_jl ::Vector{Float64} = [0 for i = 1:nbr_tests]
+    
+    Pixels_st ::Matrix{Int64} = zeros(dim, dim)
+    Pixels_cpu ::Matrix{Int64} = copy(Pixels_st)
+    Pixels_gpu ::Matrix{Int64} = copy(Pixels_st)
+    Pixels_jl ::Matrix{Int64} = copy(Pixels_st)
+    
+    max_iter ::Float64 = (dim/2) * 0.049715909 * log10(dim * 0.25296875)
+
+
+    params = [cr, ci, dim, dim, max_iter]
+
+    for i = 1:nbr_tests
+        
+        tic()
+        mandelbrot_with_starpu(Pixels_st, params, nslices, nslices)
+        t = toq()
+        
+        
+        exec_times_st[i] = t
+        
+        # tic()
+        # mandelbrot_with_starpu_cpu(Pixels_cpu, params, nslices, nslices)
+        # t = toq()
+        
+
+        # exec_times_cpu[i] = t        
+
+        # tic()
+        # mandelbrot_with_starpu_gpu(Pixels_gpu, params, nslices, nslices)
+        # t = toq()
+        
+        
+        # exec_times_gpu[i] = t
+
+
+        # tic()
+        # mandelbrotjl(Pixels_jl, cr, ci)
+        # t = toq()
+        
+        
+        # exec_times_jl[i] = t
+     end
+    # graph_pixels(Pixels_st, dim, dim, "../PPM/mandelbrotst$(dim).ppm")
+    # graph_pixels(Pixels_cpu, dim, dim, "../PPM/mandelbrotcpu$(dim).ppm")
+    # graph_pixels(Pixels_gpu, dim, dim, "../PPM/mandelbrotgpu$(dim).ppm")
+    # graph_pixels(Pixels_jl, dim, dim, "../PPM/mandelbrotjl$(dim).ppm")
+
+    sort!(exec_times_st)
+    # sort!(exec_times_cpu)
+    # sort!(exec_times_gpu)
+    # sort!(exec_times_jl)
+
+    
+    results ::Vector{Float64} = [exec_times_st[1 + div(nbr_tests-1, 2)]]
+    # results ::Vector{Float64} = [exec_times_st[1 + div(nbr_tests-1, 2)]]#, exec_times_cpu[1 + div(nbr_tests-1, 2)], exec_times_gpu[1 + div(nbr_tests-1, 2)]]#, exec_times_jl[1 + div(nbr_tests-1, 2)]]
+    return results
+end
+
+function display_time(cr ::Float64, ci ::Float64, start_dim ::Int64, step_dim ::Int64, stop_dim ::Int64, nslices ::Int64, nbr_tests ::Int64)
+    # mtc = map( (x->parse(Float64,x)), open("../DAT/mandelbrot_c.dat") do f
+    #             readlines(f)
+    #             end)
+
+    # mtgen = map( (x->parse(Float64,x)), open("../DAT/mandelbrot_with_generated_times.dat") do f
+    #             readlines(f)
+    #             end)
+
+    mtjl = map( (x->parse(Float64,x)), open("../DAT/mandelbrot_jl_times.dat") do f
+                    readlines(f)
+                    end)
+
+    # mtjlcpu = map( (x->parse(Float64,x)), open("../DAT/mandelbrot_jl_cpu.dat") do f
+    #             readlines(f)
+    #             end)
+
+    mtstruct = map( (x->parse(Float64,x)), open("../DAT/mandelbrot_c_struct_times.dat") do f
+                    readlines(f)
+                    end)
+
+
+    mtarray = map( (x->parse(Float64,x)), open("../DAT/mandelbrot_c_array_times.dat") do f
+                    readlines(f)
+                    end)
+
+
+    i = 1
+    # open("../DAT/mandelbrot.dat", "w") do f
+    # open("../DAT/mandelbrot_gen_times.dat", "w") do ft
+    open("../DAT/mandelbrot_speedups.dat", "w") do f
+        for dim in (start_dim : step_dim : stop_dim)
+            println("Dimension: $dim")
+            
+            res ::Vector{Float64} = median_times(nbr_tests, cr, ci, dim, nslices)
+            # println("C: $(mtc[i])")
+            # println("C with generated: $(mtgen[i])")
+            # println("Julia with starpu: $(res[1])")
+            # println("cpu: $(res[2])")
+            println("c_struct: $(mtstruct[i])")
+            println("c_array: $(mtarray[i])")
+            println("jl_st: $(res[1])")
+            # println("cpu: $(res[1])")
+            # write(ft, "$(dim) $(res[1]) $(mtgen[i])\n")
+            # write(f, "$(dim) $(res[4]/res[1]) $(res[4]/res[2]) $(res[4]/res[3]) $(res[4]/mtc[i])\n")
+            write(f, "$(dim) $(mtjl[i]/res[1]) $(mtjl[i]/mtstruct[i]) $(mtjl[i]/mtarray[i])\n")
+            # write(f, "$(res[1])\n")
+            i = i + 1
+        end
+    end
+end

+ 48 - 0
julia/tst/mandelbrot/mandelbrot_generated.jl

@@ -0,0 +1,48 @@
+
+
+if length(ARGS) != 7
+    println("Usage : julia prog.jl cr ci start_dim step_dim stop_dim nslices nbr_tests")
+    quit()
+end
+
+if (parse(Int64,ARGS[3]) % parse(Int64,ARGS[6]) != 0)
+    println("The number of slices should divide all the dimensions.")
+    quit()
+end
+
+include("../../src/Wrapper/Julia/starpu_include.jl")
+using StarPU
+
+@debugprint "starpu_init"
+starpu_init(extern_task_path = "../build/generated_tasks_mandelbrot.so")
+
+perfmodel = StarpuPerfmodel(
+    perf_type = STARPU_HISTORY_BASED,
+    symbol = "history_perf"
+)
+
+cl = StarpuCodelet(
+    cpu_func = "mandelbrot",
+    gpu_func = "CUDA_mandelbrot",
+    modes = [STARPU_W, STARPU_R, STARPU_R],
+    perfmodel = perfmodel
+)
+
+clcpu = StarpuCodelet(
+    cpu_func = "mandelbrot",
+    modes = [STARPU_W, STARPU_R, STARPU_R],
+    perfmodel = perfmodel
+)
+
+clgpu = StarpuCodelet(
+    gpu_func = "CUDA_mandelbrot",
+    modes = [STARPU_W, STARPU_R, STARPU_R],
+    perfmodel = perfmodel
+)
+
+include("mandelbrot_def.jl")
+
+display_time(parse(Float64,ARGS[1]), parse(Float64,ARGS[2]), map((x -> parse(Int64, x)), ARGS[3:7])...)
+
+@debugprint "starpu_shutdown"
+starpu_shutdown()

+ 29 - 0
julia/tst/mult/cpu_cuda_mult.jl

@@ -0,0 +1,29 @@
+
+include("../../src/Compiler/include.jl")
+
+starpu_new_cpu_kernel_file("../build/generated_cpu_mult.c")
+starpu_new_cuda_kernel_file("../build/generated_cuda_mult.cu")
+
+@cpu_cuda_kernel function matrix_mult(m1 :: Matrix{Float32}, m2 :: Matrix{Float32}, m3 :: Matrix{Float32}) :: Void
+
+    width_m2 :: Int64 = width(m2)
+    height_m1 :: Int64 = height(m1)
+    width_m1 :: Int64 = width(m1)
+    A ::Float64 = abs(-4.0)
+    @indep for j in (1 : width_m2)
+        @indep for i in (1 : height_m1)
+
+            sum :: Float32 = 0.
+
+            for k in (1 : width_m1)
+                sum = sum + m1[i, k] * m2[k, j]
+            end
+
+            m3[i, j] = sum
+        end
+    end
+end
+
+compile_cpu_kernels("../build/generated_cpu_mult.so")
+compile_cuda_kernels("../build/generated_cuda_mult.so")
+combine_kernel_files("../build/generated_tasks.so", ["../build/generated_cpu_mult.so", "../build/generated_cuda_mult.so"])

+ 54 - 0
julia/tst/mult/cpu_mult.c

@@ -0,0 +1,54 @@
+#include <stdint.h>
+#include <starpu.h>
+
+/*
+ * The codelet is passed 3 matrices, the "descr" union-type field gives a
+ * description of the layout of those 3 matrices in the local memory (ie. RAM
+ * in the case of CPU, GPU frame buffer in the case of GPU etc.). Since we have
+ * registered data with the "matrix" data interface, we use the matrix macros.
+ */
+
+void cpu_mult(void *descr[], void *arg)
+{
+	(void)arg;
+	float *subA, *subB, *subC;
+	uint32_t nxC, nyC, nyA;
+	uint32_t ldA, ldB, ldC;
+
+	/* .blas.ptr gives a pointer to the first element of the local copy */
+	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
+	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
+	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
+
+
+	/* .blas.nx is the number of rows (consecutive elements) and .blas.ny
+	 * is the number of lines that are separated by .blas.ld elements (ld
+	 * stands for leading dimension).
+	 * NB: in case some filters were used, the leading dimension is not
+	 * guaranteed to be the same in main memory (on the original matrix)
+	 * and on the accelerator! */
+	nxC = STARPU_MATRIX_GET_NX(descr[2]);
+	nyC = STARPU_MATRIX_GET_NY(descr[2]);
+	nyA = STARPU_MATRIX_GET_NY(descr[0]);
+
+	ldA = STARPU_MATRIX_GET_LD(descr[0]);
+	ldB = STARPU_MATRIX_GET_LD(descr[1]);
+	ldC = STARPU_MATRIX_GET_LD(descr[2]);
+
+	/* we assume a FORTRAN-ordering! */
+	unsigned i,j,k;
+	for (i = 0; i < nyC; i++)
+	{
+		for (j = 0; j < nxC; j++)
+		{
+			float sum = 0.0;
+
+			for (k = 0; k < nyA; k++)
+			{
+				sum += subA[j+k*ldA]*subB[k+i*ldB];
+			}
+
+			subC[j + i*ldC] = sum;
+		}
+	}
+}

+ 68 - 0
julia/tst/mult/gpu_mult.cu

@@ -0,0 +1,68 @@
+#include <starpu.h>
+#include <stdint.h>
+#include <stdio.h>
+
+
+
+
+__global__ void gpuMultKernel
+(
+		uint32_t nxC, uint32_t nyC, uint32_t nyA,
+		uint32_t ldA, uint32_t ldB, uint32_t ldC,
+		float * subA, float * subB, float * subC
+)
+{
+	uint32_t id, i, j, k;
+	float sum;
+
+	id = blockIdx.x * blockDim.x + threadIdx.x;
+	i = id % nxC;
+	j = id / nxC;
+
+	if (j >= nyC){
+		return;
+	}
+
+	sum = 0.;
+
+	for (k = 0 ; k < nyA ; k++){
+		sum += subA[i + k*ldA] * subB[k + j*ldB];
+	}
+
+	subC[i + j*ldC] = sum;
+
+}
+
+
+
+#define THREADS_PER_BLOCK 64
+
+extern "C" void gpu_mult(void * descr[], void * args)
+{
+
+	float * d_subA, * d_subB, * d_subC;
+	uint32_t nxC, nyC, nyA;
+	uint32_t ldA, ldB, ldC;
+	uint32_t nblocks;
+
+	d_subA = (float *) STARPU_MATRIX_GET_PTR(descr[0]);
+	d_subB = (float *) STARPU_MATRIX_GET_PTR(descr[1]);
+	d_subC = (float *) STARPU_MATRIX_GET_PTR(descr[2]);
+
+	nxC = STARPU_MATRIX_GET_NX(descr[2]);
+	nyC = STARPU_MATRIX_GET_NY(descr[2]);
+	nyA = STARPU_MATRIX_GET_NY(descr[0]);
+
+	ldA = STARPU_MATRIX_GET_LD(descr[0]);
+	ldB = STARPU_MATRIX_GET_LD(descr[1]);
+	ldC = STARPU_MATRIX_GET_LD(descr[2]);
+
+	nblocks = (nxC * nyC + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
+
+	gpuMultKernel
+		<<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
+		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
+
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+
+}

+ 288 - 0
julia/tst/mult/mult.c

@@ -0,0 +1,288 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012-2013                                Inria
+ * Copyright (C) 2009-2011,2013-2015                      Université de Bordeaux
+ * Copyright (C) 2010                                     Mehdi Juhoor
+ * Copyright (C) 2010-2013,2015,2017                      CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This example shows a simple implementation of a blocked matrix
+ * multiplication. Note that this is NOT intended to be an efficient
+ * implementation of sgemm! In this example, we show:
+ *  - how to declare dense matrices (starpu_matrix_data_register)
+ *  - how to manipulate matrices within codelets (eg. descr[0].blas.ld)
+ *  - how to use filters to partition the matrices into blocks
+ *    (starpu_data_partition and starpu_data_map_filters)
+ *  - how to unpartition data (starpu_data_unpartition) and how to stop
+ *    monitoring data (starpu_data_unregister)
+ *  - how to manipulate subsets of data (starpu_data_get_sub_data)
+ *  - how to construct an autocalibrated performance model (starpu_perfmodel)
+ *  - how to submit asynchronous tasks
+ */
+
+#include <string.h>
+#include <math.h>
+#include <sys/types.h>
+#include <signal.h>
+
+#include <starpu.h>
+
+
+
+/*
+ * That program should compute C = A * B
+ *
+ *   A of size (z,y)
+ *   B of size (x,z)
+ *   C of size (x,y)
+
+              |---------------|
+            z |       B       |
+              |---------------|
+       z              x
+     |----|   |---------------|
+     |    |   |               |
+     |    |   |               |
+     | A  | y |       C       |
+     |    |   |               |
+     |    |   |               |
+     |----|   |---------------|
+
+ */
+
+
+
+
+
+void gpu_mult(void **, void *);
+void cpu_mult(void **, void *);
+
+
+
+
+
+static struct starpu_perfmodel model =
+{
+		.type = STARPU_HISTORY_BASED,
+		.symbol = "history_perf"
+};
+
+static struct starpu_codelet cl =
+{
+		.cpu_funcs = {cpu_mult},
+		.cpu_funcs_name = {"cpu_mult"},
+		.cuda_funcs = {gpu_mult},
+		.nbuffers = 3,
+		.modes = {STARPU_R, STARPU_R, STARPU_W},
+		.model = &model
+};
+
+
+
+
+
+
+void multiply_with_starpu(float *A, float *B, float *C,  unsigned xdim,  unsigned ydim,  unsigned zdim, unsigned nslicesx, unsigned nslicesy)
+{
+	starpu_data_handle_t A_handle, B_handle, C_handle;
+
+
+	starpu_matrix_data_register(&A_handle, STARPU_MAIN_RAM, (uintptr_t)A,
+			ydim, ydim, zdim, sizeof(float));
+	starpu_matrix_data_register(&B_handle, STARPU_MAIN_RAM, (uintptr_t)B,
+			zdim, zdim, xdim, sizeof(float));
+	starpu_matrix_data_register(&C_handle, STARPU_MAIN_RAM, (uintptr_t)C,
+			ydim, ydim, xdim, sizeof(float));
+
+
+	struct starpu_data_filter vert =
+	{
+			.filter_func = starpu_matrix_filter_vertical_block,
+			.nchildren = nslicesx
+	};
+
+	struct starpu_data_filter horiz =
+	{
+			.filter_func = starpu_matrix_filter_block,
+			.nchildren = nslicesy
+	};
+
+
+	starpu_data_partition(B_handle, &vert);
+	starpu_data_partition(A_handle, &horiz);
+	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
+
+	unsigned taskx, tasky;
+
+	for (taskx = 0; taskx < nslicesx; taskx++){
+		for (tasky = 0; tasky < nslicesy; tasky++){
+
+			struct starpu_task *task = starpu_task_create();
+
+			task->cl = &cl;
+			task->handles[0] = starpu_data_get_sub_data(A_handle, 1, tasky);
+			task->handles[1] = starpu_data_get_sub_data(B_handle, 1, taskx);
+			task->handles[2] = starpu_data_get_sub_data(C_handle, 2, taskx, tasky);
+
+			starpu_task_submit(task);
+
+		}
+	}
+
+	starpu_task_wait_for_all();
+
+
+	starpu_data_unpartition(A_handle, STARPU_MAIN_RAM);
+	starpu_data_unpartition(B_handle, STARPU_MAIN_RAM);
+	starpu_data_unpartition(C_handle, STARPU_MAIN_RAM);
+
+	starpu_data_unregister(A_handle);
+	starpu_data_unregister(B_handle);
+	starpu_data_unregister(C_handle);
+
+}
+
+
+
+void init_rand(float * m, unsigned width, unsigned height)
+{
+	unsigned i,j;
+
+	for (j = 0 ; j < height ; j++){
+		for (i = 0 ; i < width ; i++){
+			m[j+i*height] = (float)(starpu_drand48());
+			
+		}
+	}
+}
+
+
+void init_zero(float * m, unsigned width, unsigned height)
+{
+	memset(m, 0, sizeof(float) * width * height);
+}
+
+
+
+void sort(unsigned int size, double t[])
+{
+	unsigned int j;
+
+	int is_sort = 0;
+
+	while(!is_sort){
+
+		is_sort = 1;
+
+		for (j = 0 ; j < size - 1 ; j++){
+
+			if (t[j] > t[j+1]){
+				double tmp = t[j];
+				t[j] = t[j+1];
+				t[j+1] = tmp;
+				is_sort = 0;
+			}
+		}
+	}
+
+
+}
+
+
+double median_time(unsigned nb_test, unsigned xdim, unsigned ydim, unsigned zdim, unsigned nsclicesx, unsigned nsclicesy)
+{
+	unsigned i;
+
+	float * A = (float *) valloc(zdim*ydim*sizeof(float));
+	float * B = (float *) valloc(xdim*zdim*sizeof(float));
+	float * C = (float *) valloc(xdim*ydim*sizeof(float));
+
+	double exec_times[nb_test];
+
+	for (i = 0 ; i < nb_test ; i++){
+
+		double start, stop, exec_t;
+
+		init_rand(A, zdim, ydim);
+		init_rand(B, xdim, zdim);
+		init_zero(C, xdim, ydim);
+
+		init_rand(A, zdim, ydim);
+		init_rand(B, xdim, zdim);
+		init_zero(C, xdim, ydim);
+
+		start = starpu_timing_now();
+		multiply_with_starpu(A, B, C, xdim, ydim, zdim, nsclicesx, nsclicesy);
+		stop = starpu_timing_now();
+
+		exec_t = (stop - start)/1.e6;
+		exec_times[i] = exec_t;
+	}
+
+	sort(nb_test, exec_times);
+
+	free(A);
+	free(B);
+	free(C);
+
+	/* return exec_times[nb_test/2]; */
+
+	return exec_times[0];
+}
+
+
+void display_times(unsigned start_dim, unsigned step_dim, unsigned stop_dim, unsigned nb_tests, unsigned nsclicesx, unsigned nsclicesy)
+{
+	unsigned dim;
+	FILE *fp;
+	fp = fopen("../DAT/mult_c.dat", "w");
+	for (dim = start_dim ; dim <= stop_dim ; dim += step_dim){
+		double t = median_time(nb_tests, dim, dim, dim, nsclicesx, nsclicesy);
+		fprintf(fp, "%f\n", t);
+		printf("%u ; %f\n", dim, t);
+	}
+	fclose(fp);
+
+}
+
+
+int main(int argc, char * argv[])
+{
+
+	if (argc != 7){
+		printf("Usage : %s start_dim step_dim stop_dim nb_tests nsclicesx nsclicesy\n", argv[0]);
+		return 1;
+	}
+
+
+	if (starpu_init(NULL) != EXIT_SUCCESS){
+		fprintf(stderr, "ERROR\n");
+		return 77;
+	}
+
+	unsigned start_dim = (unsigned) atoi(argv[1]);
+	unsigned step_dim = (unsigned) atoi(argv[2]);
+	unsigned stop_dim = (unsigned) atoi(argv[3]);
+	unsigned nb_tests = (unsigned) atoi(argv[4]);
+	unsigned nsclicesx = (unsigned) atoi(argv[5]);
+	unsigned nsclicesy = (unsigned) atoi(argv[6]);
+
+	display_times(start_dim, step_dim, stop_dim, nb_tests, nsclicesx, nsclicesy);
+
+	starpu_shutdown();
+
+	return 0;
+}
+

+ 228 - 0
julia/tst/mult/mult_def.jl

@@ -0,0 +1,228 @@
+using Base.LinAlg
+include("mult_naive.jl")
+
+#   A of size (y,z)
+#   B of size (z,x)
+#   C of size (y,x)
+
+
+#              |---------------|
+#            z |       B       |
+#              |---------------|
+#       z              x
+#     |----|   |---------------|
+#     |    |   |               |
+#     |    |   |               |
+#     | A  | y |       C       |
+#     |    |   |               |
+#     |    |   |               |
+#     |----|   |---------------|
+#
+
+
+
+
+
+function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, nslicesx, nslicesy)
+
+    vert = StarpuDataFilter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslicesx)
+    horiz = StarpuDataFilter(STARPU_MATRIX_FILTER_BLOCK, nslicesy)
+
+    @starpu_block let
+
+        hA,hB,hC = starpu_data_register(A, B, C)
+
+        starpu_data_partition(hB, vert)
+        starpu_data_partition(hA, horiz)
+        starpu_data_map_filters(hC, vert, horiz)
+
+        @starpu_sync_tasks for taskx in (1 : nslicesx)
+            for tasky in (1 : nslicesy)
+                @starpu_async_cl cl(hA[tasky], hB[taskx], hC[taskx, tasky])
+            end
+        end
+    end
+
+    return nothing
+end
+
+function multiply_with_starpu_cpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, nslicesx, nslicesy)
+
+    vert = StarpuDataFilter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslicesx)
+    horiz = StarpuDataFilter(STARPU_MATRIX_FILTER_BLOCK, nslicesy)
+
+    @starpu_block let
+
+        hA,hB,hC = starpu_data_register(A, B, C)
+
+        starpu_data_partition(hB, vert)
+        starpu_data_partition(hA, horiz)
+        starpu_data_map_filters(hC, vert, horiz)
+
+        @starpu_sync_tasks for taskx in (1 : nslicesx)
+            for tasky in (1 : nslicesy)
+                @starpu_async_cl clcpu(hA[tasky], hB[taskx], hC[taskx, tasky])
+            end
+        end
+    end
+
+    return nothing
+end
+
+function multiply_with_starpu_gpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, nslicesx, nslicesy)
+
+    vert = StarpuDataFilter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslicesx)
+    horiz = StarpuDataFilter(STARPU_MATRIX_FILTER_BLOCK, nslicesy)
+
+    @starpu_block let
+
+        hA,hB,hC = starpu_data_register(A, B, C)
+
+        starpu_data_partition(hB, vert)
+        starpu_data_partition(hA, horiz)
+        starpu_data_map_filters(hC, vert, horiz)
+
+        @starpu_sync_tasks for taskx in (1 : nslicesx)
+            for tasky in (1 : nslicesy)
+                @starpu_async_cl clgpu(hA[tasky], hB[taskx], hC[taskx, tasky])
+            end
+        end
+    end
+
+    return nothing
+end
+
+
+
+function approximately_equals(
+    A :: Matrix{Cfloat},
+    B :: Matrix{Cfloat},
+    eps = 1e-2
+)
+    (height, width) = size(A)
+
+    for j in (1 : width)
+        for i in (1 : height)
+            if (abs(A[i,j] - B[i,j]) > eps * max(abs(B[i,j]), abs(A[i,j])))
+                println("A[$i,$j] : $(A[i,j]), B[$i,$j] : $(B[i,j])")
+                return false
+            end
+        end
+    end
+
+    return true
+end
+
+
+
+function median_times(nb_tests, xdim, zdim, ydim, nslicesx, nslicesy)
+
+    exec_times_st ::Vector{Float64} = [0 for i = 1:nb_tests]
+    exec_times_cpu ::Vector{Float64} = [0 for i = 1:nb_tests]
+    exec_times_gpu ::Vector{Float64} = [0 for i = 1:nb_tests]
+    exec_times_jl ::Vector{Float64} = [0 for i = 1:nb_tests]
+
+    A = Array(rand(Cfloat, ydim, zdim))
+    B = Array(rand(Cfloat, zdim, xdim))
+    C = zeros(Float32, ydim, xdim)
+    D  = A * B
+
+    for i in (1 : nb_tests)
+        
+        # tic()
+        # multiply_with_starpu(A, B, C, nslicesx, nslicesy)
+        # t = toq()
+
+        # if (!approximately_equals(D, C))
+        #     error("Invalid st result")
+        # end
+        # exec_times_st[i] = t
+
+
+
+        # tic()
+        # multiply_with_starpu_cpu(A, B, C, nslicesx, nslicesy)
+        # tcpu = toq()
+
+        # if (!approximately_equals(D, C))
+        #     error("Invalid cpu result")
+        # end
+        # exec_times_cpu[i] = tcpu
+
+
+
+        # tic()
+        # multiply_with_starpu_gpu(A, B, C, nslicesx, nslicesy)
+        # tgpu = toq()
+
+        # if (!approximately_equals(D, C))
+        #     error("Invalid gpu result")
+        # end
+        # exec_times_gpu[i] = tgpu
+
+        al ::Float32 = 1.0
+        be ::Float32 = 0.0 
+
+        tic()
+        # multjl(A, B, C)
+        BLAS.gemm!('N','N', al, A, B, be, C)
+        # C = BLAS.gemm!('N', 'N', 1.0, A, B)
+        tjl = toq()
+
+        if (!approximately_equals(D, C))
+            error("Invalid jl result")
+        end
+
+        exec_times_jl[i] = tjl
+
+    end
+
+  
+    # sort!(exec_times_st)
+    # sort!(exec_times_cpu)
+    # sort!(exec_times_gpu)
+    sort!(exec_times_jl)
+  
+    results ::Vector{Float64} = [exec_times_jl[1 + div(nb_tests-1, 2)]]#, exec_times_cpu[1 + div(nb_tests-1, 2)], exec_times_gpu[1 + div(nb_tests-1, 2)], exec_times_jl[1 + div(nb_tests-1, 2)]]
+
+    return results
+end
+
+
+
+function display_times(start_dim, step_dim, stop_dim, nb_tests, nslicesx, nslicesy)
+    # mtc = map( (x->parse(Float64,x)), open("DAT/mult_c.dat") do f
+    #              readlines(f)
+    #              end)
+
+    # mtext = map( (x->parse(Float64,x)), open("DAT/mult_ext.dat") do f
+    #              readlines(f)
+    #              end)
+
+    # mtjl = map( (x->parse(Float64,x)), open("DAT/mult_jl.dat") do f
+    #             readlines(f)
+    #             end)
+    
+
+    # open("../DAT/mult_ext.dat", "w") do f    
+    # open("../DAT/mult_jl.dat", "w") do f
+    open("../DAT/mult_jl_times.dat", "w") do ft
+        # open("DAT/mult.dat", "w") do f
+            # i = 1
+        for dim in (start_dim : step_dim : stop_dim)
+            println("Dimension: $dim")
+            # println("C: $(mtc[i])")
+            res ::Vector{Float64} = median_times(nb_tests, dim, dim, dim, nslicesx, nslicesy)
+            println("jl: $(res[1])")
+            # println("jlcpu: $(res[2])")
+            # println("jlgpu: $(res[3])")
+            # println("jl: $(res[4])")
+            # write(f, "$(dim) $(res[4]/res[1]) $(res[4]/res[2]) $(res[4]/res[3]) $(res[4]/mtc[i])\n")
+            # write(f, "$dim $(mtjl[i]/res[1]) $(mtjl[i]/mtext[i]) $(mtjl[i]/mtc[i])\n")
+            # write(ft, "$(res[1]) $(mtc[i]) $(mtext[i]) $(mtjl[i])\n")
+            write(ft, "$(res[1])\n")
+            # i = i + 1
+            # end
+        end
+    end
+end

+ 30 - 0
julia/tst/mult/mult_extern.jl

@@ -0,0 +1,30 @@
+
+if length(ARGS) != 6
+    println("Usage : julia prog.jl start_dim step_dim stop_dim nb_tests nslicesx nslicesy")
+    quit()
+end
+
+include("../../src/Wrapper/Julia/starpu_include.jl")
+using StarPU
+
+@debugprint "starpu_init"
+starpu_init(extern_task_path = "../build/extern_tasks.so")
+
+perfmodel = StarpuPerfmodel(
+    perf_type = STARPU_HISTORY_BASED,
+    symbol = "history_perf"
+)
+
+cl = StarpuCodelet(
+    cpu_func = "cpu_mult",
+    gpu_func = "gpu_mult",
+    modes = [STARPU_R, STARPU_R, STARPU_W],
+    perfmodel = perfmodel
+)
+
+include("mult_def.jl")
+
+display_times(map((x -> parse(Int64,x)), ARGS)...)
+
+@debugprint "starpu_shutdown"
+starpu_shutdown()

+ 30 - 0
julia/tst/mult/mult_extern_graph.jl

@@ -0,0 +1,30 @@
+
+if length(ARGS) != 6
+    println("Usage : julia prog.jl start_dim step_dim stop_dim nb_tests nslicesx nslicesy")
+    quit()
+end
+
+include("../../src/Wrapper/Julia/starpu_include.jl")
+using StarPU
+
+@debugprint "starpu_init"
+starpu_init(extern_task_path = "../build/extern_tasks.so")
+
+perfmodel = StarpuPerfmodel(
+    perf_type = STARPU_HISTORY_BASED,
+    symbol = "history_perf"
+)
+
+cl = StarpuCodelet(
+    cpu_func = "cpu_mult",
+    gpu_func = "gpu_mult",
+    modes = [STARPU_R, STARPU_R, STARPU_W],
+    perfmodel = perfmodel
+)
+
+include("mult_def.jl")
+
+display_times(map((x -> parse(Int64,x)), ARGS)..., "../mult_extern.dat")
+
+@debugprint "starpu_shutdown"
+starpu_shutdown()

+ 46 - 0
julia/tst/mult/mult_generated.jl

@@ -0,0 +1,46 @@
+
+if length(ARGS) != 6
+    println("Usage : julia prog.jl start_dim step_dim stop_dim nb_tests nslicesx nslicesy")
+    quit()
+end
+
+
+include("../../src/Wrapper/Julia/starpu_include.jl")
+using StarPU
+
+
+
+
+@debugprint "starpu_init"
+starpu_init(extern_task_path = "../build/generated_tasks.so")
+
+perfmodel = StarpuPerfmodel(
+    perf_type = STARPU_HISTORY_BASED,
+    symbol = "history_perf"
+)
+
+cl = StarpuCodelet(
+    cpu_func = "matrix_mult",
+    gpu_func = "CUDA_matrix_mult",
+    modes = [STARPU_R, STARPU_R, STARPU_W],
+    perfmodel = perfmodel
+)
+
+clcpu = StarpuCodelet(
+    cpu_func = "matrix_mult",
+    modes = [STARPU_R, STARPU_R, STARPU_W],
+    perfmodel = perfmodel
+)
+
+clgpu = StarpuCodelet(
+    gpu_func = "CUDA_matrix_mult",
+    modes = [STARPU_R, STARPU_R, STARPU_W],
+    perfmodel = perfmodel
+)
+
+include("mult_def.jl")
+
+display_times(map( (x -> parse(Int64,x)) , ARGS)...)
+
+@debugprint "starpu_shutdown"
+starpu_shutdown()

+ 34 - 0
julia/tst/mult/mult_generated_graph.jl

@@ -0,0 +1,34 @@
+
+if length(ARGS) != 6
+    println("Usage : julia prog.jl start_dim step_dim stop_dim nb_tests nslicesx nslicesy")
+    quit()
+end
+
+
+include("../../src/Wrapper/Julia/starpu_include.jl")
+using StarPU
+
+
+
+
+@debugprint "starpu_init"
+starpu_init(extern_task_path = "../build/generated_tasks.so")
+
+perfmodel = StarpuPerfmodel(
+    perf_type = STARPU_HISTORY_BASED,
+    symbol = "history_perf"
+)
+
+cl = StarpuCodelet(
+    cpu_func = "matrix_mult",
+    gpu_func = "CUDA_matrix_mult",
+    modes = [STARPU_R, STARPU_R, STARPU_W],
+    perfmodel = perfmodel
+)
+
+include("mult_def.jl")
+
+display_times(map( (x -> parse(Int64,x)) , ARGS)..., "../mult_generated.dat")
+
+@debugprint "starpu_shutdown"
+starpu_shutdown()

+ 13 - 0
julia/tst/mult/mult_naive.jl

@@ -0,0 +1,13 @@
+function multjl(A ::Matrix{Float32}, B ::Matrix{Float32}, C ::Matrix{Float32})
+    heightC, widthC = size(C)
+    widthA = size(A)[2]
+    for i = 1:heightC
+        for j = 1:widthC
+            sum = 0
+            for k = 1:widthA
+                sum = sum + A[i, k] * B[k, j]
+            end
+            C[i,j] = sum
+        end
+    end
+end

+ 55 - 0
julia/tst/nbody/cpu_cuda_nbody.jl

@@ -0,0 +1,55 @@
+include("../../src/Compiler/include.jl")
+
+starpu_new_cpu_kernel_file("../build/generated_cpu_nbody.c")
+starpu_new_cuda_kernel_file("../build/generated_cuda_nbody.cu")
+
+@cpu_cuda_kernel function nbody_acc(positions ::Matrix{Float64}, accelerations ::Matrix{Float64}, masses ::Vector{Float64}, parameters ::Vector{Float64}, sliceID ::Vector{Int64}) ::Void
+    widthp ::Int64 = width(positions)
+    widtha ::Int64 = width(accelerations)
+    
+    @indep for plan = 1:widtha
+
+        sumaccx ::Float64 = 0
+        sumaccy ::Float64 = 0
+
+        for oplan = 1:widthp
+            eps ::Float64 = parameters[3]
+            Id ::Int64 = sliceID[1]*widtha
+            G ::Float64 = parameters[1]
+
+            b ::Int64 = ((plan + Id) >= oplan) + ((plan + Id) <= oplan)
+            if (b < 2)
+
+                dx ::Float64 = positions[1, oplan] - positions[1, plan + Id]
+                dy ::Float64 = positions[2, oplan] - positions[2, plan + Id]
+                modul ::Float64= sqrt(dx *dx + dy * dy)
+
+                sumaccx = sumaccx + (G * masses[oplan] * dx) / ((modul + eps) * (modul + eps) * (modul + eps)) 
+                sumaccy = sumaccy + (G * masses[oplan] * dy) / ((modul + eps) * (modul + eps) * (modul + eps))
+
+                # sumaccx = sumaccx + (G * masses[oplan]) * (dx / sqrt(dx * dx + dy * dy)) / (dx * dx + dy * dy + eps)
+                # sumaccy = sumaccy + (G * masses[oplan]) * (dy / sqrt(dx * dx + dy * dy)) / (dy * dy + dx * dx + eps)
+            end
+        end
+        accelerations[1, plan] = sumaccx
+        accelerations[2, plan] = sumaccy
+    end
+end
+
+@cpu_cuda_kernel function nbody_updt(positions ::Matrix{Float64}, velocities ::Matrix{Float64}, accelerations ::Matrix{Float64}, parameters ::Vector{Float64}) ::Void
+    widthp ::Int64 = width(positions)
+
+    @indep for i = 1:widthp
+
+        velocities[1, i] = velocities[1, i] + accelerations[1, i] * parameters[2]
+        velocities[2, i] = velocities[2, i] + accelerations[2, i] * parameters[2]
+
+        positions[1, i] = positions[1, i] + velocities[1, i] * parameters[2]
+        positions[2, i] = positions[2, i] + velocities[2, i] * parameters[2]
+    end
+
+end
+
+compile_cpu_kernels("../build/generated_cpu_nbody.so")
+compile_cuda_kernels("../build/generated_cuda_nbody.so")
+combine_kernel_files("../build/generated_tasks_nbody.so", ["../build/generated_cpu_nbody.so", "../build/generated_cuda_nbody.so"])

+ 94 - 0
julia/tst/nbody/cpu_nbody.c

@@ -0,0 +1,94 @@
+#include <stdint.h>
+#include <starpu.h>
+
+struct Param {
+	unsigned taskx;
+	double epsilon;
+};
+
+void cpu_nbody(void *descr[], void *arg)
+{
+	struct Param *params = arg;
+
+	double *P;
+	double *subA;
+	double *M;
+
+	uint32_t nxP, nxA, nxM;
+	uint32_t ldP, ldA, ldM;
+
+	P = (double *)STARPU_MATRIX_GET_PTR(descr[0]);
+	subA = (double *)STARPU_MATRIX_GET_PTR(descr[1]);
+	M = (double *)STARPU_MATRIX_GET_PTR(descr[2]);
+
+	nxP = STARPU_MATRIX_GET_NX(descr[0]);
+	nxA = STARPU_MATRIX_GET_NX(descr[1]);
+	nxM = STARPU_MATRIX_GET_NX(descr[2]);
+
+	ldP = STARPU_MATRIX_GET_LD(descr[0]);
+	ldA = STARPU_MATRIX_GET_LD(descr[1]);
+	ldM = STARPU_MATRIX_GET_LD(descr[2]);
+
+	double epsilon = params->epsilon;
+
+	unsigned id = nxA * params->taskx;
+
+	uint32_t i,j;
+	
+	for (i = 0; i < nxA; i++){
+		double sumaccx = 0;
+		double sumaccy = 0;
+		
+		for (j = 0; j < nxP; j++){
+			
+			if (j != i + id){
+				
+				double dx = P[j] - P[i + id];
+				double dy = P[j + ldP] - P[i + id + ldP];
+
+				double modul = sqrt(dx * dx + dy * dy);
+
+				sumaccx = sumaccx + 6.67e-11 * M[j] * dx / pow(modul + epsilon, 3);
+				sumaccy = sumaccy + 6.67e-11 * M[j] * dy / pow(modul + epsilon, 3);
+			}
+
+		}
+		subA[i] = sumaccx;
+		subA[i + ldA] = sumaccy;
+	}
+}
+
+void cpu_nbody2(void *descr[], void *arg)
+{
+	double *subP;
+	double *subV;
+	double *subA;
+
+	uint32_t nxP, nxV, nxA;
+	uint32_t ldP, ldV, ldA;
+
+	subP = (double *)STARPU_MATRIX_GET_PTR(descr[0]);
+	subV = (double *)STARPU_MATRIX_GET_PTR(descr[1]);
+	subA = (double *)STARPU_MATRIX_GET_PTR(descr[2]);
+
+	nxP = STARPU_MATRIX_GET_NX(descr[0]);
+	nxV = STARPU_MATRIX_GET_NX(descr[1]);
+	nxA = STARPU_MATRIX_GET_NX(descr[2]);
+	
+	ldP = STARPU_MATRIX_GET_LD(descr[0]);
+	ldV = STARPU_MATRIX_GET_LD(descr[1]);
+	ldA = STARPU_MATRIX_GET_LD(descr[2]);
+	
+	
+	unsigned i,dt;
+	dt = 3600;
+	for (i = 0; i < nxP; i++){
+	
+		subV[i] = subV[i] + dt*subA[i];
+		subV[i + ldV] = subV[i + ldV] + dt*subA[i + ldA];
+
+		subP[i] = subP[i] + dt*subV[i];
+		subP[i + ldP] = subP[i + ldP] + dt*subV[i + ldV];
+	}
+}
+	      

+ 104 - 0
julia/tst/nbody/cpu_nbody_between.c

@@ -0,0 +1,104 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <starpu.h>
+
+static inline long long jlstarpu_max(long long a, long long b)
+{
+	return (a > b) ? a : b;
+}
+
+static inline long long jlstarpu_interval_size(long long start, long long step, long long stop)
+{
+    if (stop >= start){
+            return jlstarpu_max(0, (stop - start + 1) / step);
+    } else {
+            return jlstarpu_max(0, (stop - start - 1) / step);
+    }
+}
+
+void nbody_acc(void** buffers_NJVQ1U4V, void* cl_arg_NJVQ1U4V)
+{
+    uint32_t ld_xIZ5HaKV = (uint32_t) (STARPU_MATRIX_GET_LD(buffers_NJVQ1U4V[(1) - (1)]));
+    double* ptr_xIZ5HaKV = (double*) (STARPU_MATRIX_GET_PTR(buffers_NJVQ1U4V[(1) - (1)]));
+    uint32_t ld_QZvmSRYk = (uint32_t) (STARPU_MATRIX_GET_LD(buffers_NJVQ1U4V[(2) - (1)]));
+    double* ptr_QZvmSRYk = (double*) (STARPU_MATRIX_GET_PTR(buffers_NJVQ1U4V[(2) - (1)]));
+    double* ptr_U7pwlAjr = (double*) (STARPU_VECTOR_GET_PTR(buffers_NJVQ1U4V[(3) - (1)]));
+    double* ptr_AQKcQq1a = (double*) (STARPU_VECTOR_GET_PTR(buffers_NJVQ1U4V[(4) - (1)]));
+    int64_t* ptr_qQoIJuDP = (int64_t*) (STARPU_VECTOR_GET_PTR(buffers_NJVQ1U4V[(5) - (1)]));
+
+    int64_t widthp = (int64_t) (STARPU_MATRIX_GET_NY(buffers_NJVQ1U4V[(1) - (1)]));
+    int64_t widtha = (int64_t) (STARPU_MATRIX_GET_NY(buffers_NJVQ1U4V[(2) - (1)]));
+    
+    int64_t start_nj3HDzsW = (int64_t) (1);
+    int64_t stop_nj3HDzsW = (int64_t) (widtha);
+    int64_t plan;
+
+    for (plan = start_nj3HDzsW ; plan <= stop_nj3HDzsW ; plan += 1)
+    {
+        double sumaccx = (double) (0);
+        double sumaccy = (double) (0);
+        
+        int64_t start_TzfU6QY7 = (int64_t) (1);
+        int64_t stop_TzfU6QY7 = (int64_t) (widthp);
+        int64_t oplan;
+
+        for (oplan = start_TzfU6QY7 ; oplan <= stop_TzfU6QY7 ; oplan += 1)
+        {
+            double eps = (double) (ptr_AQKcQq1a[(3) - (1)]);
+            int64_t Id = (int64_t) ((ptr_qQoIJuDP[(1) - (1)]) * (widtha));
+            double G = (double) (ptr_AQKcQq1a[(1) - (1)]);
+            int64_t b = (int64_t) ((((plan) + (Id)) >= (oplan)) + (((plan) + (Id)) <= (oplan)));
+            
+            if ((b) < (2))
+            {
+                double dx = (double) ((ptr_xIZ5HaKV[((1) + (((oplan) - (1)) * (ld_xIZ5HaKV))) - (1)]) - (ptr_xIZ5HaKV[((1) + ((((plan) + (Id)) - (1)) * (ld_xIZ5HaKV))) - (1)]));
+                double dy = (double) ((ptr_xIZ5HaKV[((2) + (((oplan) - (1)) * (ld_xIZ5HaKV))) - (1)]) - (ptr_xIZ5HaKV[((2) + ((((plan) + (Id)) - (1)) * (ld_xIZ5HaKV))) - (1)]));
+                double modul = (double) (sqrt(((dx) * (dx)) + ((dy) * (dy))));
+                sumaccx = (sumaccx) + (((G) * (ptr_U7pwlAjr[(oplan) - (1)]) * (dx)) / (((modul) + (eps)) * ((modul) + (eps)) * ((modul) + (eps))));
+                sumaccy = (sumaccy) + (((G) * (ptr_U7pwlAjr[(oplan) - (1)]) * (dy)) / (((modul) + (eps)) * ((modul) + (eps)) * ((modul) + (eps))));
+            };
+	    
+        }
+        ;
+
+	
+        ptr_QZvmSRYk[((1) + (((plan) - (1)) * (ld_QZvmSRYk))) - (1)] = sumaccx;
+        ptr_QZvmSRYk[((2) + (((plan) - (1)) * (ld_QZvmSRYk))) - (1)] = sumaccy;
+
+        /* ptr_QZvmSRYk[((1) + (((plan) - (1)) * (ld_QZvmSRYk))) - (1)] = ptr_qQoIJuDP[(1) - (1)]; */
+
+        /* ptr_QZvmSRYk[((2) + (((plan) - (1)) * (ld_QZvmSRYk))) - (1)] = ptr_qQoIJuDP[(1) - (1)]; */
+	
+	
+
+    }
+    ;
+}
+
+
+void nbody_updt(void** buffers_kCJlJluA, void* cl_arg_kCJlJluA)
+{
+    uint32_t ld_tlJ0FAub = (uint32_t) (STARPU_MATRIX_GET_LD(buffers_kCJlJluA[(1) - (1)]));
+    double* ptr_tlJ0FAub = (double*) (STARPU_MATRIX_GET_PTR(buffers_kCJlJluA[(1) - (1)]));
+    uint32_t ld_CwAKodfw = (uint32_t) (STARPU_MATRIX_GET_LD(buffers_kCJlJluA[(2) - (1)]));
+    double* ptr_CwAKodfw = (double*) (STARPU_MATRIX_GET_PTR(buffers_kCJlJluA[(2) - (1)]));
+    uint32_t ld_9CU1xW4b = (uint32_t) (STARPU_MATRIX_GET_LD(buffers_kCJlJluA[(3) - (1)]));
+    double* ptr_9CU1xW4b = (double*) (STARPU_MATRIX_GET_PTR(buffers_kCJlJluA[(3) - (1)]));
+    double* ptr_D81NeTCr = (double*) (STARPU_VECTOR_GET_PTR(buffers_kCJlJluA[(4) - (1)]));
+    int64_t widthp = (int64_t) (STARPU_MATRIX_GET_NY(buffers_kCJlJluA[(1) - (1)]));
+    
+    int64_t start_FQShiJ9y = (int64_t) (1);
+    int64_t stop_FQShiJ9y = (int64_t) (widthp);
+    int64_t i;
+
+    for (i = start_FQShiJ9y ; i <= stop_FQShiJ9y ; i += 1)
+    {
+        ptr_CwAKodfw[((1) + (((i) - (1)) * (ld_CwAKodfw))) - (1)] = (ptr_CwAKodfw[((1) + (((i) - (1)) * (ld_CwAKodfw))) - (1)]) + ((ptr_9CU1xW4b[((1) + (((i) - (1)) * (ld_9CU1xW4b))) - (1)]) * (ptr_D81NeTCr[(2) - (1)]));
+        ptr_CwAKodfw[((2) + (((i) - (1)) * (ld_CwAKodfw))) - (1)] = (ptr_CwAKodfw[((2) + (((i) - (1)) * (ld_CwAKodfw))) - (1)]) + ((ptr_9CU1xW4b[((2) + (((i) - (1)) * (ld_9CU1xW4b))) - (1)]) * (ptr_D81NeTCr[(2) - (1)]));
+        ptr_tlJ0FAub[((1) + (((i) - (1)) * (ld_tlJ0FAub))) - (1)] = (ptr_tlJ0FAub[((1) + (((i) - (1)) * (ld_tlJ0FAub))) - (1)]) + ((ptr_CwAKodfw[((1) + (((i) - (1)) * (ld_CwAKodfw))) - (1)]) * (ptr_D81NeTCr[(2) - (1)]));
+        ptr_tlJ0FAub[((2) + (((i) - (1)) * (ld_tlJ0FAub))) - (1)] = (ptr_tlJ0FAub[((2) + (((i) - (1)) * (ld_tlJ0FAub))) - (1)]) + ((ptr_CwAKodfw[((2) + (((i) - (1)) * (ld_CwAKodfw))) - (1)]) * (ptr_D81NeTCr[(2) - (1)]));
+    }
+    ;
+}
+
+

+ 145 - 0
julia/tst/nbody/gpu_nbody.cu

@@ -0,0 +1,145 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <math.h>
+#include <starpu.h>
+
+struct Params
+{
+  unsigned taskx;
+  unsigned epsilon;
+};
+
+__global__ void gpuNbodyKernel(double *P, double *subA, double *M,
+			     uint32_t nxP, uint32_t nxA, uint32_t nxM,
+			     uint32_t ldP, uint32_t ldA,
+			     struct Params params)
+{
+  uint32_t id, i, j, k;
+  double dx, dy, modul;
+
+  id = blockIdx.x * blockDim.x + threadIdx.x;
+  i = id % nxA;
+  j = id / nxA;
+
+  if (j >= 1){
+    return;
+  }
+
+  double sumaccx;
+  double sumaccy;
+  
+  for (k = 0; k < nxP; k++){
+    if (k != id + nxA*params.taskx){
+      dx = P[k] - P[id + nxA*params.taskx];
+      dy = P[k + ldP] - P[id + nxA*params.taskx + ldP];
+      
+      modul = dx * dx + dy * dy;
+
+      sumaccx = 6.674e-11 * M[k] * dx / pow(modul + params.epsilon, 3);
+      sumaccy = 6.674e-11 * M[k] * dy / pow(modul + params.epsilon, 3);
+    }
+  }
+ 
+  subA[i] = sumaccx;
+  subA[i + ldA] = sumaccy;
+
+  // P[id + nxA * params.taskx] = subA[i];
+
+  // subA[i] = 0;
+  // subA[i + ldA] = 1;
+  
+}
+
+#define THREADS_PER_BLOCK 64
+
+extern "C" void gpu_nbody(void * descr[], void * args)
+{
+
+  double *d_P, *d_subA, *d_M;
+  uint32_t nxP, nxA, nxM;
+  uint32_t ldA, ldP;
+  uint32_t nblocks;
+
+  struct Params *params = (struct Params *) args;
+
+  d_P = (double *) STARPU_MATRIX_GET_PTR(descr[0]);
+  d_subA = (double *) STARPU_MATRIX_GET_PTR(descr[1]);
+  d_M = (double *) STARPU_MATRIX_GET_PTR(descr[2]);
+
+  nxP = STARPU_MATRIX_GET_NX(descr[0]);
+  nxA = STARPU_MATRIX_GET_NX(descr[1]);
+  nxM = STARPU_MATRIX_GET_NX(descr[2]);
+
+  ldP = STARPU_MATRIX_GET_LD(descr[0]);
+  ldA = STARPU_MATRIX_GET_LD(descr[1]);
+
+  nblocks = (nxA + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+
+  gpuNbodyKernel
+    <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
+    >>> (d_P,  d_subA, d_M, nxP, nxA, nxM, ldP, ldA, *params);
+
+  cudaStreamSynchronize(starpu_cuda_get_local_stream());
+
+}
+
+
+
+
+
+
+
+__global__ void gpuNbody2Kernel(double *d_subP, double *d_subV, double *d_subA,
+			      uint32_t nxP, uint32_t nxV, uint32_t nxA,
+			      uint32_t ldP, uint32_t ldV, uint32_t ldA,
+			      struct Params params)
+{
+
+  uint32_t id, i, j;
+
+  id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  i = id % nxP;
+  j = id / nxP;
+
+  if (j >= 1){
+    return;
+  }
+
+  d_subV[i] = d_subV[i] + 3600*d_subA[i];
+  d_subV[i + ldV] = d_subV[i + ldV] + 3600*d_subA[i + ldA];
+
+  d_subP[i] = d_subP[i] + 3600*d_subV[i];
+  d_subP[i + ldP] = d_subP[i + ldP] + 3600*d_subV[i + ldV];
+}
+
+
+extern "C" void gpu_nbody2(void * descr[], void *args)
+{
+  double *d_subP, *d_subV, *d_subA;
+  uint32_t nxP, nxV, nxA;
+  uint32_t ldP, ldV, ldA;
+  uint32_t nblocks;
+
+  struct Params *params = (struct Params *) args;
+
+  d_subP = (double *) STARPU_MATRIX_GET_PTR(descr[0]);
+  d_subV = (double *) STARPU_MATRIX_GET_PTR(descr[1]);
+  d_subA = (double *) STARPU_MATRIX_GET_PTR(descr[2]);
+
+  nxP = STARPU_MATRIX_GET_NX(descr[0]);
+  nxV = STARPU_MATRIX_GET_NX(descr[1]);
+  nxA = STARPU_MATRIX_GET_NX(descr[2]);
+
+  ldP = STARPU_MATRIX_GET_LD(descr[0]);
+  ldV = STARPU_MATRIX_GET_LD(descr[1]);
+  ldA = STARPU_MATRIX_GET_LD(descr[2]);
+
+  nblocks = (nxA + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+
+  gpuNbody2Kernel
+    <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
+    >>> (d_subP, d_subV, d_subA, nxP, nxV, nxA, ldP, ldV, ldA, *params);
+
+  cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}

+ 151 - 0
julia/tst/nbody/gpu_nbody_between.cu

@@ -0,0 +1,151 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <starpu.h>
+
+#define THREADS_PER_BLOCK 64
+
+static inline long long jlstarpu_max(long long a, long long b)
+{
+	return (a > b) ? a : b;
+}
+
+static inline long long jlstarpu_interval_size(long long start, long long step, long long stop)
+{
+    if (stop >= start){
+            return jlstarpu_max(0, (stop - start + 1) / step);
+    } else {
+            return jlstarpu_max(0, (stop - start - 1) / step);
+    }
+}
+
+
+__device__ static inline long long jlstarpu_max__device(long long a, long long b)
+{
+	return (a > b) ? a : b;
+}
+
+__device__ static inline long long jlstarpu_interval_size__device(long long start, long long step, long long stop)
+{
+	if (stop >= start){
+		return jlstarpu_max__device(0, (stop - start + 1) / step);
+	} else {
+		return jlstarpu_max__device(0, (stop - start - 1) / step);
+	}
+}
+
+
+__global__ void nbody_acc(int64_t kernel_ids__start_1, int64_t kernel_ids__step_1, int64_t kernel_ids__dim_1, int64_t widthp, 
+                          double* ptr_OhLp7E87, int64_t* ptr_DvYAWLG1, int64_t widtha, double* ptr_Xi5IjQJ9, 
+                          uint32_t ld_Xi5IjQJ9, double* ptr_t4YHT0eY, double* ptr_mfUSUHkf, uint32_t ld_mfUSUHkf)
+{
+    int64_t THREAD_ID = (int64_t) ((((blockIdx).x) * ((blockDim).x)) + ((threadIdx).x));
+    
+    if ((THREAD_ID) >= ((1) * (kernel_ids__dim_1)))
+    {
+        return ;
+    };
+    int64_t kernel_ids__index_1 = (int64_t) (((THREAD_ID) / (1)) % (kernel_ids__dim_1));
+    int64_t plan = (int64_t) ((kernel_ids__start_1) + ((kernel_ids__index_1) * (kernel_ids__step_1)));
+    double sumaccx = (double) (0);
+    double sumaccy = (double) (0);
+    
+    int64_t start_TzfU6QY7 = (int64_t) (1);
+    int64_t stop_TzfU6QY7 = (int64_t) (widthp);
+    int64_t oplan;
+
+    for (oplan = start_TzfU6QY7 ; oplan <= stop_TzfU6QY7 ; oplan += 1)
+    {
+        double eps = (double) (ptr_OhLp7E87[(3) - (1)]);
+        int64_t Id = (int64_t) ((ptr_DvYAWLG1[(1) - (1)]) * (widtha));
+        double G = (double) (ptr_OhLp7E87[(1) - (1)]);
+        int64_t b = (int64_t) ((((plan) + (Id)) >= (oplan)) + (((plan) + (Id)) <= (oplan)));
+        
+        if ((b) < (2))
+        {
+            double dx = (double) ((ptr_Xi5IjQJ9[((1) + (((oplan) - (1)) * (ld_Xi5IjQJ9))) - (1)]) - (ptr_Xi5IjQJ9[((1) + ((((plan) + (Id)) - (1)) * (ld_Xi5IjQJ9))) - (1)]));
+            double dy = (double) ((ptr_Xi5IjQJ9[((2) + (((oplan) - (1)) * (ld_Xi5IjQJ9))) - (1)]) - (ptr_Xi5IjQJ9[((2) + ((((plan) + (Id)) - (1)) * (ld_Xi5IjQJ9))) - (1)]));
+            double modul = (double) (sqrt(((dx) * (dx)) + ((dy) * (dy))));
+            sumaccx = (sumaccx) + (((G) * (ptr_t4YHT0eY[(oplan) - (1)]) * (dx)) / (((modul) + (eps)) * ((modul) + (eps)) * ((modul) + (eps))));
+            sumaccy = (sumaccy) + (((G) * (ptr_t4YHT0eY[(oplan) - (1)]) * (dy)) / (((modul) + (eps)) * ((modul) + (eps)) * ((modul) + (eps))));
+        };
+    }
+    ;
+    ptr_mfUSUHkf[((1) + (((plan) - (1)) * (ld_mfUSUHkf))) - (1)] = sumaccx;
+    ptr_mfUSUHkf[((2) + (((plan) - (1)) * (ld_mfUSUHkf))) - (1)] = sumaccy;
+}
+
+
+
+extern "C" void CUDA_nbody_acc(void** buffers_qd9i9yfK, void* cl_arg_qd9i9yfK)
+{
+    uint32_t ld_Xi5IjQJ9 = (uint32_t) (STARPU_MATRIX_GET_LD(buffers_qd9i9yfK[(1) - (1)]));
+    double* ptr_Xi5IjQJ9 = (double*) (STARPU_MATRIX_GET_PTR(buffers_qd9i9yfK[(1) - (1)]));
+    uint32_t ld_mfUSUHkf = (uint32_t) (STARPU_MATRIX_GET_LD(buffers_qd9i9yfK[(2) - (1)]));
+    double* ptr_mfUSUHkf = (double*) (STARPU_MATRIX_GET_PTR(buffers_qd9i9yfK[(2) - (1)]));
+    double* ptr_t4YHT0eY = (double*) (STARPU_VECTOR_GET_PTR(buffers_qd9i9yfK[(3) - (1)]));
+    double* ptr_OhLp7E87 = (double*) (STARPU_VECTOR_GET_PTR(buffers_qd9i9yfK[(4) - (1)]));
+    int64_t* ptr_DvYAWLG1 = (int64_t*) (STARPU_VECTOR_GET_PTR(buffers_qd9i9yfK[(5) - (1)]));
+    int64_t widthp = (int64_t) (STARPU_MATRIX_GET_NY(buffers_qd9i9yfK[(1) - (1)]));
+    int64_t widtha = (int64_t) (STARPU_MATRIX_GET_NY(buffers_qd9i9yfK[(2) - (1)]));
+    int64_t kernel_ids__start_1 = (int64_t) (1);
+    int64_t kernel_ids__step_1 = (int64_t) (1);
+    int64_t kernel_ids__dim_1 = (int64_t) (jlstarpu_interval_size(kernel_ids__start_1, kernel_ids__step_1, widtha));
+    int64_t nthreads = (int64_t) ((1) * (kernel_ids__dim_1));
+    int64_t nblocks = (int64_t) ((((nthreads) + (THREADS_PER_BLOCK)) - (1)) / (THREADS_PER_BLOCK));
+    
+    nbody_acc
+        <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
+        >>> (kernel_ids__start_1, kernel_ids__step_1, kernel_ids__dim_1, widthp, 
+             ptr_OhLp7E87, ptr_DvYAWLG1, widtha, ptr_Xi5IjQJ9, 
+             ld_Xi5IjQJ9, ptr_t4YHT0eY, ptr_mfUSUHkf, ld_mfUSUHkf);
+    ;
+    cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+
+
+__global__ void nbody_updt(int64_t kernel_ids__start_1, int64_t kernel_ids__step_1, int64_t kernel_ids__dim_1, double* ptr_jJ5f8wMA, 
+                           uint32_t ld_jJ5f8wMA, double* ptr_piPvdbTs, uint32_t ld_piPvdbTs, double* ptr_JBaPgPiT, 
+                           double* ptr_0STm2S4k, uint32_t ld_0STm2S4k)
+{
+    int64_t THREAD_ID = (int64_t) ((((blockIdx).x) * ((blockDim).x)) + ((threadIdx).x));
+    
+    if ((THREAD_ID) >= ((1) * (kernel_ids__dim_1)))
+    {
+        return ;
+    };
+    int64_t kernel_ids__index_1 = (int64_t) (((THREAD_ID) / (1)) % (kernel_ids__dim_1));
+    int64_t i = (int64_t) ((kernel_ids__start_1) + ((kernel_ids__index_1) * (kernel_ids__step_1)));
+    ptr_jJ5f8wMA[((1) + (((i) - (1)) * (ld_jJ5f8wMA))) - (1)] = (ptr_jJ5f8wMA[((1) + (((i) - (1)) * (ld_jJ5f8wMA))) - (1)]) + ((ptr_piPvdbTs[((1) + (((i) - (1)) * (ld_piPvdbTs))) - (1)]) * (ptr_JBaPgPiT[(2) - (1)]));
+    ptr_jJ5f8wMA[((2) + (((i) - (1)) * (ld_jJ5f8wMA))) - (1)] = (ptr_jJ5f8wMA[((2) + (((i) - (1)) * (ld_jJ5f8wMA))) - (1)]) + ((ptr_piPvdbTs[((2) + (((i) - (1)) * (ld_piPvdbTs))) - (1)]) * (ptr_JBaPgPiT[(2) - (1)]));
+    ptr_0STm2S4k[((1) + (((i) - (1)) * (ld_0STm2S4k))) - (1)] = (ptr_0STm2S4k[((1) + (((i) - (1)) * (ld_0STm2S4k))) - (1)]) + ((ptr_jJ5f8wMA[((1) + (((i) - (1)) * (ld_jJ5f8wMA))) - (1)]) * (ptr_JBaPgPiT[(2) - (1)]));
+    ptr_0STm2S4k[((2) + (((i) - (1)) * (ld_0STm2S4k))) - (1)] = (ptr_0STm2S4k[((2) + (((i) - (1)) * (ld_0STm2S4k))) - (1)]) + ((ptr_jJ5f8wMA[((2) + (((i) - (1)) * (ld_jJ5f8wMA))) - (1)]) * (ptr_JBaPgPiT[(2) - (1)]));
+}
+
+
+
+extern "C" void CUDA_nbody_updt(void** buffers_gj6UYWT4, void* cl_arg_gj6UYWT4)
+{
+    uint32_t ld_0STm2S4k = (uint32_t) (STARPU_MATRIX_GET_LD(buffers_gj6UYWT4[(1) - (1)]));
+    double* ptr_0STm2S4k = (double*) (STARPU_MATRIX_GET_PTR(buffers_gj6UYWT4[(1) - (1)]));
+    uint32_t ld_jJ5f8wMA = (uint32_t) (STARPU_MATRIX_GET_LD(buffers_gj6UYWT4[(2) - (1)]));
+    double* ptr_jJ5f8wMA = (double*) (STARPU_MATRIX_GET_PTR(buffers_gj6UYWT4[(2) - (1)]));
+    uint32_t ld_piPvdbTs = (uint32_t) (STARPU_MATRIX_GET_LD(buffers_gj6UYWT4[(3) - (1)]));
+    double* ptr_piPvdbTs = (double*) (STARPU_MATRIX_GET_PTR(buffers_gj6UYWT4[(3) - (1)]));
+    double* ptr_JBaPgPiT = (double*) (STARPU_VECTOR_GET_PTR(buffers_gj6UYWT4[(4) - (1)]));
+    int64_t widthp = (int64_t) (STARPU_MATRIX_GET_NY(buffers_gj6UYWT4[(1) - (1)]));
+    int64_t kernel_ids__start_1 = (int64_t) (1);
+    int64_t kernel_ids__step_1 = (int64_t) (1);
+    int64_t kernel_ids__dim_1 = (int64_t) (jlstarpu_interval_size(kernel_ids__start_1, kernel_ids__step_1, widthp));
+    int64_t nthreads = (int64_t) ((1) * (kernel_ids__dim_1));
+    int64_t nblocks = (int64_t) ((((nthreads) + (THREADS_PER_BLOCK)) - (1)) / (THREADS_PER_BLOCK));
+    
+    nbody_updt
+        <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
+        >>> (kernel_ids__start_1, kernel_ids__step_1, kernel_ids__dim_1, ptr_jJ5f8wMA, 
+             ld_jJ5f8wMA, ptr_piPvdbTs, ld_piPvdbTs, ptr_JBaPgPiT, 
+             ptr_0STm2S4k, ld_0STm2S4k);
+    ;
+    cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+
+

+ 296 - 0
julia/tst/nbody/nbody.c

@@ -0,0 +1,296 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <starpu.h>
+#include <math.h>
+#include "../includes/display.h"
+#include "../includes/sorting.h"
+
+struct Param {
+	unsigned taskx;
+	double epsilon;
+};
+
+void cpu_nbody(void **, void *);
+void cpu_nbody2(void **, void *);
+
+void gpu_nbody(void **, void *);
+void gpu_nbody2(void **, void *);
+
+static struct starpu_perfmodel model =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "history_perf"
+};
+
+static struct starpu_codelet cl =
+{
+	.cpu_funcs = {cpu_nbody},
+	.cuda_funcs = {gpu_nbody},
+	.nbuffers = 3,
+	.modes = {STARPU_RW, STARPU_RW, STARPU_RW},
+	.model = &model
+};
+
+
+static struct starpu_codelet cl2 =
+{
+	.cpu_funcs = {cpu_nbody2},
+	.cuda_funcs = {gpu_nbody2},
+	.nbuffers = 3,
+	.modes = {STARPU_RW, STARPU_RW, STARPU_R},
+	.model = &model
+};
+
+void nbody_with_starpu(double *positions, double *velocities, double *accelerations, double *masses, unsigned nbr_planets, unsigned nbr_simulations, unsigned nslices)
+{
+
+
+
+	starpu_data_handle_t P_handle, V_handle, A_handle, M_handle;
+
+       
+	starpu_matrix_data_register(&P_handle, STARPU_MAIN_RAM, (uintptr_t)positions, nbr_planets, nbr_planets, 2, sizeof(double));
+	starpu_matrix_data_register(&V_handle, STARPU_MAIN_RAM, (uintptr_t)velocities, nbr_planets, nbr_planets, 2, sizeof(double));
+	starpu_matrix_data_register(&A_handle, STARPU_MAIN_RAM, (uintptr_t)accelerations, nbr_planets, nbr_planets, 2, sizeof(double));
+	starpu_matrix_data_register(&M_handle, STARPU_MAIN_RAM, (uintptr_t)masses, nbr_planets, nbr_planets, 1, sizeof(double));
+	
+	struct starpu_data_filter horiz = 
+		{
+			.filter_func = starpu_matrix_filter_block,
+			.nchildren = nslices
+		};
+	
+	unsigned i;
+		
+	for (i = 0; i < nbr_simulations; i++){
+		
+		/* printf("Simulation %d: \n", i); */
+		
+		starpu_data_partition(A_handle, &horiz);
+		
+		struct Param *params = malloc(nslices * sizeof(struct Param));
+		
+		
+		unsigned epsilon = 2.5e8;
+		
+		unsigned task1, task2;
+		
+		
+		
+		for (task1 = 0; task1 < nslices; task1++){
+			
+			struct starpu_task *task = starpu_task_create();
+			
+			struct Param param = {task1, epsilon};
+			
+			params[task1] = param;
+			
+			task->cl = &cl;
+			task->handles[0] = P_handle;
+			task->handles[1] = starpu_data_get_sub_data(A_handle, 1, task1);
+			task->handles[2] = M_handle;
+			
+			task->cl_arg = &(params[task1]);
+			task->cl_arg_size = sizeof(struct Param);
+			
+			starpu_task_submit(task);
+		}
+		starpu_task_wait_for_all();
+
+////////////////////////
+
+		starpu_data_partition(P_handle, &horiz);
+		starpu_data_partition(V_handle, &horiz);
+		
+		for (task2 = 0; task2 < nslices; task2++){
+			
+			struct starpu_task *task = starpu_task_create();
+			
+			struct Param param = {task1, epsilon};
+			
+			params[task2] = param;
+			
+			task->cl = &cl2;
+			task->handles[0] = starpu_data_get_sub_data(P_handle, 1, task2);
+			task->handles[1] = starpu_data_get_sub_data(V_handle, 1, task2);
+			task->handles[2] = starpu_data_get_sub_data(A_handle, 1, task2);
+			
+			task->cl_arg = &(params[task2]);
+			task->cl_arg_size = sizeof(struct Param);
+			
+			starpu_task_submit(task);
+		}
+		
+		starpu_task_wait_for_all();
+		
+		starpu_data_unpartition(P_handle, STARPU_MAIN_RAM);
+		starpu_data_unpartition(V_handle, STARPU_MAIN_RAM);
+		starpu_data_unpartition(A_handle, STARPU_MAIN_RAM);
+
+		/* char filename[41]; */
+	
+		/* sprintf(filename, "../PPM/nbody%d_%d.ppm", nbr_planets, i + 1); */
+	
+		/* nbody_graph(filename, positions, nbr_planets, 1000, 1000, -4e8, 4e8); */
+
+	}
+	
+		starpu_data_unregister(P_handle);
+		starpu_data_unregister(V_handle);
+		starpu_data_unregister(A_handle);
+		starpu_data_unregister(M_handle);
+
+
+}
+
+void init_positions(double *positions, unsigned nbr_planets)
+{
+	unsigned i;
+	double qiX, qiY;
+	for (i = 0; i < nbr_planets; i++){
+		double angle = ((RAND_MAX - rand()) / (double) (RAND_MAX)) * 2.0 * M_PI;
+		double distToCenter = ((RAND_MAX - rand()) / (double) (RAND_MAX)) * 1.0e8 + 1.0e8;
+
+		qiX = cos(angle) * distToCenter;
+		qiY = sin(angle) * distToCenter;
+
+		positions[i] = qiX;
+		positions[i + nbr_planets] = qiY;
+	}
+}
+
+void init_velocities(double *positions, double *velocities, unsigned nbr_planets)
+{
+	unsigned i;
+	for (i = 0; i < nbr_planets; i++){
+		double viX = positions[i + nbr_planets] * 4.0e-6;
+		double viY = -positions[i] * 4.0e-6;
+		velocities[i] = viX;
+		velocities[i + nbr_planets] = viY;
+	}
+}
+
+void init_masses(double *masses, unsigned nbr_planets)
+{
+	unsigned i;
+	for (i = 0; i < nbr_planets; i++){
+		double mi = (rand() / (double) RAND_MAX) * 5e25;
+		masses[i] = mi;
+	}
+}
+
+
+	
+
+double median_times(unsigned nbr_planets, unsigned nbr_simulations, unsigned nslices, unsigned nbr_tests)
+{
+	double exec_times[nbr_tests];
+
+	double *positions = malloc(nbr_planets * 2 * sizeof(double));
+	double *velocities = malloc(nbr_planets * 2 * sizeof(double));
+	double *accelerations = calloc(nbr_planets * 2, sizeof(double));
+	double *masses = malloc(nbr_planets * sizeof(double));
+
+	init_positions(positions, nbr_planets);
+	init_velocities(positions, velocities, nbr_planets);
+	init_masses(masses, nbr_planets);
+
+
+	/* unsigned i; */
+
+	/* for (i = 0; i < nbr_planets; i++){ */
+	/* 	accelerations[i] = i; */
+	/* } */
+
+	
+	
+	/* nbody_graph("PPM/bug0.ppm", positions, nbr_planets, 1000, 1000, -4e8, 4e8); */
+
+	/* unsigned k; */
+       
+	/* for (k = 1; k <= nbr_simulations; k++){ */
+	
+	double start, stop, exec_t;
+	unsigned i;
+	
+	for (i = 0; i < nbr_tests; i++){
+
+		start = starpu_timing_now();
+
+		nbody_with_starpu(positions, velocities, accelerations, masses, nbr_planets, nbr_simulations, nslices);
+		
+		stop = starpu_timing_now();
+
+		exec_t = (stop - start) / 1.e6;
+
+		exec_times[i] = exec_t;
+	}
+	/* printf("\n\nSIMULATION %d:\n\n", k); */
+	
+	
+	/* char filename[23]; */
+	
+	/* sprintf(filename, "PPM/bug%d.ppm", k); */
+	
+	/* nbody_graph(filename, positions, nbr_planets, 1000, 1000, -4e8, 4e8); */
+	/* } */
+	
+	free(positions);
+	free(velocities);
+	free(accelerations);
+	free(masses);
+
+	quicksort(exec_times, 0, nbr_tests - 1);
+
+	return exec_times[nbr_tests / 2];
+}
+
+
+
+void display_times(unsigned start_nbr, unsigned step_nbr, unsigned stop_nbr, unsigned nbr_simulations, unsigned nslices, unsigned nbr_tests)
+{
+
+	FILE *myfile;
+	
+	myfile = fopen("DAT/nbody_c_struct_times.dat", "w");
+
+	unsigned nbr_planets;
+	
+	for (nbr_planets = start_nbr; nbr_planets <= stop_nbr; nbr_planets += step_nbr){
+		double t = median_times(nbr_planets, nbr_simulations, nslices, nbr_tests);
+		printf("STRUCT: %u planets: %f seconds\n", nbr_planets, t);
+		fprintf(myfile, "%f\n", t);
+	}
+	fclose(myfile);
+}
+
+int main(int argc, char * argv[])
+{
+
+	if (argc != 7){
+		printf("Usage: %s start_nbr step_nbr stop_nbr nbr_simulations nslices nbr_tests\n", argv[0]);
+		return 1;
+	}
+
+
+	if (starpu_init(NULL) != EXIT_SUCCESS){
+		fprintf(stderr, "ERROR\n");
+		return 77;
+	}
+
+
+	unsigned start_nbr = (unsigned) atoi(argv[1]);
+	unsigned step_nbr = (unsigned) atoi(argv[2]);
+	unsigned stop_nbr = (unsigned) atoi(argv[3]);
+	unsigned nbr_simulations = (unsigned) atoi(argv[4]);
+	unsigned nslices = (unsigned) atoi(argv[5]);
+	unsigned nbr_tests = (unsigned) atoi(argv[6]);
+
+	srand(time(NULL));
+	
+	display_times(start_nbr, step_nbr, stop_nbr, nbr_simulations, nslices, nbr_tests);
+
+	starpu_shutdown();
+	
+	return 0;
+}

+ 40 - 0
julia/tst/nbody/nbody.jl

@@ -0,0 +1,40 @@
+include("nbody_display.jl")
+function mod2(v ::Vector{Float64})
+    return sqrt(v[1]^2 + v[2]^2)
+end
+
+function compute_accelerations(positions ::Vector{Vector{Float64}}, accelerations ::Vector{Vector{Float64}}, masses ::Vector{Float64}, eps ::Float64)
+    G ::Float64 = 6.67E-11
+    n ::Int64 = length(accelerations)
+    for i = 1:n
+        sumacc ::Vector{Float64} = [0,0]
+        for j = 1:n
+            if i != j
+                dv ::Vector{Float64} = positions[j] - positions[i]
+                
+                sumacc = sumacc + G * masses[j] * dv / (mod2(dv) + eps)^3
+            end
+        end
+        accelerations[i] = sumacc
+    end    
+end
+
+function update_pos_vel(positions ::Vector{Vector{Float64}}, velocities ::Vector{Vector{Float64}}, accelerations ::Vector{Vector{Float64}}, dt ::Float64)
+    n ::Int64 = length(positions)
+    for i = 1:n
+        velocities[i] = velocities[i] + accelerations[i] * dt
+        positions[i] = positions[i] + velocities[i] * dt
+    end
+end
+
+
+
+
+function nbody_jl(positions ::Vector{Vector{Float64}}, velocities ::Vector{Vector{Float64}}, accelerations ::Vector{Vector{Float64}}, masses ::Vector{Float64}, nbr_simulations ::Int64, eps ::Float64, dt ::Float64)
+    for i = 1:nbr_simulations
+        compute_accelerations(positions, accelerations, masses, eps)
+        update_pos_vel(positions, velocities, accelerations,dt)
+        # pixels ::Array{Array{Int64}} = [[0,0,0] for i = 1:1000*1000]
+        # graph_planets(pixels, positions, -4E8, 4E8, 1000, 1000, "nbody$i.ppm")
+    end
+end

+ 363 - 0
julia/tst/nbody/nbody_between.c

@@ -0,0 +1,363 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <starpu.h>
+#include <math.h>
+#include <stdint.h>
+#include "../includes/display.h"
+#include "../includes/sorting.h"
+
+struct Param {
+	unsigned taskx;
+	double epsilon;
+};
+
+void nbody_acc(void **, void *);
+void nbody_updt(void **, void *);
+
+void CUDA_nbody_acc(void **, void *);
+void CUDA_nbody_updt(void **, void *);
+
+static struct starpu_perfmodel model =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "history_perf"
+};
+
+static struct starpu_codelet cl =
+{
+	.cpu_funcs = {nbody_acc},
+	.cuda_funcs = {CUDA_nbody_acc},
+	//STRUCT PARAM
+	/* .nbuffers = 3, */
+	/* .modes = {STARPU_RW, STARPU_RW, STARPU_RW}, */
+	//
+	
+	//ARRAY PARAM
+	.nbuffers = 5,
+	.modes = {STARPU_RW, STARPU_RW, STARPU_RW, STARPU_RW, STARPU_RW},
+	//
+	.model = &model
+};
+
+
+static struct starpu_codelet cl2 =
+{
+	.cpu_funcs = {nbody_updt},
+	.cuda_funcs = {CUDA_nbody_updt},
+	//STRUCT PARAM
+	/* .nbuffers = 3, */
+	/* .modes = {STARPU_RW, STARPU_RW, STARPU_R}, */
+	//
+	
+	//ARRAY PARAM
+	.nbuffers = 4,
+	.modes = {STARPU_RW, STARPU_RW, STARPU_R, STARPU_R},
+	//
+	.model = &model
+};
+
+void nbody_with_starpu(double *positions, double *velocities, double *accelerations, double *masses, double *parameters, unsigned nbr_planets, unsigned nbr_simulations, unsigned nslices)
+{
+
+
+
+	starpu_data_handle_t P_handle, V_handle, A_handle, M_handle, Par_handle, Id_handle;
+
+       
+	starpu_matrix_data_register(&P_handle, STARPU_MAIN_RAM, (uintptr_t)positions, 2, 2, nbr_planets, sizeof(double));
+	starpu_matrix_data_register(&V_handle, STARPU_MAIN_RAM, (uintptr_t)velocities, 2, 2, nbr_planets, sizeof(double));
+	starpu_matrix_data_register(&A_handle, STARPU_MAIN_RAM, (uintptr_t)accelerations, 2, 2, nbr_planets, sizeof(double));
+	starpu_vector_data_register(&M_handle, STARPU_MAIN_RAM, (uintptr_t)masses, nbr_planets, sizeof(double));
+	starpu_vector_data_register(&Par_handle, STARPU_MAIN_RAM, (uintptr_t)parameters, 3, sizeof(double));
+	struct starpu_data_filter vert = 
+		{
+			.filter_func = starpu_matrix_filter_vertical_block,
+			.nchildren = nslices
+		};
+	
+	unsigned i;
+
+
+	int64_t *Id = malloc(nslices * sizeof(int64_t));
+		
+	for (i = 0; i < nbr_simulations; i++){
+		
+		/* printf("Simulation %d: \n", i); */
+		
+		starpu_data_partition(A_handle, &vert);
+
+		//STRUCT PARAM
+		/* struct Param *params = malloc(nslices * sizeof(struct Param)); */
+		
+		
+		/* unsigned epsilon = 2.5e8; */
+		//
+
+		int64_t task1, task2;
+		
+		
+		for (task1 = 0; task1 < nslices; task1++){
+			
+			
+			struct starpu_task *task = starpu_task_create();
+			
+			Id[task1] = task1;
+			starpu_vector_data_register(&Id_handle, STARPU_MAIN_RAM, (uintptr_t)&(Id[task1]), 1, sizeof(int64_t));
+			//STRUCT PARAM
+			/* struct Param param = {task1, epsilon}; */
+			
+			/* params[task1] = param; */
+			//
+			
+			task->cl = &cl;
+			task->handles[0] = P_handle;
+			task->handles[1] = starpu_data_get_sub_data(A_handle, 1, task1);
+			task->handles[2] = M_handle;
+			task->handles[3] = Par_handle;
+			task->handles[4] = Id_handle;
+			//STRUCT PARAM
+			/* task->cl_arg = &(params[task1]); */
+			/* task->cl_arg_size = sizeof(struct Param); */
+			//
+			
+			starpu_task_submit(task);
+			starpu_data_unregister_submit(Id_handle);
+		}
+		starpu_task_wait_for_all();
+
+////////////////////////
+
+		starpu_data_partition(P_handle, &vert);
+		starpu_data_partition(V_handle, &vert);
+		
+		for (task2 = 0; task2 < nslices; task2++){
+			
+			struct starpu_task *task = starpu_task_create();
+
+			//STRUCT PARAM
+			/* struct Param param = {task1, epsilon}; */
+			
+			/* params[task2] = param; */
+			//
+
+			task->cl = &cl2;
+			task->handles[0] = starpu_data_get_sub_data(P_handle, 1, task2);
+			task->handles[1] = starpu_data_get_sub_data(V_handle, 1, task2);
+			task->handles[2] = starpu_data_get_sub_data(A_handle, 1, task2);
+			task->handles[3] = Par_handle;
+			//STRUCT PARAM
+			/* task->cl_arg = &(params[task2]); */
+			/* task->cl_arg_size = sizeof(struct Param); */
+			//
+			starpu_task_submit(task);
+		}
+		
+		starpu_task_wait_for_all();
+		
+		starpu_data_unpartition(P_handle, STARPU_MAIN_RAM);
+		starpu_data_unpartition(V_handle, STARPU_MAIN_RAM);
+		starpu_data_unpartition(A_handle, STARPU_MAIN_RAM);
+		
+		/* char filename[38]; */
+	
+		/* sprintf(filename, "PPM/nbody%d_%d.ppm", nbr_planets, i + 1); */
+	
+		/* nbody_graph_transpose(filename, positions, nbr_planets, 1000, 1000, -4e8, 4e8); */
+
+	}
+
+	
+	starpu_data_unregister(P_handle);
+	starpu_data_unregister(V_handle);
+	starpu_data_unregister(A_handle);
+	starpu_data_unregister(M_handle);
+	starpu_data_unregister(Par_handle);
+
+		/* char filename[36]; */
+	
+		/* sprintf(filename, "PPM/bug%d_%d.ppm", nbr_planets, i); */
+	
+		/* nbody_graph(filename, positions, nbr_planets, 1000, 1000, -4e8, 4e8); */
+
+}
+
+void init_positions(double *positions, unsigned nbr_planets)
+{
+	unsigned i;
+	double qiX, qiY;
+	for (i = 0; i < nbr_planets; i++){
+		double angle = ((RAND_MAX - rand()) / (double) (RAND_MAX)) * 2.0 * M_PI;
+		double distToCenter = ((RAND_MAX - rand()) / (double) (RAND_MAX)) * 1.0e8 + 1.0e8;
+
+		qiX = cos(angle) * distToCenter;
+		qiY = sin(angle) * distToCenter;
+
+		positions[2*i] = qiX;
+		positions[2*i + 1] = qiY;
+
+
+	       
+	}
+}
+
+void init_velocities(double *positions, double *velocities, unsigned nbr_planets)
+{
+	unsigned i;
+	for (i = 0; i < nbr_planets; i++){
+		double viX = positions[2*i+1] * 4.0e-6;
+		double viY = -positions[2*i] * 4.0e-6;
+		velocities[2*i] = viX;
+		velocities[2*i + 1] = viY;
+		
+	}
+}
+
+void init_masses(double *masses, unsigned nbr_planets)
+{
+	unsigned i;
+	for (i = 0; i < nbr_planets; i++){
+		double mi = (rand() / (double) RAND_MAX) * 1e22;
+		masses[i] = mi;
+	}
+}
+
+
+	
+
+double median_times(unsigned nbr_planets, unsigned nbr_simulations, unsigned nslices, unsigned nbr_tests)
+{
+	double exec_times[nbr_tests];
+
+	/* double *positions = malloc(4 * sizeof(double)); */
+	/* double *velocities = calloc(4, sizeof(double)); */
+	/* double *accelerations = calloc(4, sizeof(double)); */
+	double *positions = malloc(nbr_planets * 2 * sizeof(double));
+	double *velocities = malloc(nbr_planets * 2 * sizeof(double));
+	double *accelerations = calloc(nbr_planets * 2, sizeof(double));
+	double *masses = malloc(nbr_planets * sizeof(double));
+	double *parameters = malloc(3 * sizeof(double));
+
+	double G = 6.67408e-11;
+	/* double dt = 36000; */
+	double dt = 3600;
+	double epsilon = 2.5e8;
+
+	parameters[0] = G;
+	parameters[1] = dt;
+	parameters[2] = epsilon;
+
+	init_positions(positions, nbr_planets);
+	init_velocities(positions, velocities, nbr_planets);
+	init_masses(masses, nbr_planets);
+	/* positions[0] = 0; */
+	/* positions[1] = 300000; */
+	/* positions[2] = 600000; */
+	/* positions[3] = 300000; */
+	/* masses[0] = 5.9e24; */
+	/* masses[1] = 5.9e24; */
+
+	/* unsigned i; */
+
+	/* for (i = 0; i < nbr_planets; i++){ */
+	/* 	accelerations[i] = i; */
+	/* } */
+
+	
+	
+	/* nbody_graph("PPM/bug0.ppm", positions, nbr_planets, 1000, 1000, -4e8, 4e8); */
+
+	/* unsigned k; */
+       
+	/* for (k = 1; k <= nbr_simulations; k++){ */
+	
+	double start, stop, exec_t;
+	unsigned i;
+
+
+	
+	for (i = 0; i < nbr_tests; i++){
+
+		start = starpu_timing_now();
+
+		nbody_with_starpu(positions, velocities, accelerations, masses, parameters, nbr_planets, nbr_simulations, nslices);
+		
+		stop = starpu_timing_now();
+
+		exec_t = (stop - start) / 1.e6;
+
+		exec_times[i] = exec_t;
+	}
+	/* printf("\n\nSIMULATION %d:\n\n", k); */
+	
+	
+	/* char filename[23]; */
+	
+	/* sprintf(filename, "PPM/bug%d.ppm", k); */
+	
+	/* nbody_graph(filename, positions, nbr_planets, 1000, 1000, -4e8, 4e8); */
+	/* } */
+
+	/* for (i = 0; i < nbr_planets; i++){ */
+	/* 	printf("%f %f\n", accelerations[2*i], accelerations[2*i + 1]); */
+	/* } */
+	     
+	
+	free(positions);
+	free(velocities);
+	free(accelerations);
+	free(masses);
+
+	quicksort(exec_times, 0, nbr_tests - 1);
+
+	return exec_times[nbr_tests / 2];
+}
+
+
+
+void display_times(unsigned start_nbr, unsigned step_nbr, unsigned stop_nbr, unsigned nbr_simulations, unsigned nslices, unsigned nbr_tests)
+{
+
+	FILE *myfile;
+	
+	myfile = fopen("DAT/nbody_c_array_times.dat", "w");
+
+	unsigned nbr_planets;
+	
+	for (nbr_planets = start_nbr; nbr_planets <= stop_nbr; nbr_planets += step_nbr){
+		double t = median_times(nbr_planets, nbr_simulations, nslices, nbr_tests);
+		printf("ARRAY: %u planets: %f seconds\n", nbr_planets, t);
+		fprintf(myfile, "%f\n", t);
+	}
+	fclose(myfile);
+}
+
+int main(int argc, char * argv[])
+{
+
+	if (argc != 7){
+		printf("Usage: %s start_nbr step_nbr stop_nbr nbr_simulations nslices nbr_tests\n", argv[0]);
+		return 1;
+	}
+
+
+	if (starpu_init(NULL) != EXIT_SUCCESS){
+		fprintf(stderr, "ERROR\n");
+		return 77;
+	}
+
+
+	unsigned start_nbr = (unsigned) atoi(argv[1]);
+	unsigned step_nbr = (unsigned) atoi(argv[2]);
+	unsigned stop_nbr = (unsigned) atoi(argv[3]);
+	unsigned nbr_simulations = (unsigned) atoi(argv[4]);
+	unsigned nslices = (unsigned) atoi(argv[5]);
+	unsigned nbr_tests = (unsigned) atoi(argv[6]);
+
+	srand(time(NULL));
+	
+	display_times(start_nbr, step_nbr, stop_nbr, nbr_simulations, nslices, nbr_tests);
+
+	starpu_shutdown();
+	
+	return 0;
+}

+ 366 - 0
julia/tst/nbody/nbody_def.jl

@@ -0,0 +1,366 @@
+#include("./nbody_display.jl")
+include("nbody.jl")
+function nbody_with_starpu(positions ::Matrix{Float64}, velocities ::Matrix{Float64}, accelerations ::Matrix{Float64}, masses ::Vector{Float64}, parameters ::Vector{Float64}, nbr_simulations ::Int64, nslices ::Int64, nbr_planets ::Int64)
+    
+    vert = StarpuDataFilter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslices)
+
+    for i = 1:nbr_simulations
+ 
+        @starpu_block let
+            hPOS, hVEL, hACC, hPAR, hMA = starpu_data_register(positions,velocities,accelerations,parameters,masses)
+            
+            starpu_data_partition(hACC,vert)
+            @starpu_sync_tasks for task in (1:nslices)
+                @starpu_block let
+                    id = Int64[task-1]
+                    hID = starpu_data_register(id)
+                    @starpu_async_cl claccst(hPOS,hACC[task],hMA,hPAR,hID)
+                end
+            end
+            
+            starpu_data_partition(hPOS,vert)
+            starpu_data_partition(hVEL,vert)
+            
+            @starpu_sync_tasks for task in (1:nslices)
+                @starpu_async_cl clupdtst(hPOS[task],hVEL[task],hACC[task],hPAR)
+            end
+            
+        end
+        # pixels ::Array{Array{Int64}} = [[0,0,0] for i = 1:1000*1000]
+        # graph_planets(pixels, positions, -4E8, 4E8, 1000, 1000, "PPM/nbody_st$(nbr_planets)_$i.ppm")
+    end
+    return nothing
+end
+
+
+
+function nbody_with_starpu_cpu(positions ::Matrix{Float64}, velocities ::Matrix{Float64}, accelerations ::Matrix{Float64}, masses ::Vector{Float64}, parameters ::Vector{Float64}, nbr_simulations ::Int64, nslices ::Int64)
+
+    vert = StarpuDataFilter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslices)
+ 
+    for i = 1:nbr_simulations
+
+        @starpu_block let
+            hPOS, hVEL, hACC, hPAR, hMA = starpu_data_register(positions,velocities,accelerations,parameters,masses)
+            
+            starpu_data_partition(hACC,vert)
+            @starpu_sync_tasks for task in (1:nslices)
+                id = Int64[task-1]
+                hID = starpu_data_register(id)
+                @starpu_async_cl clacccpu(hPOS,hACC[task],hMA,hPAR,hID)
+            end
+            
+            starpu_data_partition(hPOS,vert)
+            starpu_data_partition(hVEL,vert)
+            
+            @starpu_sync_tasks for task in (1:nslices)
+                @starpu_async_cl clupdtcpu(hPOS[task],hVEL[task],hACC[task],hPAR)
+            end
+            
+        end
+        
+        # pixels ::Array{Array{Int64}} = [[0,0,0] for i = 1:1000*1000]
+        # graph_planets(pixels, positions, -4E8, 4E8, 1000, 1000, "PPM/nbody_cpu$i.ppm")
+    end
+    return nothing
+end
+
+
+
+function nbody_with_starpu_gpu(positions ::Matrix{Float64}, velocities ::Matrix{Float64}, accelerations ::Matrix{Float64}, masses ::Vector{Float64}, parameters ::Vector{Float64}, nbr_simulations ::Int64, nslices ::Int64)
+
+    vert = StarpuDataFilter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslices)
+
+    for i = 1:nbr_simulations
+
+        @starpu_block let
+            hPOS, hVEL, hACC, hPAR, hMA = starpu_data_register(positions,velocities,accelerations,parameters,masses)
+            
+            starpu_data_partition(hACC,vert)
+            @starpu_sync_tasks for task in (1:nslices)
+                id = Int64[task-1]
+                hID = starpu_data_register(id)
+                @starpu_async_cl claccgpu(hPOS,hACC[task],hMA,hPAR,hID)
+            end
+            
+            starpu_data_partition(hPOS,vert)
+            starpu_data_partition(hVEL,vert)
+            
+            @starpu_sync_tasks for task in (1:nslices)
+                @starpu_async_cl clupdtgpu(hPOS[task],hVEL[task],hACC[task],hPAR)
+            end
+        end       
+    
+        # pixels ::Array{Array{Int64}} = [[0,0,0] for i = 1:1000*1000]
+        # graph_planets(pixels, positions, -4E8, 4E8, 1000, 1000, "PPM/nbody_gpu$i.ppm")
+    end
+    return nothing
+end
+
+
+
+# function display_times(starting_nbr_planets::Int64, step_nbr ::Int64, nbr_steps ::Int64, nbr_simulations::Int64, nb_tests ::Int64, nslices::Int64)
+#     width = 1000
+#     height = 1000
+
+#     times_starpu ::Vector{Float64} = [0 for i = 1:nbr_steps+1]
+#     times_julia ::Vector{Float64} = [0 for i = 1:nbr_steps+1]
+    
+#     for k = 0:nbr_steps
+#         nbr_planets ::Int64 = starting_nbr_planets + k * step_nbr
+#         println("Number of planets: $nbr_planets")
+
+#         epsilon ::Float64 = 2.5E8
+#         dt ::Float64 = 36000
+#         G ::Float64 = 6.67408E-11
+#         parameters ::Vector{Float64} = [G,dt,epsilon]
+        
+#         maxcoord ::Int64 = 2E8
+#         mincoord ::Int64 = -2E8
+        
+#         positions ::Matrix{Float64} = zeros(2, nbr_planets)
+#         velocities ::Matrix{Float64} = zeros(2, nbr_planets)
+#         accelerations ::Matrix{Float64} = zeros(2,nbr_planets)
+        
+#         positionsjl ::Vector{Vector{Float64}} = [[0, 0] for i = 1:nbr_planets]
+#         velocitiesjl ::Vector{Vector{Float64}} = [[0, 0] for i = 1:nbr_planets]
+#         accelerationsjl ::Vector{Vector{Float64}} = [[0, 0] for i = 1:nbr_planets]
+        
+        
+#         masses ::Vector{Float64} = [0 for i = 1:nbr_planets]
+        
+        
+        
+
+#         for i = 1:nbr_planets
+#             mi ::Float64 = rand() * 5E22
+#             ri = mi * 2.5E-15
+            
+#             angle ::Float64 = rand() * 2 * pi
+#             distToCenter ::Float64 = rand() * 1.0E8 + 1.0E8
+            
+#             qix ::Float64 = cos(angle) * distToCenter
+#             qiy ::Float64 = sin(angle) * distToCenter
+            
+#             vix ::Float64 = qiy * 4.0E-6
+#             viy ::Float64 = -qix * 4.0E-6
+            
+#             masses[i] = mi
+        
+#             positions[1,i] = qix
+#             positions[2,i] = qiy
+            
+#             positionsjl[i] = [qix, qiy]
+            
+#             velocities[1,i] = vix
+#             velocities[2,i] = viy
+            
+#             velocitiesjl[i] = [vix, viy]
+#         end
+        
+#         max_value = 4E8
+#         min_value = -4E8
+        
+#         println("Starpu...")
+#         tic()
+#         for i = 1:nbr_simulations
+#             nbody_with_starpu(positions, velocities, accelerations, masses, parameters, nslices)
+#         end
+#         t_starpu = toq()
+        
+#         println("No Starpu...")
+#         tic()
+#         for i = 1:nbr_simulations
+#             nbody_jl(positionsjl, velocitiesjl, accelerationsjl, masses, epsilon, dt)
+#         end
+#         t_julia = toq()
+        
+        
+        
+#         times_starpu[k+1] = t_starpu
+#         times_julia[k+1] = t_julia
+#     end
+#     open("./DAT/nbody.dat", "w") do f
+#         for i = 0:nbr_steps
+#             write(f, "$(starting_nbr_planets + i*step_nbr)")
+#             write(f, " $(times_starpu[i+1])")
+#             write(f, " $(times_julia[i+1])\n")
+#         end
+#     end
+# end
+
+
+function set_to_zero(A ::Array{<:Real,2})
+    height,width = size(A)
+    for i = 1:height
+        for j = 1:width
+            A[i,j] = 0
+        end
+    end
+end
+
+function median_times(nbr_tests ::Int64, nbr_planets ::Int64, nbr_simulations ::Int64, nslices ::Int64)
+
+######################### INITIALIZATION #########################
+
+    width ::Int64 = 1000
+    height ::Int64 = 1000
+
+    epsilon ::Float64 = 2.5E8
+    dt ::Float64 = 3600
+    G ::Float64 = 6.67408E-11
+    parameters ::Vector{Float64} = [G,dt,epsilon]
+
+    # Coordinate interval for the final display screen.
+    maxcoord ::Int64 = 2E8
+    mincoord ::Int64 = -2E8
+
+    exec_times_st ::Vector{Float64} = [0 for i = 1:nbr_tests]
+    exec_times_cpu ::Vector{Float64} = [0 for i = 1:nbr_tests]
+    exec_times_gpu ::Vector{Float64} = [0 for i = 1:nbr_tests]
+    # exec_times_jl ::Vector{Float64} = [0 for i = 1:nbr_tests]
+
+    # Arrays used for each of the starpu-using functions. 
+    positions ::Matrix{Float64} = zeros(2, nbr_planets)
+    velocities ::Matrix{Float64} = zeros(2, nbr_planets)
+    accelerations_st ::Matrix{Float64} = zeros(2, nbr_planets)    
+    
+    # Arrays used for the naive julia function.
+    # positions_jl ::Vector{Vector{Float64}} = [[0, 0] for i = 1:nbr_planets]
+    # velocities_jl ::Vector{Vector{Float64}} = [[0, 0] for i = 1:nbr_planets]
+    # accelerations_jl ::Vector{Vector{Float64}} = [[0, 0] for i = 1:nbr_planets]
+    
+    
+    masses ::Vector{Float64} = [0 for i = 1:nbr_planets]
+    
+
+    for k = 1:nbr_tests
+        println("Test $k...")
+        # Initializing the starpu and naive julia arrays with the same values.
+        for i = 1:nbr_planets
+            mi ::Float64 = rand() * 5E22
+            ri = mi * 2.5E-15
+            
+            angle ::Float64 = rand() * 2 * pi
+            distToCenter ::Float64 = rand() * 1.0E8 + 1.0E8
+            
+            qix ::Float64 = cos(angle) * distToCenter
+            qiy ::Float64 = sin(angle) * distToCenter
+            
+            vix ::Float64 = qiy * 4.0E-6
+            viy ::Float64 = -qix * 4.0E-6
+            
+            masses[i] = mi
+            
+            positions[1,i] = qix
+            positions[2,i] = qiy
+            
+            #positions_jl[i] = [qix, qiy]
+            
+            velocities[1,i] = vix
+            velocities[2,i] = viy
+            
+            #velocities_jl[i] = [vix, viy]
+        end
+
+
+######################### SIMULATION #########################
+
+        # Using new arrays for the starpu functions, so we can keep in memory the initial values.
+        positions_st ::Matrix{Float64} = copy(positions)
+        velocities_st ::Matrix{Float64} = copy(velocities)
+        set_to_zero(accelerations_st)
+
+        tic()
+        nbody_with_starpu(positions_st, velocities_st, accelerations_st, masses, parameters, nbr_simulations, nslices, nbr_planets)
+        t_st = toq()
+
+
+
+        # positions_st = copy(positions)
+        # velocities_st = copy(velocities)
+        # set_to_zero(accelerations_st)
+                
+        # tic()
+        # nbody_with_starpu_cpu(positions_st, velocities_st, accelerations_st, masses, parameters, nbr_simulations, nslices)
+        # t_cpu = toq()
+        
+
+
+        # positions_st = copy(positions)
+        # velocities_st = copy(velocities)
+        # set_to_zero(accelerations_st)
+
+        # tic()
+        # nbody_with_starpu_gpu(positions_st, velocities_st, accelerations_st, masses, parameters, nbr_simulations, nslices)
+        # t_gpu = toq()
+
+        
+        #tic()
+        #nbody_jl(positions_jl, velocities_jl, accelerations_jl, masses, nbr_simulations, epsilon, dt)
+        #t_jl = toq()
+        
+        exec_times_st[k] = t_st
+        # exec_times_cpu[k] = t_cpu
+        # exec_times_gpu[k] = t_gpu
+        #exec_times_jl[k] = t_jl
+        
+    end
+
+    sort!(exec_times_st)
+    # sort!(exec_times_cpu)
+    # sort!(exec_times_gpu)
+    #sort!(exec_times_jl)
+
+    res ::Vector{Float64} = [exec_times_st[1 + div(nbr_tests-1,2)]]#, exec_times_cpu[1 + div(nbr_tests-1,2)], exec_times_gpu[1 + div(nbr_tests-1,2)]]#, exec_times_jl[1 + div(nbr_tests-1,2)]]
+    
+    return res
+end
+# Adds the median times of each function inside a .DAT file.
+function display_times(start_nbr ::Int64, step_nbr ::Int64, stop_nbr ::Int64, nbr_simulations ::Int64, nslices ::Int64, nbr_tests ::Int64)
+
+
+    # mtc = map( (x->parse(Float64,x)), open("DAT/nbody_c_times.dat") do f
+    #         readlines(f)
+    #         end)
+
+    mtjl = map( (x->parse(Float64,x)), open("../DAT/nbody_jl.dat") do f
+            readlines(f)
+            end)
+
+    mtcstr = map( (x->parse(Float64,x)), open("../DAT/nbody_c_struct_times.dat") do f
+                  readlines(f)
+                  end)
+
+    mtcarr = map( (x->parse(Float64,x)), open("../DAT/nbody_c_array_times.dat") do f
+                  readlines(f)
+                  end)
+
+
+    i = 1;
+
+    #open("./DAT/nbody_jl.dat", "w") do fjl
+    open("../DAT/nbody_jl_array_times.dat", "w") do ft
+        open("../DAT/nbody_speedups.dat", "w") do f
+            for nbr_planets in (start_nbr : step_nbr : stop_nbr)
+                println("$(nbr_planets) planets:")
+                mt ::Vector{Float64} = median_times(nbr_tests, nbr_planets, nbr_simulations, nslices)
+                println("C struct time: $(mtcstr[i])")
+                println("C array time: $(mtcarr[i])")
+                println("Starpujl time: $(mt[1])")
+                # println("CPUjl time: $(mt[2])")
+                # println("GPUjl time: $(mt[3])")
+                println("Julia time: $(mtjl[i])")
+                write(f, "$(nbr_planets) $(mtjl[i]/mt[1]) $(mtjl[i]/mtcstr[i]) $(mtjl[i]/mtcarr[i])\n") 
+                write(ft, "$(mt[1])\n")
+                #write(fjl, "$(mt[4])\n")
+                i = i + 1
+                
+            end
+        end
+    end
+    #end
+end
+
+
+        

+ 74 - 0
julia/tst/nbody/nbody_display.jl

@@ -0,0 +1,74 @@
+function get_planet_pixels(X ::Int64, Y ::Int64)
+    pix ::Array{Tuple{Int64,Int64}} = []
+    for i = X-1:X+1
+        for j = Y-3:Y+3
+            push!(pix,(i,j))
+        end
+    end
+    for i = Y-2:Y+2
+        push!(pix,(X-2,i))
+        push!(pix,(X+2,i))
+    end
+    for i = Y-1:Y+1
+        push!(pix,(X-3,i))
+        push!(pix,(X+3,i))
+    end
+    return pix
+end
+
+
+function is_inside(t ::Tuple{Int64,Int64}, width ::Int64, height ::Int64)
+    if (t[1] > 0 && t[1] <= width && t[2] > 0 && t[2] <= height)
+        return true
+    else
+        return false
+    end
+end
+
+function graph_planets(pixels ::Array{Array{Int64}}, positions ::Matrix{Float64}, min_value ::Float64, max_value ::Float64, width ::Int64, height ::Int64, file_name ::String)
+    n = size(positions)[2]
+    for i = 1:n
+        X ::Int64= round( ( (positions[1, i] - min_value) / (max_value - min_value) ) * (width - 1) ) + 1
+        Y ::Int64= round( (1 - ( (positions[2, i] - min_value) / (max_value - min_value) ) ) * (height - 1) ) + 1
+        pix ::Array{Tuple{Int64,Int64}} = get_planet_pixels(X,Y)
+        for pixel in pix
+            if is_inside(pixel, width, height)
+                pixels[(pixel[2] - 1)*width + pixel[1]] = [125, round(255*i/n)]
+            end
+        end
+    end
+
+    open(file_name,"w") do f
+        write(f, "P3\n$width $height\n255\n")
+        for he = 1:height
+            for wi = 1:width
+                write(f, "$(pixels[(he - 1)*width + wi][1]) 0 $(pixels[(he - 1)*width + wi][2]) ")
+            end
+            write(f,"\n")
+        end
+    end
+end
+
+# function graph_planets(pixels ::Array{Array{Int64}}, positions ::Vector{Vector{Float64}}, min_value ::Float64, max_value ::Float64, width ::Int64, height ::Int64, file_name ::String)
+#     n = length(positions)
+#     for i = 1:n
+#         X ::Int64= round( ( (positions[i][1] - min_value) / (max_value - min_value) ) * (width - 1) ) + 1
+#         Y ::Int64= round( (1 - ( (positions[i][2] - min_value) / (max_value - min_value) ) ) * (height - 1) ) + 1
+#         pix ::Array{Tuple{Int64,Int64}} = get_planet_pixels(X,Y)
+#         for pixel in pix
+#             if is_inside(pixel, width, height)
+#                 pixels[(pixel[2] - 1)*width + pixel[1]] = [125, round(255*i/n)]
+#             end
+#         end
+#     end
+
+#     open(file_name,"w") do f
+#         write(f, "P3\n$width $height\n255\n")
+#         for he = 1:height
+#             for wi = 1:width
+#                 write(f, "$(pixels[(he - 1)*width + wi][1]) 0 $(pixels[(he - 1)*width + wi][2]) ")
+#             end
+#             write(f,"\n")
+#         end
+#     end
+# end

+ 68 - 0
julia/tst/nbody/nbody_generated.jl

@@ -0,0 +1,68 @@
+if length(ARGS) != 6
+    println("Usage: julia nbody_generated.jl start_nbr step_nbr stop_nbr nbr_simulations nbr_slices nbr_tests")
+    quit()
+end
+
+if parse(Int64, ARGS[1]) % parse(Int64, ARGS[5]) != 0
+    println("The number of slices must divide the number of planets.")
+    quit()
+end
+
+include("../../src/Wrapper/Julia/starpu_include.jl")
+using StarPU
+
+@debugprint "starpu_init"
+starpu_init(extern_task_path = "../build/generated_tasks_nbody.so")
+
+perfmodel = StarpuPerfmodel(
+    perf_type = STARPU_HISTORY_BASED,
+    symbol = "history_perf"
+)
+
+# Normal starpu codelets
+claccst = StarpuCodelet(
+    cpu_func = "nbody_acc",
+    gpu_func = "CUDA_nbody_acc",
+    modes = [STARPU_R, STARPU_RW, STARPU_R, STARPU_R, STARPU_R],
+    perfmodel = perfmodel
+)
+
+clupdtst = StarpuCodelet(
+    cpu_func = "nbody_updt",
+    gpu_func = "CUDA_nbody_updt",
+    modes = [STARPU_RW, STARPU_RW, STARPU_R, STARPU_R],
+    perfmodel = perfmodel
+)
+
+# CPU_only codelets
+clacccpu = StarpuCodelet(
+    cpu_func = "nbody_acc",
+    modes = [STARPU_R, STARPU_RW, STARPU_R, STARPU_R, STARPU_R],
+    perfmodel = perfmodel
+)
+
+clupdtcpu = StarpuCodelet(
+    cpu_func = "nbody_updt",
+    modes = [STARPU_RW, STARPU_RW, STARPU_R,STARPU_R],
+    perfmodel = perfmodel
+)
+
+# GPU_only codelets
+claccgpu = StarpuCodelet(
+    gpu_func = "CUDA_nbody_acc",
+    modes = [STARPU_R, STARPU_RW, STARPU_R, STARPU_R, STARPU_R],
+    perfmodel = perfmodel
+)
+
+clupdtgpu = StarpuCodelet(
+    gpu_func = "CUDA_nbody_updt",
+    modes = [STARPU_RW, STARPU_RW, STARPU_R,STARPU_R],
+    perfmodel = perfmodel
+)
+
+include("nbody_def.jl")
+
+display_times(map((x -> parse(Int64, x)), ARGS)...)
+
+@debugprint "starpu_shutdown"
+starpu_shutdown()