лет назад: 6 · 8fb5d1fe16
--- a/julia/tst/black_scholes/black_scholes.c
+++ b/julia/tst/black_scholes/black_scholes.c
@@ -0,0 +1,226 @@
 
				+#include <stdlib.h>
			
 
				+#include <stdio.h>
			
 
				+#include <starpu.h>
			
 
				+#include <math.h>
			
 
				+#include "../includes/sorting.h"
			
 
				+
			
 
				+
			
 
				+
			
 
				+void cpu_black_scholes(void **, void *);
			
 
				+void gpu_black_scholes(void **, void *);
			
 
				+
			
 
				+static struct starpu_codelet cl =
			
 
				+{
			
 
				+	.cpu_funcs = {cpu_black_scholes},
			
 
				+	.cuda_funcs = {gpu_black_scholes},
			
 
				+	.nbuffers = 7,
			
 
				+	.modes = {STARPU_R, STARPU_R, STARPU_R, STARPU_R, STARPU_R, STARPU_W, STARPU_W}
			
 
				+};
			
 
				+
			
 
				+void black_scholes_with_starpu(double *S, double *K, double *R, double *T, double *sig, double *call_res, double *put_res, unsigned nbr_data, unsigned nslices)
			
 
				+{
			
 
				+	starpu_data_handle_t S_handle, K_handle, R_handle, T_handle, SIG_handle, CRES_handle, PRES_handle;
			
 
				+	
			
 
				+
			
 
				+	starpu_vector_data_register(&S_handle, STARPU_MAIN_RAM, (uintptr_t)S, nbr_data, sizeof(double));	
			
 
				+	starpu_vector_data_register(&K_handle, STARPU_MAIN_RAM, (uintptr_t)K, nbr_data, sizeof(double));
			
 
				+	starpu_vector_data_register(&R_handle, STARPU_MAIN_RAM, (uintptr_t)R, nbr_data, sizeof(double));
			
 
				+	starpu_vector_data_register(&T_handle, STARPU_MAIN_RAM, (uintptr_t)T, nbr_data, sizeof(double));
			
 
				+	starpu_vector_data_register(&SIG_handle, STARPU_MAIN_RAM, (uintptr_t)sig, nbr_data, sizeof(double));
			
 
				+	starpu_vector_data_register(&CRES_handle, STARPU_MAIN_RAM, (uintptr_t)call_res, nbr_data, sizeof(double));
			
 
				+	starpu_vector_data_register(&PRES_handle, STARPU_MAIN_RAM, (uintptr_t)put_res, nbr_data, sizeof(double));
			
 
				+
			
 
				+	struct starpu_data_filter f =
			
 
				+	{
			
 
				+		.filter_func = starpu_vector_filter_block,
			
 
				+		.nchildren = nslices
			
 
				+	};
			
 
				+	/* printf("%f %f\n", nslices, nbr_data); */
			
 
				+
			
 
				+	starpu_data_partition(S_handle, &f);
			
 
				+	starpu_data_partition(K_handle, &f);
			
 
				+	starpu_data_partition(R_handle, &f);
			
 
				+	starpu_data_partition(T_handle, &f);
			
 
				+	starpu_data_partition(SIG_handle, &f);
			
 
				+	starpu_data_partition(CRES_handle, &f);
			
 
				+	starpu_data_partition(PRES_handle, &f);
			
 
				+	
			
 
				+	unsigned taskid;
			
 
				+
			
 
				+	for (taskid = 0; taskid < nslices; taskid++){
			
 
				+
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+		task->cl = &cl;
			
 
				+		task->handles[0] = starpu_data_get_sub_data(S_handle, 1, taskid);
			
 
				+		task->handles[1] = starpu_data_get_sub_data(K_handle, 1, taskid);
			
 
				+		task->handles[2] = starpu_data_get_sub_data(R_handle, 1, taskid);
			
 
				+		task->handles[3] = starpu_data_get_sub_data(T_handle, 1, taskid);
			
 
				+		task->handles[4] = starpu_data_get_sub_data(SIG_handle, 1, taskid);
			
 
				+		task->handles[5] = starpu_data_get_sub_data(CRES_handle, 1, taskid);
			
 
				+		task->handles[6] = starpu_data_get_sub_data(PRES_handle, 1, taskid);
			
 
				+		
			
 
				+		starpu_task_submit(task);
			
 
				+
			
 
				+	}
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	starpu_data_unpartition(S_handle, STARPU_MAIN_RAM);
			
 
				+	starpu_data_unpartition(K_handle, STARPU_MAIN_RAM);
			
 
				+	starpu_data_unpartition(R_handle, STARPU_MAIN_RAM);
			
 
				+	starpu_data_unpartition(T_handle, STARPU_MAIN_RAM);
			
 
				+	starpu_data_unpartition(SIG_handle, STARPU_MAIN_RAM);
			
 
				+	starpu_data_unpartition(CRES_handle, STARPU_MAIN_RAM);
			
 
				+	starpu_data_unpartition(PRES_handle, STARPU_MAIN_RAM);
			
 
				+
			
 
				+	starpu_data_unregister(S_handle);
			
 
				+	starpu_data_unregister(K_handle);
			
 
				+	starpu_data_unregister(R_handle);
			
 
				+	starpu_data_unregister(T_handle);
			
 
				+	starpu_data_unregister(SIG_handle);
			
 
				+	starpu_data_unregister(CRES_handle);
			
 
				+	starpu_data_unregister(PRES_handle);
			
 
				+	
			
 
				+
			
 
				+
			
 
				+}
			
 
				+
			
 
				+static void init_S(double *S, unsigned nbr_data)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < nbr_data; i++){
			
 
				+		S[i] = 100. * rand() / (double) RAND_MAX;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void init_K(double *K, unsigned nbr_data)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < nbr_data; i++){
			
 
				+		K[i] = 100. * rand() / (double) RAND_MAX;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void init_R(double *R, unsigned nbr_data)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < nbr_data; i++){
			
 
				+		R[i] = rand() / (double) RAND_MAX;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void init_T(double *T, unsigned nbr_data)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < nbr_data; i++){
			
 
				+		T[i] = 10. * rand() / (double) RAND_MAX;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void init_sig(double *sig, unsigned nbr_data)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < nbr_data; i++){
			
 
				+		sig[i] = 10. * rand() / (double) RAND_MAX;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+double median_time(unsigned nbr_data, unsigned nslices, unsigned nbr_tests)
			
 
				+{
			
 
				+	double exec_times[nbr_tests];
			
 
				+	
			
 
				+	double *S = malloc(nbr_data * sizeof(double));
			
 
				+	double *K = malloc(nbr_data * sizeof(double));
			
 
				+	double *R = malloc(nbr_data * sizeof(double));
			
 
				+	double *T = malloc(nbr_data * sizeof(double));
			
 
				+	double *sig = malloc(nbr_data * sizeof(double));
			
 
				+
			
 
				+	double *call_res = calloc(nbr_data, sizeof(double));
			
 
				+	double *put_res = calloc(nbr_data, sizeof(double));
			
 
				+
			
 
				+	double start, stop;
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < nbr_tests; i++){
			
 
				+
			
 
				+		init_S(S,nbr_data);
			
 
				+		init_K(K,nbr_data);
			
 
				+		init_R(R,nbr_data);
			
 
				+		init_T(T,nbr_data);
			
 
				+		init_sig(sig,nbr_data);
			
 
				+
			
 
				+		/* S[0] = 100.; */
			
 
				+		/* K[0] = 100.; */
			
 
				+		/* R[0] = 0.05; */
			
 
				+		/* T[0] = 1.0; */
			
 
				+		/* sig[0] = 0.2; */
			
 
				+		
			
 
				+		start = starpu_timing_now();
			
 
				+		black_scholes_with_starpu(S, K, R, T, sig, call_res, put_res, nbr_data, nslices);
			
 
				+		stop = starpu_timing_now();
			
 
				+	
			
 
				+		exec_times[i] = (stop - start) / 1.e6;
			
 
				+	}
			
 
				+
			
 
				+	/* printf("%f %f\n", call_res[0], put_res[0]); */
			
 
				+
			
 
				+	free(S);
			
 
				+	free(K);
			
 
				+	free(R);
			
 
				+	free(T);
			
 
				+	free(sig);
			
 
				+	free(call_res);
			
 
				+	free(put_res);
			
 
				+
			
 
				+	quicksort(exec_times, 0, nbr_tests - 1);
			
 
				+
			
 
				+	
			
 
				+	return exec_times[nbr_tests/2];
			
 
				+}
			
 
				+	
			
 
				+
			
 
				+
			
 
				+void display_times(unsigned start_nbr, unsigned step_nbr, unsigned stop_nbr, unsigned nslices, unsigned nbr_tests)
			
 
				+{
			
 
				+	FILE *myfile;
			
 
				+
			
 
				+	myfile = fopen("DAT/black_scholes_c_times.dat", "w");
			
 
				+
			
 
				+	unsigned nbr_data;
			
 
				+
			
 
				+	for (nbr_data = start_nbr; nbr_data <= stop_nbr; nbr_data += step_nbr){
			
 
				+		double t = median_time(nbr_data, nslices, nbr_tests);
			
 
				+		printf("nbr_data:\n%u\nTime:\n%f\n", nbr_data, t);
			
 
				+		fprintf(myfile, "%f\n", t);
			
 
				+	}
			
 
				+	fclose(myfile);
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	if (argc != 6){
			
 
				+		printf("Usage: %s start_nbr step_nbr stop_nbr nslices nbr_tests\n", argv[0]);
			
 
				+		return 1;
			
 
				+	}
			
 
				+
			
 
				+	if (starpu_init(NULL) != EXIT_SUCCESS){
			
 
				+		fprintf(stderr, "ERROR\n");
			
 
				+		return 77;
			
 
				+	}
			
 
				+
			
 
				+	unsigned start_nbr = (unsigned) atoi(argv[1]);
			
 
				+	unsigned step_nbr = (unsigned) atoi(argv[2]);
			
 
				+	unsigned stop_nbr = (unsigned) atoi(argv[3]);
			
 
				+	unsigned nslices = (unsigned) atoi(argv[4]);
			
 
				+	unsigned nbr_tests = (unsigned) atoi(argv[5]);
			
 
				+
			
 
				+	srand(time(NULL));
			
 
				+
			
 
				+	display_times(start_nbr, step_nbr, stop_nbr, nslices, nbr_tests);
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+		
			
--- a/julia/tst/black_scholes/black_scholes_def.jl
+++ b/julia/tst/black_scholes/black_scholes_def.jl
@@ -0,0 +1,81 @@
 
				+function black_scholes_starpu(data ::Matrix{Float64}, res ::Matrix{Float64}, nslices ::Int64)
			
 
				+    vert = StarpuDataFilter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslices)
			
 
				+
			
 
				+    @starpu_block let
			
 
				+        dat_handle, res_handle = starpu_data_register(data, res)
			
 
				+
			
 
				+        starpu_data_partition(dat_handle, vert)
			
 
				+        starpu_data_partition(res_handle, vert)
			
 
				+        
			
 
				+        #Compute the price of call and put option in the res matrix
			
 
				+        @starpu_sync_tasks for task in (1:nslices)
			
 
				+            @starpu_async_cl cl(dat_handle[task], res_handle[task])
			
 
				+        end
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+
			
 
				+function init_data(data, data_nbr);
			
 
				+    for i in 1:data_nbr
			
 
				+        data[1,i] = rand(Float64) * 100
			
 
				+        data[2,i] = rand(Float64) * 100
			
 
				+        data[3,i] = rand(Float64)
			
 
				+        data[4,i] = rand(Float64) * 10
			
 
				+        data[5,i] = rand(Float64) * 10
			
 
				+    end
			
 
				+    return data
			
 
				+end
			
 
				+        
			
 
				+
			
 
				+
			
 
				+function median_times(data_nbr, nslices, nbr_tests)
			
 
				+
			
 
				+    data ::Matrix{Float64} = zeros(5, data_nbr)
			
 
				+    # data[1,1] = 100.0
			
 
				+    # data[2,1] = 100.0
			
 
				+    # data[3,1] = 0.05
			
 
				+    # data[4,1] = 1.0
			
 
				+    # data[5,1] = 0.2
			
 
				+
			
 
				+
			
 
				+    res ::Matrix{Float64} = zeros(2, data_nbr)
			
 
				+
			
 
				+    exec_times ::Vector{Float64} = [0. for i in 1:nbr_tests]
			
 
				+
			
 
				+    for i = 1:nbr_tests
			
 
				+        
			
 
				+        init_data(data, data_nbr)
			
 
				+
			
 
				+        tic()
			
 
				+        black_scholes_starpu(data, res, nslices);
			
 
				+        t = toq()
			
 
				+
			
 
				+        exec_times[i] = t
			
 
				+    end
			
 
				+    sort!(exec_times)
			
 
				+    # println(data)
			
 
				+    # println(res)
			
 
				+    
			
 
				+    return exec_times[1 + div(nbr_tests - 1, 2)]
			
 
				+end
			
 
				+
			
 
				+function display_times(start_nbr, step_nbr, stop_nbr, nslices, nbr_tests)
			
 
				+
			
 
				+    mtc = map( (x->parse(Float64,x)), open("../DAT/black_scholes_c_times.dat") do f
			
 
				+                  readlines(f)
			
 
				+                  end)
			
 
				+
			
 
				+
			
 
				+    mtcgen = map( (x->parse(Float64,x)), open("../DAT/black_scholes_c_generated_times.dat") do f
			
 
				+                  readlines(f)
			
 
				+                  end)
			
 
				+    i = 1
			
 
				+    open("../DAT/black_scholes_times.dat", "w") do f 
			
 
				+        for data_nbr in (start_nbr : step_nbr : stop_nbr)
			
 
				+            t = median_times(data_nbr, nslices, nbr_tests)
			
 
				+            println("Number of data:\n$data_nbr\nTimes:\njl: $t\nC: $(mtc[i])\nGen: $(mtcgen[i])")
			
 
				+            write(f, "$data_nbr $(t) $(mtcgen[i]) $(mtc[i])\n")
			
 
				+            i = i + 1
			
 
				+        end
			
 
				+    end
			
 
				+end
			
--- a/julia/tst/black_scholes/black_scholes_generated.jl
+++ b/julia/tst/black_scholes/black_scholes_generated.jl
@@ -0,0 +1,35 @@
 
				+if length(ARGS) != 5
			
 
				+    println("Usage: julia prog.jl start_data_nbr step_data_nbr stop_data_nbr nslices nbr_tests")
			
 
				+    quit()
			
 
				+end
			
 
				+
			
 
				+
			
 
				+if (parse(Int64,ARGS[1]) < parse(Int64,ARGS[4]))
			
 
				+    println("The number of slices must be smaller than the number of data")
			
 
				+    quit()
			
 
				+end
			
 
				+
			
 
				+include("../../src/Wrapper/Julia/starpu_include.jl")
			
 
				+using StarPU
			
 
				+
			
 
				+@debugprint "starpu_init"
			
 
				+starpu_init(extern_task_path = "../build/generated_tasks_black_scholes")
			
 
				+
			
 
				+perfmodel = StarpuPerfmodel(
			
 
				+    perf_type = STARPU_HISTORY_BASED,
			
 
				+    symbol = "history_perf"
			
 
				+)
			
 
				+
			
 
				+cl = StarpuCodelet(
			
 
				+cpu_func = "black_scholes",
			
 
				+gpu_func = "CUDA_black_scholes",
			
 
				+modes = [STARPU_RW, STARPU_RW],
			
 
				+perfmodel = perfmodel
			
 
				+)
			
 
				+
			
 
				+include("black_scholes_def.jl")
			
 
				+
			
 
				+display_times(map( (x->parse(Int64, x)), ARGS)...)
			
 
				+
			
 
				+@debugprint "starpu_shutdown"
			
 
				+starpu_shutdown()
			
--- a/julia/tst/black_scholes/black_scholes_with_generated.c
+++ b/julia/tst/black_scholes/black_scholes_with_generated.c
@@ -0,0 +1,155 @@
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <starpu.h>
			
 
				+#include "../includes/sorting.h"
			
 
				+
			
 
				+void black_scholes(void **, void *);
			
 
				+void CUDA_black_scholes(void **, void*);
			
 
				+
			
 
				+static struct starpu_perfmodel model =
			
 
				+{
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = "history_perf"
			
 
				+};
			
 
				+
			
 
				+struct starpu_codelet cl =
			
 
				+{
			
 
				+	.cpu_funcs = {black_scholes},
			
 
				+	.cuda_funcs = {CUDA_black_scholes},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_R, STARPU_RW},
			
 
				+	.model = &model
			
 
				+};
			
 
				+
			
 
				+void black_scholes_with_starpu(double *data, double *res, unsigned nslices, unsigned nbr_data)
			
 
				+{
			
 
				+
			
 
				+	starpu_data_handle_t D_handle, RES_handle;
			
 
				+
			
 
				+	starpu_matrix_data_register(&D_handle, STARPU_MAIN_RAM, (uintptr_t)data, 5, 5, nbr_data, sizeof(double));
			
 
				+	starpu_matrix_data_register(&RES_handle, STARPU_MAIN_RAM, (uintptr_t)res, 2, 2, nbr_data, sizeof(double));
			
 
				+
			
 
				+	struct starpu_data_filter vert =
			
 
				+	{
			
 
				+		.filter_func = starpu_matrix_filter_vertical_block,
			
 
				+		.nchildren = nslices
			
 
				+	};
			
 
				+
			
 
				+	starpu_data_partition(D_handle, &vert);
			
 
				+	starpu_data_partition(RES_handle, &vert);
			
 
				+
			
 
				+	unsigned taskx;
			
 
				+	
			
 
				+	for (taskx = 0; taskx < nslices; taskx++){
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				+		
			
 
				+		task->cl = &cl;
			
 
				+		task->handles[0] = starpu_data_get_sub_data(D_handle, 1, taskx);
			
 
				+		task->handles[1] = starpu_data_get_sub_data(RES_handle, 1, taskx);
			
 
				+		
			
 
				+		starpu_task_submit(task);
			
 
				+	}
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+
			
 
				+	starpu_data_unpartition(D_handle, STARPU_MAIN_RAM);
			
 
				+	starpu_data_unpartition(RES_handle, STARPU_MAIN_RAM);
			
 
				+
			
 
				+	starpu_data_unregister(D_handle);
			
 
				+	starpu_data_unregister(RES_handle);
			
 
				+}
			
 
				+	
			
 
				+
			
 
				+void init_data(double *data, unsigned nbr_data)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < nbr_data; i++){
			
 
				+
			
 
				+		data[5*i] = 100. * rand() / (double) RAND_MAX;
			
 
				+		data[5*i + 1] = 100. * rand() / (double) RAND_MAX;
			
 
				+		data[5*i + 2] = rand() / (double) RAND_MAX;
			
 
				+		data[5*i + 3] = 10. * rand() / (double) RAND_MAX;
			
 
				+		data[5*i + 4] = 10. * rand() / (double) RAND_MAX;
			
 
				+		
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+double median_time(unsigned nbr_data, unsigned nslices, unsigned nbr_tests)
			
 
				+{
			
 
				+	double *data = malloc(5 * nbr_data * sizeof(double));
			
 
				+	double *res = calloc(2 * nbr_data, sizeof(double));
			
 
				+	double exec_times[nbr_tests];
			
 
				+	
			
 
				+	/* printf("nbr_data: %u\n", nbr_data); */
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < nbr_tests; i++){
			
 
				+		
			
 
				+		init_data(data, nbr_data);
			
 
				+		/* data[0] = 100.0; */
			
 
				+		/* data[1] = 100.0; */
			
 
				+		/* data[2] = 0.05; */
			
 
				+		/* data[3] = 1.0; */
			
 
				+		/* data[4] = 0.2; */
			
 
				+
			
 
				+		double start = starpu_timing_now();
			
 
				+		black_scholes_with_starpu(data, res, nslices, nbr_data);
			
 
				+		double stop = starpu_timing_now();
			
 
				+		
			
 
				+		exec_times[i] = (stop-start)/1.e6;
			
 
				+		
			
 
				+		
			
 
				+	}
			
 
				+
			
 
				+	/* printf("RES:\n%f\n%f\n", res[0], res[1]); */
			
 
				+
			
 
				+	free(data);
			
 
				+	free(res);
			
 
				+
			
 
				+	quicksort(exec_times, 0, nbr_tests - 1);
			
 
				+	return exec_times[nbr_tests/2];
			
 
				+}
			
 
				+	
			
 
				+
			
 
				+
			
 
				+void display_times(unsigned start_nbr, unsigned step_nbr, unsigned stop_nbr, unsigned nslices, unsigned nbr_tests){
			
 
				+	
			
 
				+	double t;
			
 
				+	unsigned nbr_data;
			
 
				+
			
 
				+	FILE *myfile;
			
 
				+	myfile = fopen("DAT/black_scholes_c_generated_times.dat", "w");
			
 
				+
			
 
				+	for (nbr_data = start_nbr; nbr_data <= stop_nbr; nbr_data+=step_nbr){
			
 
				+		t = median_time(nbr_data, nslices, nbr_tests);
			
 
				+		printf("Number of data: %u\nTime: %f\n", nbr_data, t);
			
 
				+		fprintf(myfile, "%f\n", t);
			
 
				+	}
			
 
				+	fclose(myfile);
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	if (argc != 6){
			
 
				+		printf("Usage: %s start_nbr step_nbr stop_nbr nslices nbr_tests\n", argv[0]);
			
 
				+		return 1;
			
 
				+	}
			
 
				+	
			
 
				+	if (starpu_init(NULL) != EXIT_SUCCESS){
			
 
				+		fprintf(stderr, "ERROR\n");
			
 
				+		return 77;
			
 
				+	}
			
 
				+
			
 
				+	unsigned start_nbr = (unsigned) atoi(argv[1]);
			
 
				+	unsigned step_nbr = (unsigned) atoi(argv[2]);
			
 
				+	unsigned stop_nbr = (unsigned) atoi(argv[3]);
			
 
				+	unsigned nslices = (unsigned) atoi(argv[4]);
			
 
				+	unsigned nbr_tests = (unsigned) atoi(argv[5]);
			
 
				+
			
 
				+
			
 
				+	display_times(start_nbr, step_nbr, stop_nbr, nslices, nbr_tests);
			
 
				+		
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/julia/tst/black_scholes/cpu_black_scholes.c
+++ b/julia/tst/black_scholes/cpu_black_scholes.c
@@ -0,0 +1,39 @@
 
				+#include <stdint.h>
			
 
				+#include <starpu.h>
			
 
				+#include <math.h>
			
 
				+
			
 
				+
			
 
				+static inline double normcdf(double x)
			
 
				+{
			
 
				+	
			
 
				+	return (1.0 + erf(x/sqrt(2.0)))/2.0;
			
 
				+}
			
 
				+
			
 
				+void cpu_black_scholes(void *descr[], void *arg)
			
 
				+{ 
			
 
				+	double *S, *K, *R, *T, *SIG, *CRES, *PRES;
			
 
				+
			
 
				+	uint32_t nxS;
			
 
				+	
			
 
				+	
			
 
				+	S = (double *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	K = (double *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+	R = (double *)STARPU_MATRIX_GET_PTR(descr[2]);
			
 
				+	T = (double *)STARPU_MATRIX_GET_PTR(descr[3]);
			
 
				+	SIG = (double *)STARPU_MATRIX_GET_PTR(descr[4]);
			
 
				+	CRES = (double *)STARPU_MATRIX_GET_PTR(descr[5]);
			
 
				+	PRES = (double *)STARPU_MATRIX_GET_PTR(descr[6]);
			
 
				+	
			
 
				+	nxS = STARPU_MATRIX_GET_NX(descr[0]);
			
 
				+
			
 
				+	
			
 
				+	uint32_t i;
			
 
				+	for (i = 0; i < nxS; i++){
			
 
				+				
			
 
				+		double d1 = (log(S[i] / K[i]) + (R[i] + pow(SIG[i], 2.0) * 0.5) * T[i]) / (SIG[i] * sqrt(T[i]));
			
 
				+		double d2 = (log(S[i] / K[i]) + (R[i] - pow(SIG[i], 2.0) * 0.5) * T[i]) / (SIG[i] * sqrt(T[i]));
			
 
				+		
			
 
				+		CRES[i] = S[i] * normcdf(d1) - K[i] * exp(-R[i] * T[i]) * normcdf(d2);
			
 
				+		PRES[i] = -S[i] * normcdf(-d1) + K[i] * exp(-R[i] * T[i]) * normcdf(-d2);
			
 
				+	}
			
 
				+}
			
--- a/julia/tst/black_scholes/cpu_cuda_black_scholes.jl
+++ b/julia/tst/black_scholes/cpu_cuda_black_scholes.jl
@@ -0,0 +1,124 @@
 
				+include("../../src/Compiler/include.jl")
			
 
				+
			
 
				+starpu_new_cpu_kernel_file("../build/generated_cpu_black_scholes.c")
			
 
				+starpu_new_cuda_kernel_file("../build/generated_cuda_black_scholes.cu")
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+@cpu_cuda_kernel function black_scholes(data ::Matrix{Float64}, res ::Matrix{Float64}) ::Void
			
 
				+    
			
 
				+    widthn ::Int64 = width(data)
			
 
				+        
			
 
				+    # data[1,...] -> S
			
 
				+    # data[2,...] -> K
			
 
				+    # data[3,...] -> r
			
 
				+    # data[4,...] -> T
			
 
				+    # data[4,...] -> sig
			
 
				+
			
 
				+    p ::Float64 = 0.2316419
			
 
				+    b1 ::Float64 = 0.31938153
			
 
				+    b2 ::Float64 = -0.356563782
			
 
				+    b3 ::Float64 = 1.781477937
			
 
				+    b4 ::Float64 = -1.821255978
			
 
				+    b5 ::Float64 = 1.330274428
			
 
				+
			
 
				+    
			
 
				+    @indep for i = 1:widthn
			
 
				+        
			
 
				+
			
 
				+        d1 ::Float64 = (log(data[1,i] / data[2,i]) + (data[3,i] + pow(data[5,i], 2.0) * 0.5) * data[4,i]) / (data[5,i] * sqrt(data[4,i]))
			
 
				+        d2 ::Float64 = (log(data[1,i] / data[2,i]) + (data[3,i] - pow(data[5,i], 2.0) * 0.5) * data[4,i]) / (data[5,i] * sqrt(data[4,i]))
			
 
				+        
			
 
				+
			
 
				+
			
 
				+
			
 
				+        f ::Float64 = 0
			
 
				+        ff ::Float64 = 0
			
 
				+        s1 ::Float64 = 0
			
 
				+        s2 ::Float64 = 0
			
 
				+        s3 ::Float64 = 0
			
 
				+        s4 ::Float64 = 0
			
 
				+        s5 ::Float64 = 0
			
 
				+        sz ::Float64 = 0
			
 
				+        
			
 
				+
			
 
				+
			
 
				+        
			
 
				+        ######## Compute normcdf of d1
			
 
				+
			
 
				+        normd1p ::Float64 = 0
			
 
				+        normd1n ::Float64 = 0
			
 
				+
			
 
				+        boold1 ::Int64 = (d1 >= 0) + (d1 <= 0)
			
 
				+        
			
 
				+        if (boold1 >= 2)
			
 
				+            normd1p = 0.5
			
 
				+            normd1n = 0.5
			
 
				+        else
			
 
				+            tmp1 ::Float64 = abs(d1)
			
 
				+            f = 1 / sqrt(2 * M_PI)
			
 
				+            ff = exp(-pow(tmp1, 2.0) / 2) * f
			
 
				+            s1 = b1 / (1 + p * tmp1)
			
 
				+            s2 = b2 / pow((1 + p * tmp1), 2.0)
			
 
				+            s3 = b3 / pow((1 + p * tmp1), 3.0)
			
 
				+            s4 = b4 / pow((1 + p * tmp1), 4.0)
			
 
				+            s5 = b5 / pow((1 + p * tmp1), 5.0)
			
 
				+            sz = ff * (s1 + s2 + s3 + s4 + s5)
			
 
				+        
			
 
				+            if (d1 > 0)
			
 
				+                normd1p = 1 - sz # normcdf(d1)
			
 
				+                normd1n = sz # normcdf(-d1)
			
 
				+            else
			
 
				+                normd1p = sz
			
 
				+                normd1n = 1 - sz
			
 
				+            end    
			
 
				+        end
			
 
				+        ########
			
 
				+        
			
 
				+
			
 
				+        ######## Compute normcdf of d2
			
 
				+        normd2p ::Float64 = 0
			
 
				+        normd2n ::Float64 = 0
			
 
				+
			
 
				+        boold2 ::Int64 = (d2 >= 0) + (d2 <= 0)
			
 
				+        
			
 
				+        if (boold2 >= 2)
			
 
				+            normd2p = 0.5
			
 
				+            normd2n = 0.5
			
 
				+        else
			
 
				+            tmp2 ::Float64 = abs(d2)
			
 
				+            f = 1 / sqrt(2 * M_PI)
			
 
				+            ff = exp(-pow(tmp2, 2.0) / 2) * f
			
 
				+            s1 = b1 / (1 + p * tmp2)
			
 
				+            s2 = b2 / pow((1 + p * tmp2), 2.0)
			
 
				+            s3 = b3 / pow((1 + p * tmp2), 3.0)
			
 
				+            s4 = b4 / pow((1 + p * tmp2), 4.0)
			
 
				+            s5 = b5 / pow((1 + p * tmp2), 5.0)
			
 
				+            sz = ff * (s1 + s2 + s3 + s4 + s5)
			
 
				+        
			
 
				+        
			
 
				+            if (d2 > 0)
			
 
				+                normd2p = 1 - sz # normcdf(d2)
			
 
				+                normd2n = sz # normcdf(-d2)
			
 
				+            else
			
 
				+                normd2p = sz
			
 
				+                normd2n = 1 - sz
			
 
				+            end
			
 
				+        end
			
 
				+        # normd1p = (1 + erf(d1/sqrt(2.0)))/2.0
			
 
				+        # normd1n = (1 + erf(-d1/sqrt(2.0)))/2.0
			
 
				+        
			
 
				+        # normd2p = (1 + erf(d2/sqrt(2.0)))/2.0
			
 
				+        # normd2n = (1 + erf(-d2/sqrt(2.0)))/2.0
			
 
				+        
			
 
				+        res[1,i] = data[1,i] * (normd1p) - data[2,i]*exp(-data[3,i]*data[4,i]) * (normd2p) # S * N(d1) - r*exp(-r*T) * norm(d2)
			
 
				+        res[2,i] = -data[1,i] * (normd1n) + data[2,i]*exp(-data[3,i]*data[4,i]) * (normd2n) # -S * N(-d1) + r*exp(-r*T) * norm(-d2)
			
 
				+        
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+compile_cpu_kernels("../build/generated_cpu_black_scholes.so")
			
 
				+compile_cuda_kernels("../build/generated_cuda_black_scholes.so")
			
 
				+combine_kernel_files("../build/generated_tasks_black_scholes.so", ["../build/generated_cpu_black_scholes.so", "../build/generated_cuda_black_scholes.so"])
			
--- a/julia/tst/black_scholes/gpu_black_scholes.cu
+++ b/julia/tst/black_scholes/gpu_black_scholes.cu
@@ -0,0 +1,74 @@
 
				+#include <stdio.h>
			
 
				+#include <stdint.h>
			
 
				+#include <math.h>
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+// __device__ inline double cndGPU(double d)
			
 
				+// {
			
 
				+//   const double A1 = 0.31938153f;
			
 
				+//   const double A2 = -0.356563782f;
			
 
				+//   const double A3 = 1.781477937f;
			
 
				+//   const double A4 = -1.821255978f;
			
 
				+//   const double A5 = 1.330274429f;
			
 
				+//   const float RSQRT2PI = 0.39894228040143267793994605993438f;
			
 
				+
			
 
				+    
			
 
				+//   double K = __fdividef(1.0f, (1.0f + 0.2316419f * fabsf(d)));
			
 
				+
			
 
				+    
			
 
				+//   double cnd = RSQRT2PI * __expf(- 0.5f * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
			
 
				+
			
 
				+//     if (d > 0)
			
 
				+//       cnd = 1.0f - cnd;
			
 
				+
			
 
				+//     return cnd;
			
 
				+// }
			
 
				+
			
 
				+__device__ inline double cndGPU(double d)
			
 
				+{
			
 
				+  return (1.0 + erf(d/sqrt(2.0)))/2.0;
			
 
				+}
			
 
				+
			
 
				+__global__ void gpuBlackScholesKernel(double *S, double *K, double *R, double *T, 
			
 
				+				      double *SIG, double *CRES, double *PRES,
			
 
				+				      uint32_t nxS)
			
 
				+{
			
 
				+  uint32_t i, id;
			
 
				+  
			
 
				+  id = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+  i = id % nxS;
			
 
				+  
			
 
				+  double sqrtT = __fdividef(1.0F, rsqrtf(T[i]));
			
 
				+  double d1 = (log(S[i] / K[i]) + (R[i] + SIG[i] * SIG[i] * 0.5) * T[i]) / (SIG[i] * sqrt(T[i]));  
			
 
				+  double d2 = (log(S[i] / K[i]) + (R[i] - SIG[i] * SIG[i] * 0.5) * T[i]) / (SIG[i] * sqrt(T[i]));
			
 
				+  
			
 
				+  CRES[i] = S[i] * (normcdf(d1)) - K[i] * exp(-R[i] * T[i]) * normcdf(d2);
			
 
				+  PRES[i] = -S[i] * (normcdf(-d1)) + K[i] * exp(-R[i] * T[i]) * normcdf(-d2);
			
 
				+}
			
 
				+
			
 
				+#define THREADS_PER_BLOCK 64
			
 
				+
			
 
				+extern "C" void gpu_black_scholes(void *descr[], void *args)
			
 
				+{
			
 
				+  double *S, *K, *R, *T, *SIG, *CRES, *PRES;
			
 
				+  uint32_t nxS;
			
 
				+  uint32_t nblocks;
			
 
				+
			
 
				+  S = (double *) STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+  K = (double *) STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+  R = (double *) STARPU_MATRIX_GET_PTR(descr[2]);
			
 
				+  T = (double *) STARPU_MATRIX_GET_PTR(descr[3]);
			
 
				+  SIG = (double *) STARPU_MATRIX_GET_PTR(descr[4]);
			
 
				+  CRES = (double *) STARPU_MATRIX_GET_PTR(descr[5]);
			
 
				+  PRES = (double *) STARPU_MATRIX_GET_PTR(descr[6]);
			
 
				+
			
 
				+  nxS = STARPU_MATRIX_GET_NX(descr[0]);
			
 
				+
			
 
				+  nblocks = (nxS + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
			
 
				+
			
 
				+  gpuBlackScholesKernel
			
 
				+    <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
			
 
				+    >>> (S, K, R, T, SIG, CRES, PRES, nxS);
			
 
				+  
			
 
				+  cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+}
			
--- a/julia/tst/includes/display.c
+++ b/julia/tst/includes/display.c
@@ -0,0 +1,196 @@
 
				+#include "display.h"
			
 
				+
			
 
				+
			
 
				+void mandelbrot_graph(char *filename, int *pixels, unsigned width, unsigned height)
			
 
				+{
			
 
				+	FILE *myfile;
			
 
				+	myfile = fopen(filename, "w");
			
 
				+	fprintf(myfile, "P3\n%u %u\n255\n", width, height);
			
 
				+	unsigned i,j;
			
 
				+	for (i = 0; i < height; i++){
			
 
				+		for (j = 0; j < width; j++){
			
 
				+			fprintf(myfile, "%d 0 0 ", pixels[j + i*width]);
			
 
				+		}
			
 
				+		fprintf(myfile, "\n");
			
 
				+	}
			
 
				+	fclose(myfile);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void mandelbrot_graph_transpose(char *filename, int64_t *pixels, unsigned width, unsigned height)
			
 
				+{
			
 
				+	FILE *myfile;
			
 
				+	myfile = fopen(filename, "w");
			
 
				+	fprintf(myfile, "P3\n%u %u\n255\n", width, height);
			
 
				+	unsigned i,j;
			
 
				+	for (i = 0; i < width; i++){
			
 
				+		for (j = 0; j < height; j++){
			
 
				+			fprintf(myfile, "%d 0 0 ", pixels[i + j*width]);
			
 
				+		}
			
 
				+		fprintf(myfile, "\n");
			
 
				+	}
			
 
				+	fclose(myfile);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+void pixels_print(int *pixels, unsigned width, unsigned height)
			
 
				+{
			
 
				+	unsigned i,j;
			
 
				+	for (i = 0; i < height; i++){
			
 
				+		for (j = 0; j < width; j++){
			
 
				+			printf("%d ", pixels[j + i * width]);
			
 
				+		}
			
 
				+		printf("\n");
			
 
				+	}
			
 
				+}
			
 
				+////////////////////////// NBODY ////////////////////////
			
 
				+
			
 
				+
			
 
				+static int is_inside(struct Position p, unsigned width, unsigned height);
			
 
				+static void get_planet_pixels(struct Position *pix, struct Position p);
			
 
				+static void graph_pixels(struct Pixel *pixel, unsigned width, unsigned height, struct Position *pix, unsigned index, unsigned nbr_planets);
			
 
				+static void nbody_ppm(char *filename, struct Pixel *pixels, unsigned width, unsigned height);
			
 
				+
			
 
				+
			
 
				+static int is_inside(struct Position p, unsigned width, unsigned height)
			
 
				+{
			
 
				+	if (p.x >= 0 && p.x < width && p.y >= 0 && p.y < height)
			
 
				+		return 1;
			
 
				+	else
			
 
				+		return 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void nbody_print(double *array, unsigned nbr_planets)
			
 
				+{
			
 
				+	unsigned i,j;
			
 
				+	for (j = 0; j < nbr_planets; j++){
			
 
				+		
			
 
				+		printf("Planet %u:\n", j);
			
 
				+		printf("%f ; %f\n", array[j], array[j + nbr_planets]);
			
 
				+		
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+//Returns a circle centered on the position p.
			
 
				+static void get_planet_pixels(struct Position *pix, struct Position p)
			
 
				+{
			
 
				+	
			
 
				+
			
 
				+	int i,j,k;
			
 
				+	k = 0;
			
 
				+	for (i = p.x - 1; i <= p.x + 1; i++){
			
 
				+		for (j = p.y - 3; j <= p.y + 3; j++){
			
 
				+	
			
 
				+			pix[k].x = i;
			
 
				+			pix[k].y = j;
			
 
				+			k++;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for (j = p.y - 2; j <= p.y + 2; j++){
			
 
				+		
			
 
				+		pix[k].x = p.x - 2;
			
 
				+		pix[k].y = j;
			
 
				+		k++;
			
 
				+		pix[k].x = p.x + 2;
			
 
				+		pix[k].y = j;
			
 
				+		k++;
			
 
				+	}
			
 
				+
			
 
				+	for (j = p.y - 1; j <= p.y + 1; j++){
			
 
				+		
			
 
				+		pix[k].x = p.x - 3;
			
 
				+		pix[k].y = j;
			
 
				+		k++;
			
 
				+		pix[k].x = p.x + 3;
			
 
				+		pix[k].y = j;
			
 
				+		k++;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void graph_pixels(struct Pixel *pixels, unsigned width, unsigned height, struct Position *pix, unsigned index, unsigned nbr_planets)
			
 
				+{
			
 
				+	/* printf("Planet: %u\n", index); */
			
 
				+	unsigned j;
			
 
				+	struct Pixel pixel = {0,0,0};
			
 
				+	for (j = 0; j < 37; j++){
			
 
				+		
			
 
				+		/* printf("X: %d, Y: %d\n", pix[j].x, pix[j].y); */
			
 
				+			
			
 
				+		if ( is_inside(pix[j], width, height) ){
			
 
				+
			
 
				+			pixel.r = 125;
			
 
				+			pixel.b = round((255. * index) / nbr_planets);
			
 
				+			pixels[pix[j].x + pix[j].y * width] = pixel;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void nbody_ppm(char *filename, struct Pixel *pixels, unsigned width, unsigned height)
			
 
				+{
			
 
				+	unsigned i,j;
			
 
				+	FILE *myfile;
			
 
				+	myfile = fopen(filename, "w");
			
 
				+	fprintf(myfile, "P3\n%u %u\n255\n", width, height);
			
 
				+	for (i = 0; i < height; i++){
			
 
				+		for (j = 0; j < width; j++){
			
 
				+			struct Pixel pixel = pixels[j + i * width];
			
 
				+			fprintf(myfile, "%u %u %u ", pixel.r, pixel.g, pixel.b);
			
 
				+		}
			
 
				+		fprintf(myfile, "\n");
			
 
				+	}
			
 
				+	fclose(myfile);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void nbody_graph(char *filename, double *positions, unsigned nbr_planets, unsigned width, unsigned height, double min_val, double max_val)
			
 
				+{
			
 
				+	struct Position *pix = malloc(37 * sizeof(struct Position));
			
 
				+	
			
 
				+	struct Pixel *pixels = calloc(width * height, sizeof(struct Pixel));
			
 
				+
			
 
				+	unsigned i,j;
			
 
				+
			
 
				+	for (i = 0; i < nbr_planets; i++){
			
 
				+		struct Position posi;
			
 
				+		
			
 
				+		posi.x = round( (positions[i] - min_val) / (max_val - min_val) * (width - 1));
			
 
				+		posi.y = round( (positions[i + nbr_planets] - min_val) / (max_val - min_val) * (width - 1));
			
 
				+
			
 
				+		
			
 
				+		get_planet_pixels(pix, posi);
			
 
				+
			
 
				+		graph_pixels(pixels, width, height, pix, i, nbr_planets);
			
 
				+
			
 
				+	}
			
 
				+	nbody_ppm(filename, pixels, width, height);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void nbody_graph_transpose(char *filename, double *positions, unsigned nbr_planets, unsigned width, unsigned height, double min_val, double max_val)
			
 
				+{
			
 
				+	struct Position *pix = malloc(37 * sizeof(struct Position));
			
 
				+	
			
 
				+	struct Pixel *pixels = calloc(width * height, sizeof(struct Pixel));
			
 
				+
			
 
				+	unsigned i,j;
			
 
				+
			
 
				+	for (i = 0; i < nbr_planets; i++){
			
 
				+		struct Position posi;
			
 
				+		
			
 
				+		posi.x = round( (positions[2 * i] - min_val) / (max_val - min_val) * (width - 1));
			
 
				+		posi.y = round( (positions[2 * i + 1] - min_val) / (max_val - min_val) * (width - 1));
			
 
				+
			
 
				+		
			
 
				+		get_planet_pixels(pix, posi);
			
 
				+
			
 
				+		graph_pixels(pixels, width, height, pix, i, nbr_planets);
			
 
				+
			
 
				+	}
			
 
				+	nbody_ppm(filename, pixels, width, height);
			
 
				+}
			
--- a/julia/tst/includes/display.h
+++ b/julia/tst/includes/display.h
@@ -0,0 +1,28 @@
 
				+#ifndef DISPLAY_H
			
 
				+#define DISPLAY_H
			
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <math.h>
			
 
				+#include <stdint.h>
			
 
				+
			
 
				+struct Position {
			
 
				+	int x;
			
 
				+	int y;
			
 
				+};
			
 
				+
			
 
				+struct Pixel {
			
 
				+	unsigned r;
			
 
				+	unsigned g;
			
 
				+	unsigned b;
			
 
				+};
			
 
				+
			
 
				+// Fills PPM/mandelbrot.ppm with the red values inside the pixels matrix.           
			
 
				+void mandelbrot_graph(char *filename, int *pixels, unsigned width, unsigned height);
			
 
				+void mandelbrot_graph_transpose(char *filename, int64_t *pixels, unsigned width, unsigned height);
			
 
				+void pixels_print(int *pixels, unsigned width, unsigned height);
			
 
				+void nbody_print(double *array, unsigned nbr_planets);
			
 
				+
			
 
				+void nbody_graph(char *filename, double *positions, unsigned nbr_planets, unsigned width, unsigned height, double min_val, double max_val);
			
 
				+void nbody_graph_transpose(char *filename, double *positions, unsigned nbr_planets, unsigned width, unsigned height, double min_val, double max_val);
			
 
				+
			
 
				+#endif
			
--- a/julia/tst/includes/sorting.c
+++ b/julia/tst/includes/sorting.c
@@ -0,0 +1,52 @@
 
				+#include "sorting.h"
			
 
				+
			
 
				+unsigned chose_pivot(int first, int last)
			
 
				+{
			
 
				+	return ( rand() % (last - first + 1) + first );
			
 
				+}
			
 
				+
			
 
				+int partitionning(double *arr, int first, int last, int pivot)
			
 
				+{
			
 
				+	double tmp;
			
 
				+	int i,j;
			
 
				+
			
 
				+	tmp = arr[last];
			
 
				+	arr[last] = arr[pivot];
			
 
				+	arr[pivot] = tmp;
			
 
				+
			
 
				+	j = first;
			
 
				+
			
 
				+	for (i = first; i < last; i++)
			
 
				+	{
			
 
				+
			
 
				+		if (arr[i] <= arr[last]){
			
 
				+			tmp = arr[i];
			
 
				+
			
 
				+			arr[i] = arr[j];
			
 
				+			arr[j] = tmp;
			
 
				+
			
 
				+			j++;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	tmp = arr[j];
			
 
				+	arr[j] = arr[last];
			
 
				+	arr[last] = tmp;
			
 
				+
			
 
				+	return j;
			
 
				+}
			
 
				+
			
 
				+void quicksort(double *arr, int first, int last)
			
 
				+{
			
 
				+
			
 
				+	if (first < last){
			
 
				+		int pivot = chose_pivot(first, last);
			
 
				+		int j;
			
 
				+
			
 
				+		j = partitionning(arr, first, last, pivot);
			
 
				+
			
 
				+		quicksort(arr, first, j - 1);
			
 
				+		quicksort(arr, j + 1, last);
			
 
				+	}
			
 
				+
			
 
				+}
			
--- a/julia/tst/includes/sorting.h
+++ b/julia/tst/includes/sorting.h
@@ -0,0 +1,12 @@
 
				+#ifndef SORTING_H
			
 
				+#define SORTING_H
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+#include <stdint.h>
			
 
				+#include <stdlib.h>
			
 
				+
			
 
				+unsigned chose_pivot(int first, int last);
			
 
				+int partitionning(double *arr, int first, int last, int pivot);
			
 
				+void quicksort(double *arr, int first, int last);
			
 
				+
			
 
				+#endif
			
--- a/julia/tst/mandelbrot/cpu_cuda_mandelbrot.jl
+++ b/julia/tst/mandelbrot/cpu_cuda_mandelbrot.jl
@@ -0,0 +1,58 @@
 
				+include("../../src/Compiler/include.jl")  
			
 
				+
			
 
				+starpu_new_cpu_kernel_file("../build/generated_cpu_mandelbrot.c")
			
 
				+starpu_new_cuda_kernel_file("../build/generated_cuda_mandelbrot.cu")
			
 
				+
			
 
				+@cpu_cuda_kernel function mandelbrot(pixels ::Matrix{Int64}, params ::Vector{Float64}, slice_pos ::Vector{Int64}) :: Void
			
 
				+    
			
 
				+
			
 
				+    local_width ::Int64 = width(pixels) 
			
 
				+    local_height ::Int64 = height(pixels)
			
 
				+
			
 
				+    #max_iterations ::Int64 = 250
			
 
				+    conv_limit ::Float64 = 2.0
			
 
				+   
			
 
				+    @indep for x in (1 : local_width)
			
 
				+	@indep for y in (1 : local_height)
			
 
				+
			
 
				+            max_iterations ::Float64 = params[5]
			
 
				+
			
 
				+            zoom ::Float64 = params[3] * 0.25296875
			
 
				+
			
 
				+	    X ::Int64 = x + local_width * (slice_pos[1] - 1)
			
 
				+	    Y ::Int64 = y + local_height * (slice_pos[2] - 1)
			
 
				+	    
			
 
				+            cr ::Float64 = params[1] + (X - params[3]/2)/zoom
			
 
				+	    zr ::Float64 = cr
			
 
				+
			
 
				+            ci ::Float64 = params[2] + (Y - params[4]/2)/zoom
			
 
				+            zi ::Float64 = ci
			
 
				+
			
 
				+	    n ::Int64 = 0
			
 
				+
			
 
				+            b1 ::Int64 = (n < max_iterations) + (zr*zr + zi*zi < conv_limit * conv_limit)
			
 
				+
			
 
				+	    while (b1 >= 2)#n <= max_iterations) && (z * z < conv_limit * conv_limit) #Double condition impossible!!!
			
 
				+
			
 
				+                tmp ::Float64 = zr*zr - zi*zi + cr
			
 
				+                zi = 2*zr*zi + ci
			
 
				+                zr = tmp
			
 
				+
			
 
				+		n = n + 1
			
 
				+                b1 = (n <= max_iterations) + (zr*zr + zi*zi <= conv_limit * conv_limit)
			
 
				+
			
 
				+	    end 
			
 
				+
			
 
				+	    if (n < max_iterations)
			
 
				+		pixels[y,x] = 255 * n / max_iterations
			
 
				+	    else
			
 
				+	        pixels[y,x] = 0
			
 
				+	    end
			
 
				+	end
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+
			
 
				+compile_cpu_kernels("../build/generated_cpu_mandelbrot.so")
			
 
				+compile_cuda_kernels("../build/generated_cuda_mandelbrot.so")
			
 
				+combine_kernel_files("../build/generated_tasks_mandelbrot.so", ["../build/generated_cpu_mandelbrot.so","../build/generated_cuda_mandelbrot.so"])
			
--- a/julia/tst/mandelbrot/cpu_mandelbrot.c
+++ b/julia/tst/mandelbrot/cpu_mandelbrot.c
@@ -0,0 +1,74 @@
 
				+#include <stdint.h>
			
 
				+#include <starpu.h>
			
 
				+#include <math.h>
			
 
				+
			
 
				+
			
 
				+struct Params
			
 
				+{
			
 
				+	float cr;
			
 
				+	float ci;
			
 
				+	unsigned taskx;
			
 
				+	unsigned tasky;
			
 
				+	unsigned width;
			
 
				+	unsigned height;
			
 
				+};
			
 
				+
			
 
				+void cpu_mandelbrot(void *descr[], void *cl_arg)
			
 
				+{
			
 
				+	
			
 
				+	struct Params *params = cl_arg;
			
 
				+
			
 
				+	int *subP;
			
 
				+	uint32_t nxP, nyP;
			
 
				+	uint32_t ldP;
			
 
				+
			
 
				+	subP = (int *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+
			
 
				+	nxP = STARPU_MATRIX_GET_NX(descr[0]);
			
 
				+	nyP = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+	
			
 
				+	ldP = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+
			
 
				+	float centerr = params->cr;
			
 
				+	float centeri = params->ci;
			
 
				+
			
 
				+	unsigned Idx = params->taskx;
			
 
				+	unsigned Idy = params->tasky;
			
 
				+
			
 
				+	unsigned width = params->width;
			
 
				+	unsigned height = params->height;
			
 
				+
			
 
				+	float zoom = width * 0.25296875;
			
 
				+	float conv_limit = 2.0;
			
 
				+	int max_iter = (width/2) * 0.049715909 * log10(zoom);
			
 
				+
			
 
				+	int x,y,n;
			
 
				+
			
 
				+	for (y = 0; y < nyP; y++){
			
 
				+		for (x = 0; x < nxP; x++){
			
 
				+			float X = x + Idx*nxP; //Coordinates in the whole matrice.
			
 
				+			float Y = y + Idy*nyP;
			
 
				+			float cr = centerr + (X - (width/2))/zoom;
			
 
				+			float ci = centeri + (Y - (height/2))/zoom;
			
 
				+			float zr = cr;
			
 
				+			float zi = ci;
			
 
				+			float m = zr * zr + zi * zi;
			
 
				+			
			
 
				+			for (n = 0; n <= max_iter && m < conv_limit * conv_limit; n++) {
			
 
				+
			
 
				+				float tmp = zr*zr - zi*zi + cr;
			
 
				+				zi = 2*zr*zi + ci;
			
 
				+				zr = tmp;
			
 
				+				m = zr*zr + zi*zi;
			
 
				+			}
			
 
				+			int color;
			
 
				+			if (n<max_iter)
			
 
				+				color = 255.*n/max_iter;
			
 
				+			else
			
 
				+				color = 0;
			
 
				+
			
 
				+			subP[x + y*ldP] = color;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+}
			
--- a/julia/tst/mandelbrot/cpu_mandelbrot_between.c
+++ b/julia/tst/mandelbrot/cpu_mandelbrot_between.c
@@ -0,0 +1,138 @@
 
				+#include <stdio.h>
			
 
				+#include <stdint.h>
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+struct Params
			
 
				+{
			
 
				+	float cr;
			
 
				+	float ci;
			
 
				+	unsigned taskx;
			
 
				+	unsigned tasky;
			
 
				+	unsigned width;
			
 
				+	unsigned height;
			
 
				+};
			
 
				+
			
 
				+static inline long long jlstarpu_max(long long a, long long b)
			
 
				+{
			
 
				+	return (a > b) ? a : b;
			
 
				+}
			
 
				+
			
 
				+static inline long long jlstarpu_interval_size(long long start, long long step, long long stop)
			
 
				+{
			
 
				+    if (stop >= start){
			
 
				+            return jlstarpu_max(0, (stop - start + 1) / step);
			
 
				+    } else {
			
 
				+            return jlstarpu_max(0, (stop - start - 1) / step);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void mandelbrot(void** buffers_86BwRM71, void* cl_arg_86BwRM71)
			
 
				+{
			
 
				+    uint32_t ld_o2BQqRir = (uint32_t) (STARPU_MATRIX_GET_LD(buffers_86BwRM71[(1) - (1)]));
			
 
				+    int64_t* ptr_o2BQqRir = (int64_t*) (STARPU_MATRIX_GET_PTR(buffers_86BwRM71[(1) - (1)]));
			
 
				+    
			
 
				+//ARRAY PAR
			
 
				+    double* ptr_Ul4Ys0Mt = (double*) (STARPU_VECTOR_GET_PTR(buffers_86BwRM71[(2) - (1)]));
			
 
				+    int64_t* ptr_cE3zj60d = (int64_t*) (STARPU_VECTOR_GET_PTR(buffers_86BwRM71[(3) - (1)]));
			
 
				+//
			
 
				+
			
 
				+    int64_t local_width = (int64_t) (STARPU_MATRIX_GET_NY(buffers_86BwRM71[(1) - (1)]));
			
 
				+    int64_t local_height = (int64_t) (STARPU_MATRIX_GET_NX(buffers_86BwRM71[(1) - (1)]));
			
 
				+    double conv_limit = (double) (2.0);
			
 
				+
			
 
				+//STRUCT PAR
			
 
				+    
			
 
				+    /* struct Params *params = cl_arg_86BwRM71; */
			
 
				+  
			
 
				+
			
 
				+    /* double centerr = params->cr; */
			
 
				+    /* double centeri = params->ci; */
			
 
				+
			
 
				+    /* unsigned Idx = params->taskx; */
			
 
				+    /* unsigned Idy = params->tasky; */
			
 
				+
			
 
				+    /* unsigned width = params->width; */
			
 
				+    /* unsigned height = params->height; */
			
 
				+
			
 
				+    /* /\* printf("cr / ci: %f / %f\n", centerr, centeri); *\/ */
			
 
				+
			
 
				+    /* int64_t zoom = width * 0.25296875; */
			
 
				+    /* int64_t max_iterations = (width/2) * 0.049715909 * log10(zoom); */
			
 
				+
			
 
				+//
			
 
				+
			
 
				+
			
 
				+    int64_t start_qxJwMzwA = (int64_t) (1);
			
 
				+    int64_t stop_qxJwMzwA = (int64_t) (local_width);
			
 
				+    int64_t x;
			
 
				+
			
 
				+    for (x = start_qxJwMzwA ; x <= stop_qxJwMzwA ; x += 1)
			
 
				+    {
			
 
				+        
			
 
				+        int64_t start_ekV9GHK1 = (int64_t) (1);
			
 
				+        int64_t stop_ekV9GHK1 = (int64_t) (local_height);
			
 
				+        int64_t y;
			
 
				+
			
 
				+        for (y = start_ekV9GHK1 ; y <= stop_ekV9GHK1 ; y += 1)
			
 
				+        {
			
 
				+	    //ARRAY PAR
			
 
				+	    double max_iterations = (double) (ptr_Ul4Ys0Mt[(5) - (1)]);
			
 
				+            double zoom = (double) ((ptr_Ul4Ys0Mt[(3) - (1)]) * (0.25296875));
			
 
				+	    //
			
 
				+
			
 
				+	    //STRUCT PAR
			
 
				+	    /* double X = x + Idy*local_width; */
			
 
				+	    /* double Y = y + Idx*local_height; */
			
 
				+
			
 
				+	    /* double cr = centerr + (X - (width / 2))/zoom; */
			
 
				+	    /* double ci = centeri + (Y - (height / 2))/zoom; */
			
 
				+	    //
			
 
				+
			
 
				+	    //ARRAY PAR
			
 
				+            int64_t X = (int64_t) ((x) + ((local_width) * ((ptr_cE3zj60d[(2) - (1)]) - (1))));
			
 
				+            int64_t Y = (int64_t) ((y) + ((local_height) * ((ptr_cE3zj60d[(1) - (1)]) - (1))));
			
 
				+
			
 
				+	    double cr = (double) ((ptr_Ul4Ys0Mt[(1) - (1)]) + (((X) - ((ptr_Ul4Ys0Mt[(3) - (1)]) / (2))) / (zoom)));
			
 
				+            double ci = (double) ((ptr_Ul4Ys0Mt[(2) - (1)]) + (((Y) - ((ptr_Ul4Ys0Mt[(4) - (1)]) / (2))) / (zoom)));
			
 
				+	    //
			
 
				+
			
 
				+
			
 
				+            double zi = (double) (ci);
			
 
				+            double zr = (double) (cr);
			
 
				+
			
 
				+
			
 
				+            int64_t n = (int64_t) (0);
			
 
				+
			
 
				+	    float m = zr * zr + zi * zi;
			
 
				+            /* int64_t b1 = (int64_t) (((n) < (max_iterations)) + ((((zr) * (zr)) + ((zi) * (zi))) < ((conv_limit) * (conv_limit)))); */
			
 
				+            
			
 
				+            /* while ((b1) >= (2)) */
			
 
				+	    /* printf("%d\n", max_iterations); */
			
 
				+
			
 
				+            for (n = 0; n < max_iterations && m < conv_limit * conv_limit; n++)
			
 
				+	    {
			
 
				+                double tmp = (double) ((((zr) * (zr)) - ((zi) * (zi))) + (cr));
			
 
				+                zi = ((2) * (zr) * (zi)) + (ci);
			
 
				+                zr = tmp;
			
 
				+                /* n = (n) + (1); */
			
 
				+		m = zr*zr + zi*zi;
			
 
				+                /* b1 = ((n) <= (max_iterations)) + ((((zr) * (zr)) + ((zi) * (zi))) <= ((conv_limit) * (conv_limit))); */
			
 
				+            }
			
 
				+            ;
			
 
				+	    
			
 
				+	    /* printf("n: %d\n max_iter: %d\n", n, max_iterations); */
			
 
				+            if ((n) < (max_iterations))
			
 
				+            {
			
 
				+		    ptr_o2BQqRir[((y) + (((x) - (1)) * (ld_o2BQqRir))) - (1)] = 255 * (1.0 * n / (max_iterations));
			
 
				+            } else
			
 
				+            {
			
 
				+                ptr_o2BQqRir[((y) + (((x) - (1)) * (ld_o2BQqRir))) - (1)] = 0;
			
 
				+            }
			
 
				+            ;
			
 
				+        }
			
 
				+        ;
			
 
				+    }
			
 
				+    ;
			
 
				+}
			
 
				+
			
 
				+
			
--- a/julia/tst/mandelbrot/gpu_mandelbrot.cu
+++ b/julia/tst/mandelbrot/gpu_mandelbrot.cu
@@ -0,0 +1,95 @@
 
				+#include <starpu.h>
			
 
				+#include <stdint.h>
			
 
				+#include <stdio.h>
			
 
				+#include <math.h>
			
 
				+
			
 
				+struct Params
			
 
				+{
			
 
				+  float cr;
			
 
				+  float ci;
			
 
				+  unsigned taskx;
			
 
				+  unsigned tasky;
			
 
				+  unsigned width;
			
 
				+  unsigned height;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+__global__ void gpuMandelbrotKernel
			
 
				+(
			
 
				+ uint32_t nxP, uint32_t nyP, 
			
 
				+ uint32_t ldP,
			
 
				+ int * subP,
			
 
				+ struct Params params
			
 
				+ )
			
 
				+{
			
 
				+  unsigned width = params.width;
			
 
				+  unsigned height = params.height;
			
 
				+  unsigned taskx = params.taskx;
			
 
				+  unsigned tasky = params.tasky;
			
 
				+
			
 
				+  float centerr = params.cr;
			
 
				+  float centeri = params.ci;
			
 
				+
			
 
				+  float zoom = width * 0.25296875;
			
 
				+  int maxiter = (width/2) * 0.049715909 * log10(zoom);
			
 
				+  float conv_lim = 2.0;
			
 
				+
			
 
				+  uint32_t id;
			
 
				+  int n,i,j,x,y;
			
 
				+  float zr,zi,cr,ci;
			
 
				+
			
 
				+  id = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+  i = id % nxP;
			
 
				+  j = id / nxP;
			
 
				+  
			
 
				+
			
 
				+  if (j >= nyP){
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  x = i + taskx * nxP;
			
 
				+  y = j + tasky * nyP;
			
 
				+  
			
 
				+  zr = cr = centerr + (x - width/2.)/zoom;
			
 
				+  zi = ci = centeri + (y - height/2.)/zoom;
			
 
				+  float m = zr*zr + zi*zi;
			
 
				+  
			
 
				+  for (n = 0; n <= maxiter && m < conv_lim * conv_lim; n++) {
			
 
				+    float tmp = zr * zr - zi*zi + cr;
			
 
				+    zi = 2*zr*zi + ci;
			
 
				+    zr = tmp;
			
 
				+    m = zr*zr + zi*zi;
			
 
				+  }
			
 
				+  int color;
			
 
				+  if (n < maxiter)
			
 
				+     color = 255.*n/maxiter;
			
 
				+   else
			
 
				+     color = 0;
			
 
				+  subP[i + j*ldP] = color;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+#define THREADS_PER_BLOCK 64
			
 
				+
			
 
				+extern "C" void gpu_mandelbrot(void *descr[], void *args)
			
 
				+{
			
 
				+  int *d_subP;
			
 
				+  uint32_t nxP, nyP;
			
 
				+  uint32_t ldP;
			
 
				+  uint32_t nblocks;
			
 
				+  struct Params *params = (struct Params *) args;
			
 
				+
			
 
				+  d_subP = (int *) STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+
			
 
				+  nxP = STARPU_MATRIX_GET_NX(descr[0]);
			
 
				+  nyP = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+
			
 
				+  ldP = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+
			
 
				+  nblocks = (nxP * nyP + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
			
 
				+
			
 
				+  gpuMandelbrotKernel <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream() >>> (nxP, nyP, ldP, d_subP, *params);
			
 
				+
			
 
				+  cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+}
			
--- a/julia/tst/mandelbrot/gpu_mandelbrot_between.cu
+++ b/julia/tst/mandelbrot/gpu_mandelbrot_between.cu
@@ -0,0 +1,113 @@
 
				+#include <stdio.h>
			
 
				+#include <stdint.h>
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#define THREADS_PER_BLOCK 64
			
 
				+
			
 
				+static inline long long jlstarpu_max(long long a, long long b)
			
 
				+{
			
 
				+	return (a > b) ? a : b;
			
 
				+}
			
 
				+
			
 
				+static inline long long jlstarpu_interval_size(long long start, long long step, long long stop)
			
 
				+{
			
 
				+    if (stop >= start){
			
 
				+            return jlstarpu_max(0, (stop - start + 1) / step);
			
 
				+    } else {
			
 
				+            return jlstarpu_max(0, (stop - start - 1) / step);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+__device__ static inline long long jlstarpu_max__device(long long a, long long b)
			
 
				+{
			
 
				+	return (a > b) ? a : b;
			
 
				+}
			
 
				+
			
 
				+__device__ static inline long long jlstarpu_interval_size__device(long long start, long long step, long long stop)
			
 
				+{
			
 
				+	if (stop >= start){
			
 
				+		return jlstarpu_max__device(0, (stop - start + 1) / step);
			
 
				+	} else {
			
 
				+		return jlstarpu_max__device(0, (stop - start - 1) / step);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+__global__ void mandelbrot(int64_t kernel_ids__start_1, int64_t kernel_ids__step_1, int64_t kernel_ids__dim_1, int64_t kernel_ids__start_2, 
			
 
				+                           int64_t kernel_ids__step_2, int64_t kernel_ids__dim_2, double* ptr_hF6lCYyJ, int64_t local_width, 
			
 
				+                           int64_t* ptr_qoUGBRtY, int64_t local_height, double conv_limit, int64_t* ptr_A5zD9sJZ, 
			
 
				+                           uint32_t ld_A5zD9sJZ)
			
 
				+{
			
 
				+    int64_t THREAD_ID = (int64_t) ((((blockIdx).x) * ((blockDim).x)) + ((threadIdx).x));
			
 
				+    
			
 
				+    if ((THREAD_ID) >= (((1) * (kernel_ids__dim_2)) * (kernel_ids__dim_1)))
			
 
				+    {
			
 
				+        return ;
			
 
				+    };
			
 
				+    int64_t kernel_ids__index_1 = (int64_t) (((THREAD_ID) / ((1) * (kernel_ids__dim_2))) % (kernel_ids__dim_1));
			
 
				+    int64_t kernel_ids__index_2 = (int64_t) (((THREAD_ID) / (1)) % (kernel_ids__dim_2));
			
 
				+    int64_t x = (int64_t) ((kernel_ids__start_1) + ((kernel_ids__index_1) * (kernel_ids__step_1)));
			
 
				+    int64_t y = (int64_t) ((kernel_ids__start_2) + ((kernel_ids__index_2) * (kernel_ids__step_2)));
			
 
				+    double max_iterations = (double) (ptr_hF6lCYyJ[(5) - (1)]);
			
 
				+    double zoom = (double) ((ptr_hF6lCYyJ[(3) - (1)]) * (0.25296875));
			
 
				+    int64_t X = (int64_t) ((x) + ((local_width) * ((ptr_qoUGBRtY[(2) - (1)]) - (1))));
			
 
				+    int64_t Y = (int64_t) ((y) + ((local_height) * ((ptr_qoUGBRtY[(1) - (1)]) - (1))));
			
 
				+    double cr = (double) ((ptr_hF6lCYyJ[(1) - (1)]) + (((X) - ((ptr_hF6lCYyJ[(3) - (1)]) / (2))) / (zoom)));
			
 
				+    double zr = (double) (cr);
			
 
				+    double ci = (double) ((ptr_hF6lCYyJ[(2) - (1)]) + (((Y) - ((ptr_hF6lCYyJ[(4) - (1)]) / (2))) / (zoom)));
			
 
				+    double zi = (double) (ci);
			
 
				+    int64_t n = (int64_t) (0);
			
 
				+    int64_t b1 = (int64_t) (((n) < (max_iterations)) + ((((zr) * (zr)) + ((zi) * (zi))) < ((conv_limit) * (conv_limit))));
			
 
				+    
			
 
				+    while ((b1) >= (2))
			
 
				+    {
			
 
				+        double tmp = (double) ((((zr) * (zr)) - ((zi) * (zi))) + (cr));
			
 
				+        zi = ((2) * (zr) * (zi)) + (ci);
			
 
				+        zr = tmp;
			
 
				+        n = (n) + (1);
			
 
				+        b1 = ((n) <= (max_iterations)) + ((((zr) * (zr)) + ((zi) * (zi))) <= ((conv_limit) * (conv_limit)));
			
 
				+    }
			
 
				+    ;
			
 
				+    
			
 
				+    if ((n) < (max_iterations))
			
 
				+    {
			
 
				+        ptr_A5zD9sJZ[((y) + (((x) - (1)) * (ld_A5zD9sJZ))) - (1)] = ((255) * (n)) / (max_iterations);
			
 
				+    } else
			
 
				+    {
			
 
				+        ptr_A5zD9sJZ[((y) + (((x) - (1)) * (ld_A5zD9sJZ))) - (1)] = 0;
			
 
				+    }
			
 
				+    ;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+extern "C" void CUDA_mandelbrot(void** buffers_uwrYFDVe, void* cl_arg_uwrYFDVe)
			
 
				+{
			
 
				+    uint32_t ld_A5zD9sJZ = (uint32_t) (STARPU_MATRIX_GET_LD(buffers_uwrYFDVe[(1) - (1)]));
			
 
				+    int64_t* ptr_A5zD9sJZ = (int64_t*) (STARPU_MATRIX_GET_PTR(buffers_uwrYFDVe[(1) - (1)]));
			
 
				+    double* ptr_hF6lCYyJ = (double*) (STARPU_VECTOR_GET_PTR(buffers_uwrYFDVe[(2) - (1)]));
			
 
				+    int64_t* ptr_qoUGBRtY = (int64_t*) (STARPU_VECTOR_GET_PTR(buffers_uwrYFDVe[(3) - (1)]));
			
 
				+    int64_t local_width = (int64_t) (STARPU_MATRIX_GET_NY(buffers_uwrYFDVe[(1) - (1)]));
			
 
				+    int64_t local_height = (int64_t) (STARPU_MATRIX_GET_NX(buffers_uwrYFDVe[(1) - (1)]));
			
 
				+    double conv_limit = (double) (2.0);
			
 
				+    int64_t kernel_ids__start_1 = (int64_t) (1);
			
 
				+    int64_t kernel_ids__step_1 = (int64_t) (1);
			
 
				+    int64_t kernel_ids__dim_1 = (int64_t) (jlstarpu_interval_size(kernel_ids__start_1, kernel_ids__step_1, local_width));
			
 
				+    int64_t kernel_ids__start_2 = (int64_t) (1);
			
 
				+    int64_t kernel_ids__step_2 = (int64_t) (1);
			
 
				+    int64_t kernel_ids__dim_2 = (int64_t) (jlstarpu_interval_size(kernel_ids__start_2, kernel_ids__step_2, local_height));
			
 
				+    int64_t nthreads = (int64_t) (((1) * (kernel_ids__dim_1)) * (kernel_ids__dim_2));
			
 
				+    int64_t nblocks = (int64_t) ((((nthreads) + (THREADS_PER_BLOCK)) - (1)) / (THREADS_PER_BLOCK));
			
 
				+    
			
 
				+    mandelbrot
			
 
				+        <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
			
 
				+        >>> (kernel_ids__start_1, kernel_ids__step_1, kernel_ids__dim_1, kernel_ids__start_2, 
			
 
				+             kernel_ids__step_2, kernel_ids__dim_2, ptr_hF6lCYyJ, local_width, 
			
 
				+             ptr_qoUGBRtY, local_height, conv_limit, ptr_A5zD9sJZ, 
			
 
				+             ld_A5zD9sJZ);
			
 
				+    ;
			
 
				+    cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+}
			
 
				+
			
 
				+
			
--- a/julia/tst/mandelbrot/mandelbrot.c
+++ b/julia/tst/mandelbrot/mandelbrot.c
@@ -0,0 +1,248 @@
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <starpu.h>
			
 
				+#include "../display.h"
			
 
				+
			
 
				+void cpu_mandelbrot(void **, void *);
			
 
				+void gpu_mandelbrot(void **, void *);
			
 
				+
			
 
				+struct Params
			
 
				+{
			
 
				+	float cr;
			
 
				+	float ci;
			
 
				+	unsigned taskx;
			
 
				+	unsigned tasky;
			
 
				+	unsigned width;
			
 
				+	unsigned height;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+struct starpu_codelet cl =
			
 
				+{
			
 
				+	.cpu_funcs = {cpu_mandelbrot},
			
 
				+	.cuda_funcs = {gpu_mandelbrot},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_RW}
			
 
				+};
			
 
				+
			
 
				+
			
 
				+void mandelbrot_with_starpu(int *pixels, float cr, float ci, unsigned width, unsigned height, unsigned nslicesx, unsigned nslicesy)
			
 
				+{
			
 
				+	starpu_data_handle_t p_handle;
			
 
				+
			
 
				+	starpu_matrix_data_register(&p_handle, STARPU_MAIN_RAM, (uintptr_t)pixels, width, width, height, sizeof(int));
			
 
				+
			
 
				+	struct starpu_data_filter vert =
			
 
				+	{
			
 
				+		.filter_func = starpu_matrix_filter_vertical_block,
			
 
				+		.nchildren = nslicesy
			
 
				+	};
			
 
				+
			
 
				+	struct starpu_data_filter horiz =
			
 
				+	{
			
 
				+		.filter_func = starpu_matrix_filter_block,
			
 
				+		.nchildren = nslicesx
			
 
				+	};
			
 
				+
			
 
				+	starpu_data_map_filters(p_handle, 2, &vert, &horiz);
			
 
				+
			
 
				+	unsigned taskx, tasky;
			
 
				+
			
 
				+	struct Params *params = malloc(nslicesx*nslicesy*sizeof(struct Params));
			
 
				+
			
 
				+	for (taskx = 0; taskx < nslicesx; taskx++){
			
 
				+		for (tasky = 0; tasky < nslicesy; tasky++){
			
 
				+			struct starpu_task *task = starpu_task_create();
			
 
				+			
			
 
				+			task->cl = &cl;
			
 
				+			task->handles[0] = starpu_data_get_sub_data(p_handle, 2, tasky, taskx);
			
 
				+			struct Params param = {cr, ci, taskx, tasky, width, height};
			
 
				+
			
 
				+			params[taskx + tasky*nslicesx] = param;
			
 
				+
			
 
				+			task->cl_arg = (params + taskx + tasky * nslicesx);
			
 
				+			task->cl_arg_size = sizeof(struct Params);
			
 
				+			
			
 
				+			starpu_task_submit(task);
			
 
				+		}
			
 
				+	}
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	starpu_data_unpartition(p_handle, STARPU_MAIN_RAM);
			
 
				+
			
 
				+	starpu_data_unregister(p_handle);
			
 
				+
			
 
				+	free(params);
			
 
				+}
			
 
				+
			
 
				+void init_zero(int * pixels, unsigned width, unsigned height)
			
 
				+{
			
 
				+	unsigned i,j;
			
 
				+	for (i = 0; i < height; i++){
			
 
				+		for (j = 0; j < width; j++){
			
 
				+			pixels[j + i*width] = 0;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void sort(double *arr, unsigned nbr_tests)
			
 
				+{
			
 
				+	unsigned j;
			
 
				+	
			
 
				+	int is_sort = 0;
			
 
				+	
			
 
				+	while (!is_sort){
			
 
				+
			
 
				+		is_sort = 1;
			
 
				+		
			
 
				+		for (j = 0; j < nbr_tests - 1; j++){
			
 
				+			if (arr[j] > arr[j+1]){
			
 
				+				is_sort = 0;
			
 
				+				double tmp = arr[j];
			
 
				+				arr[j] = arr[j+1];
			
 
				+				arr[j+1] = tmp;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+double median_time(float cr, float ci, unsigned width, unsigned height, unsigned nslicesx, unsigned nslicesy, unsigned nbr_tests)
			
 
				+{
			
 
				+	int *Pixels = malloc(width*height*sizeof(int));
			
 
				+	
			
 
				+	unsigned i;
			
 
				+
			
 
				+	double exec_times[nbr_tests];
			
 
				+
			
 
				+	double start, stop, exec_t;
			
 
				+	for (i = 0; i < nbr_tests; i++){
			
 
				+		init_zero(Pixels, width, height);
			
 
				+		
			
 
				+		start = starpu_timing_now(); // starpu_timing_now() gives the time in microseconds.
			
 
				+		mandelbrot_with_starpu(Pixels, cr, ci, width, height, nslicesx, nslicesy);
			
 
				+		stop = starpu_timing_now();
			
 
				+		
			
 
				+		exec_t = (stop-start)/1.e6;
			
 
				+		exec_times[i] = exec_t;
			
 
				+	}
			
 
				+	char filename[30];
			
 
				+	sprintf(filename, "PPM/mandelbrot%d.ppm", width);
			
 
				+	printf("%s\n", filename);
			
 
				+
			
 
				+	mandelbrot_graph(filename, Pixels, width, height);
			
 
				+
			
 
				+	free(Pixels);
			
 
				+
			
 
				+	sort(exec_times, nbr_tests);
			
 
				+
			
 
				+	return exec_times[nbr_tests/2];	
			
 
				+}
			
 
				+
			
 
				+void fluctuation_time(float cr, float ci, unsigned width, unsigned height, unsigned nslicesx, unsigned nslicesy, unsigned nbr_tests, double *exec_times)
			
 
				+{
			
 
				+	int *Pixels = malloc(width*height*sizeof(int));
			
 
				+	
			
 
				+	unsigned i;
			
 
				+
			
 
				+	double start, stop, exec_t;
			
 
				+	for (i = 0; i < nbr_tests; i++){
			
 
				+		init_zero(Pixels, width, height);
			
 
				+		
			
 
				+		start = starpu_timing_now(); // starpu_timing_now() gives the time in microseconds.
			
 
				+		mandelbrot_with_starpu(Pixels, cr, ci, width, height, nslicesx, nslicesy);
			
 
				+		stop = starpu_timing_now();
			
 
				+		
			
 
				+		exec_t = (stop-start)/1.e6;
			
 
				+		exec_times[i] = exec_t;
			
 
				+
			
 
				+		/* char filename[33]; */
			
 
				+		/* sprintf(filename, "../PPM/mandelbrot%d.ppm", i + 1); */
			
 
				+		/* printf("%s\n", filename); */
			
 
				+		/* mandelbrot_graph(filename, Pixels, width, height); */
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	free(Pixels);
			
 
				+
			
 
				+
			
 
				+
			
 
				+	
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void display_times(float cr, float ci, unsigned start_dim, unsigned step_dim, unsigned stop_dim, unsigned nslices, unsigned nbr_tests)
			
 
				+{
			
 
				+	
			
 
				+	unsigned dim;
			
 
				+
			
 
				+	FILE *myfile;
			
 
				+	myfile = fopen("DAT/mandelbrot_c_struct_times.dat", "w");
			
 
				+
			
 
				+	for (dim = start_dim; dim <= stop_dim; dim += step_dim){
			
 
				+		printf("Dimension: %u...\n", dim);
			
 
				+		double t = median_time(cr, ci, dim, dim, nslices, nslices, nbr_tests);
			
 
				+		
			
 
				+		printf("w = %u ; h = %u ; t = %f\n", dim, dim, t);
			
 
				+		
			
 
				+		fprintf(myfile, "%f\n", t);
			
 
				+		}
			
 
				+	
			
 
				+	fclose(myfile);
			
 
				+}
			
 
				+
			
 
				+void display_fluctuations(float cr, float ci, unsigned start_dim, unsigned step_dim, unsigned stop_dim, unsigned nslices, unsigned nbr_tests)
			
 
				+{
			
 
				+	
			
 
				+	unsigned dim;
			
 
				+
			
 
				+	FILE *myfile;
			
 
				+	myfile = fopen("DAT/mandelbrot_c_fluctuation.dat", "w");
			
 
				+
			
 
				+	double *exec_times = malloc(nbr_tests * sizeof(double));
			
 
				+	fluctuation_time(cr, ci, start_dim, start_dim, nslices, nslices, nbr_tests, exec_times);
			
 
				+		
			
 
				+	/* printf("w = %u ; h = %u ; t = %f\n", dim, dim, t); */
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < nbr_tests; i++){
			
 
				+		printf("test %u: %f seconds\n", i, exec_times[i]);
			
 
				+		fprintf(myfile, "%u %f\n", i, exec_times[i]);
			
 
				+	}
			
 
				+	
			
 
				+	fclose(myfile);
			
 
				+	free(exec_times);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+
			
 
				+	if (argc != 8){
			
 
				+		printf("Usage: %s cr ci start_dim step_dim stop_dim nslices(must divide dims) nbr_tests\n", argv[0]);
			
 
				+		return 1;
			
 
				+	}
			
 
				+	if (starpu_init(NULL) != EXIT_SUCCESS){
			
 
				+		fprintf(stderr, "ERROR\n");
			
 
				+		return 77;
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	
			
 
				+	float cr = (float) atof(argv[1]);
			
 
				+	float ci = (float) atof(argv[2]);
			
 
				+	unsigned start_dim = (unsigned) atoi(argv[3]);
			
 
				+	unsigned step_dim = (unsigned) atoi(argv[4]);	
			
 
				+	unsigned stop_dim = (unsigned) atoi(argv[5]);
			
 
				+	unsigned nslices = (unsigned) atoi(argv[6]);
			
 
				+	unsigned nbr_tests = (unsigned) atoi(argv[7]);
			
 
				+
			
 
				+	display_times(cr, ci, start_dim, step_dim, stop_dim, nslices, nbr_tests);
			
 
				+	
			
 
				+	
			
 
				+	/* display_fluctuations(cr, ci, start_dim, step_dim, stop_dim, nslices, nbr_tests); */
			
 
				+
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/julia/tst/mandelbrot/mandelbrot.jl
+++ b/julia/tst/mandelbrot/mandelbrot.jl
@@ -0,0 +1,30 @@
 
				+function mandelbrotjl(pixels ::Matrix{Int64}, centerr ::Float64, centeri ::Float64)
			
 
				+    height,width = size(pixels)
			
 
				+    zoom = width * 0.25296875
			
 
				+    val_diverge = 2.0
			
 
				+    max_iterations = (width/2) * 0.049715909 * log10(zoom);
			
 
				+
			
 
				+
			
 
				+    for y = 1:height
			
 
				+        for x = 1:width
			
 
				+            cr = centerr + (x - (width / 2))/zoom
			
 
				+            zr = cr
			
 
				+            ci = centeri + (y - (height / 2))/zoom
			
 
				+            zi = ci
			
 
				+
			
 
				+            n = 0
			
 
				+            while ((n < max_iterations) && (zr*zr + zi*zi < val_diverge*val_diverge))
			
 
				+                tmp = zr*zr - zi*zi + cr
			
 
				+                zi = 2*zr*zi + ci
			
 
				+                zr = tmp
			
 
				+                n = n+1
			
 
				+            end
			
 
				+            
			
 
				+            if (n < max_iterations)
			
 
				+                pixels[y,x] = round(255 * n / max_iterations)
			
 
				+            else
			
 
				+                pixels[y,x] = 0
			
 
				+            end
			
 
				+        end
			
 
				+    end
			
 
				+end
			
--- a/julia/tst/mandelbrot/mandelbrot_between.c
+++ b/julia/tst/mandelbrot/mandelbrot_between.c
@@ -0,0 +1,270 @@
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <starpu.h>
			
 
				+#include <stdint.h>
			
 
				+#include "../includes/display.h"
			
 
				+
			
 
				+void mandelbrot(void **, void *);
			
 
				+void CUDA_mandelbrot(void **, void *);
			
 
				+void test(void **, void *); /* Function used to test on my matrix, in the cpu_test_with_generated.c file. */
			
 
				+
			
 
				+struct Params
			
 
				+{
			
 
				+	float cr;
			
 
				+	float ci;
			
 
				+	unsigned taskx;
			
 
				+	unsigned tasky;
			
 
				+	unsigned width;
			
 
				+	unsigned height;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+struct starpu_codelet cl =
			
 
				+{
			
 
				+	/* .cpu_funcs = {test}, */ 
			
 
				+	.cpu_funcs = {mandelbrot},
			
 
				+	.cuda_funcs = {CUDA_mandelbrot},
			
 
				+
			
 
				+	//ARRAY PAR
			
 
				+	.nbuffers = 3,
			
 
				+	.modes = {STARPU_RW, STARPU_R, STARPU_R}
			
 
				+	//
			
 
				+
			
 
				+	//STRUCT PAR
			
 
				+	/* .nbuffers = 1, */
			
 
				+	/* .modes = {STARPU_RW} */
			
 
				+	//
			
 
				+};
			
 
				+
			
 
				+
			
 
				+void mandelbrot_with_starpu(int64_t *pixels, float cr, float ci, unsigned width, unsigned height, unsigned nslicesx, unsigned nslicesy, double *params)
			
 
				+{
			
 
				+	//ARRAY PAR
			
 
				+	starpu_data_handle_t p_handle, par_handle, v_handle;
			
 
				+	//
			
 
				+
			
 
				+	//STRUCT PAR
			
 
				+	/* starpu_data_handle_t p_handle; */
			
 
				+	//
			
 
				+
			
 
				+	starpu_matrix_data_register(&p_handle, STARPU_MAIN_RAM, (uintptr_t)pixels, width, width, height, sizeof(int64_t));
			
 
				+	//ARRAY PAR
			
 
				+	starpu_matrix_data_register(&par_handle, STARPU_MAIN_RAM, (uintptr_t)params, 5, 5, 1, sizeof(double));
			
 
				+	//
			
 
				+
			
 
				+	struct starpu_data_filter vert =
			
 
				+	{
			
 
				+		/* .filter_func = starpu_matrix_filter_block, */
			
 
				+		.filter_func = starpu_matrix_filter_vertical_block,
			
 
				+		.nchildren = nslicesy
			
 
				+	};
			
 
				+
			
 
				+	struct starpu_data_filter horiz =
			
 
				+	{
			
 
				+		.filter_func = starpu_matrix_filter_block,
			
 
				+		/* .filter_func = starpu_matrix_filter_vertical_block, */
			
 
				+		.nchildren = nslicesx
			
 
				+	};
			
 
				+
			
 
				+	starpu_data_map_filters(p_handle, 2, &vert, &horiz);
			
 
				+
			
 
				+	unsigned taskx, tasky;
			
 
				+
			
 
				+	//ARRAY PAR
			
 
				+	int64_t *V = malloc(2 * nslicesx * nslicesy * sizeof(int64_t));
			
 
				+	//
			
 
				+
			
 
				+	//STRUCT PAR
			
 
				+	/* struct Params *parameters = malloc(nslicesx * nslicesy * sizeof(struct Params)); */
			
 
				+	//
			
 
				+
			
 
				+	for (tasky = 0; tasky < nslicesy; tasky++){
			
 
				+		for (taskx = 0; taskx < nslicesx; taskx++){
			
 
				+			struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+			//ARRAY PAR			
			
 
				+			V[2 * (taskx + nslicesx * tasky)] = taskx + 1;
			
 
				+			V[2 * (taskx + nslicesx * tasky) + 1] = tasky + 1;
			
 
				+			starpu_vector_data_register(&v_handle, STARPU_MAIN_RAM, (uintptr_t)&(V[2 * (taskx + nslicesx * tasky)]), 2, sizeof(int64_t));
			
 
				+			//
			
 
				+			
			
 
				+
			
 
				+			/* printf("Pre-Task%u_%u\n", taskx, tasky); */
			
 
				+
			
 
				+			task->cl = &cl;
			
 
				+
			
 
				+			
			
 
				+			task->handles[0] = starpu_data_get_sub_data(p_handle, 2, tasky, taskx);
			
 
				+			
			
 
				+			//ARRAY PAR
			
 
				+			task->handles[1] = par_handle;
			
 
				+			task->handles[2] = v_handle;
			
 
				+			//
			
 
				+
			
 
				+			//STRUCT PAR
			
 
				+			/* struct Params param = {cr, ci, taskx, tasky, width, height}; */
			
 
				+
			
 
				+			
			
 
				+
			
 
				+			/* parameters[taskx + tasky * nslicesx] = param; */
			
 
				+
			
 
				+			/* task->cl_arg = (parameters + taskx + tasky * nslicesx); */
			
 
				+			/* task->cl_arg_size = sizeof(struct Params); */
			
 
				+			//
			
 
				+
			
 
				+
			
 
				+			starpu_task_submit(task);
			
 
				+
			
 
				+			//ARRAY PAR
			
 
				+			starpu_data_unregister_submit(v_handle);
			
 
				+			//
			
 
				+		}
			
 
				+	}
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	starpu_data_unpartition(p_handle, STARPU_MAIN_RAM);
			
 
				+
			
 
				+	starpu_data_unregister(p_handle);
			
 
				+
			
 
				+	//STRUCT PAR
			
 
				+	/* free(parameters); */
			
 
				+	//
			
 
				+
			
 
				+	//ARRAY PAR
			
 
				+	starpu_data_unregister(par_handle);
			
 
				+	/* starpu_data_unregister(v_handle); */
			
 
				+	free(V);
			
 
				+	//
			
 
				+}
			
 
				+
			
 
				+void init_zero(int64_t * pixels, unsigned width, unsigned height)
			
 
				+{
			
 
				+	unsigned i,j;
			
 
				+	for (i = 0; i < height; i++){
			
 
				+		for (j = 0; j < width; j++){
			
 
				+			pixels[j + i*width] = 0;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void sort(double *arr, unsigned nbr_tests)
			
 
				+{
			
 
				+	unsigned j;
			
 
				+	
			
 
				+	int is_sort = 0;
			
 
				+	
			
 
				+	while (!is_sort){
			
 
				+
			
 
				+		is_sort = 1;
			
 
				+		
			
 
				+		for (j = 0; j < nbr_tests - 1; j++){
			
 
				+			if (arr[j] > arr[j+1]){
			
 
				+				is_sort = 0;
			
 
				+				double tmp = arr[j];
			
 
				+				arr[j] = arr[j+1];
			
 
				+				arr[j+1] = tmp;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+double median_time(float cr, float ci, unsigned width, unsigned height, unsigned nslicesx, unsigned nslicesy, unsigned nbr_tests)
			
 
				+{
			
 
				+	int64_t *Pixels = malloc(width*height*sizeof(int64_t));
			
 
				+	double *params = malloc(4*sizeof(double));
			
 
				+	
			
 
				+	double max_iterations = (width/2) * 0.049715909 * log10(width * 0.25296875);
			
 
				+
			
 
				+	params[0] = (double) cr;
			
 
				+	params[1] = (double) ci;
			
 
				+	params[2] = (double) width;
			
 
				+	params[3] = (double) height;
			
 
				+	params[4] = (double) max_iterations;
			
 
				+	unsigned i;
			
 
				+
			
 
				+	double exec_times[nbr_tests];
			
 
				+
			
 
				+	double start, stop, exec_t;
			
 
				+	for (i = 0; i < nbr_tests; i++){
			
 
				+		init_zero(Pixels, width, height);
			
 
				+		
			
 
				+		start = starpu_timing_now(); // starpu_timing_now() gives the time in microseconds.
			
 
				+		mandelbrot_with_starpu(Pixels, cr, ci, width, height, nslicesx, nslicesy, params);
			
 
				+		stop = starpu_timing_now();
			
 
				+		
			
 
				+		exec_t = (stop-start)/1.e6;
			
 
				+		exec_times[i] = exec_t;
			
 
				+	}
			
 
				+
			
 
				+	
			
 
				+	char filename[34];
			
 
				+	sprintf(filename, "PPM/mandelbrottest%d.ppm", width);
			
 
				+	printf("%s\n", filename);
			
 
				+
			
 
				+	/* Due to Julia registering matrices differently in memory, we need to transpose the matrix we get from the Julia generated kernels */
			
 
				+
			
 
				+	mandelbrot_graph_transpose(filename, Pixels, width, height);
			
 
				+
			
 
				+
			
 
				+	/* mandelbrot_graph_transpose("PPM/mandelbrottest.ppm", Pixels, width, height); */
			
 
				+
			
 
				+	free(Pixels);
			
 
				+	free(params);
			
 
				+
			
 
				+	sort(exec_times, nbr_tests);
			
 
				+
			
 
				+	return exec_times[nbr_tests/2];	
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void display_times(float cr, float ci, unsigned start_dim, unsigned step_dim, unsigned stop_dim, unsigned nslices, unsigned nbr_tests)
			
 
				+{
			
 
				+	
			
 
				+	unsigned dim;
			
 
				+
			
 
				+	FILE *myfile;
			
 
				+	myfile = fopen("DAT/mandelbrot_c_array_times.dat", "w");
			
 
				+
			
 
				+	for (dim = start_dim; dim <= stop_dim; dim += step_dim){
			
 
				+		
			
 
				+		double t = median_time(cr, ci, dim, dim, nslices, nslices, nbr_tests);
			
 
				+		
			
 
				+		printf("w = %u ; h = %u ; t = %f\n", dim, dim, t);
			
 
				+		
			
 
				+		fprintf(myfile, "%f\n", t);
			
 
				+		}
			
 
				+	
			
 
				+	fclose(myfile);
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+
			
 
				+	if (argc != 8){
			
 
				+		printf("Usage: %s cr ci start_dim step_dim stop_dim nslices(must divide dims) nbr_tests\n", argv[0]);
			
 
				+		return 1;
			
 
				+	}
			
 
				+	if (starpu_init(NULL) != EXIT_SUCCESS){
			
 
				+		fprintf(stderr, "ERROR\n");
			
 
				+		return 77;
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	
			
 
				+	float cr = (float) atof(argv[1]);
			
 
				+	float ci = (float) atof(argv[2]);
			
 
				+	unsigned start_dim = (unsigned) atoi(argv[3]);
			
 
				+	unsigned step_dim = (unsigned) atoi(argv[4]);	
			
 
				+	unsigned stop_dim = (unsigned) atoi(argv[5]);
			
 
				+	unsigned nslices = (unsigned) atoi(argv[6]);
			
 
				+	unsigned nbr_tests = (unsigned) atoi(argv[7]);
			
 
				+
			
 
				+	display_times(cr, ci, start_dim, step_dim, stop_dim, nslices, nbr_tests);
			
 
				+
			
 
				+
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/julia/tst/mandelbrot/mandelbrot_def.jl
+++ b/julia/tst/mandelbrot/mandelbrot_def.jl
@@ -0,0 +1,197 @@
 
				+include("mandelbrot.jl")
			
 
				+
			
 
				+
			
 
				+function mandelbrot_with_starpu(A ::Matrix{Int64}, params ::Vector{Float64}, nslicesx ::Int64, nslicesy ::Int64) #mettre params en matrice. (pour que starpu le traite en matrice et non vecteur)
			
 
				+    vert = StarpuDataFilter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslicesy)
			
 
				+    horiz = StarpuDataFilter(STARPU_MATRIX_FILTER_BLOCK, nslicesx)
			
 
				+    @starpu_block let
			
 
				+	hA = starpu_data_register(A)
			
 
				+        hP = starpu_data_register(params)
			
 
				+	starpu_data_map_filters(hA, vert, horiz)
			
 
				+        
			
 
				+	@starpu_sync_tasks for tasky in (1:nslicesy)
			
 
				+            for taskx in (1 : nslicesx)
			
 
				+                @starpu_block let
			
 
				+                    v = Int64[tasky, taskx] #C'est le x qu'on augmente en fonction du nombre de slicey. Si il y a trois colonnes, x sera coupé en 3. Donc on inverse dans v.
			
 
				+                    hV = starpu_data_register(v)
			
 
				+                    @starpu_async_cl cl(hA[tasky, taskx], hP, hV)
			
 
				+	        end
			
 
				+            end
			
 
				+        end
			
 
				+    end
			
 
				+    return nothing
			
 
				+end
			
 
				+
			
 
				+function mandelbrot_with_starpu_cpu(A ::Matrix{Int64}, params ::Vector{Float64}, nslicesx ::Int64, nslicesy ::Int64) #mettre params en matrice. (pour que starpu le traite en matrice et non vecteur)
			
 
				+    vert = StarpuDataFilter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslicesy)
			
 
				+    horiz = StarpuDataFilter(STARPU_MATRIX_FILTER_BLOCK, nslicesx)
			
 
				+    @starpu_block let
			
 
				+	hA = starpu_data_register(A)
			
 
				+        hP = starpu_data_register(params)
			
 
				+	starpu_data_map_filters(hA, vert, horiz)
			
 
				+        
			
 
				+	@starpu_sync_tasks for tasky in (1:nslicesy)
			
 
				+            for taskx in (1 : nslicesx)
			
 
				+                v = Int64[tasky, taskx] #C'est le x qu'on augmente en fonction du nombre de slicey. Si il y a trois colonnes, x sera coupé en 3. Donc on inverse dans v.
			
 
				+                hV = starpu_data_register(v)
			
 
				+                @starpu_async_cl clcpu(hA[tasky, taskx], hP, hV)
			
 
				+	    end
			
 
				+        end
			
 
				+    end
			
 
				+    return nothing
			
 
				+end
			
 
				+
			
 
				+function mandelbrot_with_starpu_gpu(A ::Matrix{Int64}, params ::Vector{Float64}, nslicesx ::Int64, nslicesy ::Int64) #mettre params en matrice. (pour que starpu le traite en matrice et non vecteur)
			
 
				+    vert = StarpuDataFilter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslicesx)
			
 
				+    horiz = StarpuDataFilter(STARPU_MATRIX_FILTER_BLOCK, nslicesy)
			
 
				+    @starpu_block let
			
 
				+	hA = starpu_data_register(A)
			
 
				+        hP = starpu_data_register(params)
			
 
				+	starpu_data_map_filters(hA, vert, horiz)
			
 
				+        
			
 
				+	@starpu_sync_tasks for taskx in (1:nslicesx)
			
 
				+            for tasky in (1 : nslicesy)
			
 
				+                v = Int64[taskx, tasky] #C'est le x qu'on augmente en fonction du nombre de slicey. Si il y a trois colonnes, x sera coupé en 3. Donc on inverse dans v.
			
 
				+                hV = starpu_data_register(v)
			
 
				+                @starpu_async_cl clgpu(hA[taskx, tasky], hP, hV)
			
 
				+	    end
			
 
				+        end
			
 
				+    end
			
 
				+    return nothing
			
 
				+end
			
 
				+
			
 
				+function init_zero(Pixels ::Matrix{Int64}, width ::Int64, height ::Int64)
			
 
				+    for i in 1:height
			
 
				+        for j in 1:width
			
 
				+            Pixels[i,j] = 0
			
 
				+        end
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+function graph_pixels(Pixels ::Matrix{Int64}, width ::Int64, height ::Int64, filename ::String)
			
 
				+    open(filename, "w") do f
			
 
				+        write(f, "P3\n$width $height\n255\n")
			
 
				+        for i = 1:height
			
 
				+            for j = 1:width
			
 
				+                write(f, "$(Pixels[i,j]) 0 0 ")
			
 
				+            end
			
 
				+            write(f, "\n")
			
 
				+        end
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+function median_times(nbr_tests ::Int64, cr ::Float64, ci ::Float64, dim ::Int64, nslices ::Int64)
			
 
				+
			
 
				+    exec_times_st ::Vector{Float64} = [0 for i = 1:nbr_tests]
			
 
				+    exec_times_cpu ::Vector{Float64} = [0 for i = 1:nbr_tests]
			
 
				+    exec_times_gpu ::Vector{Float64} = [0 for i = 1:nbr_tests]
			
 
				+    exec_times_jl ::Vector{Float64} = [0 for i = 1:nbr_tests]
			
 
				+    
			
 
				+    Pixels_st ::Matrix{Int64} = zeros(dim, dim)
			
 
				+    Pixels_cpu ::Matrix{Int64} = copy(Pixels_st)
			
 
				+    Pixels_gpu ::Matrix{Int64} = copy(Pixels_st)
			
 
				+    Pixels_jl ::Matrix{Int64} = copy(Pixels_st)
			
 
				+    
			
 
				+    max_iter ::Float64 = (dim/2) * 0.049715909 * log10(dim * 0.25296875)
			
 
				+
			
 
				+
			
 
				+    params = [cr, ci, dim, dim, max_iter]
			
 
				+
			
 
				+    for i = 1:nbr_tests
			
 
				+        
			
 
				+        tic()
			
 
				+        mandelbrot_with_starpu(Pixels_st, params, nslices, nslices)
			
 
				+        t = toq()
			
 
				+        
			
 
				+        
			
 
				+        exec_times_st[i] = t
			
 
				+        
			
 
				+        # tic()
			
 
				+        # mandelbrot_with_starpu_cpu(Pixels_cpu, params, nslices, nslices)
			
 
				+        # t = toq()
			
 
				+        
			
 
				+
			
 
				+        # exec_times_cpu[i] = t        
			
 
				+
			
 
				+        # tic()
			
 
				+        # mandelbrot_with_starpu_gpu(Pixels_gpu, params, nslices, nslices)
			
 
				+        # t = toq()
			
 
				+        
			
 
				+        
			
 
				+        # exec_times_gpu[i] = t
			
 
				+
			
 
				+
			
 
				+        # tic()
			
 
				+        # mandelbrotjl(Pixels_jl, cr, ci)
			
 
				+        # t = toq()
			
 
				+        
			
 
				+        
			
 
				+        # exec_times_jl[i] = t
			
 
				+     end
			
 
				+    # graph_pixels(Pixels_st, dim, dim, "../PPM/mandelbrotst$(dim).ppm")
			
 
				+    # graph_pixels(Pixels_cpu, dim, dim, "../PPM/mandelbrotcpu$(dim).ppm")
			
 
				+    # graph_pixels(Pixels_gpu, dim, dim, "../PPM/mandelbrotgpu$(dim).ppm")
			
 
				+    # graph_pixels(Pixels_jl, dim, dim, "../PPM/mandelbrotjl$(dim).ppm")
			
 
				+
			
 
				+    sort!(exec_times_st)
			
 
				+    # sort!(exec_times_cpu)
			
 
				+    # sort!(exec_times_gpu)
			
 
				+    # sort!(exec_times_jl)
			
 
				+
			
 
				+    
			
 
				+    results ::Vector{Float64} = [exec_times_st[1 + div(nbr_tests-1, 2)]]
			
 
				+    # results ::Vector{Float64} = [exec_times_st[1 + div(nbr_tests-1, 2)]]#, exec_times_cpu[1 + div(nbr_tests-1, 2)], exec_times_gpu[1 + div(nbr_tests-1, 2)]]#, exec_times_jl[1 + div(nbr_tests-1, 2)]]
			
 
				+    return results
			
 
				+end
			
 
				+
			
 
				+function display_time(cr ::Float64, ci ::Float64, start_dim ::Int64, step_dim ::Int64, stop_dim ::Int64, nslices ::Int64, nbr_tests ::Int64)
			
 
				+    # mtc = map( (x->parse(Float64,x)), open("../DAT/mandelbrot_c.dat") do f
			
 
				+    #             readlines(f)
			
 
				+    #             end)
			
 
				+
			
 
				+    # mtgen = map( (x->parse(Float64,x)), open("../DAT/mandelbrot_with_generated_times.dat") do f
			
 
				+    #             readlines(f)
			
 
				+    #             end)
			
 
				+
			
 
				+    mtjl = map( (x->parse(Float64,x)), open("../DAT/mandelbrot_jl_times.dat") do f
			
 
				+                    readlines(f)
			
 
				+                    end)
			
 
				+
			
 
				+    # mtjlcpu = map( (x->parse(Float64,x)), open("../DAT/mandelbrot_jl_cpu.dat") do f
			
 
				+    #             readlines(f)
			
 
				+    #             end)
			
 
				+
			
 
				+    mtstruct = map( (x->parse(Float64,x)), open("../DAT/mandelbrot_c_struct_times.dat") do f
			
 
				+                    readlines(f)
			
 
				+                    end)
			
 
				+
			
 
				+
			
 
				+    mtarray = map( (x->parse(Float64,x)), open("../DAT/mandelbrot_c_array_times.dat") do f
			
 
				+                    readlines(f)
			
 
				+                    end)
			
 
				+
			
 
				+
			
 
				+    i = 1
			
 
				+    # open("../DAT/mandelbrot.dat", "w") do f
			
 
				+    # open("../DAT/mandelbrot_gen_times.dat", "w") do ft
			
 
				+    open("../DAT/mandelbrot_speedups.dat", "w") do f
			
 
				+        for dim in (start_dim : step_dim : stop_dim)
			
 
				+            println("Dimension: $dim")
			
 
				+            
			
 
				+            res ::Vector{Float64} = median_times(nbr_tests, cr, ci, dim, nslices)
			
 
				+            # println("C: $(mtc[i])")
			
 
				+            # println("C with generated: $(mtgen[i])")
			
 
				+            # println("Julia with starpu: $(res[1])")
			
 
				+            # println("cpu: $(res[2])")
			
 
				+            println("c_struct: $(mtstruct[i])")
			
 
				+            println("c_array: $(mtarray[i])")
			
 
				+            println("jl_st: $(res[1])")
			
 
				+            # println("cpu: $(res[1])")
			
 
				+            # write(ft, "$(dim) $(res[1]) $(mtgen[i])\n")
			
 
				+            # write(f, "$(dim) $(res[4]/res[1]) $(res[4]/res[2]) $(res[4]/res[3]) $(res[4]/mtc[i])\n")
			
 
				+            write(f, "$(dim) $(mtjl[i]/res[1]) $(mtjl[i]/mtstruct[i]) $(mtjl[i]/mtarray[i])\n")
			
 
				+            # write(f, "$(res[1])\n")
			
 
				+            i = i + 1
			
 
				+        end
			
 
				+    end
			
 
				+end
			
--- a/julia/tst/mandelbrot/mandelbrot_generated.jl
+++ b/julia/tst/mandelbrot/mandelbrot_generated.jl
@@ -0,0 +1,48 @@
 
				+
			
 
				+
			
 
				+if length(ARGS) != 7
			
 
				+    println("Usage : julia prog.jl cr ci start_dim step_dim stop_dim nslices nbr_tests")
			
 
				+    quit()
			
 
				+end
			
 
				+
			
 
				+if (parse(Int64,ARGS[3]) % parse(Int64,ARGS[6]) != 0)
			
 
				+    println("The number of slices should divide all the dimensions.")
			
 
				+    quit()
			
 
				+end
			
 
				+
			
 
				+include("../../src/Wrapper/Julia/starpu_include.jl")
			
 
				+using StarPU
			
 
				+
			
 
				+@debugprint "starpu_init"
			
 
				+starpu_init(extern_task_path = "../build/generated_tasks_mandelbrot.so")
			
 
				+
			
 
				+perfmodel = StarpuPerfmodel(
			
 
				+    perf_type = STARPU_HISTORY_BASED,
			
 
				+    symbol = "history_perf"
			
 
				+)
			
 
				+
			
 
				+cl = StarpuCodelet(
			
 
				+    cpu_func = "mandelbrot",
			
 
				+    gpu_func = "CUDA_mandelbrot",
			
 
				+    modes = [STARPU_W, STARPU_R, STARPU_R],
			
 
				+    perfmodel = perfmodel
			
 
				+)
			
 
				+
			
 
				+clcpu = StarpuCodelet(
			
 
				+    cpu_func = "mandelbrot",
			
 
				+    modes = [STARPU_W, STARPU_R, STARPU_R],
			
 
				+    perfmodel = perfmodel
			
 
				+)
			
 
				+
			
 
				+clgpu = StarpuCodelet(
			
 
				+    gpu_func = "CUDA_mandelbrot",
			
 
				+    modes = [STARPU_W, STARPU_R, STARPU_R],
			
 
				+    perfmodel = perfmodel
			
 
				+)
			
 
				+
			
 
				+include("mandelbrot_def.jl")
			
 
				+
			
 
				+display_time(parse(Float64,ARGS[1]), parse(Float64,ARGS[2]), map((x -> parse(Int64, x)), ARGS[3:7])...)
			
 
				+
			
 
				+@debugprint "starpu_shutdown"
			
 
				+starpu_shutdown()
			
--- a/julia/tst/mult/cpu_cuda_mult.jl
+++ b/julia/tst/mult/cpu_cuda_mult.jl
@@ -0,0 +1,29 @@
 
				+
			
 
				+include("../../src/Compiler/include.jl")
			
 
				+
			
 
				+starpu_new_cpu_kernel_file("../build/generated_cpu_mult.c")
			
 
				+starpu_new_cuda_kernel_file("../build/generated_cuda_mult.cu")
			
 
				+
			
 
				+@cpu_cuda_kernel function matrix_mult(m1 :: Matrix{Float32}, m2 :: Matrix{Float32}, m3 :: Matrix{Float32}) :: Void
			
 
				+
			
 
				+    width_m2 :: Int64 = width(m2)
			
 
				+    height_m1 :: Int64 = height(m1)
			
 
				+    width_m1 :: Int64 = width(m1)
			
 
				+    A ::Float64 = abs(-4.0)
			
 
				+    @indep for j in (1 : width_m2)
			
 
				+        @indep for i in (1 : height_m1)
			
 
				+
			
 
				+            sum :: Float32 = 0.
			
 
				+
			
 
				+            for k in (1 : width_m1)
			
 
				+                sum = sum + m1[i, k] * m2[k, j]
			
 
				+            end
			
 
				+
			
 
				+            m3[i, j] = sum
			
 
				+        end
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+compile_cpu_kernels("../build/generated_cpu_mult.so")
			
 
				+compile_cuda_kernels("../build/generated_cuda_mult.so")
			
 
				+combine_kernel_files("../build/generated_tasks.so", ["../build/generated_cpu_mult.so", "../build/generated_cuda_mult.so"])
			
--- a/julia/tst/mult/cpu_mult.c
+++ b/julia/tst/mult/cpu_mult.c
@@ -0,0 +1,54 @@
 
				+#include <stdint.h>
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+/*
			
 
				+ * The codelet is passed 3 matrices, the "descr" union-type field gives a
			
 
				+ * description of the layout of those 3 matrices in the local memory (ie. RAM
			
 
				+ * in the case of CPU, GPU frame buffer in the case of GPU etc.). Since we have
			
 
				+ * registered data with the "matrix" data interface, we use the matrix macros.
			
 
				+ */
			
 
				+
			
 
				+void cpu_mult(void *descr[], void *arg)
			
 
				+{
			
 
				+	(void)arg;
			
 
				+	float *subA, *subB, *subC;
			
 
				+	uint32_t nxC, nyC, nyA;
			
 
				+	uint32_t ldA, ldB, ldC;
			
 
				+
			
 
				+	/* .blas.ptr gives a pointer to the first element of the local copy */
			
 
				+	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
			
 
				+
			
 
				+
			
 
				+	/* .blas.nx is the number of rows (consecutive elements) and .blas.ny
			
 
				+	 * is the number of lines that are separated by .blas.ld elements (ld
			
 
				+	 * stands for leading dimension).
			
 
				+	 * NB: in case some filters were used, the leading dimension is not
			
 
				+	 * guaranteed to be the same in main memory (on the original matrix)
			
 
				+	 * and on the accelerator! */
			
 
				+	nxC = STARPU_MATRIX_GET_NX(descr[2]);
			
 
				+	nyC = STARPU_MATRIX_GET_NY(descr[2]);
			
 
				+	nyA = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+
			
 
				+	ldA = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+	ldB = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				+	ldC = STARPU_MATRIX_GET_LD(descr[2]);
			
 
				+
			
 
				+	/* we assume a FORTRAN-ordering! */
			
 
				+	unsigned i,j,k;
			
 
				+	for (i = 0; i < nyC; i++)
			
 
				+	{
			
 
				+		for (j = 0; j < nxC; j++)
			
 
				+		{
			
 
				+			float sum = 0.0;
			
 
				+
			
 
				+			for (k = 0; k < nyA; k++)
			
 
				+			{
			
 
				+				sum += subA[j+k*ldA]*subB[k+i*ldB];
			
 
				+			}
			
 
				+
			
 
				+			subC[j + i*ldC] = sum;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
--- a/julia/tst/mult/gpu_mult.cu
+++ b/julia/tst/mult/gpu_mult.cu
@@ -0,0 +1,68 @@
 
				+#include <starpu.h>
			
 
				+#include <stdint.h>
			
 
				+#include <stdio.h>
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+__global__ void gpuMultKernel
			
 
				+(
			
 
				+		uint32_t nxC, uint32_t nyC, uint32_t nyA,
			
 
				+		uint32_t ldA, uint32_t ldB, uint32_t ldC,
			
 
				+		float * subA, float * subB, float * subC
			
 
				+)
			
 
				+{
			
 
				+	uint32_t id, i, j, k;
			
 
				+	float sum;
			
 
				+
			
 
				+	id = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+	i = id % nxC;
			
 
				+	j = id / nxC;
			
 
				+
			
 
				+	if (j >= nyC){
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	sum = 0.;
			
 
				+
			
 
				+	for (k = 0 ; k < nyA ; k++){
			
 
				+		sum += subA[i + k*ldA] * subB[k + j*ldB];
			
 
				+	}
			
 
				+
			
 
				+	subC[i + j*ldC] = sum;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+#define THREADS_PER_BLOCK 64
			
 
				+
			
 
				+extern "C" void gpu_mult(void * descr[], void * args)
			
 
				+{
			
 
				+
			
 
				+	float * d_subA, * d_subB, * d_subC;
			
 
				+	uint32_t nxC, nyC, nyA;
			
 
				+	uint32_t ldA, ldB, ldC;
			
 
				+	uint32_t nblocks;
			
 
				+
			
 
				+	d_subA = (float *) STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	d_subB = (float *) STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+	d_subC = (float *) STARPU_MATRIX_GET_PTR(descr[2]);
			
 
				+
			
 
				+	nxC = STARPU_MATRIX_GET_NX(descr[2]);
			
 
				+	nyC = STARPU_MATRIX_GET_NY(descr[2]);
			
 
				+	nyA = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+
			
 
				+	ldA = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+	ldB = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				+	ldC = STARPU_MATRIX_GET_LD(descr[2]);
			
 
				+
			
 
				+	nblocks = (nxC * nyC + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
			
 
				+
			
 
				+	gpuMultKernel
			
 
				+		<<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
			
 
				+		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
			
 
				+
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+
			
 
				+}
			
--- a/julia/tst/mult/mult.c
+++ b/julia/tst/mult/mult.c
@@ -0,0 +1,288 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2012-2013                                Inria
			
 
				+ * Copyright (C) 2009-2011,2013-2015                      Université de Bordeaux
			
 
				+ * Copyright (C) 2010                                     Mehdi Juhoor
			
 
				+ * Copyright (C) 2010-2013,2015,2017                      CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * This example shows a simple implementation of a blocked matrix
			
 
				+ * multiplication. Note that this is NOT intended to be an efficient
			
 
				+ * implementation of sgemm! In this example, we show:
			
 
				+ *  - how to declare dense matrices (starpu_matrix_data_register)
			
 
				+ *  - how to manipulate matrices within codelets (eg. descr[0].blas.ld)
			
 
				+ *  - how to use filters to partition the matrices into blocks
			
 
				+ *    (starpu_data_partition and starpu_data_map_filters)
			
 
				+ *  - how to unpartition data (starpu_data_unpartition) and how to stop
			
 
				+ *    monitoring data (starpu_data_unregister)
			
 
				+ *  - how to manipulate subsets of data (starpu_data_get_sub_data)
			
 
				+ *  - how to construct an autocalibrated performance model (starpu_perfmodel)
			
 
				+ *  - how to submit asynchronous tasks
			
 
				+ */
			
 
				+
			
 
				+#include <string.h>
			
 
				+#include <math.h>
			
 
				+#include <sys/types.h>
			
 
				+#include <signal.h>
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * That program should compute C = A * B
			
 
				+ *
			
 
				+ *   A of size (z,y)
			
 
				+ *   B of size (x,z)
			
 
				+ *   C of size (x,y)
			
 
				+
			
 
				+              |---------------|
			
 
				+            z |       B       |
			
 
				+              |---------------|
			
 
				+       z              x
			
 
				+     |----|   |---------------|
			
 
				+     |    |   |               |
			
 
				+     |    |   |               |
			
 
				+     | A  | y |       C       |
			
 
				+     |    |   |               |
			
 
				+     |    |   |               |
			
 
				+     |----|   |---------------|
			
 
				+
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+void gpu_mult(void **, void *);
			
 
				+void cpu_mult(void **, void *);
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+static struct starpu_perfmodel model =
			
 
				+{
			
 
				+		.type = STARPU_HISTORY_BASED,
			
 
				+		.symbol = "history_perf"
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet cl =
			
 
				+{
			
 
				+		.cpu_funcs = {cpu_mult},
			
 
				+		.cpu_funcs_name = {"cpu_mult"},
			
 
				+		.cuda_funcs = {gpu_mult},
			
 
				+		.nbuffers = 3,
			
 
				+		.modes = {STARPU_R, STARPU_R, STARPU_W},
			
 
				+		.model = &model
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+void multiply_with_starpu(float *A, float *B, float *C,  unsigned xdim,  unsigned ydim,  unsigned zdim, unsigned nslicesx, unsigned nslicesy)
			
 
				+{
			
 
				+	starpu_data_handle_t A_handle, B_handle, C_handle;
			
 
				+
			
 
				+
			
 
				+	starpu_matrix_data_register(&A_handle, STARPU_MAIN_RAM, (uintptr_t)A,
			
 
				+			ydim, ydim, zdim, sizeof(float));
			
 
				+	starpu_matrix_data_register(&B_handle, STARPU_MAIN_RAM, (uintptr_t)B,
			
 
				+			zdim, zdim, xdim, sizeof(float));
			
 
				+	starpu_matrix_data_register(&C_handle, STARPU_MAIN_RAM, (uintptr_t)C,
			
 
				+			ydim, ydim, xdim, sizeof(float));
			
 
				+
			
 
				+
			
 
				+	struct starpu_data_filter vert =
			
 
				+	{
			
 
				+			.filter_func = starpu_matrix_filter_vertical_block,
			
 
				+			.nchildren = nslicesx
			
 
				+	};
			
 
				+
			
 
				+	struct starpu_data_filter horiz =
			
 
				+	{
			
 
				+			.filter_func = starpu_matrix_filter_block,
			
 
				+			.nchildren = nslicesy
			
 
				+	};
			
 
				+
			
 
				+
			
 
				+	starpu_data_partition(B_handle, &vert);
			
 
				+	starpu_data_partition(A_handle, &horiz);
			
 
				+	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
			
 
				+
			
 
				+	unsigned taskx, tasky;
			
 
				+
			
 
				+	for (taskx = 0; taskx < nslicesx; taskx++){
			
 
				+		for (tasky = 0; tasky < nslicesy; tasky++){
			
 
				+
			
 
				+			struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+			task->cl = &cl;
			
 
				+			task->handles[0] = starpu_data_get_sub_data(A_handle, 1, tasky);
			
 
				+			task->handles[1] = starpu_data_get_sub_data(B_handle, 1, taskx);
			
 
				+			task->handles[2] = starpu_data_get_sub_data(C_handle, 2, taskx, tasky);
			
 
				+
			
 
				+			starpu_task_submit(task);
			
 
				+
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+
			
 
				+	starpu_data_unpartition(A_handle, STARPU_MAIN_RAM);
			
 
				+	starpu_data_unpartition(B_handle, STARPU_MAIN_RAM);
			
 
				+	starpu_data_unpartition(C_handle, STARPU_MAIN_RAM);
			
 
				+
			
 
				+	starpu_data_unregister(A_handle);
			
 
				+	starpu_data_unregister(B_handle);
			
 
				+	starpu_data_unregister(C_handle);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+void init_rand(float * m, unsigned width, unsigned height)
			
 
				+{
			
 
				+	unsigned i,j;
			
 
				+
			
 
				+	for (j = 0 ; j < height ; j++){
			
 
				+		for (i = 0 ; i < width ; i++){
			
 
				+			m[j+i*height] = (float)(starpu_drand48());
			
 
				+			
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void init_zero(float * m, unsigned width, unsigned height)
			
 
				+{
			
 
				+	memset(m, 0, sizeof(float) * width * height);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+void sort(unsigned int size, double t[])
			
 
				+{
			
 
				+	unsigned int j;
			
 
				+
			
 
				+	int is_sort = 0;
			
 
				+
			
 
				+	while(!is_sort){
			
 
				+
			
 
				+		is_sort = 1;
			
 
				+
			
 
				+		for (j = 0 ; j < size - 1 ; j++){
			
 
				+
			
 
				+			if (t[j] > t[j+1]){
			
 
				+				double tmp = t[j];
			
 
				+				t[j] = t[j+1];
			
 
				+				t[j+1] = tmp;
			
 
				+				is_sort = 0;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+double median_time(unsigned nb_test, unsigned xdim, unsigned ydim, unsigned zdim, unsigned nsclicesx, unsigned nsclicesy)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+
			
 
				+	float * A = (float *) valloc(zdim*ydim*sizeof(float));
			
 
				+	float * B = (float *) valloc(xdim*zdim*sizeof(float));
			
 
				+	float * C = (float *) valloc(xdim*ydim*sizeof(float));
			
 
				+
			
 
				+	double exec_times[nb_test];
			
 
				+
			
 
				+	for (i = 0 ; i < nb_test ; i++){
			
 
				+
			
 
				+		double start, stop, exec_t;
			
 
				+
			
 
				+		init_rand(A, zdim, ydim);
			
 
				+		init_rand(B, xdim, zdim);
			
 
				+		init_zero(C, xdim, ydim);
			
 
				+
			
 
				+		init_rand(A, zdim, ydim);
			
 
				+		init_rand(B, xdim, zdim);
			
 
				+		init_zero(C, xdim, ydim);
			
 
				+
			
 
				+		start = starpu_timing_now();
			
 
				+		multiply_with_starpu(A, B, C, xdim, ydim, zdim, nsclicesx, nsclicesy);
			
 
				+		stop = starpu_timing_now();
			
 
				+
			
 
				+		exec_t = (stop - start)/1.e6;
			
 
				+		exec_times[i] = exec_t;
			
 
				+	}
			
 
				+
			
 
				+	sort(nb_test, exec_times);
			
 
				+
			
 
				+	free(A);
			
 
				+	free(B);
			
 
				+	free(C);
			
 
				+
			
 
				+	/* return exec_times[nb_test/2]; */
			
 
				+
			
 
				+	return exec_times[0];
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void display_times(unsigned start_dim, unsigned step_dim, unsigned stop_dim, unsigned nb_tests, unsigned nsclicesx, unsigned nsclicesy)
			
 
				+{
			
 
				+	unsigned dim;
			
 
				+	FILE *fp;
			
 
				+	fp = fopen("../DAT/mult_c.dat", "w");
			
 
				+	for (dim = start_dim ; dim <= stop_dim ; dim += step_dim){
			
 
				+		double t = median_time(nb_tests, dim, dim, dim, nsclicesx, nsclicesy);
			
 
				+		fprintf(fp, "%f\n", t);
			
 
				+		printf("%u ; %f\n", dim, t);
			
 
				+	}
			
 
				+	fclose(fp);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+int main(int argc, char * argv[])
			
 
				+{
			
 
				+
			
 
				+	if (argc != 7){
			
 
				+		printf("Usage : %s start_dim step_dim stop_dim nb_tests nsclicesx nsclicesy\n", argv[0]);
			
 
				+		return 1;
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	if (starpu_init(NULL) != EXIT_SUCCESS){
			
 
				+		fprintf(stderr, "ERROR\n");
			
 
				+		return 77;
			
 
				+	}
			
 
				+
			
 
				+	unsigned start_dim = (unsigned) atoi(argv[1]);
			
 
				+	unsigned step_dim = (unsigned) atoi(argv[2]);
			
 
				+	unsigned stop_dim = (unsigned) atoi(argv[3]);
			
 
				+	unsigned nb_tests = (unsigned) atoi(argv[4]);
			
 
				+	unsigned nsclicesx = (unsigned) atoi(argv[5]);
			
 
				+	unsigned nsclicesy = (unsigned) atoi(argv[6]);
			
 
				+
			
 
				+	display_times(start_dim, step_dim, stop_dim, nb_tests, nsclicesx, nsclicesy);
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
--- a/julia/tst/mult/mult_def.jl
+++ b/julia/tst/mult/mult_def.jl
@@ -0,0 +1,228 @@
 
				+using Base.LinAlg
			
 
				+include("mult_naive.jl")
			
 
				+
			
 
				+#   A of size (y,z)
			
 
				+#   B of size (z,x)
			
 
				+#   C of size (y,x)
			
 
				+
			
 
				+
			
 
				+#              |---------------|
			
 
				+#            z |       B       |
			
 
				+#              |---------------|
			
 
				+#       z              x
			
 
				+#     |----|   |---------------|
			
 
				+#     |    |   |               |
			
 
				+#     |    |   |               |
			
 
				+#     | A  | y |       C       |
			
 
				+#     |    |   |               |
			
 
				+#     |    |   |               |
			
 
				+#     |----|   |---------------|
			
 
				+#
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, nslicesx, nslicesy)
			
 
				+
			
 
				+    vert = StarpuDataFilter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslicesx)
			
 
				+    horiz = StarpuDataFilter(STARPU_MATRIX_FILTER_BLOCK, nslicesy)
			
 
				+
			
 
				+    @starpu_block let
			
 
				+
			
 
				+        hA,hB,hC = starpu_data_register(A, B, C)
			
 
				+
			
 
				+        starpu_data_partition(hB, vert)
			
 
				+        starpu_data_partition(hA, horiz)
			
 
				+        starpu_data_map_filters(hC, vert, horiz)
			
 
				+
			
 
				+        @starpu_sync_tasks for taskx in (1 : nslicesx)
			
 
				+            for tasky in (1 : nslicesy)
			
 
				+                @starpu_async_cl cl(hA[tasky], hB[taskx], hC[taskx, tasky])
			
 
				+            end
			
 
				+        end
			
 
				+    end
			
 
				+
			
 
				+    return nothing
			
 
				+end
			
 
				+
			
 
				+function multiply_with_starpu_cpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, nslicesx, nslicesy)
			
 
				+
			
 
				+    vert = StarpuDataFilter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslicesx)
			
 
				+    horiz = StarpuDataFilter(STARPU_MATRIX_FILTER_BLOCK, nslicesy)
			
 
				+
			
 
				+    @starpu_block let
			
 
				+
			
 
				+        hA,hB,hC = starpu_data_register(A, B, C)
			
 
				+
			
 
				+        starpu_data_partition(hB, vert)
			
 
				+        starpu_data_partition(hA, horiz)
			
 
				+        starpu_data_map_filters(hC, vert, horiz)
			
 
				+
			
 
				+        @starpu_sync_tasks for taskx in (1 : nslicesx)
			
 
				+            for tasky in (1 : nslicesy)
			
 
				+                @starpu_async_cl clcpu(hA[tasky], hB[taskx], hC[taskx, tasky])
			
 
				+            end
			
 
				+        end
			
 
				+    end
			
 
				+
			
 
				+    return nothing
			
 
				+end
			
 
				+
			
 
				+function multiply_with_starpu_gpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, nslicesx, nslicesy)
			
 
				+
			
 
				+    vert = StarpuDataFilter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslicesx)
			
 
				+    horiz = StarpuDataFilter(STARPU_MATRIX_FILTER_BLOCK, nslicesy)
			
 
				+
			
 
				+    @starpu_block let
			
 
				+
			
 
				+        hA,hB,hC = starpu_data_register(A, B, C)
			
 
				+
			
 
				+        starpu_data_partition(hB, vert)
			
 
				+        starpu_data_partition(hA, horiz)
			
 
				+        starpu_data_map_filters(hC, vert, horiz)
			
 
				+
			
 
				+        @starpu_sync_tasks for taskx in (1 : nslicesx)
			
 
				+            for tasky in (1 : nslicesy)
			
 
				+                @starpu_async_cl clgpu(hA[tasky], hB[taskx], hC[taskx, tasky])
			
 
				+            end
			
 
				+        end
			
 
				+    end
			
 
				+
			
 
				+    return nothing
			
 
				+end
			
 
				+
			
 
				+
			
 
				+
			
 
				+function approximately_equals(
			
 
				+    A :: Matrix{Cfloat},
			
 
				+    B :: Matrix{Cfloat},
			
 
				+    eps = 1e-2
			
 
				+)
			
 
				+    (height, width) = size(A)
			
 
				+
			
 
				+    for j in (1 : width)
			
 
				+        for i in (1 : height)
			
 
				+            if (abs(A[i,j] - B[i,j]) > eps * max(abs(B[i,j]), abs(A[i,j])))
			
 
				+                println("A[$i,$j] : $(A[i,j]), B[$i,$j] : $(B[i,j])")
			
 
				+                return false
			
 
				+            end
			
 
				+        end
			
 
				+    end
			
 
				+
			
 
				+    return true
			
 
				+end
			
 
				+
			
 
				+
			
 
				+
			
 
				+function median_times(nb_tests, xdim, zdim, ydim, nslicesx, nslicesy)
			
 
				+
			
 
				+    exec_times_st ::Vector{Float64} = [0 for i = 1:nb_tests]
			
 
				+    exec_times_cpu ::Vector{Float64} = [0 for i = 1:nb_tests]
			
 
				+    exec_times_gpu ::Vector{Float64} = [0 for i = 1:nb_tests]
			
 
				+    exec_times_jl ::Vector{Float64} = [0 for i = 1:nb_tests]
			
 
				+
			
 
				+    A = Array(rand(Cfloat, ydim, zdim))
			
 
				+    B = Array(rand(Cfloat, zdim, xdim))
			
 
				+    C = zeros(Float32, ydim, xdim)
			
 
				+    D  = A * B
			
 
				+
			
 
				+    for i in (1 : nb_tests)
			
 
				+        
			
 
				+        # tic()
			
 
				+        # multiply_with_starpu(A, B, C, nslicesx, nslicesy)
			
 
				+        # t = toq()
			
 
				+
			
 
				+        # if (!approximately_equals(D, C))
			
 
				+        #     error("Invalid st result")
			
 
				+        # end
			
 
				+        # exec_times_st[i] = t
			
 
				+
			
 
				+
			
 
				+
			
 
				+        # tic()
			
 
				+        # multiply_with_starpu_cpu(A, B, C, nslicesx, nslicesy)
			
 
				+        # tcpu = toq()
			
 
				+
			
 
				+        # if (!approximately_equals(D, C))
			
 
				+        #     error("Invalid cpu result")
			
 
				+        # end
			
 
				+        # exec_times_cpu[i] = tcpu
			
 
				+
			
 
				+
			
 
				+
			
 
				+        # tic()
			
 
				+        # multiply_with_starpu_gpu(A, B, C, nslicesx, nslicesy)
			
 
				+        # tgpu = toq()
			
 
				+
			
 
				+        # if (!approximately_equals(D, C))
			
 
				+        #     error("Invalid gpu result")
			
 
				+        # end
			
 
				+        # exec_times_gpu[i] = tgpu
			
 
				+
			
 
				+        al ::Float32 = 1.0
			
 
				+        be ::Float32 = 0.0 
			
 
				+
			
 
				+        tic()
			
 
				+        # multjl(A, B, C)
			
 
				+        BLAS.gemm!('N','N', al, A, B, be, C)
			
 
				+        # C = BLAS.gemm!('N', 'N', 1.0, A, B)
			
 
				+        tjl = toq()
			
 
				+
			
 
				+        if (!approximately_equals(D, C))
			
 
				+            error("Invalid jl result")
			
 
				+        end
			
 
				+
			
 
				+        exec_times_jl[i] = tjl
			
 
				+
			
 
				+    end
			
 
				+
			
 
				+  
			
 
				+    # sort!(exec_times_st)
			
 
				+    # sort!(exec_times_cpu)
			
 
				+    # sort!(exec_times_gpu)
			
 
				+    sort!(exec_times_jl)
			
 
				+  
			
 
				+    results ::Vector{Float64} = [exec_times_jl[1 + div(nb_tests-1, 2)]]#, exec_times_cpu[1 + div(nb_tests-1, 2)], exec_times_gpu[1 + div(nb_tests-1, 2)], exec_times_jl[1 + div(nb_tests-1, 2)]]
			
 
				+
			
 
				+    return results
			
 
				+end
			
 
				+
			
 
				+
			
 
				+
			
 
				+function display_times(start_dim, step_dim, stop_dim, nb_tests, nslicesx, nslicesy)
			
 
				+    # mtc = map( (x->parse(Float64,x)), open("DAT/mult_c.dat") do f
			
 
				+    #              readlines(f)
			
 
				+    #              end)
			
 
				+
			
 
				+    # mtext = map( (x->parse(Float64,x)), open("DAT/mult_ext.dat") do f
			
 
				+    #              readlines(f)
			
 
				+    #              end)
			
 
				+
			
 
				+    # mtjl = map( (x->parse(Float64,x)), open("DAT/mult_jl.dat") do f
			
 
				+    #             readlines(f)
			
 
				+    #             end)
			
 
				+    
			
 
				+
			
 
				+    # open("../DAT/mult_ext.dat", "w") do f    
			
 
				+    # open("../DAT/mult_jl.dat", "w") do f
			
 
				+    open("../DAT/mult_jl_times.dat", "w") do ft
			
 
				+        # open("DAT/mult.dat", "w") do f
			
 
				+            # i = 1
			
 
				+        for dim in (start_dim : step_dim : stop_dim)
			
 
				+            println("Dimension: $dim")
			
 
				+            # println("C: $(mtc[i])")
			
 
				+            res ::Vector{Float64} = median_times(nb_tests, dim, dim, dim, nslicesx, nslicesy)
			
 
				+            println("jl: $(res[1])")
			
 
				+            # println("jlcpu: $(res[2])")
			
 
				+            # println("jlgpu: $(res[3])")
			
 
				+            # println("jl: $(res[4])")
			
 
				+            # write(f, "$(dim) $(res[4]/res[1]) $(res[4]/res[2]) $(res[4]/res[3]) $(res[4]/mtc[i])\n")
			
 
				+            # write(f, "$dim $(mtjl[i]/res[1]) $(mtjl[i]/mtext[i]) $(mtjl[i]/mtc[i])\n")
			
 
				+            # write(ft, "$(res[1]) $(mtc[i]) $(mtext[i]) $(mtjl[i])\n")
			
 
				+            write(ft, "$(res[1])\n")
			
 
				+            # i = i + 1
			
 
				+            # end
			
 
				+        end
			
 
				+    end
			
 
				+end
			
--- a/julia/tst/mult/mult_extern.jl
+++ b/julia/tst/mult/mult_extern.jl
@@ -0,0 +1,30 @@
 
				+
			
 
				+if length(ARGS) != 6
			
 
				+    println("Usage : julia prog.jl start_dim step_dim stop_dim nb_tests nslicesx nslicesy")
			
 
				+    quit()
			
 
				+end
			
 
				+
			
 
				+include("../../src/Wrapper/Julia/starpu_include.jl")
			
 
				+using StarPU
			
 
				+
			
 
				+@debugprint "starpu_init"
			
 
				+starpu_init(extern_task_path = "../build/extern_tasks.so")
			
 
				+
			
 
				+perfmodel = StarpuPerfmodel(
			
 
				+    perf_type = STARPU_HISTORY_BASED,
			
 
				+    symbol = "history_perf"
			
 
				+)
			
 
				+
			
 
				+cl = StarpuCodelet(
			
 
				+    cpu_func = "cpu_mult",
			
 
				+    gpu_func = "gpu_mult",
			
 
				+    modes = [STARPU_R, STARPU_R, STARPU_W],
			
 
				+    perfmodel = perfmodel
			
 
				+)
			
 
				+
			
 
				+include("mult_def.jl")
			
 
				+
			
 
				+display_times(map((x -> parse(Int64,x)), ARGS)...)
			
 
				+
			
 
				+@debugprint "starpu_shutdown"
			
 
				+starpu_shutdown()
			
--- a/julia/tst/mult/mult_extern_graph.jl
+++ b/julia/tst/mult/mult_extern_graph.jl
@@ -0,0 +1,30 @@
 
				+
			
 
				+if length(ARGS) != 6
			
 
				+    println("Usage : julia prog.jl start_dim step_dim stop_dim nb_tests nslicesx nslicesy")
			
 
				+    quit()
			
 
				+end
			
 
				+
			
 
				+include("../../src/Wrapper/Julia/starpu_include.jl")
			
 
				+using StarPU
			
 
				+
			
 
				+@debugprint "starpu_init"
			
 
				+starpu_init(extern_task_path = "../build/extern_tasks.so")
			
 
				+
			
 
				+perfmodel = StarpuPerfmodel(
			
 
				+    perf_type = STARPU_HISTORY_BASED,
			
 
				+    symbol = "history_perf"
			
 
				+)
			
 
				+
			
 
				+cl = StarpuCodelet(
			
 
				+    cpu_func = "cpu_mult",
			
 
				+    gpu_func = "gpu_mult",
			
 
				+    modes = [STARPU_R, STARPU_R, STARPU_W],
			
 
				+    perfmodel = perfmodel
			
 
				+)
			
 
				+
			
 
				+include("mult_def.jl")
			
 
				+
			
 
				+display_times(map((x -> parse(Int64,x)), ARGS)..., "../mult_extern.dat")
			
 
				+
			
 
				+@debugprint "starpu_shutdown"
			
 
				+starpu_shutdown()
			
--- a/julia/tst/mult/mult_generated.jl
+++ b/julia/tst/mult/mult_generated.jl
@@ -0,0 +1,46 @@
 
				+
			
 
				+if length(ARGS) != 6
			
 
				+    println("Usage : julia prog.jl start_dim step_dim stop_dim nb_tests nslicesx nslicesy")
			
 
				+    quit()
			
 
				+end
			
 
				+
			
 
				+
			
 
				+include("../../src/Wrapper/Julia/starpu_include.jl")
			
 
				+using StarPU
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+@debugprint "starpu_init"
			
 
				+starpu_init(extern_task_path = "../build/generated_tasks.so")
			
 
				+
			
 
				+perfmodel = StarpuPerfmodel(
			
 
				+    perf_type = STARPU_HISTORY_BASED,
			
 
				+    symbol = "history_perf"
			
 
				+)
			
 
				+
			
 
				+cl = StarpuCodelet(
			
 
				+    cpu_func = "matrix_mult",
			
 
				+    gpu_func = "CUDA_matrix_mult",
			
 
				+    modes = [STARPU_R, STARPU_R, STARPU_W],
			
 
				+    perfmodel = perfmodel
			
 
				+)
			
 
				+
			
 
				+clcpu = StarpuCodelet(
			
 
				+    cpu_func = "matrix_mult",
			
 
				+    modes = [STARPU_R, STARPU_R, STARPU_W],
			
 
				+    perfmodel = perfmodel
			
 
				+)
			
 
				+
			
 
				+clgpu = StarpuCodelet(
			
 
				+    gpu_func = "CUDA_matrix_mult",
			
 
				+    modes = [STARPU_R, STARPU_R, STARPU_W],
			
 
				+    perfmodel = perfmodel
			
 
				+)
			
 
				+
			
 
				+include("mult_def.jl")
			
 
				+
			
 
				+display_times(map( (x -> parse(Int64,x)) , ARGS)...)
			
 
				+
			
 
				+@debugprint "starpu_shutdown"
			
 
				+starpu_shutdown()
			
--- a/julia/tst/mult/mult_generated_graph.jl
+++ b/julia/tst/mult/mult_generated_graph.jl
@@ -0,0 +1,34 @@
 
				+
			
 
				+if length(ARGS) != 6
			
 
				+    println("Usage : julia prog.jl start_dim step_dim stop_dim nb_tests nslicesx nslicesy")
			
 
				+    quit()
			
 
				+end
			
 
				+
			
 
				+
			
 
				+include("../../src/Wrapper/Julia/starpu_include.jl")
			
 
				+using StarPU
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+@debugprint "starpu_init"
			
 
				+starpu_init(extern_task_path = "../build/generated_tasks.so")
			
 
				+
			
 
				+perfmodel = StarpuPerfmodel(
			
 
				+    perf_type = STARPU_HISTORY_BASED,
			
 
				+    symbol = "history_perf"
			
 
				+)
			
 
				+
			
 
				+cl = StarpuCodelet(
			
 
				+    cpu_func = "matrix_mult",
			
 
				+    gpu_func = "CUDA_matrix_mult",
			
 
				+    modes = [STARPU_R, STARPU_R, STARPU_W],
			
 
				+    perfmodel = perfmodel
			
 
				+)
			
 
				+
			
 
				+include("mult_def.jl")
			
 
				+
			
 
				+display_times(map( (x -> parse(Int64,x)) , ARGS)..., "../mult_generated.dat")
			
 
				+
			
 
				+@debugprint "starpu_shutdown"
			
 
				+starpu_shutdown()
			
--- a/julia/tst/mult/mult_naive.jl
+++ b/julia/tst/mult/mult_naive.jl
@@ -0,0 +1,13 @@
 
				+function multjl(A ::Matrix{Float32}, B ::Matrix{Float32}, C ::Matrix{Float32})
			
 
				+    heightC, widthC = size(C)
			
 
				+    widthA = size(A)[2]
			
 
				+    for i = 1:heightC
			
 
				+        for j = 1:widthC
			
 
				+            sum = 0
			
 
				+            for k = 1:widthA
			
 
				+                sum = sum + A[i, k] * B[k, j]
			
 
				+            end
			
 
				+            C[i,j] = sum
			
 
				+        end
			
 
				+    end
			
 
				+end
			
--- a/julia/tst/nbody/cpu_cuda_nbody.jl
+++ b/julia/tst/nbody/cpu_cuda_nbody.jl
@@ -0,0 +1,55 @@
 
				+include("../../src/Compiler/include.jl")
			
 
				+
			
 
				+starpu_new_cpu_kernel_file("../build/generated_cpu_nbody.c")
			
 
				+starpu_new_cuda_kernel_file("../build/generated_cuda_nbody.cu")
			
 
				+
			
 
				+@cpu_cuda_kernel function nbody_acc(positions ::Matrix{Float64}, accelerations ::Matrix{Float64}, masses ::Vector{Float64}, parameters ::Vector{Float64}, sliceID ::Vector{Int64}) ::Void
			
 
				+    widthp ::Int64 = width(positions)
			
 
				+    widtha ::Int64 = width(accelerations)
			
 
				+    
			
 
				+    @indep for plan = 1:widtha
			
 
				+
			
 
				+        sumaccx ::Float64 = 0
			
 
				+        sumaccy ::Float64 = 0
			
 
				+
			
 
				+        for oplan = 1:widthp
			
 
				+            eps ::Float64 = parameters[3]
			
 
				+            Id ::Int64 = sliceID[1]*widtha
			
 
				+            G ::Float64 = parameters[1]
			
 
				+
			
 
				+            b ::Int64 = ((plan + Id) >= oplan) + ((plan + Id) <= oplan)
			
 
				+            if (b < 2)
			
 
				+
			
 
				+                dx ::Float64 = positions[1, oplan] - positions[1, plan + Id]
			
 
				+                dy ::Float64 = positions[2, oplan] - positions[2, plan + Id]
			
 
				+                modul ::Float64= sqrt(dx *dx + dy * dy)
			
 
				+
			
 
				+                sumaccx = sumaccx + (G * masses[oplan] * dx) / ((modul + eps) * (modul + eps) * (modul + eps)) 
			
 
				+                sumaccy = sumaccy + (G * masses[oplan] * dy) / ((modul + eps) * (modul + eps) * (modul + eps))
			
 
				+
			
 
				+                # sumaccx = sumaccx + (G * masses[oplan]) * (dx / sqrt(dx * dx + dy * dy)) / (dx * dx + dy * dy + eps)
			
 
				+                # sumaccy = sumaccy + (G * masses[oplan]) * (dy / sqrt(dx * dx + dy * dy)) / (dy * dy + dx * dx + eps)
			
 
				+            end
			
 
				+        end
			
 
				+        accelerations[1, plan] = sumaccx
			
 
				+        accelerations[2, plan] = sumaccy
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+@cpu_cuda_kernel function nbody_updt(positions ::Matrix{Float64}, velocities ::Matrix{Float64}, accelerations ::Matrix{Float64}, parameters ::Vector{Float64}) ::Void
			
 
				+    widthp ::Int64 = width(positions)
			
 
				+
			
 
				+    @indep for i = 1:widthp
			
 
				+
			
 
				+        velocities[1, i] = velocities[1, i] + accelerations[1, i] * parameters[2]
			
 
				+        velocities[2, i] = velocities[2, i] + accelerations[2, i] * parameters[2]
			
 
				+
			
 
				+        positions[1, i] = positions[1, i] + velocities[1, i] * parameters[2]
			
 
				+        positions[2, i] = positions[2, i] + velocities[2, i] * parameters[2]
			
 
				+    end
			
 
				+
			
 
				+end
			
 
				+
			
 
				+compile_cpu_kernels("../build/generated_cpu_nbody.so")
			
 
				+compile_cuda_kernels("../build/generated_cuda_nbody.so")
			
 
				+combine_kernel_files("../build/generated_tasks_nbody.so", ["../build/generated_cpu_nbody.so", "../build/generated_cuda_nbody.so"])
			
--- a/julia/tst/nbody/cpu_nbody.c
+++ b/julia/tst/nbody/cpu_nbody.c
@@ -0,0 +1,94 @@
 
				+#include <stdint.h>
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+struct Param {
			
 
				+	unsigned taskx;
			
 
				+	double epsilon;
			
 
				+};
			
 
				+
			
 
				+void cpu_nbody(void *descr[], void *arg)
			
 
				+{
			
 
				+	struct Param *params = arg;
			
 
				+
			
 
				+	double *P;
			
 
				+	double *subA;
			
 
				+	double *M;
			
 
				+
			
 
				+	uint32_t nxP, nxA, nxM;
			
 
				+	uint32_t ldP, ldA, ldM;
			
 
				+
			
 
				+	P = (double *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	subA = (double *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+	M = (double *)STARPU_MATRIX_GET_PTR(descr[2]);
			
 
				+
			
 
				+	nxP = STARPU_MATRIX_GET_NX(descr[0]);
			
 
				+	nxA = STARPU_MATRIX_GET_NX(descr[1]);
			
 
				+	nxM = STARPU_MATRIX_GET_NX(descr[2]);
			
 
				+
			
 
				+	ldP = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+	ldA = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				+	ldM = STARPU_MATRIX_GET_LD(descr[2]);
			
 
				+
			
 
				+	double epsilon = params->epsilon;
			
 
				+
			
 
				+	unsigned id = nxA * params->taskx;
			
 
				+
			
 
				+	uint32_t i,j;
			
 
				+	
			
 
				+	for (i = 0; i < nxA; i++){
			
 
				+		double sumaccx = 0;
			
 
				+		double sumaccy = 0;
			
 
				+		
			
 
				+		for (j = 0; j < nxP; j++){
			
 
				+			
			
 
				+			if (j != i + id){
			
 
				+				
			
 
				+				double dx = P[j] - P[i + id];
			
 
				+				double dy = P[j + ldP] - P[i + id + ldP];
			
 
				+
			
 
				+				double modul = sqrt(dx * dx + dy * dy);
			
 
				+
			
 
				+				sumaccx = sumaccx + 6.67e-11 * M[j] * dx / pow(modul + epsilon, 3);
			
 
				+				sumaccy = sumaccy + 6.67e-11 * M[j] * dy / pow(modul + epsilon, 3);
			
 
				+			}
			
 
				+
			
 
				+		}
			
 
				+		subA[i] = sumaccx;
			
 
				+		subA[i + ldA] = sumaccy;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void cpu_nbody2(void *descr[], void *arg)
			
 
				+{
			
 
				+	double *subP;
			
 
				+	double *subV;
			
 
				+	double *subA;
			
 
				+
			
 
				+	uint32_t nxP, nxV, nxA;
			
 
				+	uint32_t ldP, ldV, ldA;
			
 
				+
			
 
				+	subP = (double *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	subV = (double *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+	subA = (double *)STARPU_MATRIX_GET_PTR(descr[2]);
			
 
				+
			
 
				+	nxP = STARPU_MATRIX_GET_NX(descr[0]);
			
 
				+	nxV = STARPU_MATRIX_GET_NX(descr[1]);
			
 
				+	nxA = STARPU_MATRIX_GET_NX(descr[2]);
			
 
				+	
			
 
				+	ldP = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+	ldV = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				+	ldA = STARPU_MATRIX_GET_LD(descr[2]);
			
 
				+	
			
 
				+	
			
 
				+	unsigned i,dt;
			
 
				+	dt = 3600;
			
 
				+	for (i = 0; i < nxP; i++){
			
 
				+	
			
 
				+		subV[i] = subV[i] + dt*subA[i];
			
 
				+		subV[i + ldV] = subV[i + ldV] + dt*subA[i + ldA];
			
 
				+
			
 
				+		subP[i] = subP[i] + dt*subV[i];
			
 
				+		subP[i + ldP] = subP[i + ldP] + dt*subV[i + ldV];
			
 
				+	}
			
 
				+}
			
 
				+	      
			
--- a/julia/tst/nbody/cpu_nbody_between.c
+++ b/julia/tst/nbody/cpu_nbody_between.c
@@ -0,0 +1,104 @@
 
				+#include <stdio.h>
			
 
				+#include <stdint.h>
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+static inline long long jlstarpu_max(long long a, long long b)
			
 
				+{
			
 
				+	return (a > b) ? a : b;
			
 
				+}
			
 
				+
			
 
				+static inline long long jlstarpu_interval_size(long long start, long long step, long long stop)
			
 
				+{
			
 
				+    if (stop >= start){
			
 
				+            return jlstarpu_max(0, (stop - start + 1) / step);
			
 
				+    } else {
			
 
				+            return jlstarpu_max(0, (stop - start - 1) / step);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void nbody_acc(void** buffers_NJVQ1U4V, void* cl_arg_NJVQ1U4V)
			
 
				+{
			
 
				+    uint32_t ld_xIZ5HaKV = (uint32_t) (STARPU_MATRIX_GET_LD(buffers_NJVQ1U4V[(1) - (1)]));
			
 
				+    double* ptr_xIZ5HaKV = (double*) (STARPU_MATRIX_GET_PTR(buffers_NJVQ1U4V[(1) - (1)]));
			
 
				+    uint32_t ld_QZvmSRYk = (uint32_t) (STARPU_MATRIX_GET_LD(buffers_NJVQ1U4V[(2) - (1)]));
			
 
				+    double* ptr_QZvmSRYk = (double*) (STARPU_MATRIX_GET_PTR(buffers_NJVQ1U4V[(2) - (1)]));
			
 
				+    double* ptr_U7pwlAjr = (double*) (STARPU_VECTOR_GET_PTR(buffers_NJVQ1U4V[(3) - (1)]));
			
 
				+    double* ptr_AQKcQq1a = (double*) (STARPU_VECTOR_GET_PTR(buffers_NJVQ1U4V[(4) - (1)]));
			
 
				+    int64_t* ptr_qQoIJuDP = (int64_t*) (STARPU_VECTOR_GET_PTR(buffers_NJVQ1U4V[(5) - (1)]));
			
 
				+
			
 
				+    int64_t widthp = (int64_t) (STARPU_MATRIX_GET_NY(buffers_NJVQ1U4V[(1) - (1)]));
			
 
				+    int64_t widtha = (int64_t) (STARPU_MATRIX_GET_NY(buffers_NJVQ1U4V[(2) - (1)]));
			
 
				+    
			
 
				+    int64_t start_nj3HDzsW = (int64_t) (1);
			
 
				+    int64_t stop_nj3HDzsW = (int64_t) (widtha);
			
 
				+    int64_t plan;
			
 
				+
			
 
				+    for (plan = start_nj3HDzsW ; plan <= stop_nj3HDzsW ; plan += 1)
			
 
				+    {
			
 
				+        double sumaccx = (double) (0);
			
 
				+        double sumaccy = (double) (0);
			
 
				+        
			
 
				+        int64_t start_TzfU6QY7 = (int64_t) (1);
			
 
				+        int64_t stop_TzfU6QY7 = (int64_t) (widthp);
			
 
				+        int64_t oplan;
			
 
				+
			
 
				+        for (oplan = start_TzfU6QY7 ; oplan <= stop_TzfU6QY7 ; oplan += 1)
			
 
				+        {
			
 
				+            double eps = (double) (ptr_AQKcQq1a[(3) - (1)]);
			
 
				+            int64_t Id = (int64_t) ((ptr_qQoIJuDP[(1) - (1)]) * (widtha));
			
 
				+            double G = (double) (ptr_AQKcQq1a[(1) - (1)]);
			
 
				+            int64_t b = (int64_t) ((((plan) + (Id)) >= (oplan)) + (((plan) + (Id)) <= (oplan)));
			
 
				+            
			
 
				+            if ((b) < (2))
			
 
				+            {
			
 
				+                double dx = (double) ((ptr_xIZ5HaKV[((1) + (((oplan) - (1)) * (ld_xIZ5HaKV))) - (1)]) - (ptr_xIZ5HaKV[((1) + ((((plan) + (Id)) - (1)) * (ld_xIZ5HaKV))) - (1)]));
			
 
				+                double dy = (double) ((ptr_xIZ5HaKV[((2) + (((oplan) - (1)) * (ld_xIZ5HaKV))) - (1)]) - (ptr_xIZ5HaKV[((2) + ((((plan) + (Id)) - (1)) * (ld_xIZ5HaKV))) - (1)]));
			
 
				+                double modul = (double) (sqrt(((dx) * (dx)) + ((dy) * (dy))));
			
 
				+                sumaccx = (sumaccx) + (((G) * (ptr_U7pwlAjr[(oplan) - (1)]) * (dx)) / (((modul) + (eps)) * ((modul) + (eps)) * ((modul) + (eps))));
			
 
				+                sumaccy = (sumaccy) + (((G) * (ptr_U7pwlAjr[(oplan) - (1)]) * (dy)) / (((modul) + (eps)) * ((modul) + (eps)) * ((modul) + (eps))));
			
 
				+            };
			
 
				+	    
			
 
				+        }
			
 
				+        ;
			
 
				+
			
 
				+	
			
 
				+        ptr_QZvmSRYk[((1) + (((plan) - (1)) * (ld_QZvmSRYk))) - (1)] = sumaccx;
			
 
				+        ptr_QZvmSRYk[((2) + (((plan) - (1)) * (ld_QZvmSRYk))) - (1)] = sumaccy;
			
 
				+
			
 
				+        /* ptr_QZvmSRYk[((1) + (((plan) - (1)) * (ld_QZvmSRYk))) - (1)] = ptr_qQoIJuDP[(1) - (1)]; */
			
 
				+
			
 
				+        /* ptr_QZvmSRYk[((2) + (((plan) - (1)) * (ld_QZvmSRYk))) - (1)] = ptr_qQoIJuDP[(1) - (1)]; */
			
 
				+	
			
 
				+	
			
 
				+
			
 
				+    }
			
 
				+    ;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void nbody_updt(void** buffers_kCJlJluA, void* cl_arg_kCJlJluA)
			
 
				+{
			
 
				+    uint32_t ld_tlJ0FAub = (uint32_t) (STARPU_MATRIX_GET_LD(buffers_kCJlJluA[(1) - (1)]));
			
 
				+    double* ptr_tlJ0FAub = (double*) (STARPU_MATRIX_GET_PTR(buffers_kCJlJluA[(1) - (1)]));
			
 
				+    uint32_t ld_CwAKodfw = (uint32_t) (STARPU_MATRIX_GET_LD(buffers_kCJlJluA[(2) - (1)]));
			
 
				+    double* ptr_CwAKodfw = (double*) (STARPU_MATRIX_GET_PTR(buffers_kCJlJluA[(2) - (1)]));
			
 
				+    uint32_t ld_9CU1xW4b = (uint32_t) (STARPU_MATRIX_GET_LD(buffers_kCJlJluA[(3) - (1)]));
			
 
				+    double* ptr_9CU1xW4b = (double*) (STARPU_MATRIX_GET_PTR(buffers_kCJlJluA[(3) - (1)]));
			
 
				+    double* ptr_D81NeTCr = (double*) (STARPU_VECTOR_GET_PTR(buffers_kCJlJluA[(4) - (1)]));
			
 
				+    int64_t widthp = (int64_t) (STARPU_MATRIX_GET_NY(buffers_kCJlJluA[(1) - (1)]));
			
 
				+    
			
 
				+    int64_t start_FQShiJ9y = (int64_t) (1);
			
 
				+    int64_t stop_FQShiJ9y = (int64_t) (widthp);
			
 
				+    int64_t i;
			
 
				+
			
 
				+    for (i = start_FQShiJ9y ; i <= stop_FQShiJ9y ; i += 1)
			
 
				+    {
			
 
				+        ptr_CwAKodfw[((1) + (((i) - (1)) * (ld_CwAKodfw))) - (1)] = (ptr_CwAKodfw[((1) + (((i) - (1)) * (ld_CwAKodfw))) - (1)]) + ((ptr_9CU1xW4b[((1) + (((i) - (1)) * (ld_9CU1xW4b))) - (1)]) * (ptr_D81NeTCr[(2) - (1)]));
			
 
				+        ptr_CwAKodfw[((2) + (((i) - (1)) * (ld_CwAKodfw))) - (1)] = (ptr_CwAKodfw[((2) + (((i) - (1)) * (ld_CwAKodfw))) - (1)]) + ((ptr_9CU1xW4b[((2) + (((i) - (1)) * (ld_9CU1xW4b))) - (1)]) * (ptr_D81NeTCr[(2) - (1)]));
			
 
				+        ptr_tlJ0FAub[((1) + (((i) - (1)) * (ld_tlJ0FAub))) - (1)] = (ptr_tlJ0FAub[((1) + (((i) - (1)) * (ld_tlJ0FAub))) - (1)]) + ((ptr_CwAKodfw[((1) + (((i) - (1)) * (ld_CwAKodfw))) - (1)]) * (ptr_D81NeTCr[(2) - (1)]));
			
 
				+        ptr_tlJ0FAub[((2) + (((i) - (1)) * (ld_tlJ0FAub))) - (1)] = (ptr_tlJ0FAub[((2) + (((i) - (1)) * (ld_tlJ0FAub))) - (1)]) + ((ptr_CwAKodfw[((2) + (((i) - (1)) * (ld_CwAKodfw))) - (1)]) * (ptr_D81NeTCr[(2) - (1)]));
			
 
				+    }
			
 
				+    ;
			
 
				+}
			
 
				+
			
 
				+
			
--- a/julia/tst/nbody/gpu_nbody.cu
+++ b/julia/tst/nbody/gpu_nbody.cu
@@ -0,0 +1,145 @@
 
				+#include <stdio.h>
			
 
				+#include <stdint.h>
			
 
				+#include <math.h>
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+struct Params
			
 
				+{
			
 
				+  unsigned taskx;
			
 
				+  unsigned epsilon;
			
 
				+};
			
 
				+
			
 
				+__global__ void gpuNbodyKernel(double *P, double *subA, double *M,
			
 
				+			     uint32_t nxP, uint32_t nxA, uint32_t nxM,
			
 
				+			     uint32_t ldP, uint32_t ldA,
			
 
				+			     struct Params params)
			
 
				+{
			
 
				+  uint32_t id, i, j, k;
			
 
				+  double dx, dy, modul;
			
 
				+
			
 
				+  id = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+  i = id % nxA;
			
 
				+  j = id / nxA;
			
 
				+
			
 
				+  if (j >= 1){
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  double sumaccx;
			
 
				+  double sumaccy;
			
 
				+  
			
 
				+  for (k = 0; k < nxP; k++){
			
 
				+    if (k != id + nxA*params.taskx){
			
 
				+      dx = P[k] - P[id + nxA*params.taskx];
			
 
				+      dy = P[k + ldP] - P[id + nxA*params.taskx + ldP];
			
 
				+      
			
 
				+      modul = dx * dx + dy * dy;
			
 
				+
			
 
				+      sumaccx = 6.674e-11 * M[k] * dx / pow(modul + params.epsilon, 3);
			
 
				+      sumaccy = 6.674e-11 * M[k] * dy / pow(modul + params.epsilon, 3);
			
 
				+    }
			
 
				+  }
			
 
				+ 
			
 
				+  subA[i] = sumaccx;
			
 
				+  subA[i + ldA] = sumaccy;
			
 
				+
			
 
				+  // P[id + nxA * params.taskx] = subA[i];
			
 
				+
			
 
				+  // subA[i] = 0;
			
 
				+  // subA[i + ldA] = 1;
			
 
				+  
			
 
				+}
			
 
				+
			
 
				+#define THREADS_PER_BLOCK 64
			
 
				+
			
 
				+extern "C" void gpu_nbody(void * descr[], void * args)
			
 
				+{
			
 
				+
			
 
				+  double *d_P, *d_subA, *d_M;
			
 
				+  uint32_t nxP, nxA, nxM;
			
 
				+  uint32_t ldA, ldP;
			
 
				+  uint32_t nblocks;
			
 
				+
			
 
				+  struct Params *params = (struct Params *) args;
			
 
				+
			
 
				+  d_P = (double *) STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+  d_subA = (double *) STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+  d_M = (double *) STARPU_MATRIX_GET_PTR(descr[2]);
			
 
				+
			
 
				+  nxP = STARPU_MATRIX_GET_NX(descr[0]);
			
 
				+  nxA = STARPU_MATRIX_GET_NX(descr[1]);
			
 
				+  nxM = STARPU_MATRIX_GET_NX(descr[2]);
			
 
				+
			
 
				+  ldP = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+  ldA = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				+
			
 
				+  nblocks = (nxA + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
			
 
				+
			
 
				+  gpuNbodyKernel
			
 
				+    <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
			
 
				+    >>> (d_P,  d_subA, d_M, nxP, nxA, nxM, ldP, ldA, *params);
			
 
				+
			
 
				+  cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+__global__ void gpuNbody2Kernel(double *d_subP, double *d_subV, double *d_subA,
			
 
				+			      uint32_t nxP, uint32_t nxV, uint32_t nxA,
			
 
				+			      uint32_t ldP, uint32_t ldV, uint32_t ldA,
			
 
				+			      struct Params params)
			
 
				+{
			
 
				+
			
 
				+  uint32_t id, i, j;
			
 
				+
			
 
				+  id = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+
			
 
				+  i = id % nxP;
			
 
				+  j = id / nxP;
			
 
				+
			
 
				+  if (j >= 1){
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  d_subV[i] = d_subV[i] + 3600*d_subA[i];
			
 
				+  d_subV[i + ldV] = d_subV[i + ldV] + 3600*d_subA[i + ldA];
			
 
				+
			
 
				+  d_subP[i] = d_subP[i] + 3600*d_subV[i];
			
 
				+  d_subP[i + ldP] = d_subP[i + ldP] + 3600*d_subV[i + ldV];
			
 
				+}
			
 
				+
			
 
				+
			
 
				+extern "C" void gpu_nbody2(void * descr[], void *args)
			
 
				+{
			
 
				+  double *d_subP, *d_subV, *d_subA;
			
 
				+  uint32_t nxP, nxV, nxA;
			
 
				+  uint32_t ldP, ldV, ldA;
			
 
				+  uint32_t nblocks;
			
 
				+
			
 
				+  struct Params *params = (struct Params *) args;
			
 
				+
			
 
				+  d_subP = (double *) STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+  d_subV = (double *) STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+  d_subA = (double *) STARPU_MATRIX_GET_PTR(descr[2]);
			
 
				+
			
 
				+  nxP = STARPU_MATRIX_GET_NX(descr[0]);
			
 
				+  nxV = STARPU_MATRIX_GET_NX(descr[1]);
			
 
				+  nxA = STARPU_MATRIX_GET_NX(descr[2]);
			
 
				+
			
 
				+  ldP = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+  ldV = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				+  ldA = STARPU_MATRIX_GET_LD(descr[2]);
			
 
				+
			
 
				+  nblocks = (nxA + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
			
 
				+
			
 
				+  gpuNbody2Kernel
			
 
				+    <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
			
 
				+    >>> (d_subP, d_subV, d_subA, nxP, nxV, nxA, ldP, ldV, ldA, *params);
			
 
				+
			
 
				+  cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+}
			
--- a/julia/tst/nbody/gpu_nbody_between.cu
+++ b/julia/tst/nbody/gpu_nbody_between.cu
@@ -0,0 +1,151 @@
 
				+#include <stdio.h>
			
 
				+#include <stdint.h>
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#define THREADS_PER_BLOCK 64
			
 
				+
			
 
				+static inline long long jlstarpu_max(long long a, long long b)
			
 
				+{
			
 
				+	return (a > b) ? a : b;
			
 
				+}
			
 
				+
			
 
				+static inline long long jlstarpu_interval_size(long long start, long long step, long long stop)
			
 
				+{
			
 
				+    if (stop >= start){
			
 
				+            return jlstarpu_max(0, (stop - start + 1) / step);
			
 
				+    } else {
			
 
				+            return jlstarpu_max(0, (stop - start - 1) / step);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+__device__ static inline long long jlstarpu_max__device(long long a, long long b)
			
 
				+{
			
 
				+	return (a > b) ? a : b;
			
 
				+}
			
 
				+
			
 
				+__device__ static inline long long jlstarpu_interval_size__device(long long start, long long step, long long stop)
			
 
				+{
			
 
				+	if (stop >= start){
			
 
				+		return jlstarpu_max__device(0, (stop - start + 1) / step);
			
 
				+	} else {
			
 
				+		return jlstarpu_max__device(0, (stop - start - 1) / step);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+__global__ void nbody_acc(int64_t kernel_ids__start_1, int64_t kernel_ids__step_1, int64_t kernel_ids__dim_1, int64_t widthp, 
			
 
				+                          double* ptr_OhLp7E87, int64_t* ptr_DvYAWLG1, int64_t widtha, double* ptr_Xi5IjQJ9, 
			
 
				+                          uint32_t ld_Xi5IjQJ9, double* ptr_t4YHT0eY, double* ptr_mfUSUHkf, uint32_t ld_mfUSUHkf)
			
 
				+{
			
 
				+    int64_t THREAD_ID = (int64_t) ((((blockIdx).x) * ((blockDim).x)) + ((threadIdx).x));
			
 
				+    
			
 
				+    if ((THREAD_ID) >= ((1) * (kernel_ids__dim_1)))
			
 
				+    {
			
 
				+        return ;
			
 
				+    };
			
 
				+    int64_t kernel_ids__index_1 = (int64_t) (((THREAD_ID) / (1)) % (kernel_ids__dim_1));
			
 
				+    int64_t plan = (int64_t) ((kernel_ids__start_1) + ((kernel_ids__index_1) * (kernel_ids__step_1)));
			
 
				+    double sumaccx = (double) (0);
			
 
				+    double sumaccy = (double) (0);
			
 
				+    
			
 
				+    int64_t start_TzfU6QY7 = (int64_t) (1);
			
 
				+    int64_t stop_TzfU6QY7 = (int64_t) (widthp);
			
 
				+    int64_t oplan;
			
 
				+
			
 
				+    for (oplan = start_TzfU6QY7 ; oplan <= stop_TzfU6QY7 ; oplan += 1)
			
 
				+    {
			
 
				+        double eps = (double) (ptr_OhLp7E87[(3) - (1)]);
			
 
				+        int64_t Id = (int64_t) ((ptr_DvYAWLG1[(1) - (1)]) * (widtha));
			
 
				+        double G = (double) (ptr_OhLp7E87[(1) - (1)]);
			
 
				+        int64_t b = (int64_t) ((((plan) + (Id)) >= (oplan)) + (((plan) + (Id)) <= (oplan)));
			
 
				+        
			
 
				+        if ((b) < (2))
			
 
				+        {
			
 
				+            double dx = (double) ((ptr_Xi5IjQJ9[((1) + (((oplan) - (1)) * (ld_Xi5IjQJ9))) - (1)]) - (ptr_Xi5IjQJ9[((1) + ((((plan) + (Id)) - (1)) * (ld_Xi5IjQJ9))) - (1)]));
			
 
				+            double dy = (double) ((ptr_Xi5IjQJ9[((2) + (((oplan) - (1)) * (ld_Xi5IjQJ9))) - (1)]) - (ptr_Xi5IjQJ9[((2) + ((((plan) + (Id)) - (1)) * (ld_Xi5IjQJ9))) - (1)]));
			
 
				+            double modul = (double) (sqrt(((dx) * (dx)) + ((dy) * (dy))));
			
 
				+            sumaccx = (sumaccx) + (((G) * (ptr_t4YHT0eY[(oplan) - (1)]) * (dx)) / (((modul) + (eps)) * ((modul) + (eps)) * ((modul) + (eps))));
			
 
				+            sumaccy = (sumaccy) + (((G) * (ptr_t4YHT0eY[(oplan) - (1)]) * (dy)) / (((modul) + (eps)) * ((modul) + (eps)) * ((modul) + (eps))));
			
 
				+        };
			
 
				+    }
			
 
				+    ;
			
 
				+    ptr_mfUSUHkf[((1) + (((plan) - (1)) * (ld_mfUSUHkf))) - (1)] = sumaccx;
			
 
				+    ptr_mfUSUHkf[((2) + (((plan) - (1)) * (ld_mfUSUHkf))) - (1)] = sumaccy;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+extern "C" void CUDA_nbody_acc(void** buffers_qd9i9yfK, void* cl_arg_qd9i9yfK)
			
 
				+{
			
 
				+    uint32_t ld_Xi5IjQJ9 = (uint32_t) (STARPU_MATRIX_GET_LD(buffers_qd9i9yfK[(1) - (1)]));
			
 
				+    double* ptr_Xi5IjQJ9 = (double*) (STARPU_MATRIX_GET_PTR(buffers_qd9i9yfK[(1) - (1)]));
			
 
				+    uint32_t ld_mfUSUHkf = (uint32_t) (STARPU_MATRIX_GET_LD(buffers_qd9i9yfK[(2) - (1)]));
			
 
				+    double* ptr_mfUSUHkf = (double*) (STARPU_MATRIX_GET_PTR(buffers_qd9i9yfK[(2) - (1)]));
			
 
				+    double* ptr_t4YHT0eY = (double*) (STARPU_VECTOR_GET_PTR(buffers_qd9i9yfK[(3) - (1)]));
			
 
				+    double* ptr_OhLp7E87 = (double*) (STARPU_VECTOR_GET_PTR(buffers_qd9i9yfK[(4) - (1)]));
			
 
				+    int64_t* ptr_DvYAWLG1 = (int64_t*) (STARPU_VECTOR_GET_PTR(buffers_qd9i9yfK[(5) - (1)]));
			
 
				+    int64_t widthp = (int64_t) (STARPU_MATRIX_GET_NY(buffers_qd9i9yfK[(1) - (1)]));
			
 
				+    int64_t widtha = (int64_t) (STARPU_MATRIX_GET_NY(buffers_qd9i9yfK[(2) - (1)]));
			
 
				+    int64_t kernel_ids__start_1 = (int64_t) (1);
			
 
				+    int64_t kernel_ids__step_1 = (int64_t) (1);
			
 
				+    int64_t kernel_ids__dim_1 = (int64_t) (jlstarpu_interval_size(kernel_ids__start_1, kernel_ids__step_1, widtha));
			
 
				+    int64_t nthreads = (int64_t) ((1) * (kernel_ids__dim_1));
			
 
				+    int64_t nblocks = (int64_t) ((((nthreads) + (THREADS_PER_BLOCK)) - (1)) / (THREADS_PER_BLOCK));
			
 
				+    
			
 
				+    nbody_acc
			
 
				+        <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
			
 
				+        >>> (kernel_ids__start_1, kernel_ids__step_1, kernel_ids__dim_1, widthp, 
			
 
				+             ptr_OhLp7E87, ptr_DvYAWLG1, widtha, ptr_Xi5IjQJ9, 
			
 
				+             ld_Xi5IjQJ9, ptr_t4YHT0eY, ptr_mfUSUHkf, ld_mfUSUHkf);
			
 
				+    ;
			
 
				+    cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+}
			
 
				+
			
 
				+
			
 
				+__global__ void nbody_updt(int64_t kernel_ids__start_1, int64_t kernel_ids__step_1, int64_t kernel_ids__dim_1, double* ptr_jJ5f8wMA, 
			
 
				+                           uint32_t ld_jJ5f8wMA, double* ptr_piPvdbTs, uint32_t ld_piPvdbTs, double* ptr_JBaPgPiT, 
			
 
				+                           double* ptr_0STm2S4k, uint32_t ld_0STm2S4k)
			
 
				+{
			
 
				+    int64_t THREAD_ID = (int64_t) ((((blockIdx).x) * ((blockDim).x)) + ((threadIdx).x));
			
 
				+    
			
 
				+    if ((THREAD_ID) >= ((1) * (kernel_ids__dim_1)))
			
 
				+    {
			
 
				+        return ;
			
 
				+    };
			
 
				+    int64_t kernel_ids__index_1 = (int64_t) (((THREAD_ID) / (1)) % (kernel_ids__dim_1));
			
 
				+    int64_t i = (int64_t) ((kernel_ids__start_1) + ((kernel_ids__index_1) * (kernel_ids__step_1)));
			
 
				+    ptr_jJ5f8wMA[((1) + (((i) - (1)) * (ld_jJ5f8wMA))) - (1)] = (ptr_jJ5f8wMA[((1) + (((i) - (1)) * (ld_jJ5f8wMA))) - (1)]) + ((ptr_piPvdbTs[((1) + (((i) - (1)) * (ld_piPvdbTs))) - (1)]) * (ptr_JBaPgPiT[(2) - (1)]));
			
 
				+    ptr_jJ5f8wMA[((2) + (((i) - (1)) * (ld_jJ5f8wMA))) - (1)] = (ptr_jJ5f8wMA[((2) + (((i) - (1)) * (ld_jJ5f8wMA))) - (1)]) + ((ptr_piPvdbTs[((2) + (((i) - (1)) * (ld_piPvdbTs))) - (1)]) * (ptr_JBaPgPiT[(2) - (1)]));
			
 
				+    ptr_0STm2S4k[((1) + (((i) - (1)) * (ld_0STm2S4k))) - (1)] = (ptr_0STm2S4k[((1) + (((i) - (1)) * (ld_0STm2S4k))) - (1)]) + ((ptr_jJ5f8wMA[((1) + (((i) - (1)) * (ld_jJ5f8wMA))) - (1)]) * (ptr_JBaPgPiT[(2) - (1)]));
			
 
				+    ptr_0STm2S4k[((2) + (((i) - (1)) * (ld_0STm2S4k))) - (1)] = (ptr_0STm2S4k[((2) + (((i) - (1)) * (ld_0STm2S4k))) - (1)]) + ((ptr_jJ5f8wMA[((2) + (((i) - (1)) * (ld_jJ5f8wMA))) - (1)]) * (ptr_JBaPgPiT[(2) - (1)]));
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+extern "C" void CUDA_nbody_updt(void** buffers_gj6UYWT4, void* cl_arg_gj6UYWT4)
			
 
				+{
			
 
				+    uint32_t ld_0STm2S4k = (uint32_t) (STARPU_MATRIX_GET_LD(buffers_gj6UYWT4[(1) - (1)]));
			
 
				+    double* ptr_0STm2S4k = (double*) (STARPU_MATRIX_GET_PTR(buffers_gj6UYWT4[(1) - (1)]));
			
 
				+    uint32_t ld_jJ5f8wMA = (uint32_t) (STARPU_MATRIX_GET_LD(buffers_gj6UYWT4[(2) - (1)]));
			
 
				+    double* ptr_jJ5f8wMA = (double*) (STARPU_MATRIX_GET_PTR(buffers_gj6UYWT4[(2) - (1)]));
			
 
				+    uint32_t ld_piPvdbTs = (uint32_t) (STARPU_MATRIX_GET_LD(buffers_gj6UYWT4[(3) - (1)]));
			
 
				+    double* ptr_piPvdbTs = (double*) (STARPU_MATRIX_GET_PTR(buffers_gj6UYWT4[(3) - (1)]));
			
 
				+    double* ptr_JBaPgPiT = (double*) (STARPU_VECTOR_GET_PTR(buffers_gj6UYWT4[(4) - (1)]));
			
 
				+    int64_t widthp = (int64_t) (STARPU_MATRIX_GET_NY(buffers_gj6UYWT4[(1) - (1)]));
			
 
				+    int64_t kernel_ids__start_1 = (int64_t) (1);
			
 
				+    int64_t kernel_ids__step_1 = (int64_t) (1);
			
 
				+    int64_t kernel_ids__dim_1 = (int64_t) (jlstarpu_interval_size(kernel_ids__start_1, kernel_ids__step_1, widthp));
			
 
				+    int64_t nthreads = (int64_t) ((1) * (kernel_ids__dim_1));
			
 
				+    int64_t nblocks = (int64_t) ((((nthreads) + (THREADS_PER_BLOCK)) - (1)) / (THREADS_PER_BLOCK));
			
 
				+    
			
 
				+    nbody_updt
			
 
				+        <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
			
 
				+        >>> (kernel_ids__start_1, kernel_ids__step_1, kernel_ids__dim_1, ptr_jJ5f8wMA, 
			
 
				+             ld_jJ5f8wMA, ptr_piPvdbTs, ld_piPvdbTs, ptr_JBaPgPiT, 
			
 
				+             ptr_0STm2S4k, ld_0STm2S4k);
			
 
				+    ;
			
 
				+    cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+}
			
 
				+
			
 
				+
			
--- a/julia/tst/nbody/nbody.c
+++ b/julia/tst/nbody/nbody.c
@@ -0,0 +1,296 @@
 
				+#include <stdlib.h>
			
 
				+#include <stdio.h>
			
 
				+#include <starpu.h>
			
 
				+#include <math.h>
			
 
				+#include "../includes/display.h"
			
 
				+#include "../includes/sorting.h"
			
 
				+
			
 
				+struct Param {
			
 
				+	unsigned taskx;
			
 
				+	double epsilon;
			
 
				+};
			
 
				+
			
 
				+void cpu_nbody(void **, void *);
			
 
				+void cpu_nbody2(void **, void *);
			
 
				+
			
 
				+void gpu_nbody(void **, void *);
			
 
				+void gpu_nbody2(void **, void *);
			
 
				+
			
 
				+static struct starpu_perfmodel model =
			
 
				+{
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = "history_perf"
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet cl =
			
 
				+{
			
 
				+	.cpu_funcs = {cpu_nbody},
			
 
				+	.cuda_funcs = {gpu_nbody},
			
 
				+	.nbuffers = 3,
			
 
				+	.modes = {STARPU_RW, STARPU_RW, STARPU_RW},
			
 
				+	.model = &model
			
 
				+};
			
 
				+
			
 
				+
			
 
				+static struct starpu_codelet cl2 =
			
 
				+{
			
 
				+	.cpu_funcs = {cpu_nbody2},
			
 
				+	.cuda_funcs = {gpu_nbody2},
			
 
				+	.nbuffers = 3,
			
 
				+	.modes = {STARPU_RW, STARPU_RW, STARPU_R},
			
 
				+	.model = &model
			
 
				+};
			
 
				+
			
 
				+void nbody_with_starpu(double *positions, double *velocities, double *accelerations, double *masses, unsigned nbr_planets, unsigned nbr_simulations, unsigned nslices)
			
 
				+{
			
 
				+
			
 
				+
			
 
				+
			
 
				+	starpu_data_handle_t P_handle, V_handle, A_handle, M_handle;
			
 
				+
			
 
				+       
			
 
				+	starpu_matrix_data_register(&P_handle, STARPU_MAIN_RAM, (uintptr_t)positions, nbr_planets, nbr_planets, 2, sizeof(double));
			
 
				+	starpu_matrix_data_register(&V_handle, STARPU_MAIN_RAM, (uintptr_t)velocities, nbr_planets, nbr_planets, 2, sizeof(double));
			
 
				+	starpu_matrix_data_register(&A_handle, STARPU_MAIN_RAM, (uintptr_t)accelerations, nbr_planets, nbr_planets, 2, sizeof(double));
			
 
				+	starpu_matrix_data_register(&M_handle, STARPU_MAIN_RAM, (uintptr_t)masses, nbr_planets, nbr_planets, 1, sizeof(double));
			
 
				+	
			
 
				+	struct starpu_data_filter horiz = 
			
 
				+		{
			
 
				+			.filter_func = starpu_matrix_filter_block,
			
 
				+			.nchildren = nslices
			
 
				+		};
			
 
				+	
			
 
				+	unsigned i;
			
 
				+		
			
 
				+	for (i = 0; i < nbr_simulations; i++){
			
 
				+		
			
 
				+		/* printf("Simulation %d: \n", i); */
			
 
				+		
			
 
				+		starpu_data_partition(A_handle, &horiz);
			
 
				+		
			
 
				+		struct Param *params = malloc(nslices * sizeof(struct Param));
			
 
				+		
			
 
				+		
			
 
				+		unsigned epsilon = 2.5e8;
			
 
				+		
			
 
				+		unsigned task1, task2;
			
 
				+		
			
 
				+		
			
 
				+		
			
 
				+		for (task1 = 0; task1 < nslices; task1++){
			
 
				+			
			
 
				+			struct starpu_task *task = starpu_task_create();
			
 
				+			
			
 
				+			struct Param param = {task1, epsilon};
			
 
				+			
			
 
				+			params[task1] = param;
			
 
				+			
			
 
				+			task->cl = &cl;
			
 
				+			task->handles[0] = P_handle;
			
 
				+			task->handles[1] = starpu_data_get_sub_data(A_handle, 1, task1);
			
 
				+			task->handles[2] = M_handle;
			
 
				+			
			
 
				+			task->cl_arg = &(params[task1]);
			
 
				+			task->cl_arg_size = sizeof(struct Param);
			
 
				+			
			
 
				+			starpu_task_submit(task);
			
 
				+		}
			
 
				+		starpu_task_wait_for_all();
			
 
				+
			
 
				+////////////////////////
			
 
				+
			
 
				+		starpu_data_partition(P_handle, &horiz);
			
 
				+		starpu_data_partition(V_handle, &horiz);
			
 
				+		
			
 
				+		for (task2 = 0; task2 < nslices; task2++){
			
 
				+			
			
 
				+			struct starpu_task *task = starpu_task_create();
			
 
				+			
			
 
				+			struct Param param = {task1, epsilon};
			
 
				+			
			
 
				+			params[task2] = param;
			
 
				+			
			
 
				+			task->cl = &cl2;
			
 
				+			task->handles[0] = starpu_data_get_sub_data(P_handle, 1, task2);
			
 
				+			task->handles[1] = starpu_data_get_sub_data(V_handle, 1, task2);
			
 
				+			task->handles[2] = starpu_data_get_sub_data(A_handle, 1, task2);
			
 
				+			
			
 
				+			task->cl_arg = &(params[task2]);
			
 
				+			task->cl_arg_size = sizeof(struct Param);
			
 
				+			
			
 
				+			starpu_task_submit(task);
			
 
				+		}
			
 
				+		
			
 
				+		starpu_task_wait_for_all();
			
 
				+		
			
 
				+		starpu_data_unpartition(P_handle, STARPU_MAIN_RAM);
			
 
				+		starpu_data_unpartition(V_handle, STARPU_MAIN_RAM);
			
 
				+		starpu_data_unpartition(A_handle, STARPU_MAIN_RAM);
			
 
				+
			
 
				+		/* char filename[41]; */
			
 
				+	
			
 
				+		/* sprintf(filename, "../PPM/nbody%d_%d.ppm", nbr_planets, i + 1); */
			
 
				+	
			
 
				+		/* nbody_graph(filename, positions, nbr_planets, 1000, 1000, -4e8, 4e8); */
			
 
				+
			
 
				+	}
			
 
				+	
			
 
				+		starpu_data_unregister(P_handle);
			
 
				+		starpu_data_unregister(V_handle);
			
 
				+		starpu_data_unregister(A_handle);
			
 
				+		starpu_data_unregister(M_handle);
			
 
				+
			
 
				+
			
 
				+}
			
 
				+
			
 
				+void init_positions(double *positions, unsigned nbr_planets)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	double qiX, qiY;
			
 
				+	for (i = 0; i < nbr_planets; i++){
			
 
				+		double angle = ((RAND_MAX - rand()) / (double) (RAND_MAX)) * 2.0 * M_PI;
			
 
				+		double distToCenter = ((RAND_MAX - rand()) / (double) (RAND_MAX)) * 1.0e8 + 1.0e8;
			
 
				+
			
 
				+		qiX = cos(angle) * distToCenter;
			
 
				+		qiY = sin(angle) * distToCenter;
			
 
				+
			
 
				+		positions[i] = qiX;
			
 
				+		positions[i + nbr_planets] = qiY;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void init_velocities(double *positions, double *velocities, unsigned nbr_planets)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < nbr_planets; i++){
			
 
				+		double viX = positions[i + nbr_planets] * 4.0e-6;
			
 
				+		double viY = -positions[i] * 4.0e-6;
			
 
				+		velocities[i] = viX;
			
 
				+		velocities[i + nbr_planets] = viY;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void init_masses(double *masses, unsigned nbr_planets)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < nbr_planets; i++){
			
 
				+		double mi = (rand() / (double) RAND_MAX) * 5e25;
			
 
				+		masses[i] = mi;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+	
			
 
				+
			
 
				+double median_times(unsigned nbr_planets, unsigned nbr_simulations, unsigned nslices, unsigned nbr_tests)
			
 
				+{
			
 
				+	double exec_times[nbr_tests];
			
 
				+
			
 
				+	double *positions = malloc(nbr_planets * 2 * sizeof(double));
			
 
				+	double *velocities = malloc(nbr_planets * 2 * sizeof(double));
			
 
				+	double *accelerations = calloc(nbr_planets * 2, sizeof(double));
			
 
				+	double *masses = malloc(nbr_planets * sizeof(double));
			
 
				+
			
 
				+	init_positions(positions, nbr_planets);
			
 
				+	init_velocities(positions, velocities, nbr_planets);
			
 
				+	init_masses(masses, nbr_planets);
			
 
				+
			
 
				+
			
 
				+	/* unsigned i; */
			
 
				+
			
 
				+	/* for (i = 0; i < nbr_planets; i++){ */
			
 
				+	/* 	accelerations[i] = i; */
			
 
				+	/* } */
			
 
				+
			
 
				+	
			
 
				+	
			
 
				+	/* nbody_graph("PPM/bug0.ppm", positions, nbr_planets, 1000, 1000, -4e8, 4e8); */
			
 
				+
			
 
				+	/* unsigned k; */
			
 
				+       
			
 
				+	/* for (k = 1; k <= nbr_simulations; k++){ */
			
 
				+	
			
 
				+	double start, stop, exec_t;
			
 
				+	unsigned i;
			
 
				+	
			
 
				+	for (i = 0; i < nbr_tests; i++){
			
 
				+
			
 
				+		start = starpu_timing_now();
			
 
				+
			
 
				+		nbody_with_starpu(positions, velocities, accelerations, masses, nbr_planets, nbr_simulations, nslices);
			
 
				+		
			
 
				+		stop = starpu_timing_now();
			
 
				+
			
 
				+		exec_t = (stop - start) / 1.e6;
			
 
				+
			
 
				+		exec_times[i] = exec_t;
			
 
				+	}
			
 
				+	/* printf("\n\nSIMULATION %d:\n\n", k); */
			
 
				+	
			
 
				+	
			
 
				+	/* char filename[23]; */
			
 
				+	
			
 
				+	/* sprintf(filename, "PPM/bug%d.ppm", k); */
			
 
				+	
			
 
				+	/* nbody_graph(filename, positions, nbr_planets, 1000, 1000, -4e8, 4e8); */
			
 
				+	/* } */
			
 
				+	
			
 
				+	free(positions);
			
 
				+	free(velocities);
			
 
				+	free(accelerations);
			
 
				+	free(masses);
			
 
				+
			
 
				+	quicksort(exec_times, 0, nbr_tests - 1);
			
 
				+
			
 
				+	return exec_times[nbr_tests / 2];
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+void display_times(unsigned start_nbr, unsigned step_nbr, unsigned stop_nbr, unsigned nbr_simulations, unsigned nslices, unsigned nbr_tests)
			
 
				+{
			
 
				+
			
 
				+	FILE *myfile;
			
 
				+	
			
 
				+	myfile = fopen("DAT/nbody_c_struct_times.dat", "w");
			
 
				+
			
 
				+	unsigned nbr_planets;
			
 
				+	
			
 
				+	for (nbr_planets = start_nbr; nbr_planets <= stop_nbr; nbr_planets += step_nbr){
			
 
				+		double t = median_times(nbr_planets, nbr_simulations, nslices, nbr_tests);
			
 
				+		printf("STRUCT: %u planets: %f seconds\n", nbr_planets, t);
			
 
				+		fprintf(myfile, "%f\n", t);
			
 
				+	}
			
 
				+	fclose(myfile);
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char * argv[])
			
 
				+{
			
 
				+
			
 
				+	if (argc != 7){
			
 
				+		printf("Usage: %s start_nbr step_nbr stop_nbr nbr_simulations nslices nbr_tests\n", argv[0]);
			
 
				+		return 1;
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	if (starpu_init(NULL) != EXIT_SUCCESS){
			
 
				+		fprintf(stderr, "ERROR\n");
			
 
				+		return 77;
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	unsigned start_nbr = (unsigned) atoi(argv[1]);
			
 
				+	unsigned step_nbr = (unsigned) atoi(argv[2]);
			
 
				+	unsigned stop_nbr = (unsigned) atoi(argv[3]);
			
 
				+	unsigned nbr_simulations = (unsigned) atoi(argv[4]);
			
 
				+	unsigned nslices = (unsigned) atoi(argv[5]);
			
 
				+	unsigned nbr_tests = (unsigned) atoi(argv[6]);
			
 
				+
			
 
				+	srand(time(NULL));
			
 
				+	
			
 
				+	display_times(start_nbr, step_nbr, stop_nbr, nbr_simulations, nslices, nbr_tests);
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+	
			
 
				+	return 0;
			
 
				+}
			
--- a/julia/tst/nbody/nbody.jl
+++ b/julia/tst/nbody/nbody.jl
@@ -0,0 +1,40 @@
 
				+include("nbody_display.jl")
			
 
				+function mod2(v ::Vector{Float64})
			
 
				+    return sqrt(v[1]^2 + v[2]^2)
			
 
				+end
			
 
				+
			
 
				+function compute_accelerations(positions ::Vector{Vector{Float64}}, accelerations ::Vector{Vector{Float64}}, masses ::Vector{Float64}, eps ::Float64)
			
 
				+    G ::Float64 = 6.67E-11
			
 
				+    n ::Int64 = length(accelerations)
			
 
				+    for i = 1:n
			
 
				+        sumacc ::Vector{Float64} = [0,0]
			
 
				+        for j = 1:n
			
 
				+            if i != j
			
 
				+                dv ::Vector{Float64} = positions[j] - positions[i]
			
 
				+                
			
 
				+                sumacc = sumacc + G * masses[j] * dv / (mod2(dv) + eps)^3
			
 
				+            end
			
 
				+        end
			
 
				+        accelerations[i] = sumacc
			
 
				+    end    
			
 
				+end
			
 
				+
			
 
				+function update_pos_vel(positions ::Vector{Vector{Float64}}, velocities ::Vector{Vector{Float64}}, accelerations ::Vector{Vector{Float64}}, dt ::Float64)
			
 
				+    n ::Int64 = length(positions)
			
 
				+    for i = 1:n
			
 
				+        velocities[i] = velocities[i] + accelerations[i] * dt
			
 
				+        positions[i] = positions[i] + velocities[i] * dt
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+function nbody_jl(positions ::Vector{Vector{Float64}}, velocities ::Vector{Vector{Float64}}, accelerations ::Vector{Vector{Float64}}, masses ::Vector{Float64}, nbr_simulations ::Int64, eps ::Float64, dt ::Float64)
			
 
				+    for i = 1:nbr_simulations
			
 
				+        compute_accelerations(positions, accelerations, masses, eps)
			
 
				+        update_pos_vel(positions, velocities, accelerations,dt)
			
 
				+        # pixels ::Array{Array{Int64}} = [[0,0,0] for i = 1:1000*1000]
			
 
				+        # graph_planets(pixels, positions, -4E8, 4E8, 1000, 1000, "nbody$i.ppm")
			
 
				+    end
			
 
				+end
			
--- a/julia/tst/nbody/nbody_between.c
+++ b/julia/tst/nbody/nbody_between.c
@@ -0,0 +1,363 @@
 
				+#include <stdlib.h>
			
 
				+#include <stdio.h>
			
 
				+#include <starpu.h>
			
 
				+#include <math.h>
			
 
				+#include <stdint.h>
			
 
				+#include "../includes/display.h"
			
 
				+#include "../includes/sorting.h"
			
 
				+
			
 
				+struct Param {
			
 
				+	unsigned taskx;
			
 
				+	double epsilon;
			
 
				+};
			
 
				+
			
 
				+void nbody_acc(void **, void *);
			
 
				+void nbody_updt(void **, void *);
			
 
				+
			
 
				+void CUDA_nbody_acc(void **, void *);
			
 
				+void CUDA_nbody_updt(void **, void *);
			
 
				+
			
 
				+static struct starpu_perfmodel model =
			
 
				+{
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = "history_perf"
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet cl =
			
 
				+{
			
 
				+	.cpu_funcs = {nbody_acc},
			
 
				+	.cuda_funcs = {CUDA_nbody_acc},
			
 
				+	//STRUCT PARAM
			
 
				+	/* .nbuffers = 3, */
			
 
				+	/* .modes = {STARPU_RW, STARPU_RW, STARPU_RW}, */
			
 
				+	//
			
 
				+	
			
 
				+	//ARRAY PARAM
			
 
				+	.nbuffers = 5,
			
 
				+	.modes = {STARPU_RW, STARPU_RW, STARPU_RW, STARPU_RW, STARPU_RW},
			
 
				+	//
			
 
				+	.model = &model
			
 
				+};
			
 
				+
			
 
				+
			
 
				+static struct starpu_codelet cl2 =
			
 
				+{
			
 
				+	.cpu_funcs = {nbody_updt},
			
 
				+	.cuda_funcs = {CUDA_nbody_updt},
			
 
				+	//STRUCT PARAM
			
 
				+	/* .nbuffers = 3, */
			
 
				+	/* .modes = {STARPU_RW, STARPU_RW, STARPU_R}, */
			
 
				+	//
			
 
				+	
			
 
				+	//ARRAY PARAM
			
 
				+	.nbuffers = 4,
			
 
				+	.modes = {STARPU_RW, STARPU_RW, STARPU_R, STARPU_R},
			
 
				+	//
			
 
				+	.model = &model
			
 
				+};
			
 
				+
			
 
				+void nbody_with_starpu(double *positions, double *velocities, double *accelerations, double *masses, double *parameters, unsigned nbr_planets, unsigned nbr_simulations, unsigned nslices)
			
 
				+{
			
 
				+
			
 
				+
			
 
				+
			
 
				+	starpu_data_handle_t P_handle, V_handle, A_handle, M_handle, Par_handle, Id_handle;
			
 
				+
			
 
				+       
			
 
				+	starpu_matrix_data_register(&P_handle, STARPU_MAIN_RAM, (uintptr_t)positions, 2, 2, nbr_planets, sizeof(double));
			
 
				+	starpu_matrix_data_register(&V_handle, STARPU_MAIN_RAM, (uintptr_t)velocities, 2, 2, nbr_planets, sizeof(double));
			
 
				+	starpu_matrix_data_register(&A_handle, STARPU_MAIN_RAM, (uintptr_t)accelerations, 2, 2, nbr_planets, sizeof(double));
			
 
				+	starpu_vector_data_register(&M_handle, STARPU_MAIN_RAM, (uintptr_t)masses, nbr_planets, sizeof(double));
			
 
				+	starpu_vector_data_register(&Par_handle, STARPU_MAIN_RAM, (uintptr_t)parameters, 3, sizeof(double));
			
 
				+	struct starpu_data_filter vert = 
			
 
				+		{
			
 
				+			.filter_func = starpu_matrix_filter_vertical_block,
			
 
				+			.nchildren = nslices
			
 
				+		};
			
 
				+	
			
 
				+	unsigned i;
			
 
				+
			
 
				+
			
 
				+	int64_t *Id = malloc(nslices * sizeof(int64_t));
			
 
				+		
			
 
				+	for (i = 0; i < nbr_simulations; i++){
			
 
				+		
			
 
				+		/* printf("Simulation %d: \n", i); */
			
 
				+		
			
 
				+		starpu_data_partition(A_handle, &vert);
			
 
				+
			
 
				+		//STRUCT PARAM
			
 
				+		/* struct Param *params = malloc(nslices * sizeof(struct Param)); */
			
 
				+		
			
 
				+		
			
 
				+		/* unsigned epsilon = 2.5e8; */
			
 
				+		//
			
 
				+
			
 
				+		int64_t task1, task2;
			
 
				+		
			
 
				+		
			
 
				+		for (task1 = 0; task1 < nslices; task1++){
			
 
				+			
			
 
				+			
			
 
				+			struct starpu_task *task = starpu_task_create();
			
 
				+			
			
 
				+			Id[task1] = task1;
			
 
				+			starpu_vector_data_register(&Id_handle, STARPU_MAIN_RAM, (uintptr_t)&(Id[task1]), 1, sizeof(int64_t));
			
 
				+			//STRUCT PARAM
			
 
				+			/* struct Param param = {task1, epsilon}; */
			
 
				+			
			
 
				+			/* params[task1] = param; */
			
 
				+			//
			
 
				+			
			
 
				+			task->cl = &cl;
			
 
				+			task->handles[0] = P_handle;
			
 
				+			task->handles[1] = starpu_data_get_sub_data(A_handle, 1, task1);
			
 
				+			task->handles[2] = M_handle;
			
 
				+			task->handles[3] = Par_handle;
			
 
				+			task->handles[4] = Id_handle;
			
 
				+			//STRUCT PARAM
			
 
				+			/* task->cl_arg = &(params[task1]); */
			
 
				+			/* task->cl_arg_size = sizeof(struct Param); */
			
 
				+			//
			
 
				+			
			
 
				+			starpu_task_submit(task);
			
 
				+			starpu_data_unregister_submit(Id_handle);
			
 
				+		}
			
 
				+		starpu_task_wait_for_all();
			
 
				+
			
 
				+////////////////////////
			
 
				+
			
 
				+		starpu_data_partition(P_handle, &vert);
			
 
				+		starpu_data_partition(V_handle, &vert);
			
 
				+		
			
 
				+		for (task2 = 0; task2 < nslices; task2++){
			
 
				+			
			
 
				+			struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+			//STRUCT PARAM
			
 
				+			/* struct Param param = {task1, epsilon}; */
			
 
				+			
			
 
				+			/* params[task2] = param; */
			
 
				+			//
			
 
				+
			
 
				+			task->cl = &cl2;
			
 
				+			task->handles[0] = starpu_data_get_sub_data(P_handle, 1, task2);
			
 
				+			task->handles[1] = starpu_data_get_sub_data(V_handle, 1, task2);
			
 
				+			task->handles[2] = starpu_data_get_sub_data(A_handle, 1, task2);
			
 
				+			task->handles[3] = Par_handle;
			
 
				+			//STRUCT PARAM
			
 
				+			/* task->cl_arg = &(params[task2]); */
			
 
				+			/* task->cl_arg_size = sizeof(struct Param); */
			
 
				+			//
			
 
				+			starpu_task_submit(task);
			
 
				+		}
			
 
				+		
			
 
				+		starpu_task_wait_for_all();
			
 
				+		
			
 
				+		starpu_data_unpartition(P_handle, STARPU_MAIN_RAM);
			
 
				+		starpu_data_unpartition(V_handle, STARPU_MAIN_RAM);
			
 
				+		starpu_data_unpartition(A_handle, STARPU_MAIN_RAM);
			
 
				+		
			
 
				+		/* char filename[38]; */
			
 
				+	
			
 
				+		/* sprintf(filename, "PPM/nbody%d_%d.ppm", nbr_planets, i + 1); */
			
 
				+	
			
 
				+		/* nbody_graph_transpose(filename, positions, nbr_planets, 1000, 1000, -4e8, 4e8); */
			
 
				+
			
 
				+	}
			
 
				+
			
 
				+	
			
 
				+	starpu_data_unregister(P_handle);
			
 
				+	starpu_data_unregister(V_handle);
			
 
				+	starpu_data_unregister(A_handle);
			
 
				+	starpu_data_unregister(M_handle);
			
 
				+	starpu_data_unregister(Par_handle);
			
 
				+
			
 
				+		/* char filename[36]; */
			
 
				+	
			
 
				+		/* sprintf(filename, "PPM/bug%d_%d.ppm", nbr_planets, i); */
			
 
				+	
			
 
				+		/* nbody_graph(filename, positions, nbr_planets, 1000, 1000, -4e8, 4e8); */
			
 
				+
			
 
				+}
			
 
				+
			
 
				+void init_positions(double *positions, unsigned nbr_planets)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	double qiX, qiY;
			
 
				+	for (i = 0; i < nbr_planets; i++){
			
 
				+		double angle = ((RAND_MAX - rand()) / (double) (RAND_MAX)) * 2.0 * M_PI;
			
 
				+		double distToCenter = ((RAND_MAX - rand()) / (double) (RAND_MAX)) * 1.0e8 + 1.0e8;
			
 
				+
			
 
				+		qiX = cos(angle) * distToCenter;
			
 
				+		qiY = sin(angle) * distToCenter;
			
 
				+
			
 
				+		positions[2*i] = qiX;
			
 
				+		positions[2*i + 1] = qiY;
			
 
				+
			
 
				+
			
 
				+	       
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void init_velocities(double *positions, double *velocities, unsigned nbr_planets)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < nbr_planets; i++){
			
 
				+		double viX = positions[2*i+1] * 4.0e-6;
			
 
				+		double viY = -positions[2*i] * 4.0e-6;
			
 
				+		velocities[2*i] = viX;
			
 
				+		velocities[2*i + 1] = viY;
			
 
				+		
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void init_masses(double *masses, unsigned nbr_planets)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < nbr_planets; i++){
			
 
				+		double mi = (rand() / (double) RAND_MAX) * 1e22;
			
 
				+		masses[i] = mi;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+	
			
 
				+
			
 
				+double median_times(unsigned nbr_planets, unsigned nbr_simulations, unsigned nslices, unsigned nbr_tests)
			
 
				+{
			
 
				+	double exec_times[nbr_tests];
			
 
				+
			
 
				+	/* double *positions = malloc(4 * sizeof(double)); */
			
 
				+	/* double *velocities = calloc(4, sizeof(double)); */
			
 
				+	/* double *accelerations = calloc(4, sizeof(double)); */
			
 
				+	double *positions = malloc(nbr_planets * 2 * sizeof(double));
			
 
				+	double *velocities = malloc(nbr_planets * 2 * sizeof(double));
			
 
				+	double *accelerations = calloc(nbr_planets * 2, sizeof(double));
			
 
				+	double *masses = malloc(nbr_planets * sizeof(double));
			
 
				+	double *parameters = malloc(3 * sizeof(double));
			
 
				+
			
 
				+	double G = 6.67408e-11;
			
 
				+	/* double dt = 36000; */
			
 
				+	double dt = 3600;
			
 
				+	double epsilon = 2.5e8;
			
 
				+
			
 
				+	parameters[0] = G;
			
 
				+	parameters[1] = dt;
			
 
				+	parameters[2] = epsilon;
			
 
				+
			
 
				+	init_positions(positions, nbr_planets);
			
 
				+	init_velocities(positions, velocities, nbr_planets);
			
 
				+	init_masses(masses, nbr_planets);
			
 
				+	/* positions[0] = 0; */
			
 
				+	/* positions[1] = 300000; */
			
 
				+	/* positions[2] = 600000; */
			
 
				+	/* positions[3] = 300000; */
			
 
				+	/* masses[0] = 5.9e24; */
			
 
				+	/* masses[1] = 5.9e24; */
			
 
				+
			
 
				+	/* unsigned i; */
			
 
				+
			
 
				+	/* for (i = 0; i < nbr_planets; i++){ */
			
 
				+	/* 	accelerations[i] = i; */
			
 
				+	/* } */
			
 
				+
			
 
				+	
			
 
				+	
			
 
				+	/* nbody_graph("PPM/bug0.ppm", positions, nbr_planets, 1000, 1000, -4e8, 4e8); */
			
 
				+
			
 
				+	/* unsigned k; */
			
 
				+       
			
 
				+	/* for (k = 1; k <= nbr_simulations; k++){ */
			
 
				+	
			
 
				+	double start, stop, exec_t;
			
 
				+	unsigned i;
			
 
				+
			
 
				+
			
 
				+	
			
 
				+	for (i = 0; i < nbr_tests; i++){
			
 
				+
			
 
				+		start = starpu_timing_now();
			
 
				+
			
 
				+		nbody_with_starpu(positions, velocities, accelerations, masses, parameters, nbr_planets, nbr_simulations, nslices);
			
 
				+		
			
 
				+		stop = starpu_timing_now();
			
 
				+
			
 
				+		exec_t = (stop - start) / 1.e6;
			
 
				+
			
 
				+		exec_times[i] = exec_t;
			
 
				+	}
			
 
				+	/* printf("\n\nSIMULATION %d:\n\n", k); */
			
 
				+	
			
 
				+	
			
 
				+	/* char filename[23]; */
			
 
				+	
			
 
				+	/* sprintf(filename, "PPM/bug%d.ppm", k); */
			
 
				+	
			
 
				+	/* nbody_graph(filename, positions, nbr_planets, 1000, 1000, -4e8, 4e8); */
			
 
				+	/* } */
			
 
				+
			
 
				+	/* for (i = 0; i < nbr_planets; i++){ */
			
 
				+	/* 	printf("%f %f\n", accelerations[2*i], accelerations[2*i + 1]); */
			
 
				+	/* } */
			
 
				+	     
			
 
				+	
			
 
				+	free(positions);
			
 
				+	free(velocities);
			
 
				+	free(accelerations);
			
 
				+	free(masses);
			
 
				+
			
 
				+	quicksort(exec_times, 0, nbr_tests - 1);
			
 
				+
			
 
				+	return exec_times[nbr_tests / 2];
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+void display_times(unsigned start_nbr, unsigned step_nbr, unsigned stop_nbr, unsigned nbr_simulations, unsigned nslices, unsigned nbr_tests)
			
 
				+{
			
 
				+
			
 
				+	FILE *myfile;
			
 
				+	
			
 
				+	myfile = fopen("DAT/nbody_c_array_times.dat", "w");
			
 
				+
			
 
				+	unsigned nbr_planets;
			
 
				+	
			
 
				+	for (nbr_planets = start_nbr; nbr_planets <= stop_nbr; nbr_planets += step_nbr){
			
 
				+		double t = median_times(nbr_planets, nbr_simulations, nslices, nbr_tests);
			
 
				+		printf("ARRAY: %u planets: %f seconds\n", nbr_planets, t);
			
 
				+		fprintf(myfile, "%f\n", t);
			
 
				+	}
			
 
				+	fclose(myfile);
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char * argv[])
			
 
				+{
			
 
				+
			
 
				+	if (argc != 7){
			
 
				+		printf("Usage: %s start_nbr step_nbr stop_nbr nbr_simulations nslices nbr_tests\n", argv[0]);
			
 
				+		return 1;
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	if (starpu_init(NULL) != EXIT_SUCCESS){
			
 
				+		fprintf(stderr, "ERROR\n");
			
 
				+		return 77;
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	unsigned start_nbr = (unsigned) atoi(argv[1]);
			
 
				+	unsigned step_nbr = (unsigned) atoi(argv[2]);
			
 
				+	unsigned stop_nbr = (unsigned) atoi(argv[3]);
			
 
				+	unsigned nbr_simulations = (unsigned) atoi(argv[4]);
			
 
				+	unsigned nslices = (unsigned) atoi(argv[5]);
			
 
				+	unsigned nbr_tests = (unsigned) atoi(argv[6]);
			
 
				+
			
 
				+	srand(time(NULL));
			
 
				+	
			
 
				+	display_times(start_nbr, step_nbr, stop_nbr, nbr_simulations, nslices, nbr_tests);
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+	
			
 
				+	return 0;
			
 
				+}
			
--- a/julia/tst/nbody/nbody_def.jl
+++ b/julia/tst/nbody/nbody_def.jl
@@ -0,0 +1,366 @@
 
				+#include("./nbody_display.jl")
			
 
				+include("nbody.jl")
			
 
				+function nbody_with_starpu(positions ::Matrix{Float64}, velocities ::Matrix{Float64}, accelerations ::Matrix{Float64}, masses ::Vector{Float64}, parameters ::Vector{Float64}, nbr_simulations ::Int64, nslices ::Int64, nbr_planets ::Int64)
			
 
				+    
			
 
				+    vert = StarpuDataFilter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslices)
			
 
				+
			
 
				+    for i = 1:nbr_simulations
			
 
				+ 
			
 
				+        @starpu_block let
			
 
				+            hPOS, hVEL, hACC, hPAR, hMA = starpu_data_register(positions,velocities,accelerations,parameters,masses)
			
 
				+            
			
 
				+            starpu_data_partition(hACC,vert)
			
 
				+            @starpu_sync_tasks for task in (1:nslices)
			
 
				+                @starpu_block let
			
 
				+                    id = Int64[task-1]
			
 
				+                    hID = starpu_data_register(id)
			
 
				+                    @starpu_async_cl claccst(hPOS,hACC[task],hMA,hPAR,hID)
			
 
				+                end
			
 
				+            end
			
 
				+            
			
 
				+            starpu_data_partition(hPOS,vert)
			
 
				+            starpu_data_partition(hVEL,vert)
			
 
				+            
			
 
				+            @starpu_sync_tasks for task in (1:nslices)
			
 
				+                @starpu_async_cl clupdtst(hPOS[task],hVEL[task],hACC[task],hPAR)
			
 
				+            end
			
 
				+            
			
 
				+        end
			
 
				+        # pixels ::Array{Array{Int64}} = [[0,0,0] for i = 1:1000*1000]
			
 
				+        # graph_planets(pixels, positions, -4E8, 4E8, 1000, 1000, "PPM/nbody_st$(nbr_planets)_$i.ppm")
			
 
				+    end
			
 
				+    return nothing
			
 
				+end
			
 
				+
			
 
				+
			
 
				+
			
 
				+function nbody_with_starpu_cpu(positions ::Matrix{Float64}, velocities ::Matrix{Float64}, accelerations ::Matrix{Float64}, masses ::Vector{Float64}, parameters ::Vector{Float64}, nbr_simulations ::Int64, nslices ::Int64)
			
 
				+
			
 
				+    vert = StarpuDataFilter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslices)
			
 
				+ 
			
 
				+    for i = 1:nbr_simulations
			
 
				+
			
 
				+        @starpu_block let
			
 
				+            hPOS, hVEL, hACC, hPAR, hMA = starpu_data_register(positions,velocities,accelerations,parameters,masses)
			
 
				+            
			
 
				+            starpu_data_partition(hACC,vert)
			
 
				+            @starpu_sync_tasks for task in (1:nslices)
			
 
				+                id = Int64[task-1]
			
 
				+                hID = starpu_data_register(id)
			
 
				+                @starpu_async_cl clacccpu(hPOS,hACC[task],hMA,hPAR,hID)
			
 
				+            end
			
 
				+            
			
 
				+            starpu_data_partition(hPOS,vert)
			
 
				+            starpu_data_partition(hVEL,vert)
			
 
				+            
			
 
				+            @starpu_sync_tasks for task in (1:nslices)
			
 
				+                @starpu_async_cl clupdtcpu(hPOS[task],hVEL[task],hACC[task],hPAR)
			
 
				+            end
			
 
				+            
			
 
				+        end
			
 
				+        
			
 
				+        # pixels ::Array{Array{Int64}} = [[0,0,0] for i = 1:1000*1000]
			
 
				+        # graph_planets(pixels, positions, -4E8, 4E8, 1000, 1000, "PPM/nbody_cpu$i.ppm")
			
 
				+    end
			
 
				+    return nothing
			
 
				+end
			
 
				+
			
 
				+
			
 
				+
			
 
				+function nbody_with_starpu_gpu(positions ::Matrix{Float64}, velocities ::Matrix{Float64}, accelerations ::Matrix{Float64}, masses ::Vector{Float64}, parameters ::Vector{Float64}, nbr_simulations ::Int64, nslices ::Int64)
			
 
				+
			
 
				+    vert = StarpuDataFilter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslices)
			
 
				+
			
 
				+    for i = 1:nbr_simulations
			
 
				+
			
 
				+        @starpu_block let
			
 
				+            hPOS, hVEL, hACC, hPAR, hMA = starpu_data_register(positions,velocities,accelerations,parameters,masses)
			
 
				+            
			
 
				+            starpu_data_partition(hACC,vert)
			
 
				+            @starpu_sync_tasks for task in (1:nslices)
			
 
				+                id = Int64[task-1]
			
 
				+                hID = starpu_data_register(id)
			
 
				+                @starpu_async_cl claccgpu(hPOS,hACC[task],hMA,hPAR,hID)
			
 
				+            end
			
 
				+            
			
 
				+            starpu_data_partition(hPOS,vert)
			
 
				+            starpu_data_partition(hVEL,vert)
			
 
				+            
			
 
				+            @starpu_sync_tasks for task in (1:nslices)
			
 
				+                @starpu_async_cl clupdtgpu(hPOS[task],hVEL[task],hACC[task],hPAR)
			
 
				+            end
			
 
				+        end       
			
 
				+    
			
 
				+        # pixels ::Array{Array{Int64}} = [[0,0,0] for i = 1:1000*1000]
			
 
				+        # graph_planets(pixels, positions, -4E8, 4E8, 1000, 1000, "PPM/nbody_gpu$i.ppm")
			
 
				+    end
			
 
				+    return nothing
			
 
				+end
			
 
				+
			
 
				+
			
 
				+
			
 
				+# function display_times(starting_nbr_planets::Int64, step_nbr ::Int64, nbr_steps ::Int64, nbr_simulations::Int64, nb_tests ::Int64, nslices::Int64)
			
 
				+#     width = 1000
			
 
				+#     height = 1000
			
 
				+
			
 
				+#     times_starpu ::Vector{Float64} = [0 for i = 1:nbr_steps+1]
			
 
				+#     times_julia ::Vector{Float64} = [0 for i = 1:nbr_steps+1]
			
 
				+    
			
 
				+#     for k = 0:nbr_steps
			
 
				+#         nbr_planets ::Int64 = starting_nbr_planets + k * step_nbr
			
 
				+#         println("Number of planets: $nbr_planets")
			
 
				+
			
 
				+#         epsilon ::Float64 = 2.5E8
			
 
				+#         dt ::Float64 = 36000
			
 
				+#         G ::Float64 = 6.67408E-11
			
 
				+#         parameters ::Vector{Float64} = [G,dt,epsilon]
			
 
				+        
			
 
				+#         maxcoord ::Int64 = 2E8
			
 
				+#         mincoord ::Int64 = -2E8
			
 
				+        
			
 
				+#         positions ::Matrix{Float64} = zeros(2, nbr_planets)
			
 
				+#         velocities ::Matrix{Float64} = zeros(2, nbr_planets)
			
 
				+#         accelerations ::Matrix{Float64} = zeros(2,nbr_planets)
			
 
				+        
			
 
				+#         positionsjl ::Vector{Vector{Float64}} = [[0, 0] for i = 1:nbr_planets]
			
 
				+#         velocitiesjl ::Vector{Vector{Float64}} = [[0, 0] for i = 1:nbr_planets]
			
 
				+#         accelerationsjl ::Vector{Vector{Float64}} = [[0, 0] for i = 1:nbr_planets]
			
 
				+        
			
 
				+        
			
 
				+#         masses ::Vector{Float64} = [0 for i = 1:nbr_planets]
			
 
				+        
			
 
				+        
			
 
				+        
			
 
				+
			
 
				+#         for i = 1:nbr_planets
			
 
				+#             mi ::Float64 = rand() * 5E22
			
 
				+#             ri = mi * 2.5E-15
			
 
				+            
			
 
				+#             angle ::Float64 = rand() * 2 * pi
			
 
				+#             distToCenter ::Float64 = rand() * 1.0E8 + 1.0E8
			
 
				+            
			
 
				+#             qix ::Float64 = cos(angle) * distToCenter
			
 
				+#             qiy ::Float64 = sin(angle) * distToCenter
			
 
				+            
			
 
				+#             vix ::Float64 = qiy * 4.0E-6
			
 
				+#             viy ::Float64 = -qix * 4.0E-6
			
 
				+            
			
 
				+#             masses[i] = mi
			
 
				+        
			
 
				+#             positions[1,i] = qix
			
 
				+#             positions[2,i] = qiy
			
 
				+            
			
 
				+#             positionsjl[i] = [qix, qiy]
			
 
				+            
			
 
				+#             velocities[1,i] = vix
			
 
				+#             velocities[2,i] = viy
			
 
				+            
			
 
				+#             velocitiesjl[i] = [vix, viy]
			
 
				+#         end
			
 
				+        
			
 
				+#         max_value = 4E8
			
 
				+#         min_value = -4E8
			
 
				+        
			
 
				+#         println("Starpu...")
			
 
				+#         tic()
			
 
				+#         for i = 1:nbr_simulations
			
 
				+#             nbody_with_starpu(positions, velocities, accelerations, masses, parameters, nslices)
			
 
				+#         end
			
 
				+#         t_starpu = toq()
			
 
				+        
			
 
				+#         println("No Starpu...")
			
 
				+#         tic()
			
 
				+#         for i = 1:nbr_simulations
			
 
				+#             nbody_jl(positionsjl, velocitiesjl, accelerationsjl, masses, epsilon, dt)
			
 
				+#         end
			
 
				+#         t_julia = toq()
			
 
				+        
			
 
				+        
			
 
				+        
			
 
				+#         times_starpu[k+1] = t_starpu
			
 
				+#         times_julia[k+1] = t_julia
			
 
				+#     end
			
 
				+#     open("./DAT/nbody.dat", "w") do f
			
 
				+#         for i = 0:nbr_steps
			
 
				+#             write(f, "$(starting_nbr_planets + i*step_nbr)")
			
 
				+#             write(f, " $(times_starpu[i+1])")
			
 
				+#             write(f, " $(times_julia[i+1])\n")
			
 
				+#         end
			
 
				+#     end
			
 
				+# end
			
 
				+
			
 
				+
			
 
				+function set_to_zero(A ::Array{<:Real,2})
			
 
				+    height,width = size(A)
			
 
				+    for i = 1:height
			
 
				+        for j = 1:width
			
 
				+            A[i,j] = 0
			
 
				+        end
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+function median_times(nbr_tests ::Int64, nbr_planets ::Int64, nbr_simulations ::Int64, nslices ::Int64)
			
 
				+
			
 
				+######################### INITIALIZATION #########################
			
 
				+
			
 
				+    width ::Int64 = 1000
			
 
				+    height ::Int64 = 1000
			
 
				+
			
 
				+    epsilon ::Float64 = 2.5E8
			
 
				+    dt ::Float64 = 3600
			
 
				+    G ::Float64 = 6.67408E-11
			
 
				+    parameters ::Vector{Float64} = [G,dt,epsilon]
			
 
				+
			
 
				+    # Coordinate interval for the final display screen.
			
 
				+    maxcoord ::Int64 = 2E8
			
 
				+    mincoord ::Int64 = -2E8
			
 
				+
			
 
				+    exec_times_st ::Vector{Float64} = [0 for i = 1:nbr_tests]
			
 
				+    exec_times_cpu ::Vector{Float64} = [0 for i = 1:nbr_tests]
			
 
				+    exec_times_gpu ::Vector{Float64} = [0 for i = 1:nbr_tests]
			
 
				+    # exec_times_jl ::Vector{Float64} = [0 for i = 1:nbr_tests]
			
 
				+
			
 
				+    # Arrays used for each of the starpu-using functions. 
			
 
				+    positions ::Matrix{Float64} = zeros(2, nbr_planets)
			
 
				+    velocities ::Matrix{Float64} = zeros(2, nbr_planets)
			
 
				+    accelerations_st ::Matrix{Float64} = zeros(2, nbr_planets)    
			
 
				+    
			
 
				+    # Arrays used for the naive julia function.
			
 
				+    # positions_jl ::Vector{Vector{Float64}} = [[0, 0] for i = 1:nbr_planets]
			
 
				+    # velocities_jl ::Vector{Vector{Float64}} = [[0, 0] for i = 1:nbr_planets]
			
 
				+    # accelerations_jl ::Vector{Vector{Float64}} = [[0, 0] for i = 1:nbr_planets]
			
 
				+    
			
 
				+    
			
 
				+    masses ::Vector{Float64} = [0 for i = 1:nbr_planets]
			
 
				+    
			
 
				+
			
 
				+    for k = 1:nbr_tests
			
 
				+        println("Test $k...")
			
 
				+        # Initializing the starpu and naive julia arrays with the same values.
			
 
				+        for i = 1:nbr_planets
			
 
				+            mi ::Float64 = rand() * 5E22
			
 
				+            ri = mi * 2.5E-15
			
 
				+            
			
 
				+            angle ::Float64 = rand() * 2 * pi
			
 
				+            distToCenter ::Float64 = rand() * 1.0E8 + 1.0E8
			
 
				+            
			
 
				+            qix ::Float64 = cos(angle) * distToCenter
			
 
				+            qiy ::Float64 = sin(angle) * distToCenter
			
 
				+            
			
 
				+            vix ::Float64 = qiy * 4.0E-6
			
 
				+            viy ::Float64 = -qix * 4.0E-6
			
 
				+            
			
 
				+            masses[i] = mi
			
 
				+            
			
 
				+            positions[1,i] = qix
			
 
				+            positions[2,i] = qiy
			
 
				+            
			
 
				+            #positions_jl[i] = [qix, qiy]
			
 
				+            
			
 
				+            velocities[1,i] = vix
			
 
				+            velocities[2,i] = viy
			
 
				+            
			
 
				+            #velocities_jl[i] = [vix, viy]
			
 
				+        end
			
 
				+
			
 
				+
			
 
				+######################### SIMULATION #########################
			
 
				+
			
 
				+        # Using new arrays for the starpu functions, so we can keep in memory the initial values.
			
 
				+        positions_st ::Matrix{Float64} = copy(positions)
			
 
				+        velocities_st ::Matrix{Float64} = copy(velocities)
			
 
				+        set_to_zero(accelerations_st)
			
 
				+
			
 
				+        tic()
			
 
				+        nbody_with_starpu(positions_st, velocities_st, accelerations_st, masses, parameters, nbr_simulations, nslices, nbr_planets)
			
 
				+        t_st = toq()
			
 
				+
			
 
				+
			
 
				+
			
 
				+        # positions_st = copy(positions)
			
 
				+        # velocities_st = copy(velocities)
			
 
				+        # set_to_zero(accelerations_st)
			
 
				+                
			
 
				+        # tic()
			
 
				+        # nbody_with_starpu_cpu(positions_st, velocities_st, accelerations_st, masses, parameters, nbr_simulations, nslices)
			
 
				+        # t_cpu = toq()
			
 
				+        
			
 
				+
			
 
				+
			
 
				+        # positions_st = copy(positions)
			
 
				+        # velocities_st = copy(velocities)
			
 
				+        # set_to_zero(accelerations_st)
			
 
				+
			
 
				+        # tic()
			
 
				+        # nbody_with_starpu_gpu(positions_st, velocities_st, accelerations_st, masses, parameters, nbr_simulations, nslices)
			
 
				+        # t_gpu = toq()
			
 
				+
			
 
				+        
			
 
				+        #tic()
			
 
				+        #nbody_jl(positions_jl, velocities_jl, accelerations_jl, masses, nbr_simulations, epsilon, dt)
			
 
				+        #t_jl = toq()
			
 
				+        
			
 
				+        exec_times_st[k] = t_st
			
 
				+        # exec_times_cpu[k] = t_cpu
			
 
				+        # exec_times_gpu[k] = t_gpu
			
 
				+        #exec_times_jl[k] = t_jl
			
 
				+        
			
 
				+    end
			
 
				+
			
 
				+    sort!(exec_times_st)
			
 
				+    # sort!(exec_times_cpu)
			
 
				+    # sort!(exec_times_gpu)
			
 
				+    #sort!(exec_times_jl)
			
 
				+
			
 
				+    res ::Vector{Float64} = [exec_times_st[1 + div(nbr_tests-1,2)]]#, exec_times_cpu[1 + div(nbr_tests-1,2)], exec_times_gpu[1 + div(nbr_tests-1,2)]]#, exec_times_jl[1 + div(nbr_tests-1,2)]]
			
 
				+    
			
 
				+    return res
			
 
				+end
			
 
				+# Adds the median times of each function inside a .DAT file.
			
 
				+function display_times(start_nbr ::Int64, step_nbr ::Int64, stop_nbr ::Int64, nbr_simulations ::Int64, nslices ::Int64, nbr_tests ::Int64)
			
 
				+
			
 
				+
			
 
				+    # mtc = map( (x->parse(Float64,x)), open("DAT/nbody_c_times.dat") do f
			
 
				+    #         readlines(f)
			
 
				+    #         end)
			
 
				+
			
 
				+    mtjl = map( (x->parse(Float64,x)), open("../DAT/nbody_jl.dat") do f
			
 
				+            readlines(f)
			
 
				+            end)
			
 
				+
			
 
				+    mtcstr = map( (x->parse(Float64,x)), open("../DAT/nbody_c_struct_times.dat") do f
			
 
				+                  readlines(f)
			
 
				+                  end)
			
 
				+
			
 
				+    mtcarr = map( (x->parse(Float64,x)), open("../DAT/nbody_c_array_times.dat") do f
			
 
				+                  readlines(f)
			
 
				+                  end)
			
 
				+
			
 
				+
			
 
				+    i = 1;
			
 
				+
			
 
				+    #open("./DAT/nbody_jl.dat", "w") do fjl
			
 
				+    open("../DAT/nbody_jl_array_times.dat", "w") do ft
			
 
				+        open("../DAT/nbody_speedups.dat", "w") do f
			
 
				+            for nbr_planets in (start_nbr : step_nbr : stop_nbr)
			
 
				+                println("$(nbr_planets) planets:")
			
 
				+                mt ::Vector{Float64} = median_times(nbr_tests, nbr_planets, nbr_simulations, nslices)
			
 
				+                println("C struct time: $(mtcstr[i])")
			
 
				+                println("C array time: $(mtcarr[i])")
			
 
				+                println("Starpujl time: $(mt[1])")
			
 
				+                # println("CPUjl time: $(mt[2])")
			
 
				+                # println("GPUjl time: $(mt[3])")
			
 
				+                println("Julia time: $(mtjl[i])")
			
 
				+                write(f, "$(nbr_planets) $(mtjl[i]/mt[1]) $(mtjl[i]/mtcstr[i]) $(mtjl[i]/mtcarr[i])\n") 
			
 
				+                write(ft, "$(mt[1])\n")
			
 
				+                #write(fjl, "$(mt[4])\n")
			
 
				+                i = i + 1
			
 
				+                
			
 
				+            end
			
 
				+        end
			
 
				+    end
			
 
				+    #end
			
 
				+end
			
 
				+
			
 
				+
			
 
				+        
			
--- a/julia/tst/nbody/nbody_display.jl
+++ b/julia/tst/nbody/nbody_display.jl
@@ -0,0 +1,74 @@
 
				+function get_planet_pixels(X ::Int64, Y ::Int64)
			
 
				+    pix ::Array{Tuple{Int64,Int64}} = []
			
 
				+    for i = X-1:X+1
			
 
				+        for j = Y-3:Y+3
			
 
				+            push!(pix,(i,j))
			
 
				+        end
			
 
				+    end
			
 
				+    for i = Y-2:Y+2
			
 
				+        push!(pix,(X-2,i))
			
 
				+        push!(pix,(X+2,i))
			
 
				+    end
			
 
				+    for i = Y-1:Y+1
			
 
				+        push!(pix,(X-3,i))
			
 
				+        push!(pix,(X+3,i))
			
 
				+    end
			
 
				+    return pix
			
 
				+end
			
 
				+
			
 
				+
			
 
				+function is_inside(t ::Tuple{Int64,Int64}, width ::Int64, height ::Int64)
			
 
				+    if (t[1] > 0 && t[1] <= width && t[2] > 0 && t[2] <= height)
			
 
				+        return true
			
 
				+    else
			
 
				+        return false
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+function graph_planets(pixels ::Array{Array{Int64}}, positions ::Matrix{Float64}, min_value ::Float64, max_value ::Float64, width ::Int64, height ::Int64, file_name ::String)
			
 
				+    n = size(positions)[2]
			
 
				+    for i = 1:n
			
 
				+        X ::Int64= round( ( (positions[1, i] - min_value) / (max_value - min_value) ) * (width - 1) ) + 1
			
 
				+        Y ::Int64= round( (1 - ( (positions[2, i] - min_value) / (max_value - min_value) ) ) * (height - 1) ) + 1
			
 
				+        pix ::Array{Tuple{Int64,Int64}} = get_planet_pixels(X,Y)
			
 
				+        for pixel in pix
			
 
				+            if is_inside(pixel, width, height)
			
 
				+                pixels[(pixel[2] - 1)*width + pixel[1]] = [125, round(255*i/n)]
			
 
				+            end
			
 
				+        end
			
 
				+    end
			
 
				+
			
 
				+    open(file_name,"w") do f
			
 
				+        write(f, "P3\n$width $height\n255\n")
			
 
				+        for he = 1:height
			
 
				+            for wi = 1:width
			
 
				+                write(f, "$(pixels[(he - 1)*width + wi][1]) 0 $(pixels[(he - 1)*width + wi][2]) ")
			
 
				+            end
			
 
				+            write(f,"\n")
			
 
				+        end
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+# function graph_planets(pixels ::Array{Array{Int64}}, positions ::Vector{Vector{Float64}}, min_value ::Float64, max_value ::Float64, width ::Int64, height ::Int64, file_name ::String)
			
 
				+#     n = length(positions)
			
 
				+#     for i = 1:n
			
 
				+#         X ::Int64= round( ( (positions[i][1] - min_value) / (max_value - min_value) ) * (width - 1) ) + 1
			
 
				+#         Y ::Int64= round( (1 - ( (positions[i][2] - min_value) / (max_value - min_value) ) ) * (height - 1) ) + 1
			
 
				+#         pix ::Array{Tuple{Int64,Int64}} = get_planet_pixels(X,Y)
			
 
				+#         for pixel in pix
			
 
				+#             if is_inside(pixel, width, height)
			
 
				+#                 pixels[(pixel[2] - 1)*width + pixel[1]] = [125, round(255*i/n)]
			
 
				+#             end
			
 
				+#         end
			
 
				+#     end
			
 
				+
			
 
				+#     open(file_name,"w") do f
			
 
				+#         write(f, "P3\n$width $height\n255\n")
			
 
				+#         for he = 1:height
			
 
				+#             for wi = 1:width
			
 
				+#                 write(f, "$(pixels[(he - 1)*width + wi][1]) 0 $(pixels[(he - 1)*width + wi][2]) ")
			
 
				+#             end
			
 
				+#             write(f,"\n")
			
 
				+#         end
			
 
				+#     end
			
 
				+# end
			
--- a/julia/tst/nbody/nbody_generated.jl
+++ b/julia/tst/nbody/nbody_generated.jl
@@ -0,0 +1,68 @@
 
				+if length(ARGS) != 6
			
 
				+    println("Usage: julia nbody_generated.jl start_nbr step_nbr stop_nbr nbr_simulations nbr_slices nbr_tests")
			
 
				+    quit()
			
 
				+end
			
 
				+
			
 
				+if parse(Int64, ARGS[1]) % parse(Int64, ARGS[5]) != 0
			
 
				+    println("The number of slices must divide the number of planets.")
			
 
				+    quit()
			
 
				+end
			
 
				+
			
 
				+include("../../src/Wrapper/Julia/starpu_include.jl")
			
 
				+using StarPU
			
 
				+
			
 
				+@debugprint "starpu_init"
			
 
				+starpu_init(extern_task_path = "../build/generated_tasks_nbody.so")
			
 
				+
			
 
				+perfmodel = StarpuPerfmodel(
			
 
				+    perf_type = STARPU_HISTORY_BASED,
			
 
				+    symbol = "history_perf"
			
 
				+)
			
 
				+
			
 
				+# Normal starpu codelets
			
 
				+claccst = StarpuCodelet(
			
 
				+    cpu_func = "nbody_acc",
			
 
				+    gpu_func = "CUDA_nbody_acc",
			
 
				+    modes = [STARPU_R, STARPU_RW, STARPU_R, STARPU_R, STARPU_R],
			
 
				+    perfmodel = perfmodel
			
 
				+)
			
 
				+
			
 
				+clupdtst = StarpuCodelet(
			
 
				+    cpu_func = "nbody_updt",
			
 
				+    gpu_func = "CUDA_nbody_updt",
			
 
				+    modes = [STARPU_RW, STARPU_RW, STARPU_R, STARPU_R],
			
 
				+    perfmodel = perfmodel
			
 
				+)
			
 
				+
			
 
				+# CPU_only codelets
			
 
				+clacccpu = StarpuCodelet(
			
 
				+    cpu_func = "nbody_acc",
			
 
				+    modes = [STARPU_R, STARPU_RW, STARPU_R, STARPU_R, STARPU_R],
			
 
				+    perfmodel = perfmodel
			
 
				+)
			
 
				+
			
 
				+clupdtcpu = StarpuCodelet(
			
 
				+    cpu_func = "nbody_updt",
			
 
				+    modes = [STARPU_RW, STARPU_RW, STARPU_R,STARPU_R],
			
 
				+    perfmodel = perfmodel
			
 
				+)
			
 
				+
			
 
				+# GPU_only codelets
			
 
				+claccgpu = StarpuCodelet(
			
 
				+    gpu_func = "CUDA_nbody_acc",
			
 
				+    modes = [STARPU_R, STARPU_RW, STARPU_R, STARPU_R, STARPU_R],
			
 
				+    perfmodel = perfmodel
			
 
				+)
			
 
				+
			
 
				+clupdtgpu = StarpuCodelet(
			
 
				+    gpu_func = "CUDA_nbody_updt",
			
 
				+    modes = [STARPU_RW, STARPU_RW, STARPU_R,STARPU_R],
			
 
				+    perfmodel = perfmodel
			
 
				+)
			
 
				+
			
 
				+include("nbody_def.jl")
			
 
				+
			
 
				+display_times(map((x -> parse(Int64, x)), ARGS)...)
			
 
				+
			
 
				+@debugprint "starpu_shutdown"
			
 
				+starpu_shutdown()