Selaa lähdekoodia

Merge branch 'master' of git+ssh://scm.gforge.inria.fr/gitroot/starpu/starpu

Samuel Thibault 5 vuotta sitten
vanhempi
commit
0fe5a8b02d

+ 4 - 6
julia/examples/mandelbrot/Makefile

@@ -21,8 +21,6 @@ ifneq ($(ENABLE_CUDA),yes)
 	CUDA_OBJECTS:=
 endif
 
-LIBPATH=${PWD}/../StarPU.jl/lib
-
 all: ${EXTERNLIB}
 
 mandelbrot: mandelbrot.c cpu_mandelbrot.o #gpu_mandelbrot.o
@@ -47,12 +45,12 @@ clean:
 
 # Performance Tests
 cstarpu.dat: mandelbrot
-	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 ./mandelbrot -0.800671 -0.158392 32 32 4096 4 > $@
+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 ./mandelbrot > $@
 julia_generatedc.dat:
-	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mandelbrot.jl $@
+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mandelbrot.jl $@
 julia_native.dat:
-	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mandelbrot_native.jl $@
+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mandelbrot_native.jl $@
 julia_calllib.dat: ${EXTERNLIB}
-	LD_LIBRARY_PATH+=${LIBPATH} JULIA_TASK_LIB="${EXTERNLIB}" STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mandelbrot.jl julia_calllib.dat
+	JULIA_TASK_LIB="${EXTERNLIB}" STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mandelbrot.jl julia_calllib.dat
 
 test: cstarpu.dat julia_generatedc.dat julia_native.dat julia_calllib.dat

+ 17 - 13
julia/examples/mandelbrot/cpu_mandelbrot.c

@@ -1,44 +1,47 @@
 #include <stdio.h>
 #include <starpu.h>
 #include <math.h>
+#include "cpu_mandelbrot.h"
 
 void cpu_mandelbrot(void *descr[], void *cl_arg)
 {
         long long *pixels;
-        float *params;
 
         pixels = (long long int *)STARPU_MATRIX_GET_PTR(descr[0]);
-        params = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
+        struct params *params = (struct params *) cl_arg;
 
-        long long width = STARPU_MATRIX_GET_NY(descr[0]);
-        long long height = STARPU_MATRIX_GET_NX(descr[0]);
+        long width = STARPU_MATRIX_GET_NY(descr[0]);
+        long height = STARPU_MATRIX_GET_NX(descr[0]);
         double zoom = width * 0.25296875;
         double iz = 1. / zoom;
         float diverge = 4.0;
         float max_iterations = (width/2) * 0.049715909 * log10(zoom);
         float imi = 1. / max_iterations;
-        float centerr = params[0];
-        float centeri = params[1];
-        float offset = params[2];
-        float dim = params[3];
+        double centerr = params->centerr;
+        double centeri = params->centeri;
+        long offset = params->offset;
+        long dim = params->dim;
         double cr = 0;
         double zr = 0;
         double ci = 0;
         double zi = 0;
-        long long n = 0;
+        long n = 0;
         double tmp = 0;
         int ldP = STARPU_MATRIX_GET_LD(descr[0]);
 
         long long x,y;
 
-        for (y = 0; y < height; y++){
-                for (x = 0; x < width; x++){
+        for (y = 0; y < height; y++)
+	{
+                for (x = 0; x < width; x++)
+		{
                         cr = centerr + (x - (dim/2)) * iz;
 			zr = cr;
                         ci = centeri + (y+offset - (dim/2)) * iz;
                         zi = ci;
 
-                        for (n = 0; n <= max_iterations; n++) {
+                        for (n = 0; n <= max_iterations; n++)
+			{
 				if (zr*zr + zi*zi>diverge) break;
                                 tmp = zr*zr - zi*zi + cr;
                                 zi = 2*zr*zi + ci;
@@ -54,7 +57,8 @@ void cpu_mandelbrot(void *descr[], void *cl_arg)
 
 char* CPU = "cpu_mandelbrot";
 char* GPU = "gpu_mandelbrot";
-extern char *starpu_find_function(char *name, char *device) {
+extern char *starpu_find_function(char *name, char *device)
+{
 	if (!strcmp(device,"gpu")) return GPU;
 	return CPU;
 }

+ 9 - 0
julia/examples/mandelbrot/cpu_mandelbrot.h

@@ -0,0 +1,9 @@
+struct params
+{
+        double centerr;
+        double centeri;
+        long offset;
+        long dim;
+};
+
+

+ 77 - 56
julia/examples/mandelbrot/mandelbrot.c

@@ -13,36 +13,35 @@
  *
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <starpu.h>
+#include "cpu_mandelbrot.h"
 
 void cpu_mandelbrot(void **, void *);
 void gpu_mandelbrot(void **, void *);
 
 static struct starpu_perfmodel model =
 {
-		.type = STARPU_HISTORY_BASED,
-		.symbol = "history_perf"
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "history_perf"
 };
 
 static struct starpu_codelet cl =
 {
-	.cpu_funcs = {cpu_mandelbrot},
+ 	.cpu_funcs = {cpu_mandelbrot},
 	//.cuda_funcs = {gpu_mandelbrot},
-	.nbuffers = 2,
-	.modes = {STARPU_W, STARPU_R},
+	.nbuffers = 1,
+	.modes = {STARPU_W},
 	.model = &model
 };
 
-
-void mandelbrot_with_starpu(long long *pixels, float *params, long long dim, long long nslicesx)
+void mandelbrot_with_starpu(long long *pixels, struct params *p, long long dim, long long nslicesx)
 {
 	starpu_data_handle_t pixels_handle;
-	starpu_data_handle_t params_handle;
 
 	starpu_matrix_data_register(&pixels_handle, STARPU_MAIN_RAM, (uintptr_t)pixels, dim, dim, dim, sizeof(long long));
-	starpu_matrix_data_register(&params_handle, STARPU_MAIN_RAM, (uintptr_t)params, 4*nslicesx, 4*nslicesx, 1, sizeof(float));
 
 	struct starpu_data_filter horiz =
 	{
@@ -51,90 +50,95 @@ void mandelbrot_with_starpu(long long *pixels, float *params, long long dim, lon
 	};
 
 	starpu_data_partition(pixels_handle, &horiz);
-	starpu_data_partition(params_handle, &horiz);
 
 	long long taskx;
 
-	for (taskx = 0; taskx < nslicesx; taskx++){
+	for (taskx = 0; taskx < nslicesx; taskx++)
+	{
 		struct starpu_task *task = starpu_task_create();
 
 		task->cl = &cl;
 		task->handles[0] = starpu_data_get_child(pixels_handle, taskx);
-		task->handles[1] = starpu_data_get_child(params_handle, taskx);
+		task->cl_arg = p;
+		task->cl_arg_size = sizeof(*p);
 		if (starpu_task_submit(task)!=0) fprintf(stderr,"submit task error\n");
 	}
 
 	starpu_task_wait_for_all();
 
 	starpu_data_unpartition(pixels_handle, STARPU_MAIN_RAM);
-	starpu_data_unpartition(params_handle, STARPU_MAIN_RAM);
-
 	starpu_data_unregister(pixels_handle);
-	starpu_data_unregister(params_handle);
 }
 
 void pixels2img(long long *pixels, long long width, long long height, const char *filename)
 {
-  FILE *fp = fopen(filename, "w");
-  if (!fp)
-    return;
+	FILE *fp = fopen(filename, "w");
+	if (!fp)
+		return;
 
-  int MAPPING[16][3] = {{66,30,15},{25,7,26},{9,1,47},{4,4,73},{0,7,100},{12,44,138},{24,82,177},{57,125,209},{134,181,229},{211,236,248},{241,233,191},{248,201,95},{255,170,0},{204,128,0},{153,87,0},{106,52,3}};
+	int MAPPING[16][3] = {{66,30,15},{25,7,26},{9,1,47},{4,4,73},{0,7,100},{12,44,138},{24,82,177},{57,125,209},{134,181,229},{211,236,248},{241,233,191},{248,201,95},{255,170,0},{204,128,0},{153,87,0},{106,52,3}};
 
-  fprintf(fp, "P3\n%lld %lld\n255\n", width, height);
-  long long i, j;
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < width; ++j) {
-      fprintf(fp, "%d %d %d ", MAPPING[pixels[j*width+i]][0], MAPPING[pixels[j*width+i]][1], MAPPING[pixels[j*width+i]][2]);
-    }
-  }
+	fprintf(fp, "P3\n%lld %lld\n255\n", width, height);
+	long long i, j;
+	for (i = 0; i < height; ++i)
+	{
+		for (j = 0; j < width; ++j)
+		{
+			fprintf(fp, "%d %d %d ", MAPPING[pixels[j*width+i]][0], MAPPING[pixels[j*width+i]][1], MAPPING[pixels[j*width+i]][2]);
+		}
+	}
 
-  fclose(fp);
+	fclose(fp);
 }
 
-double min_times(double cr, double ci, long long dim, long long nslices)
+double min_times(double cr, double ci, long long dim, long long nslices, int gen_images)
 {
 	long long *pixels = calloc(dim*dim, sizeof(long long));
-	float *params = calloc(4*nslices, sizeof(float));
+	struct params *p = calloc(nslices, sizeof(struct params));
 
 	double t_min = 0;
 	long long i;
 
-	for (i=0; i<nslices; i++) {
-		params[4*i+0] = cr;
-		params[4*i+1] = ci;
-		params[4*i+2] = i*dim/nslices;
-		params[4*i+3] = dim;
+	for (i=0; i<nslices; i++)
+	{
+		p[i].centerr = cr;
+		p[i].centeri = ci;
+		p[i].offset = i*dim/nslices;
+		p[i].dim = dim;
 	}
 
 	double start, stop, exec_t;
-	for (i = 0; i < 10; i++){
+	for (i = 0; i < 10; i++)
+	{
 		start = starpu_timing_now(); // starpu_timing_now() gives the time in microseconds.
-		mandelbrot_with_starpu(pixels, params, dim, nslices);
+		mandelbrot_with_starpu(pixels, &p[i], dim, nslices);
 		stop = starpu_timing_now();
 		exec_t = (stop-start)*1.e3;
 		if (t_min==0 || t_min>exec_t)
 		  t_min = exec_t;
 	}
 
-	char filename[64];
-	snprintf(filename, 64, "out%lld.ppm", dim);
-	pixels2img(pixels,dim,dim,filename);
+	if (gen_images == 1)
+	{
+		char filename[64];
+		snprintf(filename, 64, "out%lld.ppm", dim);
+		pixels2img(pixels,dim,dim,filename);
+	}
 
 	free(pixels);
-	free(params);
+	free(p);
 
 	return t_min;
 }
 
-void display_times(double cr, double ci, long long start_dim, long long step_dim, long long stop_dim, long long nslices)
+void display_times(double cr, double ci, long long start_dim, long long step_dim, long long stop_dim, long long nslices, int gen_images)
 {
-
 	long long dim;
 
-	for (dim = start_dim; dim <= stop_dim; dim += step_dim) {
+	for (dim = start_dim; dim <= stop_dim; dim += step_dim)
+	{
 		printf("Dimension: %lld...\n", dim);
-		double res = min_times(cr, ci, dim, nslices);
+		double res = min_times(cr, ci, dim, nslices, gen_images);
 		res = res / dim / dim; // time per pixel
 		printf("%lld %lf\n", dim, res);
 	}
@@ -142,23 +146,40 @@ void display_times(double cr, double ci, long long start_dim, long long step_dim
 
 int main(int argc, char **argv)
 {
-	if (argc != 7){
-		printf("Usage: %s cr ci start_dim step_dim stop_dim nslices(must divide dims)\n", argv[0]);
-		return 1;
+	double cr, ci;
+	long long start_dim, step_dim, stop_dim, nslices;
+	int gen_images;
+
+	if (argc != 8)
+	{
+		printf("Usage: %s cr ci start_dim step_dim stop_dim nslices(must divide dims) gen_images. Using default parameters\n", argv[0]);
+
+		cr = -0.800671;
+		ci = -0.158392;
+		start_dim = 32;
+		step_dim = 32;
+		stop_dim = 512;
+		nslices = 4;
+		gen_images = 0;
 	}
-	if (starpu_init(NULL) != EXIT_SUCCESS){
+	else
+	{
+		cr = (float) atof(argv[1]);
+		ci = (float) atof(argv[2]);
+		start_dim = atoll(argv[3]);
+		step_dim = atoll(argv[4]);
+		stop_dim = atoll(argv[5]);
+		nslices = atoll(argv[6]);
+		gen_images = atoi(argv[7]);
+	}
+
+	if (starpu_init(NULL) != EXIT_SUCCESS)
+	{
 		fprintf(stderr, "ERROR\n");
 		return 77;
 	}
 
-	double cr = (float) atof(argv[1]);
-	double ci = (float) atof(argv[2]);
-	long long start_dim = atoll(argv[3]);
-	long long step_dim = atoll(argv[4]);
-	long long stop_dim = atoll(argv[5]);
-	long long nslices = atoll(argv[6]);
-
-	display_times(cr, ci, start_dim, step_dim, stop_dim, nslices);
+	display_times(cr, ci, start_dim, step_dim, stop_dim, nslices, gen_images);
 
 	starpu_shutdown();
 

+ 10 - 9
julia/examples/mandelbrot/mandelbrot.jl

@@ -34,7 +34,7 @@ using LinearAlgebra
                 zi = 2*zr*zi + ci
                 zr = tmp
             end
-            
+
             if (n < max_iterations)
                 pixels[y,x] = round(15 * n * imi)
             else
@@ -55,7 +55,7 @@ function mandelbrot_with_starpu(A ::Matrix{Int64}, cr ::Float64, ci ::Float64, d
 	starpu_data_partition(hA,horiz)
 
 	@starpu_sync_tasks for taskx in (1 : nslicesx)
-                @starpu_async_cl mandelbrot(hA[taskx]) [STARPU_W] [cr, ci, (taskx-1)*dim/nslicesx, dim]
+                @starpu_async_cl mandelbrot(hA[taskx]) [STARPU_W] (cr, ci, Int64((taskx-1)*dim/nslicesx), dim)
 	end
     end
 end
@@ -73,9 +73,9 @@ function pixels2img(pixels ::Matrix{Int64}, width ::Int64, height ::Int64, filen
     end
 end
 
-function min_times(cr ::Float64, ci ::Float64, dim ::Int64, nslices ::Int64)
+function min_times(cr ::Float64, ci ::Float64, dim ::Int64, nslices ::Int64, gen_images)
     tmin=0;
-    
+
     pixels ::Matrix{Int64} = zeros(dim, dim)
     for i = 1:10
         t = time_ns();
@@ -85,20 +85,21 @@ function min_times(cr ::Float64, ci ::Float64, dim ::Int64, nslices ::Int64)
             tmin=t
         end
     end
-    pixels2img(pixels,dim,dim,"out$(dim).ppm")
+    if (gen_images == 1)
+        pixels2img(pixels,dim,dim,"out$(dim).ppm")
+    end
     return tmin
 end
 
-function display_time(cr ::Float64, ci ::Float64, start_dim ::Int64, step_dim ::Int64, stop_dim ::Int64, nslices ::Int64)
+function display_time(cr ::Float64, ci ::Float64, start_dim ::Int64, step_dim ::Int64, stop_dim ::Int64, nslices ::Int64, gen_images)
     for dim in (start_dim : step_dim : stop_dim)
-        res = min_times(cr, ci, dim, nslices)
+        res = min_times(cr, ci, dim, nslices, gen_images)
         res=res/dim/dim; # time per pixel
         println("$(dim) $(res)")
     end
 end
 
 
-display_time(-0.800671,-0.158392,32,32,4096,4)
+display_time(-0.800671,-0.158392,32,32,512,4, 0)
 
 starpu_shutdown()
-

+ 7 - 5
julia/examples/mandelbrot/mandelbrot_native.jl

@@ -68,7 +68,7 @@ function pixels2img(pixels ::Matrix{Int64}, width ::Int64, height ::Int64, filen
     end
 end
 
-function min_times(cr ::Float64, ci ::Float64, dim ::Int64, nslices ::Int64)
+function min_times(cr ::Float64, ci ::Float64, dim ::Int64, nslices ::Int64, gen_images)
     tmin=0;
 
     pixels ::Matrix{Int64} = zeros(dim, dim)
@@ -80,17 +80,19 @@ function min_times(cr ::Float64, ci ::Float64, dim ::Int64, nslices ::Int64)
             tmin=t
         end
     end
-    pixels2img(pixels,dim,dim,"out$(dim).ppm")
+    if (gen_images == 1)
+        pixels2img(pixels,dim,dim,"out$(dim).ppm")
+    end
     return tmin
 end
 
-function display_time(cr ::Float64, ci ::Float64, start_dim ::Int64, step_dim ::Int64, stop_dim ::Int64, nslices ::Int64)
+function display_time(cr ::Float64, ci ::Float64, start_dim ::Int64, step_dim ::Int64, stop_dim ::Int64, nslices ::Int64, gen_images)
     for dim in (start_dim : step_dim : stop_dim)
-        res = min_times(cr, ci, dim, nslices)
+        res = min_times(cr, ci, dim, nslices, gen_images)
         res=res/dim/dim; # time per pixel
         println("$(dim) $(res)")
     end
 end
 
 
-display_time(-0.800671,-0.158392,32,32,4096,4)
+display_time(-0.800671,-0.158392,32,32,512,4, 0)

+ 10 - 15
julia/examples/mult/Makefile

@@ -1,9 +1,6 @@
-# tile size. Should be changed in mult.jl as well
-STRIDE=72
-
 # ICC compiler
 #CC =icc
-#CFLAGS=-restrict -unroll4 -ipo -falign-loops=256 -O3 -DSTRIDE=${STRIDE} -march=native $(shell pkg-config --cflags starpu-1.3)
+#CFLAGS=-restrict -unroll4 -ipo -falign-loops=256 -O3 -march=native $(shell pkg-config --cflags starpu-1.3)
 # GCC compiler
 CC=gcc
 NVCC=nvcc
@@ -14,7 +11,7 @@ ifeq ($(ENABLE_CUDA),yes)
         LD := ${NVCC}
 endif
 
-CFLAGS = -O3 -g -DSTRIDE=${STRIDE} $(shell pkg-config --cflags starpu-1.3)
+CFLAGS = -O3 -g $(shell pkg-config --cflags starpu-1.3)
 CPU_CFLAGS = ${CFLAGS} -Wall -mavx -fomit-frame-pointer -march=native -ffast-math
 CUDA_CFLAGS = ${CFLAGS}
 LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
@@ -28,9 +25,6 @@ ifneq ($(ENABLE_CUDA),yes)
 	CUDA_OBJECTS:=
 endif
 
-
-LIBPATH=${PWD}/../StarPU.jl/lib
-
 all: ${EXTERNLIB}
 
 mult: mult.c cpu_mult.o #gpu_mult.o
@@ -54,13 +48,14 @@ clean:
 	rm -f mult *.so *.o genc_*.c gencuda_*.cu *.dat
 
 # Performance Tests
+STRIDE=72
 cstarpu.dat: mult
-	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 ./mult > $@
-julia_generatedc.dat:
-	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mult.jl $@
-julia_native.dat:
-	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mult_native.jl $@
-julia_calllib.dat: ${EXTERNLIB}
-	LD_LIBRARY_PATH+=${LIBPATH} JULIA_TASK_LIB="${EXTERNLIB}" STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mult.jl julia_calllib.dat
+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 ./mult $(STRIDE) > $@
+julia_generatedc.dat: mult.jl
+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mult.jl $(STRIDE) $@
+julia_native.dat: mult_native.jl
+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mult_native.jl $(STRIDE) $@
+julia_calllib.dat: ${EXTERNLIB} mult.jl
+	JULIA_TASK_LIB="${EXTERNLIB}" STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mult.jl $(STRIDE) julia_calllib.dat
 
 test: cstarpu.dat julia_generatedc.dat julia_native.dat julia_calllib.dat

+ 21 - 11
julia/examples/mult/cpu_mult.c

@@ -13,10 +13,12 @@
  *
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
+
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
 #include <starpu.h>
+
 /*
  * The codelet is passed 3 matrices, the "descr" union-type field gives a
  * description of the layout of those 3 matrices in the local memory (ie. RAM
@@ -25,14 +27,16 @@
  */
 void cpu_mult(void *descr[], void *arg)
 {
-	(void)arg;
+	int stride;
 	float *subA, *subB, *subC;
+
+	stride = *((int *)arg);
+
 	/* .blas.ptr gives a pointer to the first element of the local copy */
 	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
 	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
 	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
 
-
 	/* .blas.nx is the number of rows (consecutive elements) and .blas.ny
 	 * is the number of lines that are separated by .blas.ld elements (ld
 	 * stands for leading dimension).
@@ -50,14 +54,18 @@ void cpu_mult(void *descr[], void *arg)
 	int i,j,k,ii,jj,kk;
 	for (i = 0; i < nyC*nxC; i++) subC[i] = 0;
 	//fprintf(stderr,"inside cpu_mult %dx%dx%d %d/%d on %d\n",nyC,nyA,nxC,starpu_worker_get_id(),STARPU_NMAXWORKERS,starpu_worker_get_devid(starpu_worker_get_id()));
-	for (i=0;i<nyC;i+=STRIDE) {
-		for (k=0;k<nyA;k+=STRIDE) {
-			for (j=0;j<nxC;j+=STRIDE) {
-				
-				for (ii = i; ii < i+STRIDE; ii+=2) {
+	for (i=0;i<nyC;i+=stride)
+	{
+		for (k=0;k<nyA;k+=stride)
+		{
+			for (j=0;j<nxC;j+=stride)
+			{
+				for (ii = i; ii < i+stride; ii+=2)
+				{
 					float *sC0=subC+ii*ldC+j;
 					float *sC1=subC+ii*ldC+ldC+j;
-					for (kk = k; kk < k+STRIDE; kk+=4) {
+					for (kk = k; kk < k+stride; kk+=4)
+					{
 						float alpha00=subB[kk +  ii*ldB];
 						float alpha01=subB[kk+1+ii*ldB];
 						float alpha10=subB[kk+  ii*ldB+ldB];
@@ -70,7 +78,8 @@ void cpu_mult(void *descr[], void *arg)
 						float *sA1=subA+kk*ldA+ldA+j;
 						float *sA2=subA+kk*ldA+2*ldA+j;
 						float *sA3=subA+kk*ldA+3*ldA+j;
-						for (jj = 0; jj < STRIDE; jj+=1) {
+						for (jj = 0; jj < stride; jj+=1)
+						{
 							sC0[jj] += alpha00*sA0[jj]+alpha01*sA1[jj]+alpha02*sA2[jj]+alpha03*sA3[jj];
 							sC1[jj] += alpha10*sA0[jj]+alpha11*sA1[jj]+alpha12*sA2[jj]+alpha13*sA3[jj];
 						}
@@ -80,11 +89,12 @@ void cpu_mult(void *descr[], void *arg)
 		}
 	}
 	//fprintf(stderr,"inside cpu_mult %dx%dx%d\n",nyC,nyA,nxC);
-
 }
+
 char* CPU = "cpu_mult";
 char* GPU = "gpu_mult";
-extern char *starpu_find_function(char *name, char *device) {
+extern char *starpu_find_function(char *name, char *device)
+{
 	if (!strcmp(device,"gpu")) return GPU;
 	return CPU;
 }

+ 52 - 58
julia/examples/mult/mult.c

@@ -1,10 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2018                                     Alexis Juven
- * Copyright (C) 2012,2013                                Inria
- * Copyright (C) 2009-2011,2013-2015                      Université de Bordeaux
- * Copyright (C) 2010                                     Mehdi Juhoor
- * Copyright (C) 2010-2013,2015,2017                      CNRS
+ * Copyright (C) 2018       Alexis Juven
+ * Copyright (C) 2010-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -40,8 +37,6 @@
 
 #include <starpu.h>
 
-
-
 /*
  * That program should compute C = A * B
  *
@@ -63,43 +58,32 @@
 
  */
 
-
-
-
-
 //void gpu_mult(void **, void *);
 void cpu_mult(void **, void *);
 
-
 static struct starpu_perfmodel model =
 {
-		.type = STARPU_HISTORY_BASED,
-		.symbol = "history_perf"
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "history_perf"
 };
 
 static struct starpu_codelet cl =
 {
-		.cpu_funcs = {cpu_mult},
-		.cpu_funcs_name = {"cpu_mult"},
-		//.cuda_funcs = {gpu_mult},
-		.nbuffers = 3,
-		.modes = {STARPU_R, STARPU_R, STARPU_W},
-		.model = &model
+	.cpu_funcs = {cpu_mult},
+	.cpu_funcs_name = {"cpu_mult"},
+	//.cuda_funcs = {gpu_mult},
+	.nbuffers = 3,
+	.modes = {STARPU_R, STARPU_R, STARPU_W},
+	.model = &model
 };
 
-
-void multiply_with_starpu(float *A, float *B, float *C,  unsigned xdim,  unsigned ydim,  unsigned zdim, unsigned nslicesx, unsigned nslicesy)
+void multiply_with_starpu(float *A, float *B, float *C,  unsigned xdim,  unsigned ydim,  unsigned zdim, unsigned nslicesx, unsigned nslicesy, int stride)
 {
 	starpu_data_handle_t A_handle, B_handle, C_handle;
 
-
-	starpu_matrix_data_register(&A_handle, STARPU_MAIN_RAM, (uintptr_t)A,
-			ydim, ydim, zdim, sizeof(float));
-	starpu_matrix_data_register(&B_handle, STARPU_MAIN_RAM, (uintptr_t)B,
-			zdim, zdim, xdim, sizeof(float));
-	starpu_matrix_data_register(&C_handle, STARPU_MAIN_RAM, (uintptr_t)C,
-			ydim, ydim, xdim, sizeof(float));
-
+	starpu_matrix_data_register(&A_handle, STARPU_MAIN_RAM, (uintptr_t)A, ydim, ydim, zdim, sizeof(float));
+	starpu_matrix_data_register(&B_handle, STARPU_MAIN_RAM, (uintptr_t)B, zdim, zdim, xdim, sizeof(float));
+	starpu_matrix_data_register(&C_handle, STARPU_MAIN_RAM, (uintptr_t)C, ydim, ydim, xdim, sizeof(float));
 
 	struct starpu_data_filter vert =
 	{
@@ -113,31 +97,31 @@ void multiply_with_starpu(float *A, float *B, float *C,  unsigned xdim,  unsigne
 			.nchildren = nslicesy
 	};
 
-
 	starpu_data_partition(B_handle, &vert);
 	starpu_data_partition(A_handle, &horiz);
 	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
 
 	unsigned taskx, tasky;
 
-	for (taskx = 0; taskx < nslicesx; taskx++){
-		for (tasky = 0; tasky < nslicesy; tasky++){
-
+	for (taskx = 0; taskx < nslicesx; taskx++)
+	{
+		for (tasky = 0; tasky < nslicesy; tasky++)
+		{
 			struct starpu_task *task = starpu_task_create();
 
 			task->cl = &cl;
 			task->handles[0] = starpu_data_get_sub_data(A_handle, 1, tasky);
 			task->handles[1] = starpu_data_get_sub_data(B_handle, 1, taskx);
 			task->handles[2] = starpu_data_get_sub_data(C_handle, 2, taskx, tasky);
+			task->cl_arg = &stride;
+			task->cl_arg_size = sizeof(stride);
 
 			if (starpu_task_submit(task)!=0) fprintf(stderr,"submit task error\n");
-
 		}
 	}
 
 	starpu_task_wait_for_all();
 
-
 	starpu_data_unpartition(A_handle, STARPU_MAIN_RAM);
 	starpu_data_unpartition(B_handle, STARPU_MAIN_RAM);
 	starpu_data_unpartition(C_handle, STARPU_MAIN_RAM);
@@ -145,31 +129,27 @@ void multiply_with_starpu(float *A, float *B, float *C,  unsigned xdim,  unsigne
 	starpu_data_unregister(A_handle);
 	starpu_data_unregister(B_handle);
 	starpu_data_unregister(C_handle);
-
 }
 
-
-
 void init_rand(float * m, unsigned width, unsigned height)
 {
 	unsigned i,j;
 
-	for (j = 0 ; j < height ; j++){
-		for (i = 0 ; i < width ; i++){
+	for (j = 0 ; j < height ; j++)
+	{
+		for (i = 0 ; i < width ; i++)
+		{
 			m[j+i*height] = (float)(starpu_drand48());
 		}
 	}
 }
 
-
 void init_zero(float * m, unsigned width, unsigned height)
 {
 	memset(m, 0, sizeof(float) * width * height);
 }
 
-
-
-double min_time(unsigned nb_test, unsigned xdim, unsigned ydim, unsigned zdim, unsigned nsclicesx, unsigned nsclicesy)
+double min_time(unsigned nb_test, unsigned xdim, unsigned ydim, unsigned zdim, unsigned nsclicesx, unsigned nsclicesy, int stride)
 {
 	unsigned i;
 
@@ -179,8 +159,8 @@ double min_time(unsigned nb_test, unsigned xdim, unsigned ydim, unsigned zdim, u
 
 	double exec_times=-1;
 
-	for (i = 0 ; i < nb_test ; i++){
-
+	for (i = 0 ; i < nb_test ; i++)
+	{
 		double start, stop, exec_t;
 
 		init_rand(A, zdim, ydim);
@@ -188,7 +168,7 @@ double min_time(unsigned nb_test, unsigned xdim, unsigned ydim, unsigned zdim, u
 		init_zero(C, xdim, ydim);
 
 		start = starpu_timing_now();
-		multiply_with_starpu(A, B, C, xdim, ydim, zdim, nsclicesx, nsclicesy);
+		multiply_with_starpu(A, B, C, xdim, ydim, zdim, nsclicesx, nsclicesy, stride);
 		stop = starpu_timing_now();
 
 		exec_t = (stop - start)*1.e3; // Put in ns instead of us
@@ -201,34 +181,48 @@ double min_time(unsigned nb_test, unsigned xdim, unsigned ydim, unsigned zdim, u
 	return exec_times;
 }
 
-
-void display_times(unsigned start_dim, unsigned step_dim, unsigned stop_dim, unsigned nb_tests, unsigned nsclicesx, unsigned nsclicesy)
+void display_times(unsigned start_dim, unsigned step_dim, unsigned stop_dim, unsigned nb_tests, unsigned nsclicesx, unsigned nsclicesy, int stride)
 {
 	unsigned dim;
 
-	for (dim = start_dim ; dim <= stop_dim ; dim += step_dim){
-		double t = min_time(nb_tests, dim, dim, dim, nsclicesx, nsclicesy);
+	for (dim = start_dim ; dim <= stop_dim ; dim += step_dim)
+	{
+		double t = min_time(nb_tests, dim, dim, dim, nsclicesx, nsclicesy, stride);
 		printf("%f %f\n", dim*dim*4.*3./1024./1024, (2.*dim-1.)*dim*dim/t);
 	}
-
 }
 
+#ifdef STARPU_QUICK_CHECK
+#define STRIDE_DEFAULT 4
+#else
+#define STRIDE_DEFAULT 72
+#endif
 
 int main(int argc, char * argv[])
 {
-	if (starpu_init(NULL) != EXIT_SUCCESS){
+	int stride=STRIDE_DEFAULT;
+	if (argc >= 2)
+		stride = atoi(argv[1]);
+	if (stride % 4 != 0)
+	{
+		fprintf(stderr, "STRIDE must be a multiple of 4 (%d)\n", stride);
+		return -1;
+	}
+
+	if (starpu_init(NULL) != EXIT_SUCCESS)
+	{
 		fprintf(stderr, "ERROR\n");
 		return 77;
 	}
 
-	unsigned start_dim = 16*STRIDE;
-	unsigned step_dim = 4*STRIDE;
-	unsigned stop_dim = 4096;
+	unsigned start_dim = 16*stride;
+	unsigned step_dim = 4*stride;
+	unsigned stop_dim = 128*stride;
 	unsigned nb_tests = 10;
 	unsigned nsclicesx = 2;
 	unsigned nsclicesy = 2;
 
-	display_times(start_dim, step_dim, stop_dim, nb_tests, nsclicesx, nsclicesy);
+	display_times(start_dim, step_dim, stop_dim, nb_tests, nsclicesx, nsclicesy, stride);
 
 	starpu_shutdown();
 

+ 14 - 11
julia/examples/mult/mult.jl

@@ -2,11 +2,8 @@ import Libdl
 using StarPU
 using LinearAlgebra
 
-#shoud be the same as in the makefile
-const STRIDE = 72
-
 @target STARPU_CPU+STARPU_CUDA
-@codelet function matrix_mult(m1 :: Matrix{Float32}, m2 :: Matrix{Float32}, m3 :: Matrix{Float32}) :: Nothing
+@codelet function matrix_mult(m1 :: Matrix{Float32}, m2 :: Matrix{Float32}, m3 :: Matrix{Float32}, stride ::Int32) :: Nothing
 
     width_m2 :: Int32 = width(m2)
     height_m1 :: Int32 = height(m1)
@@ -59,7 +56,7 @@ end
 
 starpu_init()
 
-function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, nslicesx, nslicesy)
+function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, nslicesx, nslicesy, stride)
     scale= 3
     tmin=0
     vert = StarpuDataFilter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslicesx)
@@ -88,7 +85,7 @@ function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: M
                 for taskx in (1 : nslicesx)
                     for tasky in (1 : nslicesy)
                         handles = [hA[tasky], hB[taskx], hC[taskx, tasky]]
-                        task = StarpuTask(cl = cl, handles = handles)
+                        task = StarpuTask(cl = cl, handles = handles, cl_arg=(Int32(stride),))
                         starpu_task_submit(task)
                         #@starpu_async_cl matrix_mult(hA[tasky], hB[taskx], hC[taskx, tasky])
                     end
@@ -123,12 +120,12 @@ function approximately_equals(
     return true
 end
 
-function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy)
+function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy, stride)
     for dim in (start_dim : step_dim : stop_dim)
         A = Array(rand(Cfloat, dim, dim))
         B = Array(rand(Cfloat, dim, dim))
         C = zeros(Float32, dim, dim)
-        mt =  multiply_with_starpu(A, B, C, nslicesx, nslicesy)
+        mt =  multiply_with_starpu(A, B, C, nslicesx, nslicesy, stride)
         flops = (2*dim-1)*dim*dim/mt
         size=dim*dim*4*3/1024/1024
         println(io,"$size $flops")
@@ -136,9 +133,15 @@ function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy)
     end
 end
 
-
-io=open(ARGS[1],"w")
-compute_times(io,16*STRIDE,4*STRIDE,4096,2,2)
+if size(ARGS, 1) < 2
+    stride=4
+    filename="x.dat"
+else
+    stride=parse(Int, ARGS[1])
+    filename=ARGS[2]
+end
+io=open(filename,"w")
+compute_times(io,16*stride,4*stride,128*stride,2,2,stride)
 close(io)
 
 starpu_shutdown()

+ 12 - 9
julia/examples/mult/mult_native.jl

@@ -2,10 +2,7 @@ import Libdl
 using StarPU
 using LinearAlgebra
 
-#shoud be the same as in the makefile
-const STRIDE = 72
-
-function multiply_without_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, nslicesx, nslicesy)
+function multiply_without_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, nslicesx, nslicesy, stride)
     tmin = 0
     for i in (1 : 10 )
         t=time_ns()
@@ -19,12 +16,12 @@ function multiply_without_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :
 end
 
 
-function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy)
+function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy, stride)
     for dim in (start_dim : step_dim : stop_dim)
         A = Array(rand(Cfloat, dim, dim))
         B = Array(rand(Cfloat, dim, dim))
         C = zeros(Float32, dim, dim)
-        mt =  multiply_without_starpu(A, B, C, nslicesx, nslicesy)
+        mt =  multiply_without_starpu(A, B, C, nslicesx, nslicesy, stride)
         flops = (2*dim-1)*dim*dim/mt
         size=dim*dim*4*3/1024/1024
         println(io,"$size $flops")
@@ -32,8 +29,14 @@ function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy)
     end
 end
 
-
-io=open(ARGS[1],"w")
-compute_times(io,16*STRIDE,4*STRIDE,4096,2,2)
+if size(ARGS, 1) < 2
+    stride=4
+    filename="x.dat"
+else
+    stride=parse(Int, ARGS[1])
+    filename=ARGS[2]
+end
+io=open(filename,"w")
+compute_times(io,16*stride,4*stride,128*stride,2,2,stride)
 close(io)
 

+ 1 - 3
julia/examples/task_insert_color/Makefile

@@ -21,8 +21,6 @@ ifneq ($(ENABLE_CUDA),yes)
 	CUDA_OBJECTS:=
 endif
 
-LIBPATH=${PWD}/../StarPU.jl/lib
-
 all: task_insert_color
 
 task_insert_color: task_insert_color.o
@@ -46,6 +44,6 @@ clean:
 cstarpu.dat: task_insert_color
 	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 ./task_insert_color > $@
 julia_generatedc.dat:
-	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia task_insert_colorl.jl
+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia task_insert_color.jl
 
 test: cstarpu.dat julia_generatedc.dat

+ 1 - 1
julia/examples/task_insert_color/task_insert_color.jl

@@ -34,7 +34,7 @@ function task_insert_color_with_starpu(val ::Ref{Int32})
             starpu_task_submit(StarpuTask(cl = cl2, handles = [hVal]))
 
             # In the trace file, the following tasks will be red as specified in @starpu_async_cl
-            @starpu_async_cl task_insert_color(hVal) [STARPU_RW] [] 0xFF0000
+            @starpu_async_cl task_insert_color(hVal) [STARPU_RW] () 0xFF0000
 
 	end
     end

+ 3 - 5
julia/examples/variable/Makefile

@@ -21,8 +21,6 @@ ifneq ($(ENABLE_CUDA),yes)
 	CUDA_OBJECTS:=
 endif
 
-LIBPATH=${PWD}/../StarPU.jl/lib
-
 all: ${EXTERNLIB}
 
 variable: variable.c cpu_variable.o #gpu_variable.o
@@ -49,10 +47,10 @@ clean:
 cstarpu.dat: variable
 	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 ./variable -0.800671 -0.158392 32 32 4096 4 > $@
 julia_generatedc.dat:
-	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia variable.jl $@
+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia variable.jl $@
 julia_native.dat:
-	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia variable_native.jl $@
+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia variable_native.jl $@
 julia_calllib.dat: ${EXTERNLIB}
-	LD_LIBRARY_PATH+=${LIBPATH} JULIA_TASK_LIB="${EXTERNLIB}" STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia variable.jl julia_calllib.dat
+	JULIA_TASK_LIB="${EXTERNLIB}" STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia variable.jl julia_calllib.dat
 
 test: cstarpu.dat julia_generatedc.dat julia_native.dat julia_calllib.dat

+ 5 - 6
julia/examples/vector_scal/Makefile

@@ -21,8 +21,6 @@ ifneq ($(ENABLE_CUDA),yes)
 	CUDA_OBJECTS:=
 endif
 
-LIBPATH=${PWD}/../StarPU.jl/lib
-
 all: ${EXTERNLIB}
 
 vector_scal: vector_scal.c cpu_vector_scal.o #gpu_vector_scal.o
@@ -49,10 +47,11 @@ clean:
 cstarpu.dat: vector_scal
 	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 ./vector_scal > $@
 julia_generatedc.dat:
-	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia vector_scal.jl $@
+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia vector_scal.jl $@
 julia_native.dat:
-	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia vector_scal_native.jl $@
+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia vector_scal_native.jl $@
 julia_calllib.dat: ${EXTERNLIB}
-	LD_LIBRARY_PATH+=${LIBPATH} JULIA_TASK_LIB="${EXTERNLIB}" STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia vector_scal.jl julia_calllib.dat
+	JULIA_TASK_LIB="${EXTERNLIB}" STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia vector_scal.jl julia_calllib.dat
 
-test: cstarpu.dat julia_generatedc.dat julia_native.dat julia_calllib.dat
+#test: cstarpu.dat julia_generatedc.dat julia_native.dat julia_calllib.dat
+test: julia_generatedc.dat julia_calllib.dat

+ 1 - 1
julia/examples/vector_scal/cpu_vector_scal.c

@@ -29,7 +29,7 @@ float cpu_vector_scal(void *buffers[], void *cl_arg)
 
   /* scale the vector */
   for (unsigned i = 0; i < n; i++)
-    val[i] = val[i] * k + l + m;
+    val[i] = val[i] * m + l + k;
 
   return 0.0;
 }

+ 1 - 1
julia/examples/vector_scal/vector_scal.jl

@@ -37,7 +37,7 @@ function vector_scal_with_starpu(v :: Vector{Float32}, m :: Int32, k :: Float32,
             t=time_ns()
             @starpu_sync_tasks begin
                 handles = [hV]
-                task = StarpuTask(cl = cl, handles = handles, cl_arg=[m, k, l])
+                task = StarpuTask(cl = cl, handles = handles, cl_arg=(m, k, l))
                 starpu_task_submit(task)
             end
             # @starpu_sync_tasks for task in (1:1)

+ 7 - 5
julia/src/StarPU.jl

@@ -47,6 +47,8 @@ function jlstarpu_set_to_zero(x :: T) :: Ptr{Cvoid} where {T}
         )
 end
 
+tuple_len(::NTuple{N, Any}) where {N} = N
+
 export starpu_init
 export starpu_shutdown
 export starpu_memory_pin
@@ -483,7 +485,7 @@ mutable struct StarpuTask
 
         Creates a new task which will run the specified codelet on handle buffers and cl_args data
     """
-    function StarpuTask(; cl :: Union{Cvoid, StarpuCodelet} = nothing, handles :: Vector{StarpuDataHandle} = StarpuDataHandle[], cl_arg = [])
+    function StarpuTask(; cl :: Union{Cvoid, StarpuCodelet} = nothing, handles :: Vector{StarpuDataHandle} = StarpuDataHandle[], cl_arg = ())
 
         if (cl == nothing)
             error("\"cl\" field can't be empty when creating a StarpuTask")
@@ -508,13 +510,13 @@ mutable struct StarpuTask
         scalar_parameters = get(CODELETS_SCALARS, codelet_name, nothing)
         if scalar_parameters != nothing
             nb_scalar_required = length(scalar_parameters)
-            nb_scalar_provided = length(cl_arg)
+            nb_scalar_provided = tuple_len(cl_arg)
             if (nb_scalar_provided != nb_scalar_required)
                 error("$nb_scalar_provided scalar parameters provided but $nb_scalar_required are required by $codelet_name.")
             end
             output.cl_arg = create_param_struct_from_clarg(codelet_name, cl_arg)
         else
-            output.cl_arg = nothing
+            output.cl_arg = cl_arg
         end
 
         output.synchronous = false
@@ -583,7 +585,7 @@ mutable struct StarpuTaskTranslator
             output.cl_arg = C_NULL
             output.cl_arg_size = 0
         else
-            output.cl_arg = pointer_from_objref(task.cl_arg)
+            output.cl_arg = Base.unsafe_convert(Ptr{Cvoid}, Ref(task.cl_arg))
             output.cl_arg_size = sizeof(task.cl_arg)
         end
 
@@ -877,7 +879,7 @@ end
     Creates and submits an asynchronous task running cl Codelet function.
     Ex : @starpu_async_cl cl(handle1, handle2)
 """
-macro starpu_async_cl(expr, modes, cl_arg=[], color ::UInt32=0x00000000)
+macro starpu_async_cl(expr, modes, cl_arg=(), color ::UInt32=0x00000000)
 
     if (!isa(expr, Expr) || expr.head != :call)
         error("Invalid task submit syntax")