Bladeren bron

Merge branch 'master' into julia-autotools

Nathalie Furmento 5 jaren geleden
bovenliggende
commit
66ff633d82

+ 4 - 4
Makefile.am

@@ -163,28 +163,28 @@ DISTCLEANFILES = STARPU-REVISION
 recheck:
 	RET=0 ; \
 	for i in $(SUBDIRS) ; do \
-		make -C $$i recheck || RET=1 ; \
+		$(MAKE) -C $$i recheck || RET=1 ; \
 	done ; \
 	exit $$RET
 
 showfailed:
 	@RET=0 ; \
 	for i in $(SUBDIRS) ; do \
-		make -s -C $$i showfailed || RET=1 ; \
+		$(MAKE) -s -C $$i showfailed || RET=1 ; \
 	done ; \
 	exit $$RET
 
 showcheck:
 	RET=0 ; \
 	for i in $(SUBDIRS) ; do \
-		make -C $$i showcheck || RET=1 ; \
+		$(MAKE) -C $$i showcheck || RET=1 ; \
 	done ; \
 	exit $$RET
 
 showsuite:
 	RET=0 ; \
 	for i in $(SUBDIRS) ; do \
-		make -C $$i showsuite || RET=1 ; \
+		$(MAKE) -C $$i showsuite || RET=1 ; \
 	done ; \
 	exit $$RET
 

+ 8 - 0
examples/tag_example/tag_example.c

@@ -222,6 +222,14 @@ int main(int argc, char **argv)
 {
 	int ret;
 
+#ifdef STARPU_HAVE_HELGRIND_H
+	if (RUNNING_ON_VALGRIND) {
+		ni /= 2;
+		nj /= 2;
+		nk /= 2;
+	}
+#endif
+
 	ret = starpu_init(NULL);
 	if (ret == -ENODEV)
 		exit(77);

+ 10 - 12
julia/examples/mult/Makefile.old

@@ -1,9 +1,6 @@
-# tile size. Should be changed in mult.jl as well
-STRIDE=72
-
 # ICC compiler
 #CC =icc
-#CFLAGS=-restrict -unroll4 -ipo -falign-loops=256 -O3 -DSTRIDE=${STRIDE} -march=native $(shell pkg-config --cflags starpu-1.3)
+#CFLAGS=-restrict -unroll4 -ipo -falign-loops=256 -O3 -march=native $(shell pkg-config --cflags starpu-1.3)
 # GCC compiler
 CC=gcc
 NVCC=nvcc
@@ -14,7 +11,7 @@ ifeq ($(ENABLE_CUDA),yes)
         LD := ${NVCC}
 endif
 
-CFLAGS = -O3 -g -DSTRIDE=${STRIDE} $(shell pkg-config --cflags starpu-1.3)
+CFLAGS = -O3 -g $(shell pkg-config --cflags starpu-1.3)
 CPU_CFLAGS = ${CFLAGS} -Wall -mavx -fomit-frame-pointer -march=native -ffast-math
 CUDA_CFLAGS = ${CFLAGS}
 LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
@@ -52,13 +49,14 @@ clean:
 
 tjulia: julia_generatedc.dat
 # Performance Tests
+STRIDE=72
 cstarpu.dat: mult
-	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 ./mult > $@
-julia_generatedc.dat:
-	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mult.jl $@
-julia_native.dat:
-	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mult_native.jl $@
-julia_calllib.dat: ${EXTERNLIB}
-	LD_LIBRARY_PATH+=${LIBPATH} JULIA_TASK_LIB="${EXTERNLIB}" STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mult.jl julia_calllib.dat
+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 ./mult $(STRIDE) > $@
+julia_generatedc.dat: mult.jl
+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mult.jl $(STRIDE) $@
+julia_native.dat: mult_native.jl
+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mult_native.jl $(STRIDE) $@
+julia_calllib.dat: ${EXTERNLIB} mult.jl
+	JULIA_TASK_LIB="${EXTERNLIB}" STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mult.jl $(STRIDE) julia_calllib.dat
 
 test: cstarpu.dat julia_generatedc.dat julia_native.dat julia_calllib.dat

+ 21 - 11
julia/examples/mult/cpu_mult.c

@@ -13,10 +13,12 @@
  *
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
+
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
 #include <starpu.h>
+
 /*
  * The codelet is passed 3 matrices, the "descr" union-type field gives a
  * description of the layout of those 3 matrices in the local memory (ie. RAM
@@ -25,14 +27,16 @@
  */
 void cpu_mult(void *descr[], void *arg)
 {
-	(void)arg;
+	int stride;
 	float *subA, *subB, *subC;
+
+	stride = *((int *)arg);
+
 	/* .blas.ptr gives a pointer to the first element of the local copy */
 	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
 	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
 	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
 
-
 	/* .blas.nx is the number of rows (consecutive elements) and .blas.ny
 	 * is the number of lines that are separated by .blas.ld elements (ld
 	 * stands for leading dimension).
@@ -50,14 +54,18 @@ void cpu_mult(void *descr[], void *arg)
 	int i,j,k,ii,jj,kk;
 	for (i = 0; i < nyC*nxC; i++) subC[i] = 0;
 	//fprintf(stderr,"inside cpu_mult %dx%dx%d %d/%d on %d\n",nyC,nyA,nxC,starpu_worker_get_id(),STARPU_NMAXWORKERS,starpu_worker_get_devid(starpu_worker_get_id()));
-	for (i=0;i<nyC;i+=STRIDE) {
-		for (k=0;k<nyA;k+=STRIDE) {
-			for (j=0;j<nxC;j+=STRIDE) {
-				
-				for (ii = i; ii < i+STRIDE; ii+=2) {
+	for (i=0;i<nyC;i+=stride)
+	{
+		for (k=0;k<nyA;k+=stride)
+		{
+			for (j=0;j<nxC;j+=stride)
+			{
+				for (ii = i; ii < i+stride; ii+=2)
+				{
 					float *sC0=subC+ii*ldC+j;
 					float *sC1=subC+ii*ldC+ldC+j;
-					for (kk = k; kk < k+STRIDE; kk+=4) {
+					for (kk = k; kk < k+stride; kk+=4)
+					{
 						float alpha00=subB[kk +  ii*ldB];
 						float alpha01=subB[kk+1+ii*ldB];
 						float alpha10=subB[kk+  ii*ldB+ldB];
@@ -70,7 +78,8 @@ void cpu_mult(void *descr[], void *arg)
 						float *sA1=subA+kk*ldA+ldA+j;
 						float *sA2=subA+kk*ldA+2*ldA+j;
 						float *sA3=subA+kk*ldA+3*ldA+j;
-						for (jj = 0; jj < STRIDE; jj+=1) {
+						for (jj = 0; jj < stride; jj+=1)
+						{
 							sC0[jj] += alpha00*sA0[jj]+alpha01*sA1[jj]+alpha02*sA2[jj]+alpha03*sA3[jj];
 							sC1[jj] += alpha10*sA0[jj]+alpha11*sA1[jj]+alpha12*sA2[jj]+alpha13*sA3[jj];
 						}
@@ -80,11 +89,12 @@ void cpu_mult(void *descr[], void *arg)
 		}
 	}
 	//fprintf(stderr,"inside cpu_mult %dx%dx%d\n",nyC,nyA,nxC);
-
 }
+
 char* CPU = "cpu_mult";
 char* GPU = "gpu_mult";
-extern char *starpu_find_function(char *name, char *device) {
+extern char *starpu_find_function(char *name, char *device)
+{
 	if (!strcmp(device,"gpu")) return GPU;
 	return CPU;
 }

+ 52 - 58
julia/examples/mult/mult.c

@@ -1,10 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2018                                     Alexis Juven
- * Copyright (C) 2012,2013                                Inria
- * Copyright (C) 2009-2011,2013-2015                      Université de Bordeaux
- * Copyright (C) 2010                                     Mehdi Juhoor
- * Copyright (C) 2010-2013,2015,2017                      CNRS
+ * Copyright (C) 2018       Alexis Juven
+ * Copyright (C) 2010-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -40,8 +37,6 @@
 
 #include <starpu.h>
 
-
-
 /*
  * That program should compute C = A * B
  *
@@ -63,43 +58,32 @@
 
  */
 
-
-
-
-
 //void gpu_mult(void **, void *);
 void cpu_mult(void **, void *);
 
-
 static struct starpu_perfmodel model =
 {
-		.type = STARPU_HISTORY_BASED,
-		.symbol = "history_perf"
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "history_perf"
 };
 
 static struct starpu_codelet cl =
 {
-		.cpu_funcs = {cpu_mult},
-		.cpu_funcs_name = {"cpu_mult"},
-		//.cuda_funcs = {gpu_mult},
-		.nbuffers = 3,
-		.modes = {STARPU_R, STARPU_R, STARPU_W},
-		.model = &model
+	.cpu_funcs = {cpu_mult},
+	.cpu_funcs_name = {"cpu_mult"},
+	//.cuda_funcs = {gpu_mult},
+	.nbuffers = 3,
+	.modes = {STARPU_R, STARPU_R, STARPU_W},
+	.model = &model
 };
 
-
-void multiply_with_starpu(float *A, float *B, float *C,  unsigned xdim,  unsigned ydim,  unsigned zdim, unsigned nslicesx, unsigned nslicesy)
+void multiply_with_starpu(float *A, float *B, float *C,  unsigned xdim,  unsigned ydim,  unsigned zdim, unsigned nslicesx, unsigned nslicesy, int stride)
 {
 	starpu_data_handle_t A_handle, B_handle, C_handle;
 
-
-	starpu_matrix_data_register(&A_handle, STARPU_MAIN_RAM, (uintptr_t)A,
-			ydim, ydim, zdim, sizeof(float));
-	starpu_matrix_data_register(&B_handle, STARPU_MAIN_RAM, (uintptr_t)B,
-			zdim, zdim, xdim, sizeof(float));
-	starpu_matrix_data_register(&C_handle, STARPU_MAIN_RAM, (uintptr_t)C,
-			ydim, ydim, xdim, sizeof(float));
-
+	starpu_matrix_data_register(&A_handle, STARPU_MAIN_RAM, (uintptr_t)A, ydim, ydim, zdim, sizeof(float));
+	starpu_matrix_data_register(&B_handle, STARPU_MAIN_RAM, (uintptr_t)B, zdim, zdim, xdim, sizeof(float));
+	starpu_matrix_data_register(&C_handle, STARPU_MAIN_RAM, (uintptr_t)C, ydim, ydim, xdim, sizeof(float));
 
 	struct starpu_data_filter vert =
 	{
@@ -113,31 +97,31 @@ void multiply_with_starpu(float *A, float *B, float *C,  unsigned xdim,  unsigne
 			.nchildren = nslicesy
 	};
 
-
 	starpu_data_partition(B_handle, &vert);
 	starpu_data_partition(A_handle, &horiz);
 	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
 
 	unsigned taskx, tasky;
 
-	for (taskx = 0; taskx < nslicesx; taskx++){
-		for (tasky = 0; tasky < nslicesy; tasky++){
-
+	for (taskx = 0; taskx < nslicesx; taskx++)
+	{
+		for (tasky = 0; tasky < nslicesy; tasky++)
+		{
 			struct starpu_task *task = starpu_task_create();
 
 			task->cl = &cl;
 			task->handles[0] = starpu_data_get_sub_data(A_handle, 1, tasky);
 			task->handles[1] = starpu_data_get_sub_data(B_handle, 1, taskx);
 			task->handles[2] = starpu_data_get_sub_data(C_handle, 2, taskx, tasky);
+			task->cl_arg = &stride;
+			task->cl_arg_size = sizeof(stride);
 
 			if (starpu_task_submit(task)!=0) fprintf(stderr,"submit task error\n");
-
 		}
 	}
 
 	starpu_task_wait_for_all();
 
-
 	starpu_data_unpartition(A_handle, STARPU_MAIN_RAM);
 	starpu_data_unpartition(B_handle, STARPU_MAIN_RAM);
 	starpu_data_unpartition(C_handle, STARPU_MAIN_RAM);
@@ -145,31 +129,27 @@ void multiply_with_starpu(float *A, float *B, float *C,  unsigned xdim,  unsigne
 	starpu_data_unregister(A_handle);
 	starpu_data_unregister(B_handle);
 	starpu_data_unregister(C_handle);
-
 }
 
-
-
 void init_rand(float * m, unsigned width, unsigned height)
 {
 	unsigned i,j;
 
-	for (j = 0 ; j < height ; j++){
-		for (i = 0 ; i < width ; i++){
+	for (j = 0 ; j < height ; j++)
+	{
+		for (i = 0 ; i < width ; i++)
+		{
 			m[j+i*height] = (float)(starpu_drand48());
 		}
 	}
 }
 
-
 void init_zero(float * m, unsigned width, unsigned height)
 {
 	memset(m, 0, sizeof(float) * width * height);
 }
 
-
-
-double min_time(unsigned nb_test, unsigned xdim, unsigned ydim, unsigned zdim, unsigned nsclicesx, unsigned nsclicesy)
+double min_time(unsigned nb_test, unsigned xdim, unsigned ydim, unsigned zdim, unsigned nsclicesx, unsigned nsclicesy, int stride)
 {
 	unsigned i;
 
@@ -179,8 +159,8 @@ double min_time(unsigned nb_test, unsigned xdim, unsigned ydim, unsigned zdim, u
 
 	double exec_times=-1;
 
-	for (i = 0 ; i < nb_test ; i++){
-
+	for (i = 0 ; i < nb_test ; i++)
+	{
 		double start, stop, exec_t;
 
 		init_rand(A, zdim, ydim);
@@ -188,7 +168,7 @@ double min_time(unsigned nb_test, unsigned xdim, unsigned ydim, unsigned zdim, u
 		init_zero(C, xdim, ydim);
 
 		start = starpu_timing_now();
-		multiply_with_starpu(A, B, C, xdim, ydim, zdim, nsclicesx, nsclicesy);
+		multiply_with_starpu(A, B, C, xdim, ydim, zdim, nsclicesx, nsclicesy, stride);
 		stop = starpu_timing_now();
 
 		exec_t = (stop - start)*1.e3; // Put in ns instead of us
@@ -201,34 +181,48 @@ double min_time(unsigned nb_test, unsigned xdim, unsigned ydim, unsigned zdim, u
 	return exec_times;
 }
 
-
-void display_times(unsigned start_dim, unsigned step_dim, unsigned stop_dim, unsigned nb_tests, unsigned nsclicesx, unsigned nsclicesy)
+void display_times(unsigned start_dim, unsigned step_dim, unsigned stop_dim, unsigned nb_tests, unsigned nsclicesx, unsigned nsclicesy, int stride)
 {
 	unsigned dim;
 
-	for (dim = start_dim ; dim <= stop_dim ; dim += step_dim){
-		double t = min_time(nb_tests, dim, dim, dim, nsclicesx, nsclicesy);
+	for (dim = start_dim ; dim <= stop_dim ; dim += step_dim)
+	{
+		double t = min_time(nb_tests, dim, dim, dim, nsclicesx, nsclicesy, stride);
 		printf("%f %f\n", dim*dim*4.*3./1024./1024, (2.*dim-1.)*dim*dim/t);
 	}
-
 }
 
+#ifdef STARPU_QUICK_CHECK
+#define STRIDE_DEFAULT 4
+#else
+#define STRIDE_DEFAULT 72
+#endif
 
 int main(int argc, char * argv[])
 {
-	if (starpu_init(NULL) != EXIT_SUCCESS){
+	int stride=STRIDE_DEFAULT;
+	if (argc >= 2)
+		stride = atoi(argv[1]);
+	if (stride % 4 != 0)
+	{
+		fprintf(stderr, "STRIDE must be a multiple of 4 (%d)\n", stride);
+		return -1;
+	}
+
+	if (starpu_init(NULL) != EXIT_SUCCESS)
+	{
 		fprintf(stderr, "ERROR\n");
 		return 77;
 	}
 
-	unsigned start_dim = 16*STRIDE;
-	unsigned step_dim = 4*STRIDE;
-	unsigned stop_dim = 4096;
+	unsigned start_dim = 16*stride;
+	unsigned step_dim = 4*stride;
+	unsigned stop_dim = 128*stride;
 	unsigned nb_tests = 10;
 	unsigned nsclicesx = 2;
 	unsigned nsclicesy = 2;
 
-	display_times(start_dim, step_dim, stop_dim, nb_tests, nsclicesx, nsclicesy);
+	display_times(start_dim, step_dim, stop_dim, nb_tests, nsclicesx, nsclicesy, stride);
 
 	starpu_shutdown();
 

+ 12 - 12
julia/examples/mult/mult.jl

@@ -2,9 +2,6 @@ import Libdl
 using StarPU
 using LinearAlgebra
 
-#shoud be the same as in the makefile
-const STRIDE = 72
-
 @target STARPU_CPU+STARPU_CUDA
 @codelet function matrix_mult(m1 :: Matrix{Float32}, m2 :: Matrix{Float32}, m3 :: Matrix{Float32}) :: Nothing
 
@@ -59,7 +56,7 @@ end
 
 starpu_init()
 
-function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, nslicesx, nslicesy)
+function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, nslicesx, nslicesy, stride)
     scale= 3
     tmin=0
     vert = StarpuDataFilter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslicesx)
@@ -88,7 +85,7 @@ function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: M
                 for taskx in (1 : nslicesx)
                     for tasky in (1 : nslicesy)
                         handles = [hA[tasky], hB[taskx], hC[taskx, tasky]]
-                        task = StarpuTask(cl = cl, handles = handles)
+                        task = StarpuTask(cl = cl, handles = handles, cl_arg=(stride))
                         starpu_task_submit(task)
                         #@starpu_async_cl matrix_mult(hA[tasky], hB[taskx], hC[taskx, tasky])
                     end
@@ -123,12 +120,12 @@ function approximately_equals(
     return true
 end
 
-function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy)
+function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy, stride)
     for dim in (start_dim : step_dim : stop_dim)
         A = Array(rand(Cfloat, dim, dim))
         B = Array(rand(Cfloat, dim, dim))
         C = zeros(Float32, dim, dim)
-        mt =  multiply_with_starpu(A, B, C, nslicesx, nslicesy)
+        mt =  multiply_with_starpu(A, B, C, nslicesx, nslicesy, stride)
         flops = (2*dim-1)*dim*dim/mt
         size=dim*dim*4*3/1024/1024
         println(io,"$size $flops")
@@ -136,12 +133,15 @@ function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy)
     end
 end
 
-
-if size(ARGS, 1) == 0
-    error("Argument missing")
+if size(ARGS, 1) < 2
+    stride=4
+    filename="x.dat"
+else
+    stride=parse(Int, ARGS[1])
+    filename=ARGS[2]
 end
-io=open(ARGS[1],"w")
-compute_times(io,16*STRIDE,4*STRIDE,4096,2,2)
+io=open(filename,"w")
+compute_times(io,16*stride,4*stride,128*stride,2,2,stride)
 close(io)
 
 starpu_shutdown()

+ 12 - 9
julia/examples/mult/mult_native.jl

@@ -2,10 +2,7 @@ import Libdl
 using StarPU
 using LinearAlgebra
 
-#shoud be the same as in the makefile
-const STRIDE = 72
-
-function multiply_without_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, nslicesx, nslicesy)
+function multiply_without_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, nslicesx, nslicesy, stride)
     tmin = 0
     for i in (1 : 10 )
         t=time_ns()
@@ -19,12 +16,12 @@ function multiply_without_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :
 end
 
 
-function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy)
+function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy, stride)
     for dim in (start_dim : step_dim : stop_dim)
         A = Array(rand(Cfloat, dim, dim))
         B = Array(rand(Cfloat, dim, dim))
         C = zeros(Float32, dim, dim)
-        mt =  multiply_without_starpu(A, B, C, nslicesx, nslicesy)
+        mt =  multiply_without_starpu(A, B, C, nslicesx, nslicesy, stride)
         flops = (2*dim-1)*dim*dim/mt
         size=dim*dim*4*3/1024/1024
         println(io,"$size $flops")
@@ -32,8 +29,14 @@ function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy)
     end
 end
 
-
-io=open(ARGS[1],"w")
-compute_times(io,16*STRIDE,4*STRIDE,4096,2,2)
+if size(ARGS, 1) < 2
+    stride=4
+    filename="x.dat"
+else
+    stride=parse(Int, ARGS[1])
+    filename=ARGS[2]
+end
+io=open(filename,"w")
+compute_times(io,16*stride,4*stride,128*stride,2,2,stride)
 close(io)
 

+ 5 - 4
julia/examples/vector_scal/Makefile

@@ -49,10 +49,11 @@ clean:
 cstarpu.dat: vector_scal
 	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 ./vector_scal > $@
 julia_generatedc.dat:
-	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia vector_scal.jl $@
+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia vector_scal.jl $@
 julia_native.dat:
-	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia vector_scal_native.jl $@
+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia vector_scal_native.jl $@
 julia_calllib.dat: ${EXTERNLIB}
-	LD_LIBRARY_PATH+=${LIBPATH} JULIA_TASK_LIB="${EXTERNLIB}" STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia vector_scal.jl julia_calllib.dat
+	JULIA_TASK_LIB="${EXTERNLIB}" STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia vector_scal.jl julia_calllib.dat
 
-test: cstarpu.dat julia_generatedc.dat julia_native.dat julia_calllib.dat
+#test: cstarpu.dat julia_generatedc.dat julia_native.dat julia_calllib.dat
+test: julia_generatedc.dat julia_calllib.dat

+ 1 - 1
julia/examples/vector_scal/cpu_vector_scal.c

@@ -29,7 +29,7 @@ float cpu_vector_scal(void *buffers[], void *cl_arg)
 
   /* scale the vector */
   for (unsigned i = 0; i < n; i++)
-    val[i] = val[i] * k + l + m;
+    val[i] = val[i] * m + l + k;
 
   return 0.0;
 }

+ 1 - 1
julia/examples/vector_scal/vector_scal.jl

@@ -37,7 +37,7 @@ function vector_scal_with_starpu(v :: Vector{Float32}, m :: Int32, k :: Float32,
             t=time_ns()
             @starpu_sync_tasks begin
                 handles = [hV]
-                task = StarpuTask(cl = cl, handles = handles, cl_arg=[m, k, l])
+                task = StarpuTask(cl = cl, handles = handles, cl_arg=(m, k, l))
                 starpu_task_submit(task)
             end
             # @starpu_sync_tasks for task in (1:1)

+ 6 - 4
julia/src/StarPU.jl

@@ -55,6 +55,8 @@ function jlstarpu_set_to_zero(x :: T) :: Ptr{Cvoid} where {T}
         )
 end
 
+tuple_len(::NTuple{N, Any}) where {N} = N
+
 export starpu_init
 export starpu_shutdown
 export starpu_memory_pin
@@ -491,7 +493,7 @@ mutable struct StarpuTask
 
         Creates a new task which will run the specified codelet on handle buffers and cl_args data
     """
-    function StarpuTask(; cl :: Union{Cvoid, StarpuCodelet} = nothing, handles :: Vector{StarpuDataHandle} = StarpuDataHandle[], cl_arg = [])
+    function StarpuTask(; cl :: Union{Cvoid, StarpuCodelet} = nothing, handles :: Vector{StarpuDataHandle} = StarpuDataHandle[], cl_arg = ())
 
         if (cl == nothing)
             error("\"cl\" field can't be empty when creating a StarpuTask")
@@ -516,13 +518,13 @@ mutable struct StarpuTask
         scalar_parameters = get(CODELETS_SCALARS, codelet_name, nothing)
         if scalar_parameters != nothing
             nb_scalar_required = length(scalar_parameters)
-            nb_scalar_provided = length(cl_arg)
+            nb_scalar_provided = tuple_len(cl_arg)
             if (nb_scalar_provided != nb_scalar_required)
                 error("$nb_scalar_provided scalar parameters provided but $nb_scalar_required are required by $codelet_name.")
             end
             output.cl_arg = create_param_struct_from_clarg(codelet_name, cl_arg)
         else
-            output.cl_arg = nothing
+            output.cl_arg = cl_arg
         end
 
         output.synchronous = false
@@ -591,7 +593,7 @@ mutable struct StarpuTaskTranslator
             output.cl_arg = C_NULL
             output.cl_arg_size = 0
         else
-            output.cl_arg = pointer_from_objref(task.cl_arg)
+            output.cl_arg = Base.unsafe_convert(Ptr{Cvoid}, Ref(task.cl_arg))
             output.cl_arg_size = sizeof(task.cl_arg)
         end
 

+ 16 - 0
mpi/Makefile.am

@@ -26,3 +26,19 @@ versinclude_HEADERS = 					\
 	include/starpu_mpi.h				\
 	include/starpu_mpi_lb.h				\
 	include/fstarpu_mpi_mod.f90
+
+check-recursive:
+	RET=0 ; \
+	NJOBS=`printf %s "$(MAKEFLAGS)" | sed -ne 's/.*-j \?\([0-9]\+\).*/\1/p'` ; \
+	JOBS="" ; \
+	if [ -n "$$NJOBS" ] ; then \
+		if [ "$$NJOBS" -ge 4 ] ; then \
+			JOBS="-j$$(($$NJOBS / 4))" ; \
+		else \
+			JOBS="-j1" ; \
+		fi ; \
+	fi ; \
+	for i in $(SUBDIRS) ; do \
+		$(MAKE) check -C $$i MAKEFLAGS="$(MAKEFLAGS) $$JOBS" || RET=1; \
+	done ; \
+	exit $$RET

+ 1 - 1
socl/Makefile.am

@@ -25,6 +25,6 @@ dist_SOCL_vendors_DATA = @SOCL_VENDORS@
 recheck:
 	RET=0 ; \
 	for i in $(SUBDIRS) ; do \
-		make -C $$i recheck || RET=1 ; \
+		$(MAKE) -C $$i recheck || RET=1 ; \
 	done ; \
 	exit $$RET

+ 52 - 26
src/datawizard/interfaces/bcsr_interface.c

@@ -133,10 +133,13 @@ void starpu_bcsr_data_register(starpu_data_handle_t *handleptr, int home_node,
 #ifndef STARPU_SIMGRID
 	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
 	{
-		STARPU_ASSERT_ACCESSIBLE(nzval);
-		STARPU_ASSERT_ACCESSIBLE(nzval + nnz*elemsize*r*c - 1);
-		STARPU_ASSERT_ACCESSIBLE(colind);
-		STARPU_ASSERT_ACCESSIBLE((uintptr_t) colind + nnz*sizeof(uint32_t) - 1);
+		if (nnz)
+		{
+			STARPU_ASSERT_ACCESSIBLE(nzval);
+			STARPU_ASSERT_ACCESSIBLE(nzval + nnz*elemsize*r*c - 1);
+			STARPU_ASSERT_ACCESSIBLE(colind);
+			STARPU_ASSERT_ACCESSIBLE((uintptr_t) colind + nnz*sizeof(uint32_t) - 1);
+		}
 		STARPU_ASSERT_ACCESSIBLE(rowptr);
 		STARPU_ASSERT_ACCESSIBLE((uintptr_t) rowptr + (nrow+1)*sizeof(uint32_t) - 1);
 	}
@@ -325,12 +328,21 @@ static starpu_ssize_t allocate_bcsr_buffer_on_node(void *data_interface_, unsign
 	uint32_t r = bcsr_interface->r;
 	uint32_t c = bcsr_interface->c;
 
-	addr_nzval = starpu_malloc_on_node(dst_node, nnz*r*c*elemsize);
-	if (!addr_nzval)
-		goto fail_nzval;
-	addr_colind = starpu_malloc_on_node(dst_node, nnz*sizeof(uint32_t));
-	if (!addr_colind)
-		goto fail_colind;
+	STARPU_ASSERT_MSG(r && c, "partitioning bcsr with several memory nodes is not supported yet");
+
+	if (nnz)
+	{
+		addr_nzval = starpu_malloc_on_node(dst_node, nnz*r*c*elemsize);
+		if (!addr_nzval)
+			goto fail_nzval;
+		addr_colind = starpu_malloc_on_node(dst_node, nnz*sizeof(uint32_t));
+		if (!addr_colind)
+			goto fail_colind;
+	}
+	else
+	{
+		addr_nzval = addr_colind = 0;
+	}
 	addr_rowptr = starpu_malloc_on_node(dst_node, (nrow+1)*sizeof(uint32_t));
 	if (!addr_rowptr)
 		goto fail_rowptr;
@@ -347,9 +359,11 @@ static starpu_ssize_t allocate_bcsr_buffer_on_node(void *data_interface_, unsign
 	return allocated_memory;
 
 fail_rowptr:
-	starpu_free_on_node(dst_node, addr_colind, nnz*sizeof(uint32_t));
+	if (nnz)
+		starpu_free_on_node(dst_node, addr_colind, nnz*sizeof(uint32_t));
 fail_colind:
-	starpu_free_on_node(dst_node, addr_nzval, nnz*r*c*elemsize);
+	if (nnz)
+		starpu_free_on_node(dst_node, addr_nzval, nnz*r*c*elemsize);
 fail_nzval:
 	/* allocation failed */
 	return -ENOMEM;
@@ -364,8 +378,11 @@ static void free_bcsr_buffer_on_node(void *data_interface, unsigned node)
 	uint32_t r = bcsr_interface->r;
 	uint32_t c = bcsr_interface->c;
 
-	starpu_free_on_node(node, bcsr_interface->nzval, nnz*r*c*elemsize);
-	starpu_free_on_node(node, (uintptr_t) bcsr_interface->colind, nnz*sizeof(uint32_t));
+	if (nnz)
+	{
+		starpu_free_on_node(node, bcsr_interface->nzval, nnz*r*c*elemsize);
+		starpu_free_on_node(node, (uintptr_t) bcsr_interface->colind, nnz*sizeof(uint32_t));
+	}
 	starpu_free_on_node(node, (uintptr_t) bcsr_interface->rowptr, (nrow+1)*sizeof(uint32_t));
 }
 
@@ -383,11 +400,14 @@ static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_int
 
 	int ret = 0;
 
-	if (starpu_interface_copy(src_bcsr->nzval, 0, src_node, dst_bcsr->nzval, 0, dst_node, nnz*elemsize*r*c, async_data))
-		ret = -EAGAIN;
+	if (nnz)
+	{
+		if (starpu_interface_copy(src_bcsr->nzval, 0, src_node, dst_bcsr->nzval, 0, dst_node, nnz*elemsize*r*c, async_data))
+			ret = -EAGAIN;
 
-	if (starpu_interface_copy((uintptr_t)src_bcsr->colind, 0, src_node, (uintptr_t)dst_bcsr->colind, 0, dst_node, nnz*sizeof(uint32_t), async_data))
-		ret = -EAGAIN;
+		if (starpu_interface_copy((uintptr_t)src_bcsr->colind, 0, src_node, (uintptr_t)dst_bcsr->colind, 0, dst_node, nnz*sizeof(uint32_t), async_data))
+			ret = -EAGAIN;
+	}
 
 	if (starpu_interface_copy((uintptr_t)src_bcsr->rowptr, 0, src_node, (uintptr_t)dst_bcsr->rowptr, 0, dst_node, (nrow+1)*sizeof(uint32_t), async_data))
 		ret = -EAGAIN;
@@ -425,10 +445,13 @@ static int pack_data(starpu_data_handle_t handle, unsigned node, void **ptr, sta
 	{
 		*ptr = (void *)starpu_malloc_on_node_flags(node, *count, 0);
 		char *tmp = *ptr;
-		memcpy(tmp, (void*)bcsr->colind, bcsr->nnz * sizeof(bcsr->colind[0]));
-		tmp += bcsr->nnz * sizeof(bcsr->colind[0]);
-		memcpy(tmp, (void*)bcsr->rowptr, (bcsr->nrow + 1) * sizeof(bcsr->rowptr[0]));
-		tmp += (bcsr->nrow + 1) * sizeof(bcsr->rowptr[0]);
+		if (bcsr->nnz)
+		{
+			memcpy(tmp, (void*)bcsr->colind, bcsr->nnz * sizeof(bcsr->colind[0]));
+			tmp += bcsr->nnz * sizeof(bcsr->colind[0]);
+			memcpy(tmp, (void*)bcsr->rowptr, (bcsr->nrow + 1) * sizeof(bcsr->rowptr[0]));
+			tmp += (bcsr->nrow + 1) * sizeof(bcsr->rowptr[0]);
+		}
 		memcpy(tmp, (void*)bcsr->nzval, bcsr->r * bcsr->c * bcsr->nnz * bcsr->elemsize);
 	}
 
@@ -444,10 +467,13 @@ static int unpack_data(starpu_data_handle_t handle, unsigned node, void *ptr, si
 	STARPU_ASSERT(count == (bcsr->nnz * sizeof(bcsr->colind[0]))+((bcsr->nrow + 1) * sizeof(bcsr->rowptr[0]))+(bcsr->r * bcsr->c * bcsr->nnz * bcsr->elemsize));
 
 	char *tmp = ptr;
-	memcpy((void*)bcsr->colind, tmp, bcsr->nnz * sizeof(bcsr->colind[0]));
-	tmp += bcsr->nnz * sizeof(bcsr->colind[0]);
-	memcpy((void*)bcsr->rowptr, tmp, (bcsr->nrow + 1) * sizeof(bcsr->rowptr[0]));
-	tmp += (bcsr->nrow + 1) * sizeof(bcsr->rowptr[0]);
+	if (bcsr->nnz)
+	{
+		memcpy((void*)bcsr->colind, tmp, bcsr->nnz * sizeof(bcsr->colind[0]));
+		tmp += bcsr->nnz * sizeof(bcsr->colind[0]);
+		memcpy((void*)bcsr->rowptr, tmp, (bcsr->nrow + 1) * sizeof(bcsr->rowptr[0]));
+		tmp += (bcsr->nrow + 1) * sizeof(bcsr->rowptr[0]);
+	}
 	memcpy((void*)bcsr->nzval, tmp, bcsr->r * bcsr->c * bcsr->nnz * bcsr->elemsize);
 
 	starpu_free_on_node_flags(node, (uintptr_t)ptr, count, 0);

+ 51 - 26
src/datawizard/interfaces/csr_interface.c

@@ -115,10 +115,13 @@ void starpu_csr_data_register(starpu_data_handle_t *handleptr, int home_node,
 #ifndef STARPU_SIMGRID
 	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
 	{
-		STARPU_ASSERT_ACCESSIBLE(nzval);
-		STARPU_ASSERT_ACCESSIBLE(nzval + nnz*elemsize - 1);
-		STARPU_ASSERT_ACCESSIBLE(colind);
-		STARPU_ASSERT_ACCESSIBLE((uintptr_t) colind + nnz*sizeof(uint32_t) - 1);
+		if (nnz)
+		{
+			STARPU_ASSERT_ACCESSIBLE(nzval);
+			STARPU_ASSERT_ACCESSIBLE(nzval + nnz*elemsize - 1);
+			STARPU_ASSERT_ACCESSIBLE(colind);
+			STARPU_ASSERT_ACCESSIBLE((uintptr_t) colind + nnz*sizeof(uint32_t) - 1);
+		}
 		STARPU_ASSERT_ACCESSIBLE(rowptr);
 		STARPU_ASSERT_ACCESSIBLE((uintptr_t) rowptr + (nrow+1)*sizeof(uint32_t) - 1);
 	}
@@ -272,12 +275,20 @@ static starpu_ssize_t allocate_csr_buffer_on_node(void *data_interface_, unsigne
 	uint32_t nrow = csr_interface->nrow;
 	size_t elemsize = csr_interface->elemsize;
 
-	addr_nzval = starpu_malloc_on_node(dst_node, nnz*elemsize);
-	if (!addr_nzval)
-		goto fail_nzval;
-	addr_colind = (uint32_t*) starpu_malloc_on_node(dst_node, nnz*sizeof(uint32_t));
-	if (!addr_colind)
-		goto fail_colind;
+	if (nnz)
+	{
+		addr_nzval = starpu_malloc_on_node(dst_node, nnz*elemsize);
+		if (!addr_nzval)
+			goto fail_nzval;
+		addr_colind = (uint32_t*) starpu_malloc_on_node(dst_node, nnz*sizeof(uint32_t));
+		if (!addr_colind)
+			goto fail_colind;
+	}
+	else
+	{
+		addr_nzval = 0;
+		addr_colind = NULL;
+	}
 	addr_rowptr = (uint32_t*) starpu_malloc_on_node(dst_node, (nrow+1)*sizeof(uint32_t));
 	if (!addr_rowptr)
 		goto fail_rowptr;
@@ -294,9 +305,11 @@ static starpu_ssize_t allocate_csr_buffer_on_node(void *data_interface_, unsigne
 	return allocated_memory;
 
 fail_rowptr:
-	starpu_free_on_node(dst_node, (uintptr_t) addr_colind, nnz*sizeof(uint32_t));
+	if (nnz)
+		starpu_free_on_node(dst_node, (uintptr_t) addr_colind, nnz*sizeof(uint32_t));
 fail_colind:
-	starpu_free_on_node(dst_node, addr_nzval, nnz*elemsize);
+	if (nnz)
+		starpu_free_on_node(dst_node, addr_nzval, nnz*elemsize);
 fail_nzval:
 	/* allocation failed */
 	return -ENOMEM;
@@ -309,8 +322,11 @@ static void free_csr_buffer_on_node(void *data_interface, unsigned node)
 	uint32_t nrow = csr_interface->nrow;
 	size_t elemsize = csr_interface->elemsize;
 
-	starpu_free_on_node(node, csr_interface->nzval, nnz*elemsize);
-	starpu_free_on_node(node, (uintptr_t) csr_interface->colind, nnz*sizeof(uint32_t));
+	if (nnz)
+	{
+		starpu_free_on_node(node, csr_interface->nzval, nnz*elemsize);
+		starpu_free_on_node(node, (uintptr_t) csr_interface->colind, nnz*sizeof(uint32_t));
+	}
 	starpu_free_on_node(node, (uintptr_t) csr_interface->rowptr, (nrow+1)*sizeof(uint32_t));
 }
 
@@ -325,11 +341,14 @@ static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_int
 	size_t elemsize = src_csr->elemsize;
 	int ret = 0;
 
-	if (starpu_interface_copy(src_csr->nzval, 0, src_node, dst_csr->nzval, 0, dst_node, nnz*elemsize, async_data))
-		ret = -EAGAIN;
+	if (nnz)
+	{
+		if (starpu_interface_copy(src_csr->nzval, 0, src_node, dst_csr->nzval, 0, dst_node, nnz*elemsize, async_data))
+			ret = -EAGAIN;
 
-	if (starpu_interface_copy((uintptr_t)src_csr->colind, 0, src_node, (uintptr_t)dst_csr->colind, 0, dst_node, nnz*sizeof(uint32_t), async_data))
-		ret = -EAGAIN;
+		if (starpu_interface_copy((uintptr_t)src_csr->colind, 0, src_node, (uintptr_t)dst_csr->colind, 0, dst_node, nnz*sizeof(uint32_t), async_data))
+			ret = -EAGAIN;
+	}
 
 	if (starpu_interface_copy((uintptr_t)src_csr->rowptr, 0, src_node, (uintptr_t)dst_csr->rowptr, 0, dst_node, (nrow+1)*sizeof(uint32_t), async_data))
 		ret = -EAGAIN;
@@ -365,10 +384,13 @@ static int pack_data(starpu_data_handle_t handle, unsigned node, void **ptr, sta
 	{
 		*ptr = (void *)starpu_malloc_on_node_flags(node, *count, 0);
 		char *tmp = *ptr;
-		memcpy(tmp, (void*)csr->colind, csr->nnz * sizeof(csr->colind[0]));
-		tmp += csr->nnz * sizeof(csr->colind[0]);
-		memcpy(tmp, (void*)csr->rowptr, (csr->nrow + 1) * sizeof(csr->rowptr[0]));
-		tmp += (csr->nrow + 1) * sizeof(csr->rowptr[0]);
+		if (csr->nnz)
+		{
+			memcpy(tmp, (void*)csr->colind, csr->nnz * sizeof(csr->colind[0]));
+			tmp += csr->nnz * sizeof(csr->colind[0]);
+			memcpy(tmp, (void*)csr->rowptr, (csr->nrow + 1) * sizeof(csr->rowptr[0]));
+			tmp += (csr->nrow + 1) * sizeof(csr->rowptr[0]);
+		}
 		memcpy(tmp, (void*)csr->nzval, csr->nnz * csr->elemsize);
 	}
 
@@ -384,10 +406,13 @@ static int unpack_data(starpu_data_handle_t handle, unsigned node, void *ptr, si
 	STARPU_ASSERT(count == (csr->nnz * sizeof(csr->colind[0]))+((csr->nrow + 1) * sizeof(csr->rowptr[0]))+(csr->nnz * csr->elemsize));
 
 	char *tmp = ptr;
-	memcpy((void*)csr->colind, tmp, csr->nnz * sizeof(csr->colind[0]));
-	tmp += csr->nnz * sizeof(csr->colind[0]);
-	memcpy((void*)csr->rowptr, tmp, (csr->nrow + 1) * sizeof(csr->rowptr[0]));
-	tmp += (csr->nrow + 1) * sizeof(csr->rowptr[0]);
+	if (csr->nnz)
+	{
+		memcpy((void*)csr->colind, tmp, csr->nnz * sizeof(csr->colind[0]));
+		tmp += csr->nnz * sizeof(csr->colind[0]);
+		memcpy((void*)csr->rowptr, tmp, (csr->nrow + 1) * sizeof(csr->rowptr[0]));
+		tmp += (csr->nrow + 1) * sizeof(csr->rowptr[0]);
+	}
 	memcpy((void*)csr->nzval, tmp, csr->nnz * csr->elemsize);
 
 	starpu_free_on_node_flags(node, (uintptr_t)ptr, count, 0);

+ 19 - 10
src/datawizard/memalloc.c

@@ -113,6 +113,11 @@ static unsigned tidying[STARPU_MAXNODES];
 /* Whether some thread is currently reclaiming memory for this node */
 static unsigned reclaiming[STARPU_MAXNODES];
 
+/* This records that we tried to prefetch data but went out of memory, so will
+ * probably fail again to prefetch data, thus not trace each and every
+ * attempt. */
+static volatile int prefetch_out_of_memory[STARPU_MAXNODES];
+
 int _starpu_is_reclaiming(unsigned node)
 {
 	STARPU_ASSERT(node < STARPU_MAXNODES);
@@ -184,6 +189,7 @@ void _starpu_init_mem_chunk_lists(void)
 		STARPU_HG_DISABLE_CHECKING(mc_cache_size[i]);
 		STARPU_HG_DISABLE_CHECKING(mc_nb[i]);
 		STARPU_HG_DISABLE_CHECKING(mc_clean_nb[i]);
+		STARPU_HG_DISABLE_CHECKING(prefetch_out_of_memory[i]);
 	}
 	/* We do not enable forcing available memory by default, since
 	  this makes StarPU spuriously free data when prefetching fills the
@@ -1432,7 +1438,6 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 	starpu_ssize_t data_size = _starpu_data_get_alloc_size(handle);
 	int told_reclaiming = 0;
 	int reused = 0;
-	static int prefetch_out_of_memory[STARPU_MAXNODES];
 
 	_starpu_spin_checklocked(&handle->header_lock);
 
@@ -1441,17 +1446,19 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 	/* perhaps we can directly reuse a buffer in the free-list */
 	uint32_t footprint = _starpu_compute_data_footprint(handle);
 
+	int prefetch_oom = is_prefetch && prefetch_out_of_memory[dst_node];
+
 #ifdef STARPU_USE_ALLOCATION_CACHE
-	if (!(is_prefetch && prefetch_out_of_memory[dst_node]))
+	if (!prefetch_oom)
 		_STARPU_TRACE_START_ALLOC_REUSE(dst_node, data_size, handle, is_prefetch);
 	if (try_to_find_reusable_mc(dst_node, handle, replicate, footprint))
 	{
 		_starpu_allocation_cache_hit(dst_node);
-		if (!(is_prefetch && prefetch_out_of_memory[dst_node]))
+		if (!prefetch_oom)
 			_STARPU_TRACE_END_ALLOC_REUSE(dst_node, handle, 1);
 		return data_size;
 	}
-	if (!(is_prefetch && prefetch_out_of_memory[dst_node]))
+	if (!prefetch_oom)
 		_STARPU_TRACE_END_ALLOC_REUSE(dst_node, handle, 0);
 #endif
 	STARPU_ASSERT(handle->ops);
@@ -1473,7 +1480,7 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 
 	do
 	{
-		if (!(is_prefetch && prefetch_out_of_memory[dst_node]))
+		if (!prefetch_oom)
 			_STARPU_TRACE_START_ALLOC(dst_node, data_size, handle, is_prefetch);
 
 #if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
@@ -1488,11 +1495,9 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 #endif
 
 		allocated_memory = handle->ops->allocate_data_on_node(data_interface, dst_node);
-		if (!(is_prefetch && prefetch_out_of_memory[dst_node]))
+		if (!prefetch_oom)
 			_STARPU_TRACE_END_ALLOC(dst_node, handle, allocated_memory);
 
-		prefetch_out_of_memory[dst_node] = 0;
-
 		if (allocated_memory == -ENOMEM)
 		{
 			size_t handle_size = _starpu_data_get_alloc_size(handle);
@@ -1501,9 +1506,11 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 			/* First try to flush data explicitly marked for freeing */
 			size_t freed = flush_memchunk_cache(dst_node, reclaim);
 
-			if (freed >= reclaim)
+			if (freed >= reclaim) {
 				/* That freed enough data, retry allocating */
+				prefetch_out_of_memory[dst_node] = 0;
 				continue;
+			}
 			reclaim -= freed;
 
 			/* Try to reuse an allocated data with the same interface (to avoid spurious free/alloc) */
@@ -1535,7 +1542,9 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 			_STARPU_TRACE_START_MEMRECLAIM(dst_node,is_prefetch);
 			_starpu_memory_reclaim_generic(dst_node, 0, reclaim);
 			_STARPU_TRACE_END_MEMRECLAIM(dst_node,is_prefetch);
-		}
+			prefetch_out_of_memory[dst_node] = 0;
+		} else
+			prefetch_out_of_memory[dst_node] = 0;
 	}
 	while((allocated_memory == -ENOMEM) && attempts++ < 2);
 

+ 1 - 0
src/drivers/cpu/driver_cpu.c

@@ -301,6 +301,7 @@ int _starpu_cpu_driver_run_once(struct _starpu_worker *cpu_worker)
 	if (pending_task != NULL && cpu_worker->nb_buffers_transferred == cpu_worker->nb_buffers_totransfer)
 	{
 		int ret;
+		STARPU_RMB();
 		_STARPU_TRACE_END_PROGRESS(memnode);
 		j = _starpu_get_job_associated_to_task(pending_task);
 

+ 1 - 0
src/drivers/cuda/driver_cuda.c

@@ -825,6 +825,7 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 		task = worker->task_transferring;
 		if (task && worker->nb_buffers_transferred == worker->nb_buffers_totransfer)
 		{
+			STARPU_RMB();
 			_STARPU_TRACE_END_PROGRESS(memnode);
 			j = _starpu_get_job_associated_to_task(task);
 

+ 1 - 0
src/drivers/mp_common/source_common.c

@@ -927,6 +927,7 @@ static void _starpu_src_common_worker_internal_work(struct _starpu_worker_set *
 		/* We send all buffers to execute the task */
 		if (task != NULL && worker_set->workers[i].nb_buffers_transferred == worker_set->workers[i].nb_buffers_totransfer)
 		{
+			STARPU_RMB();
 			struct _starpu_job * j = _starpu_get_job_associated_to_task(task);
 
 			_STARPU_TRACE_END_PROGRESS(memnode);

+ 1 - 0
src/drivers/opencl/driver_opencl.c

@@ -700,6 +700,7 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *worker)
 		idle_transfers++;
 	if (task && worker->nb_buffers_transferred == worker->nb_buffers_totransfer)
 	{
+		STARPU_RMB();
 		_STARPU_TRACE_END_PROGRESS(memnode);
 		j = _starpu_get_job_associated_to_task(task);
 

+ 1 - 1
starpufft/Makefile.am

@@ -32,6 +32,6 @@ pkgconfig_DATA = libstarpufft.pc starpufft-1.0.pc starpufft-1.1.pc starpufft-1.2
 recheck:
 	RET=0 ; \
 	for i in $(SUBDIRS) ; do \
-		make -C $$i recheck || RET=1 ; \
+		$(MAKE) -C $$i recheck || RET=1 ; \
 	done ; \
 	exit $$RET

+ 17 - 8
tests/datawizard/bcsr.c

@@ -39,7 +39,7 @@ void cpu_show_bcsr(void *descr[], void *arg)
 
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 
-	printf("nnz %d elemsize %d\n", nnz, elemsize);
+	printf("\nnnz %d elemsize %d\n", nnz, elemsize);
 
 	for (i = 0; i < nrow; i++)
 	{
@@ -81,11 +81,13 @@ struct starpu_codelet show_cl =
  *   |  2   3   0   0 |
  *   |  4   5   8   9 |
  *   |  6   7  10  11 |
+ *   |  0   0   0   0 |
+ *   |  0   0   0   0 |
  *   +----------------+
  *
  * nzval  = [0, 1, 2, 3] ++ [4, 5, 6, 7] ++ [8, 9, 10, 11]
- * colind = [0, 0, 1]
- * rowptr = [0, 1, 3 ]
+ * colind = [0, 0, 1] (column index of each non-zero block)
+ * rowptr = [0, 1, 3] (index of first non-zero block for each row)
  * r = c = 2
  */
 
@@ -93,10 +95,10 @@ struct starpu_codelet show_cl =
 #define R              2
 #define C              2
 
-#define NNZ_BLOCKS     3   /* out of 4 */
+#define NNZ_BLOCKS     3   /* out of 6 */
 #define NZVAL_SIZE     (R*C*NNZ_BLOCKS)
 
-#define NROWS          2
+#define NROWS          3
 
 static int nzval[NZVAL_SIZE]  =
 {
@@ -106,16 +108,21 @@ static int nzval[NZVAL_SIZE]  =
 };
 static uint32_t colind[NNZ_BLOCKS] = { 0, 0, 1 };
 
-static uint32_t rowptr[NROWS+1] = { 0, 1, NNZ_BLOCKS };
+static uint32_t rowptr[NROWS+1] = { 0, 1, NNZ_BLOCKS, NNZ_BLOCKS };
 
 int main(int argc, char **argv)
 {
 	struct starpu_conf conf;
 	starpu_conf_init(&conf);
 
-	if (starpu_initialize(&conf, &argc, &argv) == -ENODEV || starpu_cpu_worker_get_count() == 0)
+	if (starpu_initialize(&conf, &argc, &argv) == -ENODEV)
 		return STARPU_TEST_SKIPPED;
 
+	if (starpu_cpu_worker_get_count() == 0 || starpu_memory_nodes_get_count() > 1) {
+		starpu_shutdown();
+		return STARPU_TEST_SKIPPED;
+	}
+
 	starpu_bcsr_data_register(&bcsr_handle,
 				  STARPU_MAIN_RAM,
 				  NNZ_BLOCKS,
@@ -133,12 +140,14 @@ int main(int argc, char **argv)
 	struct starpu_data_filter filter =
 	{
 		.filter_func = starpu_bcsr_filter_vertical_block,
-		.nchildren = 2,
+		.nchildren = 3,
 	};
 	starpu_data_partition(bcsr_handle, &filter);
 
 	starpu_task_insert(&show_cl, STARPU_R, starpu_data_get_sub_data(bcsr_handle, 1, 0), 0);
 	starpu_task_insert(&show_cl, STARPU_R, starpu_data_get_sub_data(bcsr_handle, 1, 1), 0);
+	starpu_task_insert(&show_cl, STARPU_R, starpu_data_get_sub_data(bcsr_handle, 1, 2), 0);
+
 	starpu_data_unpartition(bcsr_handle, STARPU_MAIN_RAM);
 
 	starpu_data_unregister(bcsr_handle);

+ 1 - 0
tools/dev/tsan/starpu.suppr

@@ -22,6 +22,7 @@ race:^ntasks$
 race:^mc_cache_size$
 race:^mc_nb$
 race:^mc_clean_nb$
+race:^prefetch_out_of_memory$
 race:^data_requests$
 race:^prefetch_requests$
 race:^idle_requests$