5 anni fa · f1cffb99e7
--- a/doc/doxygen/chapters/320_scheduling.doxy
+++ b/doc/doxygen/chapters/320_scheduling.doxy
@@ -185,6 +185,11 @@ already gives the good results that a precise estimation would give.
 
				 
			
 
				 \section Energy-basedScheduling Energy-based Scheduling
			
 
				 
			
 
				+Note: by default StarPU does not let CPU workers sleep, to let them react to
			
 
				+task release as quickly as possible. For idle time to really let CPU cores save
			
 
				+energy, one needs to use the \ref enable-blocking-drivers
			
 
				+"--enable-blocking-drivers" configuration option.
			
 
				+
			
 
				 If the application can provide some energy consumption performance model (through
			
 
				 the field starpu_codelet::energy_model), StarPU will
			
 
				 take it into account when distributing tasks. The target function that
			
--- a/doc/doxygen/chapters/380_offline_performance_tools.doxy
+++ b/doc/doxygen/chapters/380_offline_performance_tools.doxy
@@ -586,19 +586,31 @@ $ starpu_paje_sort paje.trace
 
				 \section PapiCounters PAPI counters
			
 
				 
			
 
				 Performance counter values could be obtained from the PAPI framework if
			
 
				-<c>./configure</c> detected the libpapi. One has to set the \ref STARPU_PROFILING
			
 
				-environment variable to 1 and then specify which events to record with the
			
 
				-\ref STARPU_PROF_PAPI_EVENTS environment variable. For instance:
			
 
				+<c>./configure</c> detected the libpapi.
			
 
				+
			
 
				+In Debian, packages <c>libpapi-dev</c> and <c>libpapi5.7</c> provide required
			
 
				+files.  Package <c>papi-tools</c> contains a set of useful tools, for example
			
 
				+<c>papi_avail</c> to see which counters are available.
			
 
				+
			
 
				+To be able to use Papi counters, one may need to reduce the level of the kernel
			
 
				+parameter <c>kernel.perf_event_paranoid</c> to at least 2. See
			
 
				+https://www.kernel.org/doc/html/latest/admin-guide/perf-security.html for the
			
 
				+security impact of this parameter.
			
 
				+
			
 
				+Then one has to set the \ref STARPU_PROFILING environment variable to 1 and
			
 
				+specify which events to record with the \ref STARPU_PROF_PAPI_EVENTS
			
 
				+environment variable. For instance:
			
 
				 
			
 
				 \verbatim
			
 
				 export STARPU_PROFILING=1 STARPU_PROF_PAPI_EVENTS="PAPI_TOT_INS PAPI_TOT_CYC"
			
 
				 \endverbatim
			
 
				 
			
 
				+The comma can also be used to separate events to monitor.
			
 
				+
			
 
				 In the current simple implementation, only CPU tasks have their events measured
			
 
				-and require CPUs that support the PAPI events. All events that PAPI support are
			
 
				-available from their documentation (https://icl.cs.utk.edu/projects/papi/wiki/PAPIC:Preset_Event_Definitions).
			
 
				-It is important to note that not all events are available on all systems, and
			
 
				-general PAPI recommendations should be followed.
			
 
				+and require CPUs that support the PAPI events. It is important to note that not
			
 
				+all events are available on all systems, and general PAPI recommendations
			
 
				+should be followed.
			
 
				 
			
 
				 The counter values can be accessed using the profiling interface:
			
 
				 \code{.c}
			
--- a/doc/doxygen/chapters/510_configure_options.doxy
+++ b/doc/doxygen/chapters/510_configure_options.doxy
@@ -1,6 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ * Copyright (C) 2020       Federal University of Rio Grande do Sul (UFRGS)
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/doc/doxygen/chapters/code/vector_scal_opencl.c
+++ b/doc/doxygen/chapters/code/vector_scal_opencl.c
@@ -57,6 +57,7 @@ void scal_opencl_func(void *buffers[], void *_args)
 
				         err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
			
 
				         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				         if (local > global) local=global;
			
 
				+        else global = (global + local-1) / local * local;
			
 
				 
			
 
				         err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
			
 
				         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
--- a/examples/axpy/axpy_opencl.c
+++ b/examples/axpy/axpy_opencl.c
@@ -60,6 +60,8 @@ void axpy_opencl(void *buffers[], void *_args)
 
				 			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				                 if (local > global)
			
 
				 			local=global;
			
 
				+                else
			
 
				+                        global = (global + local-1) / local * local;
			
 
				 
			
 
				 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
			
 
				 		if (err != CL_SUCCESS)
			
--- a/examples/basic_examples/multiformat_conversion_codelets_opencl.c
+++ b/examples/basic_examples/multiformat_conversion_codelets_opencl.c
@@ -74,6 +74,8 @@ void cpu_to_opencl_opencl_func(void *buffers[], void *args)
 
				 
			
 
				                 if (local > global)
			
 
				 			local = global;
			
 
				+                else
			
 
				+                        global = (global + local-1) / local * local;
			
 
				 
			
 
				 		err = clEnqueueNDRangeKernel(queue,
			
 
				 					kernel,
			
--- a/examples/basic_examples/multiformat_opencl.c
+++ b/examples/basic_examples/multiformat_opencl.c
@@ -68,6 +68,8 @@ void multiformat_scal_opencl_func(void *buffers[], void *args)
 
				 
			
 
				                 if (local > global)
			
 
				 			local = global;
			
 
				+                else
			
 
				+                        global = (global + local-1) / local * local;
			
 
				 
			
 
				 		err = clEnqueueNDRangeKernel(queue,
			
 
				 					kernel,
			
--- a/examples/basic_examples/vector_scal_opencl.c
+++ b/examples/basic_examples/vector_scal_opencl.c
@@ -57,6 +57,7 @@ void scal_opencl_func(void *buffers[], void *_args)
 
				                 err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
			
 
				                 if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				                 if (local > global) local=global;
			
 
				+                else global = (global + local-1) / local * local;
			
 
				 
			
 
				 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
			
 
				 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
--- a/examples/filters/custom_mf/conversion_opencl.c
+++ b/examples/filters/custom_mf/conversion_opencl.c
@@ -76,6 +76,8 @@ void cpu_to_opencl_opencl_func(void *buffers[], void *args)
 
				 
			
 
				                 if (local > global)
			
 
				 			local = global;
			
 
				+                else
			
 
				+                        global = (global + local-1) / local * local;
			
 
				 
			
 
				 		err = clEnqueueNDRangeKernel(
			
 
				 				queue,
			
--- a/examples/filters/custom_mf/custom_opencl.c
+++ b/examples/filters/custom_mf/custom_opencl.c
@@ -75,6 +75,8 @@ void custom_scal_opencl_func(void *buffers[], void *args)
 
				 
			
 
				                 if (local > global)
			
 
				 			local = global;
			
 
				+                else
			
 
				+                        global = (global + local-1) / local * local;
			
 
				 
			
 
				 		err = clEnqueueNDRangeKernel(
			
 
				 				queue,
			
--- a/examples/interface/complex_kernels_opencl.c
+++ b/examples/interface/complex_kernels_opencl.c
@@ -64,6 +64,8 @@ void copy_complex_codelet_opencl(void *buffers[], void *_args)
 
				 			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				                 if (local > global)
			
 
				 			local=global;
			
 
				+                else
			
 
				+                        global = (global + local-1) / local * local;
			
 
				 
			
 
				 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
			
 
				 		if (err != CL_SUCCESS)
			
--- a/examples/reductions/dot_product.c
+++ b/examples/reductions/dot_product.c
@@ -185,18 +185,12 @@ void redux_opencl_func(void *buffers[], void *args)
 
				 
			
 
				 	{
			
 
				 		size_t global=1;
			
 
				-		size_t local;
			
 
				+                size_t local=1;
			
 
				                 size_t s;
			
 
				                 cl_device_id device;
			
 
				 
			
 
				                 starpu_opencl_get_device(devid, &device);
			
 
				 
			
 
				-                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
			
 
				-                if (err != CL_SUCCESS)
			
 
				-			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-                if (local > global)
			
 
				-			local=global;
			
 
				-
			
 
				 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
			
 
				 		if (err != CL_SUCCESS)
			
 
				 			STARPU_OPENCL_REPORT_ERROR(err);
			
@@ -306,18 +300,12 @@ void dot_opencl_func(void *buffers[], void *cl_arg)
 
				 
			
 
				 	{
			
 
				 		size_t global=1;
			
 
				-		size_t local;
			
 
				+                size_t local=1;
			
 
				                 size_t s;
			
 
				                 cl_device_id device;
			
 
				 
			
 
				                 starpu_opencl_get_device(devid, &device);
			
 
				 
			
 
				-                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
			
 
				-                if (err != CL_SUCCESS)
			
 
				-			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-                if (local > global)
			
 
				-			local=global;
			
 
				-
			
 
				 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
			
 
				 		if (err != CL_SUCCESS)
			
 
				 			STARPU_OPENCL_REPORT_ERROR(err);
			
--- a/examples/reductions/dot_product_opencl_kernels.cl
+++ b/examples/reductions/dot_product_opencl_kernels.cl
@@ -31,6 +31,7 @@ __kernel void _dot_opencl(__global float *x,
 
				 			  __global DOT_TYPE *dot,
			
 
				 			  unsigned n)
			
 
				 {
			
 
				+/* FIXME: real parallel implementation */
			
 
				 	unsigned i;
			
 
				 	__local double tmp;
			
 
				 	tmp = 0.0;
			
--- a/julia/examples/cholesky/cholesky_common.jl
+++ b/julia/examples/cholesky/cholesky_common.jl
@@ -1,3 +1,18 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				 # Standard kernels for the Cholesky factorization
			
 
				 # U22 is the gemm update
			
 
				 # U21 is the trsm update
			
--- a/julia/examples/cholesky/cholesky_native.jl
+++ b/julia/examples/cholesky/cholesky_native.jl
@@ -1,3 +1,18 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				 using LinearAlgebra
			
 
				 
			
 
				 function check(mat::Matrix{Float32})
			
--- a/julia/src/openblas_ldflags.jl
+++ b/julia/src/openblas_ldflags.jl
@@ -1,3 +1,18 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				 import LinearAlgebra.BLAS
			
 
				 import Libdl
			
 
				 
			
--- a/mpi/examples/Makefile.am
+++ b/mpi/examples/Makefile.am
@@ -83,6 +83,10 @@ EXTRA_DIST = 				\
 
				 	matrix_decomposition/mpi_decomposition_params.h	\
			
 
				 	matrix_decomposition/mpi_decomposition_matrix.h	\
			
 
				 	user_datatype/my_interface.h			\
			
 
				+	benchs/abstract_sendrecv_bench.h	\
			
 
				+	benchs/bench_helper.h			\
			
 
				+	benchs/gemm_helper.h			\
			
 
				+	benchs/burst_helper.h			\
			
 
				 	helper.h
			
 
				 
			
 
				 examplebindir = $(libdir)/starpu/mpi
			
@@ -399,3 +403,68 @@ native_fortran/nf_mm_task_build.o: nf_mm_cl.mod fstarpu_mpi_mod.mod fstarpu_mod.
 
				 native_fortran/nf_basic_ring.o: fstarpu_mpi_mod.mod fstarpu_mod.mod
			
 
				 endif
			
 
				 endif
			
 
				+
			
 
				+
			
 
				+##########
			
 
				+# benchs #
			
 
				+##########
			
 
				+
			
 
				+examplebin_PROGRAMS +=		\
			
 
				+	benchs/sendrecv_bench	\
			
 
				+	benchs/burst
			
 
				+
			
 
				+if !STARPU_USE_MPI_MPI
			
 
				+examplebin_PROGRAMS +=		\
			
 
				+	benchs/sendrecv_parallel_tasks_bench
			
 
				+endif
			
 
				+
			
 
				+if !STARPU_NO_BLAS_LIB
			
 
				+examplebin_PROGRAMS +=		\
			
 
				+	benchs/sendrecv_gemm_bench			\
			
 
				+	benchs/burst_gemm
			
 
				+endif
			
 
				+
			
 
				+if !STARPU_SIMGRID
			
 
				+starpu_mpi_EXAMPLES	+=	\
			
 
				+	benchs/sendrecv_bench	\
			
 
				+	benchs/burst
			
 
				+
			
 
				+if !STARPU_USE_MPI_MPI
			
 
				+starpu_mpi_EXAMPLES	+=	\
			
 
				+	benchs/sendrecv_parallel_tasks_bench
			
 
				+endif
			
 
				+
			
 
				+if !STARPU_NO_BLAS_LIB
			
 
				+starpu_mpi_EXAMPLES	+=	\
			
 
				+	benchs/sendrecv_gemm_bench			\
			
 
				+	benchs/burst_gemm
			
 
				+endif
			
 
				+endif
			
 
				+
			
 
				+benchs_sendrecv_bench_SOURCES = benchs/sendrecv_bench.c
			
 
				+benchs_sendrecv_bench_SOURCES += benchs/bench_helper.c
			
 
				+benchs_sendrecv_bench_SOURCES += benchs/abstract_sendrecv_bench.c
			
 
				+
			
 
				+benchs_sendrecv_parallel_tasks_bench_SOURCES = benchs/sendrecv_parallel_tasks_bench.c
			
 
				+benchs_sendrecv_parallel_tasks_bench_SOURCES += benchs/bench_helper.c
			
 
				+benchs_sendrecv_parallel_tasks_bench_SOURCES += benchs/abstract_sendrecv_bench.c
			
 
				+
			
 
				+benchs_burst_SOURCES = benchs/burst.c
			
 
				+benchs_burst_SOURCES += benchs/burst_helper.c
			
 
				+
			
 
				+if !STARPU_NO_BLAS_LIB
			
 
				+benchs_sendrecv_gemm_bench_SOURCES = benchs/sendrecv_gemm_bench.c
			
 
				+benchs_sendrecv_gemm_bench_SOURCES += benchs/bench_helper.c
			
 
				+benchs_sendrecv_gemm_bench_SOURCES += benchs/gemm_helper.c
			
 
				+benchs_sendrecv_gemm_bench_SOURCES += benchs/abstract_sendrecv_bench.c
			
 
				+benchs_sendrecv_gemm_bench_SOURCES += ../../examples/common/blas.c
			
 
				+
			
 
				+benchs_sendrecv_gemm_bench_LDADD = $(STARPU_BLAS_LDFLAGS)
			
 
				+
			
 
				+benchs_burst_gemm_SOURCES = benchs/burst_gemm.c
			
 
				+benchs_burst_gemm_SOURCES += benchs/gemm_helper.c
			
 
				+benchs_burst_gemm_SOURCES += benchs/burst_helper.c
			
 
				+benchs_burst_gemm_SOURCES += ../../examples/common/blas.c
			
 
				+
			
 
				+benchs_burst_gemm_LDADD = $(STARPU_BLAS_LDFLAGS)
			
 
				+endif
			
--- a/mpi/examples/benchs/abstract_sendrecv_bench.c
+++ b/mpi/examples/benchs/abstract_sendrecv_bench.c
--- a/mpi/examples/benchs/abstract_sendrecv_bench.h
+++ b/mpi/examples/benchs/abstract_sendrecv_bench.h
--- a/mpi/examples/benchs/bench_helper.c
+++ b/mpi/examples/benchs/bench_helper.c
--- a/mpi/examples/benchs/bench_helper.h
+++ b/mpi/examples/benchs/bench_helper.h
--- a/mpi/tests/burst.c
+++ b/mpi/tests/burst.c
@@ -49,13 +49,11 @@ void parse_args(int argc, char **argv)
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				-	int ret, rank, mpi_init, other_rank;
			
 
				+	int ret, rank, other_rank;
			
 
				 
			
 
				 	parse_args(argc, argv);
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				-
			
 
				-	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
			
 
				+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
			
 
				 
			
 
				 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
@@ -68,8 +66,6 @@ int main(int argc, char **argv)
 
				 	burst_free_data(rank);
			
 
				 
			
 
				 	starpu_mpi_shutdown();
			
 
				-	if (!mpi_init)
			
 
				-		MPI_Finalize();
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/mpi/tests/burst_gemm.c
+++ b/mpi/tests/burst_gemm.c
@@ -90,12 +90,11 @@ void parse_args(int argc, char **argv)
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				-	int ret, mpi_init, worldsize, mpi_rank;
			
 
				+	int ret, worldsize, mpi_rank;
			
 
				 
			
 
				 	parse_args(argc, argv);
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				-	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
			
 
				+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
			
 
				 	if (ret == -ENODEV)
			
 
				 		return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
			
@@ -109,8 +108,7 @@ int main(int argc, char **argv)
 
				 			FPRINTF(stderr, "We need 2 processes.\n");
			
 
				 
			
 
				 		starpu_mpi_shutdown();
			
 
				-		if (!mpi_init)
			
 
				-			MPI_Finalize();
			
 
				+
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -203,8 +201,6 @@ enodev:
 
				 	burst_free_data(mpi_rank);
			
 
				 
			
 
				 	starpu_mpi_shutdown();
			
 
				-	if (!mpi_init)
			
 
				-		MPI_Finalize();
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
--- a/mpi/examples/benchs/burst_helper.c
+++ b/mpi/examples/benchs/burst_helper.c
--- a/mpi/examples/benchs/burst_helper.h
+++ b/mpi/examples/benchs/burst_helper.h
--- a/mpi/examples/benchs/gemm_helper.c
+++ b/mpi/examples/benchs/gemm_helper.c
--- a/mpi/examples/benchs/gemm_helper.h
+++ b/mpi/examples/benchs/gemm_helper.h
--- a/mpi/tests/sendrecv_bench.c
+++ b/mpi/tests/sendrecv_bench.c
@@ -26,7 +26,6 @@
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	int ret, rank, worldsize;
			
 
				-	int mpi_init;
			
 
				 	int pause_workers = 0;
			
 
				 
			
 
				 
			
@@ -52,8 +51,7 @@ int main(int argc, char **argv)
 
				 	}
			
 
				 
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				-	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
			
 
				+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
			
 
				 
			
 
				 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
@@ -65,8 +63,7 @@ int main(int argc, char **argv)
 
				 			FPRINTF(stderr, "We need 2 processes.\n");
			
 
				 
			
 
				 		starpu_mpi_shutdown();
			
 
				-		if (!mpi_init)
			
 
				-			MPI_Finalize();
			
 
				+
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -85,8 +82,6 @@ int main(int argc, char **argv)
 
				 	}
			
 
				 
			
 
				 	starpu_mpi_shutdown();
			
 
				-	if (!mpi_init)
			
 
				-		MPI_Finalize();
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/mpi/tests/sendrecv_gemm_bench.c
+++ b/mpi/tests/sendrecv_gemm_bench.c
@@ -53,7 +53,7 @@ static void* comm_thread_func(void* arg)
 
				 	{
			
 
				 		char hostname[65];
			
 
				 		gethostname(hostname, sizeof(hostname));
			
 
				-		_STARPU_DISP("[%s] No core was available for the comm thread. You should increase STARPU_RESERVE_NCPU or decrease STARPU_NCPU\n", hostname);
			
 
				+		fprintf(stderr, "[%s] No core was available for the comm thread. You should increase STARPU_RESERVE_NCPU or decrease STARPU_NCPU\n", hostname);
			
 
				 	}
			
 
				 
			
 
				 	sendrecv_bench(mpi_rank, &thread_barrier);
			
@@ -118,7 +118,7 @@ void parse_args(int argc, char **argv)
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	double start, end;
			
 
				-	int ret, mpi_init, worldsize;
			
 
				+	int ret, worldsize;
			
 
				 	starpu_pthread_t comm_thread;
			
 
				 
			
 
				 	char hostname[255];
			
@@ -128,8 +128,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_fxt_autostart_profiling(0);
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				-	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
			
 
				+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
			
 
				 	if (ret == -ENODEV)
			
 
				 		return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
			
@@ -143,8 +142,7 @@ int main(int argc, char **argv)
 
				 			FPRINTF(stderr, "We need 2 processes.\n");
			
 
				 
			
 
				 		starpu_mpi_shutdown();
			
 
				-		if (!mpi_init)
			
 
				-			MPI_Finalize();
			
 
				+
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -162,7 +160,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 	if (mpi_rank == 0)
			
 
				 	{
			
 
				-		PRINTF("# node\tx\ty\tz\tms\tGFlops\n");
			
 
				+		printf("# node\tx\ty\tz\tms\tGFlops\n");
			
 
				 	}
			
 
				 
			
 
				 	starpu_pause();
			
@@ -185,7 +183,7 @@ int main(int argc, char **argv)
 
				 	double timing = end - start;
			
 
				 	double flops = 2.0*((unsigned long long)matrix_dim) * ((unsigned long long)matrix_dim)*((unsigned long long)matrix_dim);
			
 
				 
			
 
				-	PRINTF("%s\t%u\t%u\t%u\t%.0f\t%.1f\n", hostname, matrix_dim, matrix_dim, matrix_dim, timing/1000.0, flops/timing/1000.0);
			
 
				+	printf("%s\t%u\t%u\t%u\t%.0f\t%.1f\n", hostname, matrix_dim, matrix_dim, matrix_dim, timing/1000.0, flops/timing/1000.0);
			
 
				 
			
 
				 
			
 
				 enodev:
			
@@ -200,8 +198,6 @@ enodev:
 
				 
			
 
				 	starpu_resume();
			
 
				 	starpu_mpi_shutdown();
			
 
				-	if (!mpi_init)
			
 
				-		MPI_Finalize();
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
--- a/mpi/tests/sendrecv_parallel_tasks_bench.c
+++ b/mpi/tests/sendrecv_parallel_tasks_bench.c
@@ -134,10 +134,8 @@ static struct starpu_codelet cl =
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	int ret, rank, worldsize;
			
 
				-	int mpi_init;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				-	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
			
 
				+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
			
 
				 
			
 
				 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
@@ -149,8 +147,7 @@ int main(int argc, char **argv)
 
				 			FPRINTF(stderr, "We need 2 processes.\n");
			
 
				 
			
 
				 		starpu_mpi_shutdown();
			
 
				-		if (!mpi_init)
			
 
				-			MPI_Finalize();
			
 
				+
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -162,8 +159,7 @@ int main(int argc, char **argv)
 
				 	else if (rank >= 2)
			
 
				 	{
			
 
				 		starpu_mpi_shutdown();
			
 
				-		if (!mpi_init)
			
 
				-			MPI_Finalize();
			
 
				+
			
 
				 		return 0;
			
 
				 	}
			
 
				 
			
@@ -222,8 +218,6 @@ int main(int argc, char **argv)
 
				 	free(mpi_tags);
			
 
				 
			
 
				 	starpu_mpi_shutdown();
			
 
				-	if (!mpi_init)
			
 
				-		MPI_Finalize();
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/mpi/tests/Makefile.am
+++ b/mpi/tests/Makefile.am
@@ -62,11 +62,7 @@ BUILT_SOURCES =
 
				 CLEANFILES = *.gcno *.gcda *.linkinfo starpu_idle_microsec.log
			
 
				 
			
 
				 EXTRA_DIST = 				\
			
 
				-	abstract_sendrecv_bench.h	\
			
 
				-	bench_helper.h			\
			
 
				 	helper.h			\
			
 
				-	gemm_helper.h			\
			
 
				-	burst_helper.h			\
			
 
				 	user_defined_datatype_value.h
			
 
				 
			
 
				 examplebindir = $(libdir)/starpu/examples/mpi
			
@@ -142,21 +138,8 @@ starpu_mpi_TESTS +=				\
 
				 	temporary				\
			
 
				 	user_defined_datatype			\
			
 
				 	early_stuff				\
			
 
				-	sendrecv_bench				\
			
 
				-	burst						\
			
 
				 	display_bindings
			
 
				 
			
 
				-if !STARPU_USE_MPI_MPI
			
 
				-starpu_mpi_TESTS +=				\
			
 
				-	sendrecv_parallel_tasks_bench
			
 
				-endif
			
 
				-
			
 
				-if !STARPU_NO_BLAS_LIB
			
 
				-starpu_mpi_TESTS +=				\
			
 
				-	sendrecv_gemm_bench			\
			
 
				-	burst_gemm
			
 
				-endif
			
 
				-
			
 
				 if !STARPU_SIMGRID
			
 
				 # missing support in simgrid
			
 
				 starpu_mpi_TESTS +=				\
			
@@ -244,18 +227,9 @@ noinst_PROGRAMS +=				\
 
				 	starpu_redefine				\
			
 
				 	load_balancer				\
			
 
				 	driver					\
			
 
				-	sendrecv_bench				\
			
 
				-	sendrecv_parallel_tasks_bench		\
			
 
				-	burst					\
			
 
				 	nothing							\
			
 
				 	display_bindings
			
 
				 
			
 
				-if !STARPU_NO_BLAS_LIB
			
 
				-noinst_PROGRAMS +=				\
			
 
				-	sendrecv_gemm_bench			\
			
 
				-	burst_gemm
			
 
				-endif
			
 
				-
			
 
				 XFAIL_TESTS=					\
			
 
				 	policy_register_toomany			\
			
 
				 	policy_unregister			\
			
@@ -285,31 +259,3 @@ mpi_earlyrecv2_SOURCES = mpi_earlyrecv2.c
 
				 mpi_earlyrecv2_SOURCES += ../../examples/interface/complex_interface.c
			
 
				 mpi_earlyrecv2_sync_SOURCES = mpi_earlyrecv2_sync.c
			
 
				 mpi_earlyrecv2_sync_SOURCES += ../../examples/interface/complex_interface.c
			
 
				-
			
 
				-sendrecv_bench_SOURCES = sendrecv_bench.c
			
 
				-sendrecv_bench_SOURCES += bench_helper.c
			
 
				-sendrecv_bench_SOURCES += abstract_sendrecv_bench.c
			
 
				-
			
 
				-sendrecv_parallel_tasks_bench_SOURCES = sendrecv_parallel_tasks_bench.c
			
 
				-sendrecv_parallel_tasks_bench_SOURCES += bench_helper.c
			
 
				-sendrecv_parallel_tasks_bench_SOURCES += abstract_sendrecv_bench.c
			
 
				-
			
 
				-burst_SOURCES = burst.c
			
 
				-burst_SOURCES += burst_helper.c
			
 
				-
			
 
				-if !STARPU_NO_BLAS_LIB
			
 
				-sendrecv_gemm_bench_SOURCES = sendrecv_gemm_bench.c
			
 
				-sendrecv_gemm_bench_SOURCES += bench_helper.c
			
 
				-sendrecv_gemm_bench_SOURCES += gemm_helper.c
			
 
				-sendrecv_gemm_bench_SOURCES += abstract_sendrecv_bench.c
			
 
				-sendrecv_gemm_bench_SOURCES += ../../examples/common/blas.c
			
 
				-
			
 
				-sendrecv_gemm_bench_LDADD = $(STARPU_BLAS_LDFLAGS)
			
 
				-
			
 
				-burst_gemm_SOURCES = burst_gemm.c
			
 
				-burst_gemm_SOURCES += gemm_helper.c
			
 
				-burst_gemm_SOURCES += burst_helper.c
			
 
				-burst_gemm_SOURCES += ../../examples/common/blas.c
			
 
				-
			
 
				-burst_gemm_LDADD = $(STARPU_BLAS_LDFLAGS)
			
 
				-endif
			
--- a/src/common/utils.c
+++ b/src/common/utils.c
@@ -743,7 +743,7 @@ int starpu_get_env_size_default(const char *str, int defval)
 
				 
			
 
				 void starpu_display_bindings(void)
			
 
				 {
			
 
				-#ifdef STARPU_HAVE_HWLOC
			
 
				+#if defined(STARPU_HAVE_HWLOC) && !defined(STARPU_SIMGRID)
			
 
				 	int hwloc_ret = system("hwloc-ps -a -t -c");
			
 
				 	if (hwloc_ret)
			
 
				 	{
			
--- a/src/profiling/profiling.c
+++ b/src/profiling/profiling.c
@@ -46,6 +46,7 @@ static struct timespec executing_start_date[STARPU_NMAXWORKERS];
 
				 #ifdef STARPU_PAPI
			
 
				 static int papi_events[PAPI_MAX_HWCTRS];
			
 
				 static int papi_nevents = 0;
			
 
				+static int warned_component_unavailable = 0;
			
 
				 #endif
			
 
				 
			
 
				 /* Store the busid of the different (src, dst) pairs. busid_matrix[src][dst]
			
@@ -160,7 +161,7 @@ void _starpu_profiling_init(void)
 
				 		conf_papi_events = starpu_getenv("STARPU_PROF_PAPI_EVENTS");
			
 
				 		if (conf_papi_events != NULL)
			
 
				 		{
			
 
				-			while ((papi_event_name = strtok_r(conf_papi_events, " ", &conf_papi_events)))
			
 
				+			while ((papi_event_name = strtok_r(conf_papi_events, " ,", &conf_papi_events)))
			
 
				 			{
			
 
				 				_STARPU_DEBUG("Loading PAPI Event:%s\n", papi_event_name);
			
 
				 				retval = PAPI_event_name_to_code ((char*)papi_event_name, &papi_events[papi_nevents]);
			
@@ -188,7 +189,12 @@ void _starpu_profiling_papi_task_start_counters(struct starpu_task *task)
 
				 		PAPI_create_eventset(&profiling_info->papi_event_set);
			
 
				 		for(int i=0; i<papi_nevents; i++)
			
 
				 		{
			
 
				-			PAPI_add_event(profiling_info->papi_event_set, papi_events[i]);
			
 
				+			int ret = PAPI_add_event(profiling_info->papi_event_set, papi_events[i]);
			
 
				+			if (ret == PAPI_ECMP_DISABLED && !warned_component_unavailable)
			
 
				+			{
			
 
				+				_STARPU_MSG("Error while registering Papi event: Component containing event is disabled. Try running `papi_component_avail` to get more information.\n");
			
 
				+				warned_component_unavailable = 1;
			
 
				+			}
			
 
				 			profiling_info->papi_values[i]=0;
			
 
				 		}
			
 
				 		PAPI_reset(profiling_info->papi_event_set);
			
--- a/src/sched_policies/component_heft.c
+++ b/src/sched_policies/component_heft.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2013-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				  * Copyright (C) 2013       Simon Archipoff
			
 
				+ * Copyright (C) 2020       Télécom-Sud Paris
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/src/sched_policies/component_heteroprio.c
+++ b/src/sched_policies/component_heteroprio.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2013-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				  * Copyright (C) 2013       Simon Archipoff
			
 
				+ * Copyright (C) 2020       Télécom-Sud Paris
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/src/sched_policies/component_mct.c
+++ b/src/sched_policies/component_mct.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2013-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				  * Copyright (C) 2013       Simon Archipoff
			
 
				+ * Copyright (C) 2020       Télécom-Sud Paris
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/src/sched_policies/deque_modeling_policy_data_aware.c
+++ b/src/sched_policies/deque_modeling_policy_data_aware.c
@@ -6,6 +6,7 @@
 
				  * Copyright (C) 2013       Simon Archipoff
			
 
				  * Copyright (C) 2013       Thibaut Lambert
			
 
				  * Copyright (C) 2016       Uppsala University
			
 
				+ * Copyright (C) 2020       Télécom-Sud Paris
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -1014,6 +1015,10 @@ static void initialize_dmda_policy(unsigned sched_ctx_id)
 
				 	dt->alpha = starpu_get_env_float_default("STARPU_SCHED_ALPHA", _STARPU_SCHED_ALPHA_DEFAULT);
			
 
				 	dt->beta = starpu_get_env_float_default("STARPU_SCHED_BETA", _STARPU_SCHED_BETA_DEFAULT);
			
 
				 	/* data->_gamma: cost of one Joule in us. If gamma is set to 10^6, then one Joule cost 1s */
			
 
				+#ifdef STARPU_NON_BLOCKING_DRIVERS
			
 
				+	if (starpu_getenv("STARPU_SCHED_GAMMA"))
			
 
				+		_STARPU_DISP("Warning: STARPU_SCHED_GAMMA was used, but --enable-blocking-drivers configuration was not set, CPU cores will not actually be sleeping\n");
			
 
				+#endif
			
 
				 	dt->_gamma = starpu_get_env_float_default("STARPU_SCHED_GAMMA", _STARPU_SCHED_GAMMA_DEFAULT);
			
 
				 	/* data->idle_power: Idle power of the whole machine in Watt */
			
 
				 	dt->idle_power = starpu_get_env_float_default("STARPU_IDLE_POWER", 0.0);
			
--- a/src/sched_policies/helper_mct.c
+++ b/src/sched_policies/helper_mct.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2013-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				  * Copyright (C) 2013       Simon Archipoff
			
 
				+ * Copyright (C) 2020       Télécom-Sud Paris
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -45,6 +46,10 @@ struct _starpu_mct_data *starpu_mct_init_parameters(struct starpu_sched_componen
 
				 	{
			
 
				 		data->alpha = starpu_get_env_float_default("STARPU_SCHED_ALPHA", _STARPU_SCHED_ALPHA_DEFAULT);
			
 
				 		data->beta = starpu_get_env_float_default("STARPU_SCHED_BETA", _STARPU_SCHED_BETA_DEFAULT);
			
 
				+#ifdef STARPU_NON_BLOCKING_DRIVERS
			
 
				+		if (starpu_getenv("STARPU_SCHED_GAMMA"))
			
 
				+			_STARPU_DISP("Warning: STARPU_SCHED_GAMMA was used, but --enable-blocking-drivers configuration was not set, CPU cores will not actually be sleeping\n");
			
 
				+#endif
			
 
				 		data->_gamma = starpu_get_env_float_default("STARPU_SCHED_GAMMA", _STARPU_SCHED_GAMMA_DEFAULT);
			
 
				 		data->idle_power = starpu_get_env_float_default("STARPU_IDLE_POWER", 0.0);
			
 
				 	}
			
--- a/src/sched_policies/helper_mct.h
+++ b/src/sched_policies/helper_mct.h
@@ -1,6 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2013-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ * Copyright (C) 2020       Télécom-Sud Paris
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/src/sched_policies/parallel_heft.c
+++ b/src/sched_policies/parallel_heft.c
@@ -567,6 +567,10 @@ static void initialize_parallel_heft_policy(unsigned sched_ctx_id)
 
				 
			
 
				 	hd->alpha = starpu_get_env_float_default("STARPU_SCHED_ALPHA", _STARPU_SCHED_ALPHA_DEFAULT);
			
 
				 	hd->beta = starpu_get_env_float_default("STARPU_SCHED_BETA", _STARPU_SCHED_BETA_DEFAULT);
			
 
				+#ifdef STARPU_NON_BLOCKING_DRIVERS
			
 
				+	if (starpu_getenv("STARPU_SCHED_GAMMA"))
			
 
				+		_STARPU_DISP("Warning: STARPU_SCHED_GAMMA was used, but --enable-blocking-drivers configuration was not set, CPU cores will not actually be sleeping\n");
			
 
				+#endif
			
 
				 	hd->_gamma = starpu_get_env_float_default("STARPU_SCHED_GAMMA", _STARPU_SCHED_GAMMA_DEFAULT);
			
 
				 	hd->idle_power = starpu_get_env_float_default("STARPU_IDLE_POWER", 0.0);
			
 
				 
			
--- a/starpufft/src/starpufftx3d.c
+++ b/starpufft/src/starpufftx3d.c
@@ -164,7 +164,6 @@ static struct starpu_task *
 
				 STARPUFFT(start3dC2C)(STARPUFFT(plan) plan, starpu_data_handle_t in, starpu_data_handle_t out)
			
 
				 {
			
 
				 	STARPU_ASSERT(plan->type == C2C);
			
 
				-	int z;
			
 
				 	int ret;
			
 
				 
			
 
				 if (PARALLEL) {
			
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -61,7 +61,6 @@ EXTRA_DIST =					\
 
				 	datawizard/interfaces/block/block_opencl_kernel.cl \
			
 
				 	datawizard/interfaces/tensor/tensor_opencl_kernel.cl \
			
 
				 	perfmodels/opencl_memset_kernel.cl \
			
 
				-	perfmodels/opencl_memset_kernel_01.cl \
			
 
				 	$(MICROBENCHS:=.sh) \
			
 
				 	microbenchs/microbench.sh \
			
 
				 	model-checking/platform.xml \
			
@@ -359,7 +358,8 @@ myPROGRAMS +=				\
 
				 	perfmodels/regression_based		\
			
 
				 	perfmodels/regression_based_01		\
			
 
				 	perfmodels/regression_based_02		\
			
 
				-	perfmodels/regression_based_03		\	
			
 
				+	perfmodels/regression_based_03		\
			
 
				+	perfmodels/regression_based_04		\
			
 
				 	perfmodels/non_linear_regression_based	\
			
 
				 	perfmodels/feed				\
			
 
				 	perfmodels/user_base			\
			
@@ -1011,6 +1011,9 @@ perfmodels_regression_based_02_SOURCES=\
 
				 perfmodels_regression_based_03_SOURCES=\
			
 
				 	perfmodels/regression_based_03.c
			
 
				 
			
 
				+perfmodels_regression_based_04_SOURCES=\
			
 
				+	perfmodels/regression_based_04.c
			
 
				+
			
 
				 perfmodels_max_fpga_SOURCES=\
			
 
				 	perfmodels/max_fpga.c
			
 
				 perfmodels_max_fpga_LDADD = $(LDADD) \
			
@@ -1020,6 +1023,9 @@ if STARPU_USE_OPENCL
 
				 perfmodels_regression_based_SOURCES+=\
			
 
				 	perfmodels/opencl_memset.c
			
 
				 
			
 
				+perfmodels_regression_based_04_SOURCES+=\
			
 
				+	perfmodels/opencl_memset.c
			
 
				+
			
 
				 nobase_STARPU_OPENCL_DATA_DATA += \
			
 
				 	perfmodels/opencl_memset_kernel.cl
			
 
				 endif
			
--- a/tests/datawizard/interfaces/bcsr/bcsr_opencl.c
+++ b/tests/datawizard/interfaces/bcsr/bcsr_opencl.c
@@ -95,6 +95,8 @@ test_bcsr_opencl_func(void *buffers[], void *args)
 
				 
			
 
				                 if (local > global)
			
 
				 			local = global;
			
 
				+                else
			
 
				+                        global = (global + local-1) / local * local;
			
 
				 
			
 
				 		err = clEnqueueNDRangeKernel(queue,
			
 
				 					kernel,
			
--- a/tests/datawizard/interfaces/coo/coo_opencl.c
+++ b/tests/datawizard/interfaces/coo/coo_opencl.c
@@ -93,6 +93,8 @@ test_coo_opencl_func(void *buffers[], void *args)
 
				 
			
 
				                 if (local > global)
			
 
				 			local = global;
			
 
				+                else
			
 
				+                        global = (global + local-1) / local * local;
			
 
				 
			
 
				 		err = clEnqueueNDRangeKernel(queue,
			
 
				 					kernel,
			
--- a/tests/datawizard/interfaces/csr/csr_opencl.c
+++ b/tests/datawizard/interfaces/csr/csr_opencl.c
@@ -93,6 +93,8 @@ test_csr_opencl_func(void *buffers[], void *args)
 
				 
			
 
				                 if (local > global)
			
 
				 			local = global;
			
 
				+                else
			
 
				+                        global = (global + local-1) / local * local;
			
 
				 
			
 
				 		err = clEnqueueNDRangeKernel(queue,
			
 
				 					kernel,
			
--- a/tests/datawizard/interfaces/matrix/matrix_opencl.c
+++ b/tests/datawizard/interfaces/matrix/matrix_opencl.c
@@ -92,6 +92,8 @@ void test_matrix_opencl_func(void *buffers[], void *args)
 
				 
			
 
				                 if (local > global)
			
 
				 			local = global;
			
 
				+                else
			
 
				+                        global = (global + local-1) / local * local;
			
 
				 
			
 
				 		err = clEnqueueNDRangeKernel(queue,
			
 
				 					kernel,
			
--- a/tests/datawizard/interfaces/multiformat/multiformat_conversion_codelets_opencl.c
+++ b/tests/datawizard/interfaces/multiformat/multiformat_conversion_codelets_opencl.c
@@ -84,6 +84,8 @@ void cpu_to_opencl_opencl_func(void *buffers[], void *args)
 
				 
			
 
				                 if (local > global)
			
 
				 			local = global;
			
 
				+                else
			
 
				+                        global = (global + local-1) / local * local;
			
 
				 
			
 
				 		err = clEnqueueNDRangeKernel(queue,
			
 
				 					kernel,
			
--- a/tests/datawizard/interfaces/multiformat/multiformat_opencl.c
+++ b/tests/datawizard/interfaces/multiformat/multiformat_opencl.c
@@ -98,6 +98,8 @@ void test_multiformat_opencl_func(void *buffers[], void *args)
 
				 
			
 
				                 if (local > global)
			
 
				 			local = global;
			
 
				+                else
			
 
				+                        global = (global + local-1) / local * local;
			
 
				 
			
 
				 		err = clEnqueueNDRangeKernel(queue,
			
 
				 					kernel,
			
--- a/tests/datawizard/interfaces/tensor/tensor_opencl.c
+++ b/tests/datawizard/interfaces/tensor/tensor_opencl.c
@@ -87,7 +87,7 @@ test_tensor_opencl_func(void *buffers[], void *args)
 
				 	}
			
 
				 			
			
 
				 	{
			
 
				-		size_t global = nx * ny * nz * nt;
			
 
				+                size_t global = 1;
			
 
				 		err = clEnqueueNDRangeKernel(queue,
			
 
				 					     kernel,
			
 
				 					     1,
			
--- a/tests/datawizard/interfaces/variable/variable_opencl.c
+++ b/tests/datawizard/interfaces/variable/variable_opencl.c
@@ -73,24 +73,12 @@ void test_variable_opencl_func(void *buffers[], void *args)
 
				 
			
 
				 	{
			
 
				 		size_t global = 1;
			
 
				-		size_t local;
			
 
				+                size_t local = 1;
			
 
				                 size_t s;
			
 
				                 cl_device_id device;
			
 
				 
			
 
				                 starpu_opencl_get_device(devid, &device);
			
 
				 
			
 
				-                err = clGetKernelWorkGroupInfo (kernel,
			
 
				-						device,
			
 
				-						CL_KERNEL_WORK_GROUP_SIZE,
			
 
				-						sizeof(local),
			
 
				-						&local,
			
 
				-						&s);
			
 
				-                if (err != CL_SUCCESS)
			
 
				-			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-
			
 
				-                if (local > global)
			
 
				-			local = global;
			
 
				-
			
 
				 		err = clEnqueueNDRangeKernel(queue,
			
 
				 					kernel,
			
 
				 					1,
			
--- a/tests/datawizard/interfaces/vector/vector_opencl.c
+++ b/tests/datawizard/interfaces/vector/vector_opencl.c
@@ -91,6 +91,8 @@ test_vector_opencl_func(void *buffers[], void *args)
 
				 
			
 
				                 if (local > global)
			
 
				 			local = global;
			
 
				+                else
			
 
				+                        global = (global + local-1) / local * local;
			
 
				 
			
 
				 		err = clEnqueueNDRangeKernel(queue,
			
 
				 					kernel,
			
--- a/tests/datawizard/scal.c
+++ b/tests/datawizard/scal.c
@@ -73,6 +73,7 @@ void scal_func_opencl(void *buffers[], void *cl_arg)
 
				                 err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
			
 
				                 if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				                 if (local > global) local=global;
			
 
				+                else global = (global + local-1) / local * local;
			
 
				 
			
 
				 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
			
 
				 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
--- a/tests/datawizard/scratch_opencl.c
+++ b/tests/datawizard/scratch_opencl.c
@@ -73,6 +73,8 @@ void opencl_f(void *buffers[], void *args)
 
				 			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				                 if (local > global)
			
 
				 			local=global;
			
 
				+                else
			
 
				+                        global = (global + local-1) / local * local;
			
 
				 
			
 
				 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
			
 
				 		if (err != CL_SUCCESS)
			
--- a/tests/main/display_binding.c
+++ b/tests/main/display_binding.c
@@ -38,6 +38,8 @@ int main(void)
 
				 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				 	return EXIT_SUCCESS;
			
 
				 }
			
 
				 #endif
			
--- a/tests/perfmodels/opencl_memset.c
+++ b/tests/perfmodels/opencl_memset.c
@@ -22,7 +22,7 @@
 
				 
			
 
				 extern struct starpu_opencl_program opencl_program;
			
 
				 
			
 
				-void memset_opencl(void *buffers[], void *args)
			
 
				+void _memset_opencl(void *buffers[], void *args, const char *name)
			
 
				 {
			
 
				 	(void) args;
			
 
				 	int id, devid;
			
@@ -36,7 +36,7 @@ void memset_opencl(void *buffers[], void *args)
 
				 	id = starpu_worker_get_id_check();
			
 
				 	devid = starpu_worker_get_devid(id);
			
 
				 
			
 
				-	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "_memset_opencl", devid);
			
 
				+	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, name, devid);
			
 
				 	if (err != CL_SUCCESS)
			
 
				 		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
@@ -58,6 +58,8 @@ void memset_opencl(void *buffers[], void *args)
 
				 			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				                 if (local > global)
			
 
				 			local=global;
			
 
				+                else
			
 
				+                        global = (global + local-1) / local * local;
			
 
				 
			
 
				 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
			
 
				 		if (err != CL_SUCCESS)
			
@@ -65,3 +67,13 @@ void memset_opencl(void *buffers[], void *args)
 
				 	}
			
 
				 	starpu_opencl_release_kernel(kernel);
			
 
				 }
			
 
				+
			
 
				+void memset_opencl(void *buffers[], void *args, const char *kernel)
			
 
				+{
			
 
				+	_memset_opencl(buffers, args, "_memset_opencl");
			
 
				+}
			
 
				+
			
 
				+void memset0_opencl(void *buffers[], void *args, const char *kernel)
			
 
				+{
			
 
				+	_memset_opencl(buffers, args, "_memset0_opencl");
			
 
				+}
			
--- a/tests/perfmodels/opencl_memset_kernel.cl
+++ b/tests/perfmodels/opencl_memset_kernel.cl
@@ -14,6 +14,13 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				+ __kernel void _memset0_opencl(__global int *val, int nx)
			
 
				+{
			
 
				+        const int i = get_global_id(0);
			
 
				+        if (i < nx)
			
 
				+                val[0] += i;
			
 
				+}
			
 
				+
			
 
				 __kernel void _memset_opencl(__global int *val, int nx)
			
 
				 {
			
 
				         const int i = get_global_id(0);
			
--- a/tests/perfmodels/opencl_memset_kernel_01.cl
+++ b/tests/perfmodels/opencl_memset_kernel_01.cl
@@ -1,31 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2012-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				- __kernel void _memset0_opencl(__global int *val, int nx)
			
 
				-{
			
 
				-        const int i = get_global_id(0);
			
 
				-        if (i < nx)
			
 
				-                val[0] += i;
			
 
				-}
			
 
				-
			
 
				-__kernel void _memset_opencl(__global int *val, int nx)
			
 
				-{
			
 
				-        const int i = get_global_id(0);
			
 
				-        if (i < nx)
			
 
				-                val[i] = 42;
			
 
				-}
			
 
				-
			
 
				-
			
--- a/tests/perfmodels/regression_based.c
+++ b/tests/perfmodels/regression_based.c
@@ -23,6 +23,7 @@
 
				  * Benchmark memset with a linear regression
			
 
				  */
			
 
				 
			
 
				+#define STARTlin 1024
			
 
				 #define START 1024
			
 
				 #ifdef STARPU_QUICK_CHECK
			
 
				 #define END 1048576
			
@@ -184,11 +185,14 @@ int main(int argc, char **argv)
 
				 #endif
			
 
				 
			
 
				 	int size;
			
 
				-	for (size = START; size < END; size *= 2)
			
 
				+	for (size = STARTlin; size < END; size *= 2)
			
 
				 	{
			
 
				 		/* Use a linear regression */
			
 
				 		test_memset(size, &memset_cl);
			
 
				+	}
			
 
				 
			
 
				+	for (size = START; size < END; size *= 2)
			
 
				+	{
			
 
				 		/* Use a non-linear regression */
			
 
				 		test_memset(size, &nl_memset_cl);
			
 
				 	}
			
--- a/tests/perfmodels/regression_based_01.c
+++ b/tests/perfmodels/regression_based_01.c
@@ -1,6 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ * Copyright (C) 2011-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ * Copyright (C) 2011       Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -14,23 +15,20 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				-/*
			
 
				- * Dans ce benchmark:
			
 
				- - calibrer le modèle linéaire seulement pour des grandes tailles STARTlin 1048576
			
 
				- - séparer la boucle test_memset en deux boucles:
			
 
				-        *linéaire: démarrer à partir de 1 048 576
			
 
				-        *non linéaire: conserver le démarrage à 1024
			
 
				- */
			
 
				-
			
 
				 #include <starpu.h>
			
 
				 #include <assert.h>
			
 
				 #include <starpu_scheduler.h>
			
 
				 #include <unistd.h>
			
 
				 #include "../helper.h"
			
 
				 
			
 
				-#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				+/*
			
 
				+ * - Calibrate the linear model only for large sizes: STARTline 1048576
			
 
				+ * - Separate the test_memset loop in two loops:
			
 
				+ *   - linear: start from 1048576
			
 
				+ *   - non-linear: keep start at 1024
			
 
				+ */
			
 
				 
			
 
				-#define STARTlin 1048576
			
 
				+#define STARTlin 131072
			
 
				 #define START 1024
			
 
				 #ifdef STARPU_QUICK_CHECK
			
 
				 #define END 1048576
			
@@ -38,168 +36,159 @@
 
				 #define END 16777216
			
 
				 #endif
			
 
				 
			
 
				-int ret;
			
 
				-
			
 
				 
			
 
				 void memset_cpu(void *descr[], void *arg)
			
 
				 {
			
 
				-    (void)arg;
			
 
				-    STARPU_SKIP_IF_VALGRIND;
			
 
				+	(void)arg;
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				 
			
 
				-    int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				-    unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				+	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				+	unsigned i;
			
 
				 
			
 
				-    usleep(1000);
			
 
				-    int i;
			
 
				+	usleep(1000);
			
 
				 
			
 
				-    for (i=0; i<n ; i++)
			
 
				-    {
			
 
				+	for (i=0; i<n ; i++)
			
 
				+	{
			
 
				 
			
 
				-        ptr[0] += i;
			
 
				-    }
			
 
				+		ptr[0] += i;
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				-
			
 
				 static struct starpu_perfmodel model =
			
 
				 {
			
 
				-    .type = STARPU_REGRESSION_BASED,
			
 
				-    .symbol = "memset_regression_based"
			
 
				+	.type = STARPU_REGRESSION_BASED,
			
 
				+	.symbol = "memset_regression_based"
			
 
				 };
			
 
				 
			
 
				 static struct starpu_perfmodel nl_model =
			
 
				 {
			
 
				-    .type = STARPU_NL_REGRESSION_BASED,
			
 
				-    .symbol = "non_linear_memset_regression_based"
			
 
				+	.type = STARPU_NL_REGRESSION_BASED,
			
 
				+	.symbol = "non_linear_memset_regression_based"
			
 
				 };
			
 
				 
			
 
				 static struct starpu_codelet memset_cl =
			
 
				 {
			
 
				-    .cpu_funcs = {memset_cpu},
			
 
				-    .cpu_funcs_name = {"memset_cpu"},
			
 
				-    .model = &model,
			
 
				-    .nbuffers = 1,
			
 
				-    .modes = {STARPU_W}
			
 
				+	.cpu_funcs = {memset_cpu},
			
 
				+	.cpu_funcs_name = {"memset_cpu"},
			
 
				+	.model = &model,
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_W}
			
 
				 };
			
 
				 
			
 
				 static struct starpu_codelet nl_memset_cl =
			
 
				 {
			
 
				-    .cpu_funcs = {memset_cpu},
			
 
				-    .cpu_funcs_name = {"memset_cpu"},
			
 
				-    .model = &nl_model,
			
 
				-    .nbuffers = 1,
			
 
				-    .modes = {STARPU_W}
			
 
				+	.cpu_funcs = {memset_cpu},
			
 
				+	.cpu_funcs_name = {"memset_cpu"},
			
 
				+	.model = &nl_model,
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_W}
			
 
				 };
			
 
				 
			
 
				-
			
 
				 static void test_memset(int nelems, struct starpu_codelet *codelet)
			
 
				 {
			
 
				-    int nloops = 100;
			
 
				-    int loop;
			
 
				-    starpu_data_handle_t handle;
			
 
				+	int nloops = 100;
			
 
				+	int loop;
			
 
				+	starpu_data_handle_t handle;
			
 
				 
			
 
				-    starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, nelems, sizeof(int));
			
 
				-    for (loop = 0; loop < nloops; loop++)
			
 
				-    {
			
 
				-        struct starpu_task *task = starpu_task_create();
			
 
				+	starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, nelems, sizeof(int));
			
 
				+	for (loop = 0; loop < nloops; loop++)
			
 
				+	{
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				 
			
 
				-        task->cl = codelet;
			
 
				-        task->handles[0] = handle;
			
 
				+		task->cl = codelet;
			
 
				+		task->handles[0] = handle;
			
 
				 
			
 
				-        int ret = starpu_task_submit(task);
			
 
				-        if (ret == -ENODEV)
			
 
				-            exit(STARPU_TEST_SKIPPED);
			
 
				-        STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				-    }
			
 
				+		int ret = starpu_task_submit(task);
			
 
				+		if (ret == -ENODEV)
			
 
				+			exit(STARPU_TEST_SKIPPED);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				 
			
 
				-    starpu_data_unregister(handle);
			
 
				+	starpu_data_unregister(handle);
			
 
				 }
			
 
				 
			
 
				-
			
 
				-static void compare_performance(int size, struct starpu_codelet *codelet, struct starpu_task *task)
			
 
				+static void compare_performance(int size, struct starpu_codelet *codelet, struct starpu_task *compar_task)
			
 
				 {
			
 
				-    unsigned i;
			
 
				-    int niter = 100;
			
 
				-    starpu_data_handle_t handle;
			
 
				-
			
 
				-    starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
			
 
				+	unsigned i;
			
 
				+	unsigned niter = 100;
			
 
				+	starpu_data_handle_t handle;
			
 
				 
			
 
				-    struct starpu_task **tasks = (struct starpu_task **) malloc(niter*sizeof(struct starpu_task *));
			
 
				-    assert(tasks);
			
 
				+	starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
			
 
				 
			
 
				-    for (i = 0; i < niter; i++)
			
 
				-    {
			
 
				-        struct starpu_task *task = starpu_task_create();
			
 
				+	struct starpu_task **tasks = (struct starpu_task **) malloc(niter*sizeof(struct starpu_task *));
			
 
				+	assert(tasks);
			
 
				 
			
 
				-        task->cl = codelet;
			
 
				-        task->handles[0] = handle;
			
 
				+	for (i = 0; i < niter; i++)
			
 
				+	{
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				 
			
 
				-        /* create a synchronous task: any call to starpu_task_submit will block
			
 
				-         * until it is terminated */
			
 
				+		task->cl = codelet;
			
 
				+		task->handles[0] = handle;
			
 
				 
			
 
				-        task->synchronous = 1;
			
 
				+		task->synchronous = 1;
			
 
				 
			
 
				-        /* We will destroy the task structure by hand so that we can
			
 
				-         * query the profiling info before the task is destroyed. */
			
 
				-        task->destroy = 0;
			
 
				+		/* We will destroy the task structure by hand so that we can
			
 
				+		 * query the profiling info before the task is destroyed. */
			
 
				+		task->destroy = 0;
			
 
				 
			
 
				-        tasks[i] = task;
			
 
				+		tasks[i] = task;
			
 
				 
			
 
				-        ret = starpu_task_submit(task);
			
 
				+		int ret = starpu_task_submit(task);
			
 
				 
			
 
				-        if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				-        {
			
 
				-            FPRINTF(stderr, "No worker may execute this task\n");
			
 
				-            exit(0);
			
 
				-        }
			
 
				-    }
			
 
				+		if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				+		{
			
 
				+			FPRINTF(stderr, "No worker may execute this task\n");
			
 
				+			exit(0);
			
 
				+		}
			
 
				+	}
			
 
				 
			
 
				-    starpu_data_unregister(handle);
			
 
				+	starpu_data_unregister(handle);
			
 
				 
			
 
				-    starpu_task_wait_for_all();
			
 
				+	starpu_task_wait_for_all();
			
 
				 
			
 
				-    double length_sum = 0.0;
			
 
				+	double length_sum = 0.0;
			
 
				 
			
 
				-    for (i = 0; i < niter; i++)
			
 
				-    {
			
 
				-        struct starpu_task *task = tasks[i];
			
 
				-        struct starpu_profiling_task_info *info = task->profiling_info;
			
 
				+	for (i = 0; i < niter; i++)
			
 
				+	{
			
 
				+		struct starpu_task *task = tasks[i];
			
 
				+		struct starpu_profiling_task_info *info = task->profiling_info;
			
 
				 
			
 
				 
			
 
				-        /* How long was the task execution ? */
			
 
				-        length_sum += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
			
 
				+		/* How long was the task execution ? */
			
 
				+		length_sum += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
			
 
				 
			
 
				-        /* We don't need the task structure anymore */
			
 
				-        starpu_task_destroy(task);
			
 
				-    }
			
 
				+		/* We don't need the task structure anymore */
			
 
				+		starpu_task_destroy(task);
			
 
				+	}
			
 
				 
			
 
				 
			
 
				-    /* Display the occupancy of all workers during the test */
			
 
				-    unsigned worker;
			
 
				-    for (worker = 0; worker < starpu_worker_get_count(); worker++)
			
 
				-    {
			
 
				-        struct starpu_profiling_worker_info worker_info;
			
 
				-        ret = starpu_profiling_worker_get_info(worker, &worker_info);
			
 
				-        STARPU_ASSERT(!ret);
			
 
				+	/* Display the occupancy of all workers during the test */
			
 
				+	unsigned worker;
			
 
				+	for (worker = 0; worker < starpu_worker_get_count(); worker++)
			
 
				+	{
			
 
				+		struct starpu_profiling_worker_info worker_info;
			
 
				+		int ret = starpu_profiling_worker_get_info(worker, &worker_info);
			
 
				+		STARPU_ASSERT(!ret);
			
 
				 
			
 
				-        char workername[128];
			
 
				-        starpu_worker_get_name(worker, workername, sizeof(workername));
			
 
				-        unsigned nimpl;
			
 
				+		char workername[128];
			
 
				+		starpu_worker_get_name(worker, workername, sizeof(workername));
			
 
				+		unsigned nimpl;
			
 
				 
			
 
				 
			
 
				-        if (starpu_worker_get_type(worker)==STARPU_CPU_WORKER)
			
 
				-        {
			
 
				-            FPRINTF(stdout, "\n Worker :%s ::::::::::\n\n", workername);
			
 
				+		if (starpu_worker_get_type(worker)==STARPU_CPU_WORKER)
			
 
				+		{
			
 
				+			FPRINTF(stdout, "\n Worker :%s ::::::::::\n\n", workername);
			
 
				 
			
 
				-            for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				-            {
			
 
				+			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				+			{
			
 
				 
			
 
				-                FPRINTF(stdout, "Expected time for %d on %s (impl %u): %f, Measured time: %f\n",
			
 
				-                        size, workername, nimpl,starpu_task_expected_length(task, starpu_worker_get_perf_archtype(worker, task->sched_ctx), nimpl), ((length_sum)/niter));
			
 
				+				FPRINTF(stdout, "Expected time for %d on %s (impl %u): %f, Measured time: %f\n",
			
 
				+						size, workername, nimpl,starpu_task_expected_length(compar_task, starpu_worker_get_perf_archtype(worker, compar_task->sched_ctx), nimpl), ((length_sum)/niter));
			
 
				 
			
 
				-            }
			
 
				-        }
			
 
				-
			
 
				-    }
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				 
			
 
				 
			
 
				 }
			
@@ -207,78 +196,76 @@ static void compare_performance(int size, struct starpu_codelet *codelet, struct
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				-    /* Enable profiling */
			
 
				-    starpu_profiling_status_set(STARPU_PROFILING_ENABLE);
			
 
				-
			
 
				-    struct starpu_conf conf;
			
 
				-    starpu_data_handle_t handle;
			
 
				-    int ret;
			
 
				-
			
 
				-    starpu_conf_init(&conf);
			
 
				+	/* Enable profiling */
			
 
				+	starpu_profiling_status_set(STARPU_PROFILING_ENABLE);
			
 
				 
			
 
				-    conf.sched_policy_name = "eager";
			
 
				-    conf.calibrate = 2;
			
 
				+	struct starpu_conf conf;
			
 
				+	starpu_data_handle_t handle;
			
 
				+	int ret;
			
 
				 
			
 
				-    ret = starpu_initialize(&conf, &argc, &argv);
			
 
				-    if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				-    STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	starpu_conf_init(&conf);
			
 
				 
			
 
				-    int size;
			
 
				-    for (size = STARTlin; size < END; size *= 2)
			
 
				-    {
			
 
				-        /* Use a linear regression */
			
 
				-        test_memset(size, &memset_cl);
			
 
				-    }
			
 
				+	conf.sched_policy_name = "eager";
			
 
				+	conf.calibrate = 2;
			
 
				 
			
 
				-    for (size = START; size < END; size *= 2)
			
 
				-    {
			
 
				-        /* Use a non-linear regression */
			
 
				-        test_memset(size, &nl_memset_cl);
			
 
				-    }
			
 
				+	ret = starpu_initialize(&conf, &argc, &argv);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-    ret = starpu_task_wait_for_all();
			
 
				-    STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
			
 
				+	int size;
			
 
				+	for (size = STARTlin; size < END; size *= 2)
			
 
				+	{
			
 
				+		/* Use a linear regression */
			
 
				+		test_memset(size, &memset_cl);
			
 
				+	}
			
 
				 
			
 
				-    starpu_shutdown();
			
 
				+	for (size = START; size < END; size *= 2)
			
 
				+	{
			
 
				+		/* Use a non-linear regression */
			
 
				+		test_memset(size, &nl_memset_cl);
			
 
				+	}
			
 
				 
			
 
				+	ret = starpu_task_wait_for_all();
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
			
 
				 
			
 
				-    /* Test Phase */
			
 
				-    starpu_conf_init(&conf);
			
 
				+	starpu_shutdown();
			
 
				 
			
 
				-    conf.sched_policy_name = "eager";
			
 
				-    conf.calibrate = 0;
			
 
				 
			
 
				-    ret = starpu_initialize(&conf, &argc, &argv);
			
 
				-    if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				-    STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	/* Test Phase */
			
 
				+	starpu_conf_init(&conf);
			
 
				 
			
 
				-    /* Now create a dummy task just to estimate its duration according to the regression */
			
 
				+	conf.sched_policy_name = "eager";
			
 
				+	conf.calibrate = 0;
			
 
				 
			
 
				-    size = 1234567;
			
 
				+	ret = starpu_initialize(&conf, &argc, &argv);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-    starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
			
 
				+	/* Now create a dummy task just to estimate its duration according to the regression */
			
 
				 
			
 
				-    struct starpu_task *task = starpu_task_create();
			
 
				-    task->cl = &memset_cl;
			
 
				-    task->handles[0] = handle;
			
 
				-    task->destroy = 0;
			
 
				+	size = 1234567;
			
 
				 
			
 
				-    FPRINTF(stdout, "\n ////linear regression results////\n");
			
 
				-    compare_performance(size, &memset_cl,task);
			
 
				+	starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
			
 
				 
			
 
				+	struct starpu_task *task = starpu_task_create();
			
 
				+	task->cl = &memset_cl;
			
 
				+	task->handles[0] = handle;
			
 
				+	task->destroy = 0;
			
 
				 
			
 
				-    task->cl = &nl_memset_cl;
			
 
				+	FPRINTF(stdout, "\n ////linear regression results////\n");
			
 
				+	compare_performance(size, &memset_cl, task);
			
 
				 
			
 
				-    FPRINTF(stdout, "\n ////non linear regression results////\n");
			
 
				+	task->cl = &nl_memset_cl;
			
 
				 
			
 
				-    compare_performance(size, &nl_memset_cl,task);
			
 
				+	FPRINTF(stdout, "\n ////non linear regression results////\n");
			
 
				 
			
 
				+	compare_performance(size, &nl_memset_cl, task);
			
 
				 
			
 
				-    starpu_task_destroy(task);
			
 
				+	starpu_task_destroy(task);
			
 
				 
			
 
				-    starpu_data_unregister(handle);
			
 
				+	starpu_data_unregister(handle);
			
 
				 
			
 
				-    starpu_shutdown();
			
 
				+	starpu_shutdown();
			
 
				 
			
 
				-    return 0;
			
 
				+	return EXIT_SUCCESS;
			
 
				 }
			
--- a/tests/perfmodels/regression_based_02.c
+++ b/tests/perfmodels/regression_based_02.c
@@ -33,202 +33,197 @@
 
				 #define END 16777216
			
 
				 #endif
			
 
				 
			
 
				-int ret;
			
 
				-
			
 
				-//1er implémentation avec un delai initial (100 us)
			
 
				+// first implementation with an initial delay (100 us)
			
 
				 void memset0_cpu(void *descr[], void *arg)
			
 
				 {
			
 
				-    (void)arg;
			
 
				-    STARPU_SKIP_IF_VALGRIND;
			
 
				+	(void)arg;
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				 
			
 
				-    int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				-    unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				-    int i;
			
 
				+	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				+	unsigned i;
			
 
				 
			
 
				-    //usleep(100);
			
 
				+	usleep(100);
			
 
				 
			
 
				-    for (i=0; i<n ; i++)
			
 
				-    {
			
 
				-        ptr[0] += i;
			
 
				-    }
			
 
				+	for (i=0; i<n ; i++)
			
 
				+	{
			
 
				+		ptr[0] += i;
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				-//deuxième implémentation sans delai initial usleep() et fait 2.5 plus de tours de boucles
			
 
				+// second implementation without initial delay but 2.5 more loops
			
 
				 void memset_cpu(void *descr[], void *arg)
			
 
				 {
			
 
				-    (void)arg;
			
 
				-    STARPU_SKIP_IF_VALGRIND;
			
 
				+	(void)arg;
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				 
			
 
				-    int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				-    unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				-    int i;
			
 
				+	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				+	int i;
			
 
				 
			
 
				-    for (i=0; i<6.5*n ; i++)
			
 
				-    {
			
 
				-        ptr[0] += i;
			
 
				-    }
			
 
				+	for (i=0; i<6.5*n ; i++)
			
 
				+	{
			
 
				+		ptr[0] += i;
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 //fonction pour mesurer l'energie
			
 
				 double energy_function(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl)
			
 
				 
			
 
				 {
			
 
				-    double energy;
			
 
				-    int factor;
			
 
				-    if (nimpl == 0)
			
 
				-        factor = 10;
			
 
				-    else
			
 
				-        factor = 1;
			
 
				+	double energy;
			
 
				+	int factor;
			
 
				+	if (nimpl == 0)
			
 
				+		factor = 10;
			
 
				+	else
			
 
				+		factor = 1;
			
 
				 
			
 
				-    energy=starpu_task_expected_length(task, arch, nimpl)*factor;
			
 
				+	energy=starpu_task_expected_length(task, arch, nimpl)*factor;
			
 
				 
			
 
				-    return energy;
			
 
				+	return energy;
			
 
				 }
			
 
				 
			
 
				 static struct starpu_perfmodel model =
			
 
				 {
			
 
				-    .type = STARPU_REGRESSION_BASED,
			
 
				-    .symbol = "memset_regression_based"
			
 
				+	.type = STARPU_REGRESSION_BASED,
			
 
				+	.symbol = "memset_regression_based"
			
 
				 };
			
 
				 
			
 
				 static struct starpu_perfmodel nl_model =
			
 
				 {
			
 
				-    .type = STARPU_NL_REGRESSION_BASED,
			
 
				-    .symbol = "non_linear_memset_regression_based"
			
 
				+	.type = STARPU_NL_REGRESSION_BASED,
			
 
				+	.symbol = "non_linear_memset_regression_based"
			
 
				 };
			
 
				 
			
 
				 static struct starpu_perfmodel nl_energy_model=
			
 
				 {
			
 
				-    .type = STARPU_PER_ARCH,
			
 
				-    .symbol = "non_linear_energy_model",
			
 
				-    .arch_cost_function={energy_function},
			
 
				+	.type = STARPU_PER_ARCH,
			
 
				+	.symbol = "non_linear_energy_model",
			
 
				+	.arch_cost_function=energy_function,
			
 
				 };
			
 
				 
			
 
				 static struct starpu_codelet memset_cl =
			
 
				 {
			
 
				-    .cpu_funcs = {memset0_cpu, memset_cpu},
			
 
				-    .cpu_funcs_name = {"memset0_cpu", "memset_cpu"},
			
 
				-    .model = &model,
			
 
				-    .nbuffers = 1,
			
 
				-    .modes = {STARPU_W}
			
 
				+	.cpu_funcs = {memset0_cpu, memset_cpu},
			
 
				+	.cpu_funcs_name = {"memset0_cpu", "memset_cpu"},
			
 
				+	.model = &model,
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_W}
			
 
				 };
			
 
				 
			
 
				 static struct starpu_codelet nl_memset_cl =
			
 
				 {
			
 
				-    .cpu_funcs = {memset0_cpu, memset_cpu},
			
 
				-    .cpu_funcs_name = {"memset0_cpu", "memset_cpu"},
			
 
				-    .model = &nl_model,
			
 
				-    .energy_model = &nl_energy_model,
			
 
				-    .nbuffers = 1,
			
 
				-    .modes = {STARPU_W}
			
 
				+	.cpu_funcs = {memset0_cpu, memset_cpu},
			
 
				+	.cpu_funcs_name = {"memset0_cpu", "memset_cpu"},
			
 
				+	.model = &nl_model,
			
 
				+	.energy_model = &nl_energy_model,
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_W}
			
 
				 };
			
 
				 
			
 
				 static void test_memset(int nelems, struct starpu_codelet *codelet)
			
 
				 {
			
 
				-    int nloops = 100;
			
 
				-    int loop;
			
 
				-    starpu_data_handle_t handle;
			
 
				+	int nloops = 100;
			
 
				+	int loop;
			
 
				+	starpu_data_handle_t handle;
			
 
				 
			
 
				-    starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, nelems, sizeof(int));
			
 
				-    for (loop = 0; loop < nloops; loop++)
			
 
				-    {
			
 
				-        struct starpu_task *task = starpu_task_create();
			
 
				+	starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, nelems, sizeof(int));
			
 
				+	for (loop = 0; loop < nloops; loop++)
			
 
				+	{
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				 
			
 
				-        task->cl = codelet;
			
 
				-        task->handles[0] = handle;
			
 
				+		task->cl = codelet;
			
 
				+		task->handles[0] = handle;
			
 
				 
			
 
				-        int ret = starpu_task_submit(task);
			
 
				-        if (ret == -ENODEV)
			
 
				-            exit(STARPU_TEST_SKIPPED);
			
 
				-        STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				-    }
			
 
				+		int ret = starpu_task_submit(task);
			
 
				+		if (ret == -ENODEV)
			
 
				+			exit(STARPU_TEST_SKIPPED);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				 
			
 
				-    starpu_data_unregister(handle);
			
 
				+	starpu_data_unregister(handle);
			
 
				 }
			
 
				 
			
 
				-static void compare_performance(int size, struct starpu_codelet *codelet, struct starpu_task *task)
			
 
				+static void compare_performance(int size, struct starpu_codelet *codelet, struct starpu_task *compar_task)
			
 
				 {
			
 
				-    unsigned i;
			
 
				-    int niter = 100;
			
 
				-    starpu_data_handle_t handle;
			
 
				-
			
 
				-    starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
			
 
				+	unsigned i;
			
 
				+	unsigned niter = 100;
			
 
				+	starpu_data_handle_t handle;
			
 
				 
			
 
				-    struct starpu_task **tasks = (struct starpu_task **) malloc(niter*sizeof(struct starpu_task *));
			
 
				-    assert(tasks);
			
 
				+	starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
			
 
				 
			
 
				-    for (i = 0; i < niter; i++)
			
 
				-    {
			
 
				-        //fabriquer la tache
			
 
				-        struct starpu_task *task = starpu_task_create();
			
 
				+	struct starpu_task **tasks = (struct starpu_task **) malloc(niter*sizeof(struct starpu_task *));
			
 
				+	assert(tasks);
			
 
				 
			
 
				-        task->cl = codelet;
			
 
				-        task->handles[0] = handle;
			
 
				+	for (i = 0; i < niter; i++)
			
 
				+	{
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				 
			
 
				-        task->synchronous = 1;
			
 
				+		task->cl = codelet;
			
 
				+		task->handles[0] = handle;
			
 
				 
			
 
				-        /* We will destroy the task structure by hand so that we can
			
 
				-         * query the profiling info before the task is destroyed. */
			
 
				-        task->destroy = 0;
			
 
				+		task->synchronous = 1;
			
 
				 
			
 
				-        tasks[i] = task;
			
 
				+		/* We will destroy the task structure by hand so that we can
			
 
				+		 * query the profiling info before the task is destroyed. */
			
 
				+		task->destroy = 0;
			
 
				 
			
 
				-        //soumettre la tache
			
 
				-        ret = starpu_task_submit(task);
			
 
				+		tasks[i] = task;
			
 
				 
			
 
				-        if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				-        {
			
 
				-            FPRINTF(stderr, "No worker may execute this task\n");
			
 
				-            exit(0);
			
 
				-        }
			
 
				-    }
			
 
				+		int ret = starpu_task_submit(task);
			
 
				 
			
 
				-    starpu_data_unregister(handle);
			
 
				+		if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				+		{
			
 
				+			FPRINTF(stderr, "No worker may execute this task\n");
			
 
				+			exit(0);
			
 
				+		}
			
 
				+	}
			
 
				 
			
 
				-    starpu_task_wait_for_all();
			
 
				+	starpu_data_unregister(handle);
			
 
				 
			
 
				-    double length_sum = 0.0;
			
 
				+	starpu_task_wait_for_all();
			
 
				 
			
 
				-    for (i = 0; i < niter; i++)
			
 
				-    {
			
 
				-        struct starpu_task *task = tasks[i];
			
 
				+	double length_sum = 0.0;
			
 
				 
			
 
				-        struct starpu_profiling_task_info *info = task->profiling_info;
			
 
				+	for (i = 0; i < niter; i++)
			
 
				+	{
			
 
				+		struct starpu_task *task = tasks[i];
			
 
				+		struct starpu_profiling_task_info *info = task->profiling_info;
			
 
				 
			
 
				 
			
 
				-        /* How long was the task execution ? */
			
 
				-        length_sum += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
			
 
				+		/* How long was the task execution ? */
			
 
				+		length_sum += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
			
 
				 
			
 
				-        /* We don't need the task structure anymore */
			
 
				-        starpu_task_destroy(task);
			
 
				-    }
			
 
				+		/* We don't need the task structure anymore */
			
 
				+		starpu_task_destroy(task);
			
 
				+	}
			
 
				 
			
 
				 
			
 
				-    /* Display the occupancy of all workers during the test */
			
 
				-    unsigned worker;
			
 
				-    for (worker = 0; worker < starpu_worker_get_count(); worker++)
			
 
				-    {
			
 
				-        struct starpu_profiling_worker_info worker_info;
			
 
				-        ret = starpu_profiling_worker_get_info(worker, &worker_info);
			
 
				-        STARPU_ASSERT(!ret);
			
 
				+	/* Display the occupancy of all workers during the test */
			
 
				+	unsigned worker;
			
 
				+	for (worker = 0; worker < starpu_worker_get_count(); worker++)
			
 
				+	{
			
 
				+		struct starpu_profiling_worker_info worker_info;
			
 
				+		int ret = starpu_profiling_worker_get_info(worker, &worker_info);
			
 
				+		STARPU_ASSERT(!ret);
			
 
				 
			
 
				-        char workername[128];
			
 
				-        starpu_worker_get_name(worker, workername, sizeof(workername));
			
 
				-        unsigned nimpl;
			
 
				+		char workername[128];
			
 
				+		starpu_worker_get_name(worker, workername, sizeof(workername));
			
 
				+		unsigned nimpl;
			
 
				 
			
 
				-        if (starpu_worker_get_type(worker)==STARPU_CPU_WORKER)
			
 
				-        {
			
 
				-            FPRINTF(stdout, "\n Worker :%s ::::::::::\n\n", workername);
			
 
				+		if (starpu_worker_get_type(worker)==STARPU_CPU_WORKER)
			
 
				+		{
			
 
				+			FPRINTF(stdout, "\n Worker :%s ::::::::::\n\n", workername);
			
 
				 
			
 
				-            for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				-            {
			
 
				+			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				+			{
			
 
				 
			
 
				-                FPRINTF(stdout, "Expected time for %d on %s (impl %u): %f, Measured time: %f, Expected energy: %f\n",
			
 
				-                        size, workername, nimpl,starpu_task_expected_length(task, starpu_worker_get_perf_archtype(worker, task->sched_ctx), nimpl), ((length_sum)/niter),
			
 
				-                        starpu_task_expected_energy(task, starpu_worker_get_perf_archtype(worker, task->sched_ctx), nimpl));
			
 
				-            }
			
 
				-        }
			
 
				-    }
			
 
				+				FPRINTF(stdout, "Expected time for %d on %s (impl %u): %f, Measured time: %f, Expected energy: %f\n",
			
 
				+						size, workername, nimpl,starpu_task_expected_length(compar_task, starpu_worker_get_perf_archtype(worker, compar_task->sched_ctx), nimpl), ((length_sum)/niter),
			
 
				+						starpu_task_expected_energy(compar_task, starpu_worker_get_perf_archtype(worker, compar_task->sched_ctx), nimpl));
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				 
			
 
				 
			
 
				 }
			
@@ -236,74 +231,75 @@ static void compare_performance(int size, struct starpu_codelet *codelet, struct
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				+	/* Enable profiling */
			
 
				+	starpu_profiling_status_set(STARPU_PROFILING_ENABLE);
			
 
				 
			
 
				-    /* Enable profiling */
			
 
				-    starpu_profiling_status_set(1);
			
 
				+	struct starpu_conf conf;
			
 
				+	starpu_data_handle_t handle;
			
 
				+	int ret;
			
 
				 
			
 
				-    struct starpu_conf conf;
			
 
				-    starpu_data_handle_t handle;
			
 
				-    int ret;
			
 
				+	starpu_conf_init(&conf);
			
 
				 
			
 
				-    starpu_conf_init(&conf);
			
 
				+	conf.sched_policy_name = "dmda";
			
 
				+	conf.calibrate = 2;
			
 
				 
			
 
				-    conf.sched_policy_name = "dmda";
			
 
				-    conf.calibrate = 2;
			
 
				+	ret = starpu_initialize(&conf, &argc, &argv);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-    ret = starpu_initialize(&conf, &argc, &argv);
			
 
				-    if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				-    STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-
			
 
				-    int size;
			
 
				-    /*for (size = STARTlin; size < END; size *= 2)
			
 
				-    {
			
 
				-        /* Use a linear regression */
			
 
				-    //test_memset(size, &memset_cl);
			
 
				-    //}
			
 
				+	int size;
			
 
				+#if 0
			
 
				+	for (size = STARTlin; size < END; size *= 2)
			
 
				+	{
			
 
				+		/* Use a linear regression */
			
 
				+		test_memset(size, &memset_cl);
			
 
				+	}
			
 
				+#endif
			
 
				 
			
 
				-    for (size = START; size < END; size *= 2)
			
 
				-    {
			
 
				-        /* Use a non-linear regression */
			
 
				-        test_memset(size, &nl_memset_cl);
			
 
				-    }
			
 
				+	for (size = START; size < END; size *= 2)
			
 
				+	{
			
 
				+		/* Use a non-linear regression */
			
 
				+		test_memset(size, &nl_memset_cl);
			
 
				+	}
			
 
				 
			
 
				-    ret = starpu_task_wait_for_all();
			
 
				-    STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
			
 
				+	ret = starpu_task_wait_for_all();
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
			
 
				 
			
 
				-    starpu_shutdown();
			
 
				+	starpu_shutdown();
			
 
				 
			
 
				 
			
 
				-    /* Test Phase */
			
 
				-    starpu_conf_init(&conf);
			
 
				+	/* Test Phase */
			
 
				+	starpu_conf_init(&conf);
			
 
				 
			
 
				-    conf.sched_policy_name = "dmda";
			
 
				-    conf.calibrate = 0;
			
 
				+	conf.sched_policy_name = "dmda";
			
 
				+	conf.calibrate = 0;
			
 
				 
			
 
				-    ret = starpu_initialize(&conf, &argc, &argv);
			
 
				-    if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				-    STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_initialize(&conf, &argc, &argv);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-    /* Now create a dummy task just to estimate its duration according to the regression */
			
 
				+	/* Now create a dummy task just to estimate its duration according to the regression */
			
 
				 
			
 
				-    size = 1234567;
			
 
				+	size = 1234567;
			
 
				 
			
 
				-    starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
			
 
				+	starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
			
 
				 
			
 
				-    struct starpu_task *task = starpu_task_create();
			
 
				-    task->cl = &memset_cl;
			
 
				-    task->handles[0] = handle;
			
 
				-    task->destroy = 0;
			
 
				+	struct starpu_task *task = starpu_task_create();
			
 
				+	task->cl = &memset_cl;
			
 
				+	task->handles[0] = handle;
			
 
				+	task->destroy = 0;
			
 
				 
			
 
				-    task->cl = &nl_memset_cl;
			
 
				+	task->cl = &nl_memset_cl;
			
 
				 
			
 
				-    FPRINTF(stdout, "\n ////non linear regression results////\n");
			
 
				+	FPRINTF(stdout, "\n ////non linear regression results////\n");
			
 
				 
			
 
				-    compare_performance(size, &nl_memset_cl,task);
			
 
				+	compare_performance(size, &nl_memset_cl, task);
			
 
				 
			
 
				-    starpu_task_destroy(task);
			
 
				+	starpu_task_destroy(task);
			
 
				 
			
 
				-    starpu_data_unregister(handle);
			
 
				+	starpu_data_unregister(handle);
			
 
				 
			
 
				-    starpu_shutdown();
			
 
				+	starpu_shutdown();
			
 
				 
			
 
				-    return EXIT_SUCCESS;
			
 
				+	return EXIT_SUCCESS;
			
 
				 }
			
--- a/tests/perfmodels/regression_based_03.c
+++ b/tests/perfmodels/regression_based_03.c
@@ -34,204 +34,197 @@
 
				 #define END 16777216
			
 
				 #endif
			
 
				 
			
 
				-
			
 
				-int ret;
			
 
				-
			
 
				-//1er implémentation avec un delai initial (100 us)
			
 
				+// first implementation with an initial delay (100 us)
			
 
				 void memset0_cpu(void *descr[], void *arg)
			
 
				 {
			
 
				-    (void)arg;
			
 
				-    STARPU_SKIP_IF_VALGRIND;
			
 
				+	(void)arg;
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				 
			
 
				-    int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				-    unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				-    int i;
			
 
				+	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				+	unsigned i;
			
 
				 
			
 
				-    //usleep () function
			
 
				-    //usleep(100);
			
 
				+	usleep(100);
			
 
				 
			
 
				-    for (i=0; i<n ; i++)
			
 
				-    {
			
 
				-        ptr[0] += i;
			
 
				-    }
			
 
				+	for (i=0; i<n ; i++)
			
 
				+	{
			
 
				+		ptr[0] += i;
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				-//deuxième implémentation sans delai initial usleep() et fait 2.5 plus de tours de boucles
			
 
				+// second implementation without initial delay but 2.5 more loops
			
 
				 void memset_cpu(void *descr[], void *arg)
			
 
				 {
			
 
				-    (void)arg;
			
 
				-    STARPU_SKIP_IF_VALGRIND;
			
 
				+	(void)arg;
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				 
			
 
				-    int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				-    unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				-    int i;
			
 
				+	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				+	int i;
			
 
				 
			
 
				-    for (i=0; i<6.5*n ; i++)
			
 
				-    {
			
 
				-        ptr[0] += i;
			
 
				-    }
			
 
				+	for (i=0; i<6.5*n ; i++)
			
 
				+	{
			
 
				+		ptr[0] += i;
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 //fonction pour mesurer l'energie
			
 
				 double energy_function(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl)
			
 
				 
			
 
				 {
			
 
				-    double energy;
			
 
				-    int factor;
			
 
				-    if (nimpl == 0)
			
 
				-        factor = 10;
			
 
				-    else
			
 
				-        factor = 1;
			
 
				+	double energy;
			
 
				+	int factor;
			
 
				+	if (nimpl == 0)
			
 
				+		factor = 10;
			
 
				+	else
			
 
				+		factor = 1;
			
 
				 
			
 
				-    energy=starpu_task_expected_length(task, arch, nimpl)*factor;
			
 
				+	energy=starpu_task_expected_length(task, arch, nimpl)*factor;
			
 
				 
			
 
				-    return energy;
			
 
				+	return energy;
			
 
				 }
			
 
				 
			
 
				 static struct starpu_perfmodel model =
			
 
				 {
			
 
				-    .type = STARPU_REGRESSION_BASED,
			
 
				-    .symbol = "memset_regression_based"
			
 
				+	.type = STARPU_REGRESSION_BASED,
			
 
				+	.symbol = "memset_regression_based"
			
 
				 };
			
 
				 
			
 
				 static struct starpu_perfmodel nl_model =
			
 
				 {
			
 
				-    .type = STARPU_NL_REGRESSION_BASED,
			
 
				-    .symbol = "non_linear_memset_regression_based"
			
 
				+	.type = STARPU_NL_REGRESSION_BASED,
			
 
				+	.symbol = "non_linear_memset_regression_based"
			
 
				 };
			
 
				 
			
 
				 static struct starpu_perfmodel nl_energy_model=
			
 
				 {
			
 
				-    .type = STARPU_PER_ARCH,
			
 
				-    .symbol = "non_linear_energy_model",
			
 
				-    .arch_cost_function={energy_function},
			
 
				+	.type = STARPU_PER_ARCH,
			
 
				+	.symbol = "non_linear_energy_model",
			
 
				+	.arch_cost_function=energy_function,
			
 
				 };
			
 
				 
			
 
				 static struct starpu_codelet memset_cl =
			
 
				 {
			
 
				-    .cpu_funcs = {memset0_cpu, memset_cpu},
			
 
				-    .cpu_funcs_name = {"memset0_cpu", "memset_cpu"},
			
 
				-    .model = &model,
			
 
				-    .nbuffers = 1,
			
 
				-    .modes = {STARPU_W}
			
 
				+	.cpu_funcs = {memset0_cpu, memset_cpu},
			
 
				+	.cpu_funcs_name = {"memset0_cpu", "memset_cpu"},
			
 
				+	.model = &model,
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_W}
			
 
				 };
			
 
				 
			
 
				 static struct starpu_codelet nl_memset_cl =
			
 
				 {
			
 
				-    .cpu_funcs = {memset0_cpu, memset_cpu},
			
 
				-    .cpu_funcs_name = {"memset0_cpu", "memset_cpu"},
			
 
				-    .model = &nl_model,
			
 
				-    .energy_model = &nl_energy_model,
			
 
				-    .nbuffers = 1,
			
 
				-    .modes = {STARPU_W}
			
 
				+	.cpu_funcs = {memset0_cpu, memset_cpu},
			
 
				+	.cpu_funcs_name = {"memset0_cpu", "memset_cpu"},
			
 
				+	.model = &nl_model,
			
 
				+	.energy_model = &nl_energy_model,
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_W}
			
 
				 };
			
 
				 
			
 
				 static void test_memset(int nelems, struct starpu_codelet *codelet)
			
 
				 {
			
 
				-    int nloops = 100;
			
 
				-    int loop;
			
 
				-    starpu_data_handle_t handle;
			
 
				+	int nloops = 100;
			
 
				+	int loop;
			
 
				+	starpu_data_handle_t handle;
			
 
				 
			
 
				-    starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, nelems, sizeof(int));
			
 
				-    for (loop = 0; loop < nloops; loop++)
			
 
				-    {
			
 
				-        struct starpu_task *task = starpu_task_create();
			
 
				+	starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, nelems, sizeof(int));
			
 
				+	for (loop = 0; loop < nloops; loop++)
			
 
				+	{
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				 
			
 
				-        task->cl = codelet;
			
 
				-        task->handles[0] = handle;
			
 
				+		task->cl = codelet;
			
 
				+		task->handles[0] = handle;
			
 
				 
			
 
				-        int ret = starpu_task_submit(task);
			
 
				-        if (ret == -ENODEV)
			
 
				-            exit(STARPU_TEST_SKIPPED);
			
 
				-        STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				-    }
			
 
				+		int ret = starpu_task_submit(task);
			
 
				+		if (ret == -ENODEV)
			
 
				+			exit(STARPU_TEST_SKIPPED);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				 
			
 
				-    starpu_data_unregister(handle);
			
 
				+	starpu_data_unregister(handle);
			
 
				 }
			
 
				 
			
 
				-static void compare_performance(int size, struct starpu_codelet *codelet, struct starpu_task *task)
			
 
				+static void compare_performance(int size, struct starpu_codelet *codelet, struct starpu_task *compar_task)
			
 
				 {
			
 
				-    unsigned i;
			
 
				-    int niter = 100;
			
 
				-    starpu_data_handle_t handle;
			
 
				+	unsigned i;
			
 
				+	unsigned niter = 100;
			
 
				+	starpu_data_handle_t handle;
			
 
				 
			
 
				-    starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
			
 
				+	starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
			
 
				 
			
 
				-    struct starpu_task **tasks = (struct starpu_task **) malloc(niter*sizeof(struct starpu_task *));
			
 
				-    assert(tasks);
			
 
				+	struct starpu_task **tasks = (struct starpu_task **) malloc(niter*sizeof(struct starpu_task *));
			
 
				+	assert(tasks);
			
 
				 
			
 
				-    for (i = 0; i < niter; i++)
			
 
				-    {
			
 
				-        //fabriquer la tache
			
 
				-        struct starpu_task *task = starpu_task_create();
			
 
				+	for (i = 0; i < niter; i++)
			
 
				+	{
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				 
			
 
				-        task->cl = codelet;
			
 
				-        task->handles[0] = handle;
			
 
				+		task->cl = codelet;
			
 
				+		task->handles[0] = handle;
			
 
				 
			
 
				-        task->synchronous = 1;
			
 
				+		task->synchronous = 1;
			
 
				 
			
 
				-        /* We will destroy the task structure by hand so that we can
			
 
				-         * query the profiling info before the task is destroyed. */
			
 
				-        task->destroy = 0;
			
 
				+		/* We will destroy the task structure by hand so that we can
			
 
				+		 * query the profiling info before the task is destroyed. */
			
 
				+		task->destroy = 0;
			
 
				 
			
 
				-        tasks[i] = task;
			
 
				+		tasks[i] = task;
			
 
				 
			
 
				-        //soumettre la tache
			
 
				-        ret = starpu_task_submit(task);
			
 
				+		int ret = starpu_task_submit(task);
			
 
				 
			
 
				-        if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				-        {
			
 
				-            FPRINTF(stderr, "No worker may execute this task\n");
			
 
				-            exit(0);
			
 
				-        }
			
 
				-    }
			
 
				+		if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				+		{
			
 
				+			FPRINTF(stderr, "No worker may execute this task\n");
			
 
				+			exit(0);
			
 
				+		}
			
 
				+	}
			
 
				 
			
 
				-    starpu_data_unregister(handle);
			
 
				+	starpu_data_unregister(handle);
			
 
				 
			
 
				-    starpu_task_wait_for_all();
			
 
				+	starpu_task_wait_for_all();
			
 
				 
			
 
				-    double length_sum = 0.0;
			
 
				+	double length_sum = 0.0;
			
 
				 
			
 
				-    for (i = 0; i < niter; i++)
			
 
				-    {
			
 
				-        struct starpu_task *task = tasks[i];
			
 
				+	for (i = 0; i < niter; i++)
			
 
				+	{
			
 
				+		struct starpu_task *task = tasks[i];
			
 
				+		struct starpu_profiling_task_info *info = task->profiling_info;
			
 
				 
			
 
				-        struct starpu_profiling_task_info *info = task->profiling_info;
			
 
				 
			
 
				+		/* How long was the task execution ? */
			
 
				+		length_sum += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
			
 
				 
			
 
				-        /* How long was the task execution ? */
			
 
				-        length_sum += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
			
 
				+		/* We don't need the task structure anymore */
			
 
				+		starpu_task_destroy(task);
			
 
				+	}
			
 
				 
			
 
				-        /* We don't need the task structure anymore */
			
 
				-        starpu_task_destroy(task);
			
 
				-    }
			
 
				 
			
 
				+	/* Display the occupancy of all workers during the test */
			
 
				+	unsigned worker;
			
 
				+	for (worker = 0; worker < starpu_worker_get_count(); worker++)
			
 
				+	{
			
 
				+		struct starpu_profiling_worker_info worker_info;
			
 
				+		int ret = starpu_profiling_worker_get_info(worker, &worker_info);
			
 
				+		STARPU_ASSERT(!ret);
			
 
				 
			
 
				-    /* Display the occupancy of all workers during the test */
			
 
				-    unsigned worker;
			
 
				-    for (worker = 0; worker < starpu_worker_get_count(); worker++)
			
 
				-    {
			
 
				-        struct starpu_profiling_worker_info worker_info;
			
 
				-        ret = starpu_profiling_worker_get_info(worker, &worker_info);
			
 
				-        STARPU_ASSERT(!ret);
			
 
				+		char workername[128];
			
 
				+		starpu_worker_get_name(worker, workername, sizeof(workername));
			
 
				+		unsigned nimpl;
			
 
				 
			
 
				-        char workername[128];
			
 
				-        starpu_worker_get_name(worker, workername, sizeof(workername));
			
 
				-        unsigned nimpl;
			
 
				+		if (starpu_worker_get_type(worker)==STARPU_CPU_WORKER)
			
 
				+		{
			
 
				+			FPRINTF(stdout, "\n Worker :%s ::::::::::\n\n", workername);
			
 
				 
			
 
				-        if (starpu_worker_get_type(worker)==STARPU_CPU_WORKER)
			
 
				-        {
			
 
				-            FPRINTF(stdout, "\n Worker :%s ::::::::::\n\n", workername);
			
 
				+			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				+			{
			
 
				 
			
 
				-            for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				-            {
			
 
				-
			
 
				-                FPRINTF(stdout, "Expected time for %d on %s (impl %u): %f, Measured time: %f, Expected energy: %f\n",
			
 
				-                        size, workername, nimpl,starpu_task_expected_length(task, starpu_worker_get_perf_archtype(worker, task->sched_ctx), nimpl), ((length_sum)/niter),
			
 
				-                        starpu_task_expected_energy(task, starpu_worker_get_perf_archtype(worker, task->sched_ctx), nimpl));
			
 
				-            }
			
 
				-        }
			
 
				-    }
			
 
				+				FPRINTF(stdout, "Expected time for %d on %s (impl %u): %f, Measured time: %f, Expected energy: %f\n",
			
 
				+						size, workername, nimpl,starpu_task_expected_length(compar_task, starpu_worker_get_perf_archtype(worker, compar_task->sched_ctx), nimpl), ((length_sum)/niter),
			
 
				+						starpu_task_expected_energy(compar_task, starpu_worker_get_perf_archtype(worker, compar_task->sched_ctx), nimpl));
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				 
			
 
				 
			
 
				 }
			
@@ -239,74 +232,73 @@ static void compare_performance(int size, struct starpu_codelet *codelet, struct
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				+	/* Enable profiling */
			
 
				+	starpu_profiling_status_set(STARPU_PROFILING_ENABLE);
			
 
				 
			
 
				-    /* Enable profiling */
			
 
				-    starpu_profiling_status_set(1);
			
 
				-
			
 
				-    struct starpu_conf conf;
			
 
				-    starpu_data_handle_t handle;
			
 
				-    int ret;
			
 
				+	struct starpu_conf conf;
			
 
				+	starpu_data_handle_t handle;
			
 
				+	int ret;
			
 
				 
			
 
				-    starpu_conf_init(&conf);
			
 
				+	starpu_conf_init(&conf);
			
 
				 
			
 
				-    conf.sched_policy_name = "dmda";
			
 
				-    conf.calibrate = 2;
			
 
				+	conf.sched_policy_name = "dmda";
			
 
				+	conf.calibrate = 2;
			
 
				 
			
 
				-    ret = starpu_initialize(&conf, &argc, &argv);
			
 
				-    if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				-    STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_initialize(&conf, &argc, &argv);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-    int size;
			
 
				-    for (size = STARTlin; size < END; size *= 2)
			
 
				-    {
			
 
				-        /* Use a linear regression */
			
 
				-        test_memset(size, &memset_cl);
			
 
				-    }
			
 
				+	int size;
			
 
				+	for (size = STARTlin; size < END; size *= 2)
			
 
				+	{
			
 
				+		/* Use a linear regression */
			
 
				+		test_memset(size, &memset_cl);
			
 
				+	}
			
 
				 
			
 
				-    for (size = START; size < END; size *= 2)
			
 
				-    {
			
 
				-        /* Use a non-linear regression */
			
 
				-        test_memset(size, &nl_memset_cl);
			
 
				-    }
			
 
				+	for (size = START; size < END; size *= 2)
			
 
				+	{
			
 
				+		/* Use a non-linear regression */
			
 
				+		test_memset(size, &nl_memset_cl);
			
 
				+	}
			
 
				 
			
 
				-    ret = starpu_task_wait_for_all();
			
 
				-    STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
			
 
				+	ret = starpu_task_wait_for_all();
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
			
 
				 
			
 
				-    starpu_shutdown();
			
 
				+	starpu_shutdown();
			
 
				 
			
 
				 
			
 
				-    /* Test Phase */
			
 
				-    starpu_conf_init(&conf);
			
 
				+	/* Test Phase */
			
 
				+	starpu_conf_init(&conf);
			
 
				 
			
 
				-    conf.sched_policy_name = "dmda";
			
 
				-    conf.calibrate = 0;
			
 
				+	conf.sched_policy_name = "dmda";
			
 
				+	conf.calibrate = 0;
			
 
				 
			
 
				-    ret = starpu_initialize(&conf, &argc, &argv);
			
 
				-    if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				-    STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_initialize(&conf, &argc, &argv);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-    /* Now create a dummy task just to estimate its duration according to the regression */
			
 
				+	/* Now create a dummy task just to estimate its duration according to the regression */
			
 
				 
			
 
				-    size = 1234567;
			
 
				+	size = 1234567;
			
 
				 
			
 
				-    starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
			
 
				+	starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
			
 
				 
			
 
				-    struct starpu_task *task = starpu_task_create();
			
 
				-    task->cl = &memset_cl;
			
 
				-    task->handles[0] = handle;
			
 
				-    task->destroy = 0;
			
 
				+	struct starpu_task *task = starpu_task_create();
			
 
				+	task->cl = &memset_cl;
			
 
				+	task->handles[0] = handle;
			
 
				+	task->destroy = 0;
			
 
				 
			
 
				-    task->cl = &nl_memset_cl;
			
 
				+	task->cl = &nl_memset_cl;
			
 
				 
			
 
				-    FPRINTF(stdout, "\n ////non linear regression results////\n");
			
 
				+	FPRINTF(stdout, "\n ////non linear regression results////\n");
			
 
				 
			
 
				-    compare_performance(size, &nl_memset_cl,task);
			
 
				+	compare_performance(size, &nl_memset_cl, task);
			
 
				 
			
 
				-    starpu_task_destroy(task);
			
 
				+	starpu_task_destroy(task);
			
 
				 
			
 
				-    starpu_data_unregister(handle);
			
 
				+	starpu_data_unregister(handle);
			
 
				 
			
 
				-    starpu_shutdown();
			
 
				+	starpu_shutdown();
			
 
				 
			
 
				-    return EXIT_SUCCESS;
			
 
				+	return EXIT_SUCCESS;
			
 
				 }
			
--- a/tests/perfmodels/regression_based_04.c
+++ b/tests/perfmodels/regression_based_04.c
@@ -0,0 +1,387 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ * Copyright (C) 2011       Télécom-SudParis
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_scheduler.h>
			
 
				+#include "../helper.h"
			
 
				+
			
 
				+/*
			
 
				+ * A multi-implementation benchmark with dmda scheduler
			
 
				+ * we aim to test OPENCL workers and calculate the estimated time for each type of worker (CPU or OPENCL or CUDA)
			
 
				+ * dmda choose OPENCL workers for lage size (variable size of compare_performance) size=1234567
			
 
				+ * dmda choose CPU workers for small size (size=1234)
			
 
				+ */
			
 
				+
			
 
				+#define STARTlin (512*1024)
			
 
				+#define START 1024
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				+#define END 1048576
			
 
				+#else
			
 
				+#define END 16777216
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+static void memset_cuda(void *descr[], void *arg)
			
 
				+{
			
 
				+	(void)arg;
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				+	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				+
			
 
				+	cudaMemsetAsync(ptr, 42, n * sizeof(*ptr), starpu_cuda_get_local_stream());
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+extern void memset0_opencl(void *buffers[], void *args);
			
 
				+extern void memset_opencl(void *buffers[], void *args);
			
 
				+#endif
			
 
				+
			
 
				+void memset0_cpu(void *descr[], void *arg)
			
 
				+{
			
 
				+	(void)arg;
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				+	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				+	unsigned i;
			
 
				+
			
 
				+	//starpu_usleep(100);
			
 
				+
			
 
				+	for (i = 0; i < n; i++)
			
 
				+
			
 
				+		ptr[0] += i;
			
 
				+}
			
 
				+
			
 
				+void memset_cpu(void *descr[], void *arg)
			
 
				+{
			
 
				+	(void)arg;
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				+	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				+
			
 
				+	//starpu_usleep(10);
			
 
				+	memset(ptr, 42, n * sizeof(*ptr));
			
 
				+}
			
 
				+
			
 
				+static struct starpu_perfmodel model =
			
 
				+{
			
 
				+	.type = STARPU_REGRESSION_BASED,
			
 
				+	.symbol = "memset_regression_based"
			
 
				+};
			
 
				+
			
 
				+static struct starpu_perfmodel nl_model =
			
 
				+{
			
 
				+	.type = STARPU_NL_REGRESSION_BASED,
			
 
				+	.symbol = "non_linear_memset_regression_based"
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet memset_cl =
			
 
				+{
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {memset_cuda},
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.opencl_funcs = {memset0_opencl, memset_opencl},
			
 
				+	.opencl_flags = {STARPU_OPENCL_ASYNC},
			
 
				+#endif
			
 
				+	.cpu_funcs = {memset0_cpu, memset_cpu},
			
 
				+	.cpu_funcs_name = {"memset0_cpu", "memset_cpu"},
			
 
				+	.model = &model,
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_W}
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet nl_memset_cl =
			
 
				+{
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {memset_cuda},
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.opencl_funcs = {memset0_opencl, memset_opencl},
			
 
				+	.opencl_flags = {STARPU_OPENCL_ASYNC},
			
 
				+#endif
			
 
				+	.cpu_funcs = {memset0_cpu, memset_cpu},
			
 
				+	.cpu_funcs_name = {"memset0_cpu", "memset_cpu"},
			
 
				+	.model = &nl_model,
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_W}
			
 
				+};
			
 
				+
			
 
				+static void test_memset(int nelems, struct starpu_codelet *codelet)
			
 
				+{
			
 
				+	int nloops = 100;
			
 
				+	int loop;
			
 
				+	starpu_data_handle_t handle;
			
 
				+
			
 
				+	starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, nelems, sizeof(int));
			
 
				+	for (loop = 0; loop < nloops; loop++)
			
 
				+	{
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+		task->cl = codelet;
			
 
				+		task->handles[0] = handle;
			
 
				+
			
 
				+		int ret = starpu_task_submit(task);
			
 
				+		if (ret == -ENODEV)
			
 
				+			exit(STARPU_TEST_SKIPPED);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+
			
 
				+	starpu_data_unregister(handle);
			
 
				+}
			
 
				+
			
 
				+static void compare_performance(int size, struct starpu_codelet *codelet, struct starpu_task *compar_task)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	unsigned niter = 100;
			
 
				+	starpu_data_handle_t handle;
			
 
				+
			
 
				+	starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
			
 
				+
			
 
				+	struct starpu_task **tasks = (struct starpu_task **) malloc(niter*sizeof(struct starpu_task *));
			
 
				+	assert(tasks);
			
 
				+
			
 
				+	for (i = 0; i < niter; i++)
			
 
				+	{
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+		task->cl = codelet;
			
 
				+		task->handles[0] = handle;
			
 
				+
			
 
				+		task->synchronous = 1;
			
 
				+
			
 
				+		/* We will destroy the task structure by hand so that we can
			
 
				+		 * query the profiling info before the task is destroyed. */
			
 
				+		task->destroy = 0;
			
 
				+
			
 
				+		tasks[i] = task;
			
 
				+
			
 
				+		int ret = starpu_task_submit(task);
			
 
				+
			
 
				+		if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				+		{
			
 
				+			FPRINTF(stderr, "No worker may execute this task\n");
			
 
				+			exit(0);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_data_unregister(handle);
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	double length_cpu_sum = 0.0;
			
 
				+	double length_gpu_sum = 0.0;
			
 
				+
			
 
				+	enum starpu_worker_archtype archi;
			
 
				+
			
 
				+	for (i = 0; i < niter; i++)
			
 
				+	{
			
 
				+		struct starpu_task *task = tasks[i];
			
 
				+		struct starpu_profiling_task_info *info = task->profiling_info;
			
 
				+
			
 
				+		//archi=starpu_worker_get_type(0);
			
 
				+		archi=starpu_worker_get_type(info->workerid);
			
 
				+
			
 
				+		switch (archi)
			
 
				+		{
			
 
				+		case STARPU_CPU_WORKER:
			
 
				+			FPRINTF(stdout, "cpuuu\n");
			
 
				+			/* How long was the task execution ? */
			
 
				+			length_cpu_sum += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
			
 
				+			break;
			
 
				+
			
 
				+		case STARPU_OPENCL_WORKER:
			
 
				+
			
 
				+			FPRINTF(stdout, "openclllllll\n");
			
 
				+			/* How long was the task execution ? */
			
 
				+			length_gpu_sum += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
			
 
				+			break;
			
 
				+
			
 
				+		case STARPU_CUDA_WORKER:
			
 
				+
			
 
				+			FPRINTF(stdout, "cudaaaaaa\n");
			
 
				+			/* How long was the task execution ? */
			
 
				+			length_gpu_sum += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
			
 
				+			break;
			
 
				+
			
 
				+
			
 
				+	default:
			
 
				+			FPRINTF(stdout, "unsupported!\n");
			
 
				+		break;
			
 
				+		}
			
 
				+
			
 
				+		/* We don't need the task structure anymore */
			
 
				+		starpu_task_destroy(task);
			
 
				+
			
 
				+	}
			
 
				+
			
 
				+	unsigned worker;
			
 
				+
			
 
				+	/* Display the occupancy of all workers during the test */
			
 
				+	unsigned ncpus =  starpu_cpu_worker_get_count();
			
 
				+	unsigned ngpus =  starpu_opencl_worker_get_count()+starpu_cuda_worker_get_count();
			
 
				+	//unsigned ncpu= starpu_worker_get_count_by_type(STARPU_CPU_WORKER);
			
 
				+
			
 
				+	FPRINTF(stderr, "ncpus %u \n", ncpus);
			
 
				+	FPRINTF(stderr, "ngpus %u \n", ngpus);
			
 
				+	for (worker= 0; worker< starpu_worker_get_count(); worker++)
			
 
				+	{
			
 
				+
			
 
				+		struct starpu_profiling_worker_info worker_info;
			
 
				+		int ret = starpu_profiling_worker_get_info(worker, &worker_info);
			
 
				+		STARPU_ASSERT(!ret);
			
 
				+
			
 
				+		char workername[128];
			
 
				+		starpu_worker_get_name(worker, workername, sizeof(workername));
			
 
				+		unsigned nimpl;
			
 
				+
			
 
				+		FPRINTF(stdout, "\n Worker :%s ::::::::::\n\n", workername);
			
 
				+
			
 
				+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				+		{
			
 
				+			switch (starpu_worker_get_type(worker))
			
 
				+
			
 
				+			{
			
 
				+			case STARPU_CPU_WORKER:
			
 
				+
			
 
				+				FPRINTF(stdout, "Expected time for %d on %s (impl %u): %f, Measured time: %f \n",
			
 
				+						size, workername, nimpl,starpu_task_expected_length(compar_task, starpu_worker_get_perf_archtype(worker, compar_task->sched_ctx), nimpl), ((length_cpu_sum)/niter));
			
 
				+
			
 
				+				break;
			
 
				+
			
 
				+			case STARPU_OPENCL_WORKER:
			
 
				+
			
 
				+				FPRINTF(stdout, "Expectedd time for %d on %s (impl %u): %f, Measuredd time: %f \n",
			
 
				+						size, workername, nimpl,starpu_task_expected_length(compar_task, starpu_worker_get_perf_archtype(worker, compar_task->sched_ctx), nimpl), ((length_gpu_sum)/niter));
			
 
				+
			
 
				+				break;
			
 
				+
			
 
				+			case STARPU_CUDA_WORKER:
			
 
				+
			
 
				+				FPRINTF(stdout, "Expectedd time for %d on %s (impl %u): %f, Measuredd time: %f \n",
			
 
				+						size, workername, nimpl,starpu_task_expected_length(compar_task, starpu_worker_get_perf_archtype(worker, compar_task->sched_ctx), nimpl), ((length_gpu_sum)/niter));
			
 
				+
			
 
				+				break;
			
 
				+
			
 
				+			default:
			
 
				+				FPRINTF(stdout, "unsupported!\n");
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+struct starpu_opencl_program opencl_program;
			
 
				+#endif
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	/* Enable profiling */
			
 
				+	starpu_profiling_status_set(STARPU_PROFILING_ENABLE);
			
 
				+
			
 
				+	struct starpu_conf conf;
			
 
				+	starpu_data_handle_t handle;
			
 
				+	int ret;
			
 
				+
			
 
				+	starpu_conf_init(&conf);
			
 
				+
			
 
				+	conf.sched_policy_name = "dmda";
			
 
				+	conf.calibrate = 2;
			
 
				+
			
 
				+	ret = starpu_initialize(&conf, &argc, &argv);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	ret = starpu_opencl_load_opencl_from_file("tests/perfmodels/opencl_memset_kernel.cl",
			
 
				+			&opencl_program, NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
			
 
				+#endif
			
 
				+
			
 
				+	int size;
			
 
				+	for (size = STARTlin; size < END; size *= 2)
			
 
				+	{
			
 
				+		/* Use a linear regression */
			
 
				+		test_memset(size, &memset_cl);
			
 
				+	}
			
 
				+
			
 
				+	for (size = START*1.5; size < END; size *= 2)
			
 
				+	{
			
 
				+		/* Use a non-linear regression */
			
 
				+		test_memset(size, &nl_memset_cl);
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_task_wait_for_all();
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+
			
 
				+	/* Test Phase */
			
 
				+	starpu_conf_init(&conf);
			
 
				+
			
 
				+	conf.sched_policy_name = "dmda";
			
 
				+	conf.calibrate = 0;
			
 
				+
			
 
				+	ret = starpu_initialize(&conf, &argc, &argv);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	ret = starpu_opencl_load_opencl_from_file("tests/perfmodels/opencl_memset_kernel.cl",
			
 
				+			&opencl_program, NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
			
 
				+
			
 
				+	/* Now create a dummy task just to estimate its duration according to the regression */
			
 
				+
			
 
				+	size = 1234567;
			
 
				+
			
 
				+	starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
			
 
				+
			
 
				+	struct starpu_task *task = starpu_task_create();
			
 
				+	task->cl = &memset_cl;
			
 
				+	task->handles[0] = handle;
			
 
				+	task->destroy = 0;
			
 
				+
			
 
				+	//FPRINTF(stdout, "\n ////linear regression results////\n");
			
 
				+	//compare_performance(size, &memset_cl, task);
			
 
				+
			
 
				+	task->cl = &nl_memset_cl;
			
 
				+
			
 
				+	FPRINTF(stdout, "\n ////non linear regression results////\n");
			
 
				+
			
 
				+	compare_performance(size, &nl_memset_cl, task);
			
 
				+
			
 
				+	starpu_task_destroy(task);
			
 
				+
			
 
				+	starpu_data_unregister(handle);
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	ret = starpu_opencl_unload_opencl(&opencl_program);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
			
 
				+#endif
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return EXIT_SUCCESS;
			
 
				+}