Browse Source

Merge branch 'master' of git+ssh://scm.gforge.inria.fr/gitroot/starpu/starpu into fpga

Samuel Thibault 5 years ago
parent
commit
f1cffb99e7
62 changed files with 1125 additions and 677 deletions
  1. 5 0
      doc/doxygen/chapters/320_scheduling.doxy
  2. 19 7
      doc/doxygen/chapters/380_offline_performance_tools.doxy
  3. 1 0
      doc/doxygen/chapters/510_configure_options.doxy
  4. 1 0
      doc/doxygen/chapters/code/vector_scal_opencl.c
  5. 2 0
      examples/axpy/axpy_opencl.c
  6. 2 0
      examples/basic_examples/multiformat_conversion_codelets_opencl.c
  7. 2 0
      examples/basic_examples/multiformat_opencl.c
  8. 1 0
      examples/basic_examples/vector_scal_opencl.c
  9. 2 0
      examples/filters/custom_mf/conversion_opencl.c
  10. 2 0
      examples/filters/custom_mf/custom_opencl.c
  11. 2 0
      examples/interface/complex_kernels_opencl.c
  12. 2 14
      examples/reductions/dot_product.c
  13. 1 0
      examples/reductions/dot_product_opencl_kernels.cl
  14. 15 0
      julia/examples/cholesky/cholesky_common.jl
  15. 15 0
      julia/examples/cholesky/cholesky_native.jl
  16. 15 0
      julia/src/openblas_ldflags.jl
  17. 69 0
      mpi/examples/Makefile.am
  18. 0 0
      mpi/examples/benchs/abstract_sendrecv_bench.c
  19. 0 0
      mpi/examples/benchs/abstract_sendrecv_bench.h
  20. 0 0
      mpi/examples/benchs/bench_helper.c
  21. 0 0
      mpi/examples/benchs/bench_helper.h
  22. 2 6
      mpi/tests/burst.c
  23. 3 7
      mpi/tests/burst_gemm.c
  24. 0 0
      mpi/examples/benchs/burst_helper.c
  25. 0 0
      mpi/examples/benchs/burst_helper.h
  26. 0 0
      mpi/examples/benchs/gemm_helper.c
  27. 0 0
      mpi/examples/benchs/gemm_helper.h
  28. 2 7
      mpi/tests/sendrecv_bench.c
  29. 6 10
      mpi/tests/sendrecv_gemm_bench.c
  30. 3 9
      mpi/tests/sendrecv_parallel_tasks_bench.c
  31. 0 54
      mpi/tests/Makefile.am
  32. 1 1
      src/common/utils.c
  33. 8 2
      src/profiling/profiling.c
  34. 1 0
      src/sched_policies/component_heft.c
  35. 1 0
      src/sched_policies/component_heteroprio.c
  36. 1 0
      src/sched_policies/component_mct.c
  37. 5 0
      src/sched_policies/deque_modeling_policy_data_aware.c
  38. 5 0
      src/sched_policies/helper_mct.c
  39. 1 0
      src/sched_policies/helper_mct.h
  40. 4 0
      src/sched_policies/parallel_heft.c
  41. 0 1
      starpufft/src/starpufftx3d.c
  42. 8 2
      tests/Makefile.am
  43. 2 0
      tests/datawizard/interfaces/bcsr/bcsr_opencl.c
  44. 2 0
      tests/datawizard/interfaces/coo/coo_opencl.c
  45. 2 0
      tests/datawizard/interfaces/csr/csr_opencl.c
  46. 2 0
      tests/datawizard/interfaces/matrix/matrix_opencl.c
  47. 2 0
      tests/datawizard/interfaces/multiformat/multiformat_conversion_codelets_opencl.c
  48. 2 0
      tests/datawizard/interfaces/multiformat/multiformat_opencl.c
  49. 1 1
      tests/datawizard/interfaces/tensor/tensor_opencl.c
  50. 1 13
      tests/datawizard/interfaces/variable/variable_opencl.c
  51. 2 0
      tests/datawizard/interfaces/vector/vector_opencl.c
  52. 1 0
      tests/datawizard/scal.c
  53. 2 0
      tests/datawizard/scratch_opencl.c
  54. 2 0
      tests/main/display_binding.c
  55. 14 2
      tests/perfmodels/opencl_memset.c
  56. 7 0
      tests/perfmodels/opencl_memset_kernel.cl
  57. 0 31
      tests/perfmodels/opencl_memset_kernel_01.cl
  58. 5 1
      tests/perfmodels/regression_based.c
  59. 152 165
      tests/perfmodels/regression_based_01.c
  60. 167 171
      tests/perfmodels/regression_based_02.c
  61. 165 173
      tests/perfmodels/regression_based_03.c
  62. 387 0
      tests/perfmodels/regression_based_04.c

+ 5 - 0
doc/doxygen/chapters/320_scheduling.doxy

@@ -185,6 +185,11 @@ already gives the good results that a precise estimation would give.
 
 
 \section Energy-basedScheduling Energy-based Scheduling
 \section Energy-basedScheduling Energy-based Scheduling
 
 
+Note: by default StarPU does not let CPU workers sleep, to let them react to
+task release as quickly as possible. For idle time to really let CPU cores save
+energy, one needs to use the \ref enable-blocking-drivers
+"--enable-blocking-drivers" configuration option.
+
 If the application can provide some energy consumption performance model (through
 If the application can provide some energy consumption performance model (through
 the field starpu_codelet::energy_model), StarPU will
 the field starpu_codelet::energy_model), StarPU will
 take it into account when distributing tasks. The target function that
 take it into account when distributing tasks. The target function that

+ 19 - 7
doc/doxygen/chapters/380_offline_performance_tools.doxy

@@ -586,19 +586,31 @@ $ starpu_paje_sort paje.trace
 \section PapiCounters PAPI counters
 \section PapiCounters PAPI counters
 
 
 Performance counter values could be obtained from the PAPI framework if
 Performance counter values could be obtained from the PAPI framework if
-<c>./configure</c> detected the libpapi. One has to set the \ref STARPU_PROFILING
-environment variable to 1 and then specify which events to record with the
-\ref STARPU_PROF_PAPI_EVENTS environment variable. For instance:
+<c>./configure</c> detected the libpapi.
+
+In Debian, packages <c>libpapi-dev</c> and <c>libpapi5.7</c> provide required
+files.  Package <c>papi-tools</c> contains a set of useful tools, for example
+<c>papi_avail</c> to see which counters are available.
+
+To be able to use Papi counters, one may need to reduce the level of the kernel
+parameter <c>kernel.perf_event_paranoid</c> to at least 2. See
+https://www.kernel.org/doc/html/latest/admin-guide/perf-security.html for the
+security impact of this parameter.
+
+Then one has to set the \ref STARPU_PROFILING environment variable to 1 and
+specify which events to record with the \ref STARPU_PROF_PAPI_EVENTS
+environment variable. For instance:
 
 
 \verbatim
 \verbatim
 export STARPU_PROFILING=1 STARPU_PROF_PAPI_EVENTS="PAPI_TOT_INS PAPI_TOT_CYC"
 export STARPU_PROFILING=1 STARPU_PROF_PAPI_EVENTS="PAPI_TOT_INS PAPI_TOT_CYC"
 \endverbatim
 \endverbatim
 
 
+The comma can also be used to separate events to monitor.
+
 In the current simple implementation, only CPU tasks have their events measured
 In the current simple implementation, only CPU tasks have their events measured
-and require CPUs that support the PAPI events. All events that PAPI support are
-available from their documentation (https://icl.cs.utk.edu/projects/papi/wiki/PAPIC:Preset_Event_Definitions).
-It is important to note that not all events are available on all systems, and
-general PAPI recommendations should be followed.
+and require CPUs that support the PAPI events. It is important to note that not
+all events are available on all systems, and general PAPI recommendations
+should be followed.
 
 
 The counter values can be accessed using the profiling interface:
 The counter values can be accessed using the profiling interface:
 \code{.c}
 \code{.c}

+ 1 - 0
doc/doxygen/chapters/510_configure_options.doxy

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Federal University of Rio Grande do Sul (UFRGS)
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 0
doc/doxygen/chapters/code/vector_scal_opencl.c

@@ -57,6 +57,7 @@ void scal_opencl_func(void *buffers[], void *_args)
         err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
         err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
         if (local > global) local=global;
         if (local > global) local=global;
+        else global = (global + local-1) / local * local;
 
 
         err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
         err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);

+ 2 - 0
examples/axpy/axpy_opencl.c

@@ -60,6 +60,8 @@ void axpy_opencl(void *buffers[], void *_args)
 			STARPU_OPENCL_REPORT_ERROR(err);
 			STARPU_OPENCL_REPORT_ERROR(err);
                 if (local > global)
                 if (local > global)
 			local=global;
 			local=global;
+                else
+                        global = (global + local-1) / local * local;
 
 
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		if (err != CL_SUCCESS)
 		if (err != CL_SUCCESS)

+ 2 - 0
examples/basic_examples/multiformat_conversion_codelets_opencl.c

@@ -74,6 +74,8 @@ void cpu_to_opencl_opencl_func(void *buffers[], void *args)
 
 
                 if (local > global)
                 if (local > global)
 			local = global;
 			local = global;
+                else
+                        global = (global + local-1) / local * local;
 
 
 		err = clEnqueueNDRangeKernel(queue,
 		err = clEnqueueNDRangeKernel(queue,
 					kernel,
 					kernel,

+ 2 - 0
examples/basic_examples/multiformat_opencl.c

@@ -68,6 +68,8 @@ void multiformat_scal_opencl_func(void *buffers[], void *args)
 
 
                 if (local > global)
                 if (local > global)
 			local = global;
 			local = global;
+                else
+                        global = (global + local-1) / local * local;
 
 
 		err = clEnqueueNDRangeKernel(queue,
 		err = clEnqueueNDRangeKernel(queue,
 					kernel,
 					kernel,

+ 1 - 0
examples/basic_examples/vector_scal_opencl.c

@@ -57,6 +57,7 @@ void scal_opencl_func(void *buffers[], void *_args)
                 err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
                 err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
                 if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
                 if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
                 if (local > global) local=global;
                 if (local > global) local=global;
+                else global = (global + local-1) / local * local;
 
 
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);

+ 2 - 0
examples/filters/custom_mf/conversion_opencl.c

@@ -76,6 +76,8 @@ void cpu_to_opencl_opencl_func(void *buffers[], void *args)
 
 
                 if (local > global)
                 if (local > global)
 			local = global;
 			local = global;
+                else
+                        global = (global + local-1) / local * local;
 
 
 		err = clEnqueueNDRangeKernel(
 		err = clEnqueueNDRangeKernel(
 				queue,
 				queue,

+ 2 - 0
examples/filters/custom_mf/custom_opencl.c

@@ -75,6 +75,8 @@ void custom_scal_opencl_func(void *buffers[], void *args)
 
 
                 if (local > global)
                 if (local > global)
 			local = global;
 			local = global;
+                else
+                        global = (global + local-1) / local * local;
 
 
 		err = clEnqueueNDRangeKernel(
 		err = clEnqueueNDRangeKernel(
 				queue,
 				queue,

+ 2 - 0
examples/interface/complex_kernels_opencl.c

@@ -64,6 +64,8 @@ void copy_complex_codelet_opencl(void *buffers[], void *_args)
 			STARPU_OPENCL_REPORT_ERROR(err);
 			STARPU_OPENCL_REPORT_ERROR(err);
                 if (local > global)
                 if (local > global)
 			local=global;
 			local=global;
+                else
+                        global = (global + local-1) / local * local;
 
 
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		if (err != CL_SUCCESS)
 		if (err != CL_SUCCESS)

+ 2 - 14
examples/reductions/dot_product.c

@@ -185,18 +185,12 @@ void redux_opencl_func(void *buffers[], void *args)
 
 
 	{
 	{
 		size_t global=1;
 		size_t global=1;
-		size_t local;
+                size_t local=1;
                 size_t s;
                 size_t s;
                 cl_device_id device;
                 cl_device_id device;
 
 
                 starpu_opencl_get_device(devid, &device);
                 starpu_opencl_get_device(devid, &device);
 
 
-                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
-                if (err != CL_SUCCESS)
-			STARPU_OPENCL_REPORT_ERROR(err);
-                if (local > global)
-			local=global;
-
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		if (err != CL_SUCCESS)
 		if (err != CL_SUCCESS)
 			STARPU_OPENCL_REPORT_ERROR(err);
 			STARPU_OPENCL_REPORT_ERROR(err);
@@ -306,18 +300,12 @@ void dot_opencl_func(void *buffers[], void *cl_arg)
 
 
 	{
 	{
 		size_t global=1;
 		size_t global=1;
-		size_t local;
+                size_t local=1;
                 size_t s;
                 size_t s;
                 cl_device_id device;
                 cl_device_id device;
 
 
                 starpu_opencl_get_device(devid, &device);
                 starpu_opencl_get_device(devid, &device);
 
 
-                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
-                if (err != CL_SUCCESS)
-			STARPU_OPENCL_REPORT_ERROR(err);
-                if (local > global)
-			local=global;
-
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		if (err != CL_SUCCESS)
 		if (err != CL_SUCCESS)
 			STARPU_OPENCL_REPORT_ERROR(err);
 			STARPU_OPENCL_REPORT_ERROR(err);

+ 1 - 0
examples/reductions/dot_product_opencl_kernels.cl

@@ -31,6 +31,7 @@ __kernel void _dot_opencl(__global float *x,
 			  __global DOT_TYPE *dot,
 			  __global DOT_TYPE *dot,
 			  unsigned n)
 			  unsigned n)
 {
 {
+/* FIXME: real parallel implementation */
 	unsigned i;
 	unsigned i;
 	__local double tmp;
 	__local double tmp;
 	tmp = 0.0;
 	tmp = 0.0;

+ 15 - 0
julia/examples/cholesky/cholesky_common.jl

@@ -1,3 +1,18 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
 # Standard kernels for the Cholesky factorization
 # Standard kernels for the Cholesky factorization
 # U22 is the gemm update
 # U22 is the gemm update
 # U21 is the trsm update
 # U21 is the trsm update

+ 15 - 0
julia/examples/cholesky/cholesky_native.jl

@@ -1,3 +1,18 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
 using LinearAlgebra
 using LinearAlgebra
 
 
 function check(mat::Matrix{Float32})
 function check(mat::Matrix{Float32})

+ 15 - 0
julia/src/openblas_ldflags.jl

@@ -1,3 +1,18 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
 import LinearAlgebra.BLAS
 import LinearAlgebra.BLAS
 import Libdl
 import Libdl
 
 

+ 69 - 0
mpi/examples/Makefile.am

@@ -83,6 +83,10 @@ EXTRA_DIST = 				\
 	matrix_decomposition/mpi_decomposition_params.h	\
 	matrix_decomposition/mpi_decomposition_params.h	\
 	matrix_decomposition/mpi_decomposition_matrix.h	\
 	matrix_decomposition/mpi_decomposition_matrix.h	\
 	user_datatype/my_interface.h			\
 	user_datatype/my_interface.h			\
+	benchs/abstract_sendrecv_bench.h	\
+	benchs/bench_helper.h			\
+	benchs/gemm_helper.h			\
+	benchs/burst_helper.h			\
 	helper.h
 	helper.h
 
 
 examplebindir = $(libdir)/starpu/mpi
 examplebindir = $(libdir)/starpu/mpi
@@ -399,3 +403,68 @@ native_fortran/nf_mm_task_build.o: nf_mm_cl.mod fstarpu_mpi_mod.mod fstarpu_mod.
 native_fortran/nf_basic_ring.o: fstarpu_mpi_mod.mod fstarpu_mod.mod
 native_fortran/nf_basic_ring.o: fstarpu_mpi_mod.mod fstarpu_mod.mod
 endif
 endif
 endif
 endif
+
+
+##########
+# benchs #
+##########
+
+examplebin_PROGRAMS +=		\
+	benchs/sendrecv_bench	\
+	benchs/burst
+
+if !STARPU_USE_MPI_MPI
+examplebin_PROGRAMS +=		\
+	benchs/sendrecv_parallel_tasks_bench
+endif
+
+if !STARPU_NO_BLAS_LIB
+examplebin_PROGRAMS +=		\
+	benchs/sendrecv_gemm_bench			\
+	benchs/burst_gemm
+endif
+
+if !STARPU_SIMGRID
+starpu_mpi_EXAMPLES	+=	\
+	benchs/sendrecv_bench	\
+	benchs/burst
+
+if !STARPU_USE_MPI_MPI
+starpu_mpi_EXAMPLES	+=	\
+	benchs/sendrecv_parallel_tasks_bench
+endif
+
+if !STARPU_NO_BLAS_LIB
+starpu_mpi_EXAMPLES	+=	\
+	benchs/sendrecv_gemm_bench			\
+	benchs/burst_gemm
+endif
+endif
+
+benchs_sendrecv_bench_SOURCES = benchs/sendrecv_bench.c
+benchs_sendrecv_bench_SOURCES += benchs/bench_helper.c
+benchs_sendrecv_bench_SOURCES += benchs/abstract_sendrecv_bench.c
+
+benchs_sendrecv_parallel_tasks_bench_SOURCES = benchs/sendrecv_parallel_tasks_bench.c
+benchs_sendrecv_parallel_tasks_bench_SOURCES += benchs/bench_helper.c
+benchs_sendrecv_parallel_tasks_bench_SOURCES += benchs/abstract_sendrecv_bench.c
+
+benchs_burst_SOURCES = benchs/burst.c
+benchs_burst_SOURCES += benchs/burst_helper.c
+
+if !STARPU_NO_BLAS_LIB
+benchs_sendrecv_gemm_bench_SOURCES = benchs/sendrecv_gemm_bench.c
+benchs_sendrecv_gemm_bench_SOURCES += benchs/bench_helper.c
+benchs_sendrecv_gemm_bench_SOURCES += benchs/gemm_helper.c
+benchs_sendrecv_gemm_bench_SOURCES += benchs/abstract_sendrecv_bench.c
+benchs_sendrecv_gemm_bench_SOURCES += ../../examples/common/blas.c
+
+benchs_sendrecv_gemm_bench_LDADD = $(STARPU_BLAS_LDFLAGS)
+
+benchs_burst_gemm_SOURCES = benchs/burst_gemm.c
+benchs_burst_gemm_SOURCES += benchs/gemm_helper.c
+benchs_burst_gemm_SOURCES += benchs/burst_helper.c
+benchs_burst_gemm_SOURCES += ../../examples/common/blas.c
+
+benchs_burst_gemm_LDADD = $(STARPU_BLAS_LDFLAGS)
+endif

mpi/tests/abstract_sendrecv_bench.c → mpi/examples/benchs/abstract_sendrecv_bench.c


mpi/tests/abstract_sendrecv_bench.h → mpi/examples/benchs/abstract_sendrecv_bench.h


mpi/tests/bench_helper.c → mpi/examples/benchs/bench_helper.c


mpi/tests/bench_helper.h → mpi/examples/benchs/bench_helper.h


+ 2 - 6
mpi/tests/burst.c

@@ -49,13 +49,11 @@ void parse_args(int argc, char **argv)
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
-	int ret, rank, mpi_init, other_rank;
+	int ret, rank, other_rank;
 
 
 	parse_args(argc, argv);
 	parse_args(argc, argv);
 
 
-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
-
-	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
 
 
 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
@@ -68,8 +66,6 @@ int main(int argc, char **argv)
 	burst_free_data(rank);
 	burst_free_data(rank);
 
 
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();
-	if (!mpi_init)
-		MPI_Finalize();
 
 
 	return 0;
 	return 0;
 }
 }

+ 3 - 7
mpi/tests/burst_gemm.c

@@ -90,12 +90,11 @@ void parse_args(int argc, char **argv)
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
-	int ret, mpi_init, worldsize, mpi_rank;
+	int ret, worldsize, mpi_rank;
 
 
 	parse_args(argc, argv);
 	parse_args(argc, argv);
 
 
-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
-	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
 	if (ret == -ENODEV)
 	if (ret == -ENODEV)
 		return 77;
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
@@ -109,8 +108,7 @@ int main(int argc, char **argv)
 			FPRINTF(stderr, "We need 2 processes.\n");
 			FPRINTF(stderr, "We need 2 processes.\n");
 
 
 		starpu_mpi_shutdown();
 		starpu_mpi_shutdown();
-		if (!mpi_init)
-			MPI_Finalize();
+
 		return STARPU_TEST_SKIPPED;
 		return STARPU_TEST_SKIPPED;
 	}
 	}
 
 
@@ -203,8 +201,6 @@ enodev:
 	burst_free_data(mpi_rank);
 	burst_free_data(mpi_rank);
 
 
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();
-	if (!mpi_init)
-		MPI_Finalize();
 
 
 	return ret;
 	return ret;
 }
 }

mpi/tests/burst_helper.c → mpi/examples/benchs/burst_helper.c


mpi/tests/burst_helper.h → mpi/examples/benchs/burst_helper.h


mpi/tests/gemm_helper.c → mpi/examples/benchs/gemm_helper.c


mpi/tests/gemm_helper.h → mpi/examples/benchs/gemm_helper.h


+ 2 - 7
mpi/tests/sendrecv_bench.c

@@ -26,7 +26,6 @@
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
 	int ret, rank, worldsize;
 	int ret, rank, worldsize;
-	int mpi_init;
 	int pause_workers = 0;
 	int pause_workers = 0;
 
 
 
 
@@ -52,8 +51,7 @@ int main(int argc, char **argv)
 	}
 	}
 
 
 
 
-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
-	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
 
 
 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
@@ -65,8 +63,7 @@ int main(int argc, char **argv)
 			FPRINTF(stderr, "We need 2 processes.\n");
 			FPRINTF(stderr, "We need 2 processes.\n");
 
 
 		starpu_mpi_shutdown();
 		starpu_mpi_shutdown();
-		if (!mpi_init)
-			MPI_Finalize();
+
 		return STARPU_TEST_SKIPPED;
 		return STARPU_TEST_SKIPPED;
 	}
 	}
 
 
@@ -85,8 +82,6 @@ int main(int argc, char **argv)
 	}
 	}
 
 
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();
-	if (!mpi_init)
-		MPI_Finalize();
 
 
 	return 0;
 	return 0;
 }
 }

+ 6 - 10
mpi/tests/sendrecv_gemm_bench.c

@@ -53,7 +53,7 @@ static void* comm_thread_func(void* arg)
 	{
 	{
 		char hostname[65];
 		char hostname[65];
 		gethostname(hostname, sizeof(hostname));
 		gethostname(hostname, sizeof(hostname));
-		_STARPU_DISP("[%s] No core was available for the comm thread. You should increase STARPU_RESERVE_NCPU or decrease STARPU_NCPU\n", hostname);
+		fprintf(stderr, "[%s] No core was available for the comm thread. You should increase STARPU_RESERVE_NCPU or decrease STARPU_NCPU\n", hostname);
 	}
 	}
 
 
 	sendrecv_bench(mpi_rank, &thread_barrier);
 	sendrecv_bench(mpi_rank, &thread_barrier);
@@ -118,7 +118,7 @@ void parse_args(int argc, char **argv)
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
 	double start, end;
 	double start, end;
-	int ret, mpi_init, worldsize;
+	int ret, worldsize;
 	starpu_pthread_t comm_thread;
 	starpu_pthread_t comm_thread;
 
 
 	char hostname[255];
 	char hostname[255];
@@ -128,8 +128,7 @@ int main(int argc, char **argv)
 
 
 	starpu_fxt_autostart_profiling(0);
 	starpu_fxt_autostart_profiling(0);
 
 
-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
-	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
 	if (ret == -ENODEV)
 	if (ret == -ENODEV)
 		return 77;
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
@@ -143,8 +142,7 @@ int main(int argc, char **argv)
 			FPRINTF(stderr, "We need 2 processes.\n");
 			FPRINTF(stderr, "We need 2 processes.\n");
 
 
 		starpu_mpi_shutdown();
 		starpu_mpi_shutdown();
-		if (!mpi_init)
-			MPI_Finalize();
+
 		return STARPU_TEST_SKIPPED;
 		return STARPU_TEST_SKIPPED;
 	}
 	}
 
 
@@ -162,7 +160,7 @@ int main(int argc, char **argv)
 
 
 	if (mpi_rank == 0)
 	if (mpi_rank == 0)
 	{
 	{
-		PRINTF("# node\tx\ty\tz\tms\tGFlops\n");
+		printf("# node\tx\ty\tz\tms\tGFlops\n");
 	}
 	}
 
 
 	starpu_pause();
 	starpu_pause();
@@ -185,7 +183,7 @@ int main(int argc, char **argv)
 	double timing = end - start;
 	double timing = end - start;
 	double flops = 2.0*((unsigned long long)matrix_dim) * ((unsigned long long)matrix_dim)*((unsigned long long)matrix_dim);
 	double flops = 2.0*((unsigned long long)matrix_dim) * ((unsigned long long)matrix_dim)*((unsigned long long)matrix_dim);
 
 
-	PRINTF("%s\t%u\t%u\t%u\t%.0f\t%.1f\n", hostname, matrix_dim, matrix_dim, matrix_dim, timing/1000.0, flops/timing/1000.0);
+	printf("%s\t%u\t%u\t%u\t%.0f\t%.1f\n", hostname, matrix_dim, matrix_dim, matrix_dim, timing/1000.0, flops/timing/1000.0);
 
 
 
 
 enodev:
 enodev:
@@ -200,8 +198,6 @@ enodev:
 
 
 	starpu_resume();
 	starpu_resume();
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();
-	if (!mpi_init)
-		MPI_Finalize();
 
 
 	return ret;
 	return ret;
 }
 }

+ 3 - 9
mpi/tests/sendrecv_parallel_tasks_bench.c

@@ -134,10 +134,8 @@ static struct starpu_codelet cl =
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
 	int ret, rank, worldsize;
 	int ret, rank, worldsize;
-	int mpi_init;
 
 
-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
-	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
 
 
 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
@@ -149,8 +147,7 @@ int main(int argc, char **argv)
 			FPRINTF(stderr, "We need 2 processes.\n");
 			FPRINTF(stderr, "We need 2 processes.\n");
 
 
 		starpu_mpi_shutdown();
 		starpu_mpi_shutdown();
-		if (!mpi_init)
-			MPI_Finalize();
+
 		return STARPU_TEST_SKIPPED;
 		return STARPU_TEST_SKIPPED;
 	}
 	}
 
 
@@ -162,8 +159,7 @@ int main(int argc, char **argv)
 	else if (rank >= 2)
 	else if (rank >= 2)
 	{
 	{
 		starpu_mpi_shutdown();
 		starpu_mpi_shutdown();
-		if (!mpi_init)
-			MPI_Finalize();
+
 		return 0;
 		return 0;
 	}
 	}
 
 
@@ -222,8 +218,6 @@ int main(int argc, char **argv)
 	free(mpi_tags);
 	free(mpi_tags);
 
 
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();
-	if (!mpi_init)
-		MPI_Finalize();
 
 
 	return 0;
 	return 0;
 }
 }

+ 0 - 54
mpi/tests/Makefile.am

@@ -62,11 +62,7 @@ BUILT_SOURCES =
 CLEANFILES = *.gcno *.gcda *.linkinfo starpu_idle_microsec.log
 CLEANFILES = *.gcno *.gcda *.linkinfo starpu_idle_microsec.log
 
 
 EXTRA_DIST = 				\
 EXTRA_DIST = 				\
-	abstract_sendrecv_bench.h	\
-	bench_helper.h			\
 	helper.h			\
 	helper.h			\
-	gemm_helper.h			\
-	burst_helper.h			\
 	user_defined_datatype_value.h
 	user_defined_datatype_value.h
 
 
 examplebindir = $(libdir)/starpu/examples/mpi
 examplebindir = $(libdir)/starpu/examples/mpi
@@ -142,21 +138,8 @@ starpu_mpi_TESTS +=				\
 	temporary				\
 	temporary				\
 	user_defined_datatype			\
 	user_defined_datatype			\
 	early_stuff				\
 	early_stuff				\
-	sendrecv_bench				\
-	burst						\
 	display_bindings
 	display_bindings
 
 
-if !STARPU_USE_MPI_MPI
-starpu_mpi_TESTS +=				\
-	sendrecv_parallel_tasks_bench
-endif
-
-if !STARPU_NO_BLAS_LIB
-starpu_mpi_TESTS +=				\
-	sendrecv_gemm_bench			\
-	burst_gemm
-endif
-
 if !STARPU_SIMGRID
 if !STARPU_SIMGRID
 # missing support in simgrid
 # missing support in simgrid
 starpu_mpi_TESTS +=				\
 starpu_mpi_TESTS +=				\
@@ -244,18 +227,9 @@ noinst_PROGRAMS +=				\
 	starpu_redefine				\
 	starpu_redefine				\
 	load_balancer				\
 	load_balancer				\
 	driver					\
 	driver					\
-	sendrecv_bench				\
-	sendrecv_parallel_tasks_bench		\
-	burst					\
 	nothing							\
 	nothing							\
 	display_bindings
 	display_bindings
 
 
-if !STARPU_NO_BLAS_LIB
-noinst_PROGRAMS +=				\
-	sendrecv_gemm_bench			\
-	burst_gemm
-endif
-
 XFAIL_TESTS=					\
 XFAIL_TESTS=					\
 	policy_register_toomany			\
 	policy_register_toomany			\
 	policy_unregister			\
 	policy_unregister			\
@@ -285,31 +259,3 @@ mpi_earlyrecv2_SOURCES = mpi_earlyrecv2.c
 mpi_earlyrecv2_SOURCES += ../../examples/interface/complex_interface.c
 mpi_earlyrecv2_SOURCES += ../../examples/interface/complex_interface.c
 mpi_earlyrecv2_sync_SOURCES = mpi_earlyrecv2_sync.c
 mpi_earlyrecv2_sync_SOURCES = mpi_earlyrecv2_sync.c
 mpi_earlyrecv2_sync_SOURCES += ../../examples/interface/complex_interface.c
 mpi_earlyrecv2_sync_SOURCES += ../../examples/interface/complex_interface.c
-
-sendrecv_bench_SOURCES = sendrecv_bench.c
-sendrecv_bench_SOURCES += bench_helper.c
-sendrecv_bench_SOURCES += abstract_sendrecv_bench.c
-
-sendrecv_parallel_tasks_bench_SOURCES = sendrecv_parallel_tasks_bench.c
-sendrecv_parallel_tasks_bench_SOURCES += bench_helper.c
-sendrecv_parallel_tasks_bench_SOURCES += abstract_sendrecv_bench.c
-
-burst_SOURCES = burst.c
-burst_SOURCES += burst_helper.c
-
-if !STARPU_NO_BLAS_LIB
-sendrecv_gemm_bench_SOURCES = sendrecv_gemm_bench.c
-sendrecv_gemm_bench_SOURCES += bench_helper.c
-sendrecv_gemm_bench_SOURCES += gemm_helper.c
-sendrecv_gemm_bench_SOURCES += abstract_sendrecv_bench.c
-sendrecv_gemm_bench_SOURCES += ../../examples/common/blas.c
-
-sendrecv_gemm_bench_LDADD = $(STARPU_BLAS_LDFLAGS)
-
-burst_gemm_SOURCES = burst_gemm.c
-burst_gemm_SOURCES += gemm_helper.c
-burst_gemm_SOURCES += burst_helper.c
-burst_gemm_SOURCES += ../../examples/common/blas.c
-
-burst_gemm_LDADD = $(STARPU_BLAS_LDFLAGS)
-endif

+ 1 - 1
src/common/utils.c

@@ -743,7 +743,7 @@ int starpu_get_env_size_default(const char *str, int defval)
 
 
 void starpu_display_bindings(void)
 void starpu_display_bindings(void)
 {
 {
-#ifdef STARPU_HAVE_HWLOC
+#if defined(STARPU_HAVE_HWLOC) && !defined(STARPU_SIMGRID)
 	int hwloc_ret = system("hwloc-ps -a -t -c");
 	int hwloc_ret = system("hwloc-ps -a -t -c");
 	if (hwloc_ret)
 	if (hwloc_ret)
 	{
 	{

+ 8 - 2
src/profiling/profiling.c

@@ -46,6 +46,7 @@ static struct timespec executing_start_date[STARPU_NMAXWORKERS];
 #ifdef STARPU_PAPI
 #ifdef STARPU_PAPI
 static int papi_events[PAPI_MAX_HWCTRS];
 static int papi_events[PAPI_MAX_HWCTRS];
 static int papi_nevents = 0;
 static int papi_nevents = 0;
+static int warned_component_unavailable = 0;
 #endif
 #endif
 
 
 /* Store the busid of the different (src, dst) pairs. busid_matrix[src][dst]
 /* Store the busid of the different (src, dst) pairs. busid_matrix[src][dst]
@@ -160,7 +161,7 @@ void _starpu_profiling_init(void)
 		conf_papi_events = starpu_getenv("STARPU_PROF_PAPI_EVENTS");
 		conf_papi_events = starpu_getenv("STARPU_PROF_PAPI_EVENTS");
 		if (conf_papi_events != NULL)
 		if (conf_papi_events != NULL)
 		{
 		{
-			while ((papi_event_name = strtok_r(conf_papi_events, " ", &conf_papi_events)))
+			while ((papi_event_name = strtok_r(conf_papi_events, " ,", &conf_papi_events)))
 			{
 			{
 				_STARPU_DEBUG("Loading PAPI Event:%s\n", papi_event_name);
 				_STARPU_DEBUG("Loading PAPI Event:%s\n", papi_event_name);
 				retval = PAPI_event_name_to_code ((char*)papi_event_name, &papi_events[papi_nevents]);
 				retval = PAPI_event_name_to_code ((char*)papi_event_name, &papi_events[papi_nevents]);
@@ -188,7 +189,12 @@ void _starpu_profiling_papi_task_start_counters(struct starpu_task *task)
 		PAPI_create_eventset(&profiling_info->papi_event_set);
 		PAPI_create_eventset(&profiling_info->papi_event_set);
 		for(int i=0; i<papi_nevents; i++)
 		for(int i=0; i<papi_nevents; i++)
 		{
 		{
-			PAPI_add_event(profiling_info->papi_event_set, papi_events[i]);
+			int ret = PAPI_add_event(profiling_info->papi_event_set, papi_events[i]);
+			if (ret == PAPI_ECMP_DISABLED && !warned_component_unavailable)
+			{
+				_STARPU_MSG("Error while registering Papi event: Component containing event is disabled. Try running `papi_component_avail` to get more information.\n");
+				warned_component_unavailable = 1;
+			}
 			profiling_info->papi_values[i]=0;
 			profiling_info->papi_values[i]=0;
 		}
 		}
 		PAPI_reset(profiling_info->papi_event_set);
 		PAPI_reset(profiling_info->papi_event_set);

+ 1 - 0
src/sched_policies/component_heft.c

@@ -2,6 +2,7 @@
  *
  *
  * Copyright (C) 2013-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  * Copyright (C) 2013-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  * Copyright (C) 2013       Simon Archipoff
  * Copyright (C) 2013       Simon Archipoff
+ * Copyright (C) 2020       Télécom-Sud Paris
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 0
src/sched_policies/component_heteroprio.c

@@ -2,6 +2,7 @@
  *
  *
  * Copyright (C) 2013-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  * Copyright (C) 2013-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  * Copyright (C) 2013       Simon Archipoff
  * Copyright (C) 2013       Simon Archipoff
+ * Copyright (C) 2020       Télécom-Sud Paris
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 0
src/sched_policies/component_mct.c

@@ -2,6 +2,7 @@
  *
  *
  * Copyright (C) 2013-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  * Copyright (C) 2013-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  * Copyright (C) 2013       Simon Archipoff
  * Copyright (C) 2013       Simon Archipoff
+ * Copyright (C) 2020       Télécom-Sud Paris
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by

+ 5 - 0
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -6,6 +6,7 @@
  * Copyright (C) 2013       Simon Archipoff
  * Copyright (C) 2013       Simon Archipoff
  * Copyright (C) 2013       Thibaut Lambert
  * Copyright (C) 2013       Thibaut Lambert
  * Copyright (C) 2016       Uppsala University
  * Copyright (C) 2016       Uppsala University
+ * Copyright (C) 2020       Télécom-Sud Paris
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -1014,6 +1015,10 @@ static void initialize_dmda_policy(unsigned sched_ctx_id)
 	dt->alpha = starpu_get_env_float_default("STARPU_SCHED_ALPHA", _STARPU_SCHED_ALPHA_DEFAULT);
 	dt->alpha = starpu_get_env_float_default("STARPU_SCHED_ALPHA", _STARPU_SCHED_ALPHA_DEFAULT);
 	dt->beta = starpu_get_env_float_default("STARPU_SCHED_BETA", _STARPU_SCHED_BETA_DEFAULT);
 	dt->beta = starpu_get_env_float_default("STARPU_SCHED_BETA", _STARPU_SCHED_BETA_DEFAULT);
 	/* data->_gamma: cost of one Joule in us. If gamma is set to 10^6, then one Joule cost 1s */
 	/* data->_gamma: cost of one Joule in us. If gamma is set to 10^6, then one Joule cost 1s */
+#ifdef STARPU_NON_BLOCKING_DRIVERS
+	if (starpu_getenv("STARPU_SCHED_GAMMA"))
+		_STARPU_DISP("Warning: STARPU_SCHED_GAMMA was used, but --enable-blocking-drivers configuration was not set, CPU cores will not actually be sleeping\n");
+#endif
 	dt->_gamma = starpu_get_env_float_default("STARPU_SCHED_GAMMA", _STARPU_SCHED_GAMMA_DEFAULT);
 	dt->_gamma = starpu_get_env_float_default("STARPU_SCHED_GAMMA", _STARPU_SCHED_GAMMA_DEFAULT);
 	/* data->idle_power: Idle power of the whole machine in Watt */
 	/* data->idle_power: Idle power of the whole machine in Watt */
 	dt->idle_power = starpu_get_env_float_default("STARPU_IDLE_POWER", 0.0);
 	dt->idle_power = starpu_get_env_float_default("STARPU_IDLE_POWER", 0.0);

+ 5 - 0
src/sched_policies/helper_mct.c

@@ -2,6 +2,7 @@
  *
  *
  * Copyright (C) 2013-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  * Copyright (C) 2013-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  * Copyright (C) 2013       Simon Archipoff
  * Copyright (C) 2013       Simon Archipoff
+ * Copyright (C) 2020       Télécom-Sud Paris
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -45,6 +46,10 @@ struct _starpu_mct_data *starpu_mct_init_parameters(struct starpu_sched_componen
 	{
 	{
 		data->alpha = starpu_get_env_float_default("STARPU_SCHED_ALPHA", _STARPU_SCHED_ALPHA_DEFAULT);
 		data->alpha = starpu_get_env_float_default("STARPU_SCHED_ALPHA", _STARPU_SCHED_ALPHA_DEFAULT);
 		data->beta = starpu_get_env_float_default("STARPU_SCHED_BETA", _STARPU_SCHED_BETA_DEFAULT);
 		data->beta = starpu_get_env_float_default("STARPU_SCHED_BETA", _STARPU_SCHED_BETA_DEFAULT);
+#ifdef STARPU_NON_BLOCKING_DRIVERS
+		if (starpu_getenv("STARPU_SCHED_GAMMA"))
+			_STARPU_DISP("Warning: STARPU_SCHED_GAMMA was used, but --enable-blocking-drivers configuration was not set, CPU cores will not actually be sleeping\n");
+#endif
 		data->_gamma = starpu_get_env_float_default("STARPU_SCHED_GAMMA", _STARPU_SCHED_GAMMA_DEFAULT);
 		data->_gamma = starpu_get_env_float_default("STARPU_SCHED_GAMMA", _STARPU_SCHED_GAMMA_DEFAULT);
 		data->idle_power = starpu_get_env_float_default("STARPU_IDLE_POWER", 0.0);
 		data->idle_power = starpu_get_env_float_default("STARPU_IDLE_POWER", 0.0);
 	}
 	}

+ 1 - 0
src/sched_policies/helper_mct.h

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2013-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  * Copyright (C) 2013-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Télécom-Sud Paris
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by

+ 4 - 0
src/sched_policies/parallel_heft.c

@@ -567,6 +567,10 @@ static void initialize_parallel_heft_policy(unsigned sched_ctx_id)
 
 
 	hd->alpha = starpu_get_env_float_default("STARPU_SCHED_ALPHA", _STARPU_SCHED_ALPHA_DEFAULT);
 	hd->alpha = starpu_get_env_float_default("STARPU_SCHED_ALPHA", _STARPU_SCHED_ALPHA_DEFAULT);
 	hd->beta = starpu_get_env_float_default("STARPU_SCHED_BETA", _STARPU_SCHED_BETA_DEFAULT);
 	hd->beta = starpu_get_env_float_default("STARPU_SCHED_BETA", _STARPU_SCHED_BETA_DEFAULT);
+#ifdef STARPU_NON_BLOCKING_DRIVERS
+	if (starpu_getenv("STARPU_SCHED_GAMMA"))
+		_STARPU_DISP("Warning: STARPU_SCHED_GAMMA was used, but --enable-blocking-drivers configuration was not set, CPU cores will not actually be sleeping\n");
+#endif
 	hd->_gamma = starpu_get_env_float_default("STARPU_SCHED_GAMMA", _STARPU_SCHED_GAMMA_DEFAULT);
 	hd->_gamma = starpu_get_env_float_default("STARPU_SCHED_GAMMA", _STARPU_SCHED_GAMMA_DEFAULT);
 	hd->idle_power = starpu_get_env_float_default("STARPU_IDLE_POWER", 0.0);
 	hd->idle_power = starpu_get_env_float_default("STARPU_IDLE_POWER", 0.0);
 
 

+ 0 - 1
starpufft/src/starpufftx3d.c

@@ -164,7 +164,6 @@ static struct starpu_task *
 STARPUFFT(start3dC2C)(STARPUFFT(plan) plan, starpu_data_handle_t in, starpu_data_handle_t out)
 STARPUFFT(start3dC2C)(STARPUFFT(plan) plan, starpu_data_handle_t in, starpu_data_handle_t out)
 {
 {
 	STARPU_ASSERT(plan->type == C2C);
 	STARPU_ASSERT(plan->type == C2C);
-	int z;
 	int ret;
 	int ret;
 
 
 if (PARALLEL) {
 if (PARALLEL) {

+ 8 - 2
tests/Makefile.am

@@ -61,7 +61,6 @@ EXTRA_DIST =					\
 	datawizard/interfaces/block/block_opencl_kernel.cl \
 	datawizard/interfaces/block/block_opencl_kernel.cl \
 	datawizard/interfaces/tensor/tensor_opencl_kernel.cl \
 	datawizard/interfaces/tensor/tensor_opencl_kernel.cl \
 	perfmodels/opencl_memset_kernel.cl \
 	perfmodels/opencl_memset_kernel.cl \
-	perfmodels/opencl_memset_kernel_01.cl \
 	$(MICROBENCHS:=.sh) \
 	$(MICROBENCHS:=.sh) \
 	microbenchs/microbench.sh \
 	microbenchs/microbench.sh \
 	model-checking/platform.xml \
 	model-checking/platform.xml \
@@ -359,7 +358,8 @@ myPROGRAMS +=				\
 	perfmodels/regression_based		\
 	perfmodels/regression_based		\
 	perfmodels/regression_based_01		\
 	perfmodels/regression_based_01		\
 	perfmodels/regression_based_02		\
 	perfmodels/regression_based_02		\
-	perfmodels/regression_based_03		\	
+	perfmodels/regression_based_03		\
+	perfmodels/regression_based_04		\
 	perfmodels/non_linear_regression_based	\
 	perfmodels/non_linear_regression_based	\
 	perfmodels/feed				\
 	perfmodels/feed				\
 	perfmodels/user_base			\
 	perfmodels/user_base			\
@@ -1011,6 +1011,9 @@ perfmodels_regression_based_02_SOURCES=\
 perfmodels_regression_based_03_SOURCES=\
 perfmodels_regression_based_03_SOURCES=\
 	perfmodels/regression_based_03.c
 	perfmodels/regression_based_03.c
 
 
+perfmodels_regression_based_04_SOURCES=\
+	perfmodels/regression_based_04.c
+
 perfmodels_max_fpga_SOURCES=\
 perfmodels_max_fpga_SOURCES=\
 	perfmodels/max_fpga.c
 	perfmodels/max_fpga.c
 perfmodels_max_fpga_LDADD = $(LDADD) \
 perfmodels_max_fpga_LDADD = $(LDADD) \
@@ -1020,6 +1023,9 @@ if STARPU_USE_OPENCL
 perfmodels_regression_based_SOURCES+=\
 perfmodels_regression_based_SOURCES+=\
 	perfmodels/opencl_memset.c
 	perfmodels/opencl_memset.c
 
 
+perfmodels_regression_based_04_SOURCES+=\
+	perfmodels/opencl_memset.c
+
 nobase_STARPU_OPENCL_DATA_DATA += \
 nobase_STARPU_OPENCL_DATA_DATA += \
 	perfmodels/opencl_memset_kernel.cl
 	perfmodels/opencl_memset_kernel.cl
 endif
 endif

+ 2 - 0
tests/datawizard/interfaces/bcsr/bcsr_opencl.c

@@ -95,6 +95,8 @@ test_bcsr_opencl_func(void *buffers[], void *args)
 
 
                 if (local > global)
                 if (local > global)
 			local = global;
 			local = global;
+                else
+                        global = (global + local-1) / local * local;
 
 
 		err = clEnqueueNDRangeKernel(queue,
 		err = clEnqueueNDRangeKernel(queue,
 					kernel,
 					kernel,

+ 2 - 0
tests/datawizard/interfaces/coo/coo_opencl.c

@@ -93,6 +93,8 @@ test_coo_opencl_func(void *buffers[], void *args)
 
 
                 if (local > global)
                 if (local > global)
 			local = global;
 			local = global;
+                else
+                        global = (global + local-1) / local * local;
 
 
 		err = clEnqueueNDRangeKernel(queue,
 		err = clEnqueueNDRangeKernel(queue,
 					kernel,
 					kernel,

+ 2 - 0
tests/datawizard/interfaces/csr/csr_opencl.c

@@ -93,6 +93,8 @@ test_csr_opencl_func(void *buffers[], void *args)
 
 
                 if (local > global)
                 if (local > global)
 			local = global;
 			local = global;
+                else
+                        global = (global + local-1) / local * local;
 
 
 		err = clEnqueueNDRangeKernel(queue,
 		err = clEnqueueNDRangeKernel(queue,
 					kernel,
 					kernel,

+ 2 - 0
tests/datawizard/interfaces/matrix/matrix_opencl.c

@@ -92,6 +92,8 @@ void test_matrix_opencl_func(void *buffers[], void *args)
 
 
                 if (local > global)
                 if (local > global)
 			local = global;
 			local = global;
+                else
+                        global = (global + local-1) / local * local;
 
 
 		err = clEnqueueNDRangeKernel(queue,
 		err = clEnqueueNDRangeKernel(queue,
 					kernel,
 					kernel,

+ 2 - 0
tests/datawizard/interfaces/multiformat/multiformat_conversion_codelets_opencl.c

@@ -84,6 +84,8 @@ void cpu_to_opencl_opencl_func(void *buffers[], void *args)
 
 
                 if (local > global)
                 if (local > global)
 			local = global;
 			local = global;
+                else
+                        global = (global + local-1) / local * local;
 
 
 		err = clEnqueueNDRangeKernel(queue,
 		err = clEnqueueNDRangeKernel(queue,
 					kernel,
 					kernel,

+ 2 - 0
tests/datawizard/interfaces/multiformat/multiformat_opencl.c

@@ -98,6 +98,8 @@ void test_multiformat_opencl_func(void *buffers[], void *args)
 
 
                 if (local > global)
                 if (local > global)
 			local = global;
 			local = global;
+                else
+                        global = (global + local-1) / local * local;
 
 
 		err = clEnqueueNDRangeKernel(queue,
 		err = clEnqueueNDRangeKernel(queue,
 					kernel,
 					kernel,

+ 1 - 1
tests/datawizard/interfaces/tensor/tensor_opencl.c

@@ -87,7 +87,7 @@ test_tensor_opencl_func(void *buffers[], void *args)
 	}
 	}
 			
 			
 	{
 	{
-		size_t global = nx * ny * nz * nt;
+                size_t global = 1;
 		err = clEnqueueNDRangeKernel(queue,
 		err = clEnqueueNDRangeKernel(queue,
 					     kernel,
 					     kernel,
 					     1,
 					     1,

+ 1 - 13
tests/datawizard/interfaces/variable/variable_opencl.c

@@ -73,24 +73,12 @@ void test_variable_opencl_func(void *buffers[], void *args)
 
 
 	{
 	{
 		size_t global = 1;
 		size_t global = 1;
-		size_t local;
+                size_t local = 1;
                 size_t s;
                 size_t s;
                 cl_device_id device;
                 cl_device_id device;
 
 
                 starpu_opencl_get_device(devid, &device);
                 starpu_opencl_get_device(devid, &device);
 
 
-                err = clGetKernelWorkGroupInfo (kernel,
-						device,
-						CL_KERNEL_WORK_GROUP_SIZE,
-						sizeof(local),
-						&local,
-						&s);
-                if (err != CL_SUCCESS)
-			STARPU_OPENCL_REPORT_ERROR(err);
-
-                if (local > global)
-			local = global;
-
 		err = clEnqueueNDRangeKernel(queue,
 		err = clEnqueueNDRangeKernel(queue,
 					kernel,
 					kernel,
 					1,
 					1,

+ 2 - 0
tests/datawizard/interfaces/vector/vector_opencl.c

@@ -91,6 +91,8 @@ test_vector_opencl_func(void *buffers[], void *args)
 
 
                 if (local > global)
                 if (local > global)
 			local = global;
 			local = global;
+                else
+                        global = (global + local-1) / local * local;
 
 
 		err = clEnqueueNDRangeKernel(queue,
 		err = clEnqueueNDRangeKernel(queue,
 					kernel,
 					kernel,

+ 1 - 0
tests/datawizard/scal.c

@@ -73,6 +73,7 @@ void scal_func_opencl(void *buffers[], void *cl_arg)
                 err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
                 err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
                 if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
                 if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
                 if (local > global) local=global;
                 if (local > global) local=global;
+                else global = (global + local-1) / local * local;
 
 
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);

+ 2 - 0
tests/datawizard/scratch_opencl.c

@@ -73,6 +73,8 @@ void opencl_f(void *buffers[], void *args)
 			STARPU_OPENCL_REPORT_ERROR(err);
 			STARPU_OPENCL_REPORT_ERROR(err);
                 if (local > global)
                 if (local > global)
 			local=global;
 			local=global;
+                else
+                        global = (global + local-1) / local * local;
 
 
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		if (err != CL_SUCCESS)
 		if (err != CL_SUCCESS)

+ 2 - 0
tests/main/display_binding.c

@@ -38,6 +38,8 @@ int main(void)
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
+	starpu_shutdown();
+
 	return EXIT_SUCCESS;
 	return EXIT_SUCCESS;
 }
 }
 #endif
 #endif

+ 14 - 2
tests/perfmodels/opencl_memset.c

@@ -22,7 +22,7 @@
 
 
 extern struct starpu_opencl_program opencl_program;
 extern struct starpu_opencl_program opencl_program;
 
 
-void memset_opencl(void *buffers[], void *args)
+void _memset_opencl(void *buffers[], void *args, const char *name)
 {
 {
 	(void) args;
 	(void) args;
 	int id, devid;
 	int id, devid;
@@ -36,7 +36,7 @@ void memset_opencl(void *buffers[], void *args)
 	id = starpu_worker_get_id_check();
 	id = starpu_worker_get_id_check();
 	devid = starpu_worker_get_devid(id);
 	devid = starpu_worker_get_devid(id);
 
 
-	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "_memset_opencl", devid);
+	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, name, devid);
 	if (err != CL_SUCCESS)
 	if (err != CL_SUCCESS)
 		STARPU_OPENCL_REPORT_ERROR(err);
 		STARPU_OPENCL_REPORT_ERROR(err);
 
 
@@ -58,6 +58,8 @@ void memset_opencl(void *buffers[], void *args)
 			STARPU_OPENCL_REPORT_ERROR(err);
 			STARPU_OPENCL_REPORT_ERROR(err);
                 if (local > global)
                 if (local > global)
 			local=global;
 			local=global;
+                else
+                        global = (global + local-1) / local * local;
 
 
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		if (err != CL_SUCCESS)
 		if (err != CL_SUCCESS)
@@ -65,3 +67,13 @@ void memset_opencl(void *buffers[], void *args)
 	}
 	}
 	starpu_opencl_release_kernel(kernel);
 	starpu_opencl_release_kernel(kernel);
 }
 }
+
+void memset_opencl(void *buffers[], void *args, const char *kernel)
+{
+	_memset_opencl(buffers, args, "_memset_opencl");
+}
+
+void memset0_opencl(void *buffers[], void *args, const char *kernel)
+{
+	_memset_opencl(buffers, args, "_memset0_opencl");
+}

+ 7 - 0
tests/perfmodels/opencl_memset_kernel.cl

@@ -14,6 +14,13 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
  */
 
 
+ __kernel void _memset0_opencl(__global int *val, int nx)
+{
+        const int i = get_global_id(0);
+        if (i < nx)
+                val[0] += i;
+}
+
 __kernel void _memset_opencl(__global int *val, int nx)
 __kernel void _memset_opencl(__global int *val, int nx)
 {
 {
         const int i = get_global_id(0);
         const int i = get_global_id(0);

+ 0 - 31
tests/perfmodels/opencl_memset_kernel_01.cl

@@ -1,31 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2012-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
- __kernel void _memset0_opencl(__global int *val, int nx)
-{
-        const int i = get_global_id(0);
-        if (i < nx)
-                val[0] += i;
-}
-
-__kernel void _memset_opencl(__global int *val, int nx)
-{
-        const int i = get_global_id(0);
-        if (i < nx)
-                val[i] = 42;
-}
-
-

+ 5 - 1
tests/perfmodels/regression_based.c

@@ -23,6 +23,7 @@
  * Benchmark memset with a linear regression
  * Benchmark memset with a linear regression
  */
  */
 
 
+#define STARTlin 1024
 #define START 1024
 #define START 1024
 #ifdef STARPU_QUICK_CHECK
 #ifdef STARPU_QUICK_CHECK
 #define END 1048576
 #define END 1048576
@@ -184,11 +185,14 @@ int main(int argc, char **argv)
 #endif
 #endif
 
 
 	int size;
 	int size;
-	for (size = START; size < END; size *= 2)
+	for (size = STARTlin; size < END; size *= 2)
 	{
 	{
 		/* Use a linear regression */
 		/* Use a linear regression */
 		test_memset(size, &memset_cl);
 		test_memset(size, &memset_cl);
+	}
 
 
+	for (size = START; size < END; size *= 2)
+	{
 		/* Use a non-linear regression */
 		/* Use a non-linear regression */
 		test_memset(size, &nl_memset_cl);
 		test_memset(size, &nl_memset_cl);
 	}
 	}

+ 152 - 165
tests/perfmodels/regression_based_01.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2011-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2011       Télécom-SudParis
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -14,23 +15,20 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
  */
 
 
-/*
- * Dans ce benchmark:
- - calibrer le modèle linéaire seulement pour des grandes tailles STARTlin 1048576
- - séparer la boucle test_memset en deux boucles:
-        *linéaire: démarrer à partir de 1 048 576
-        *non linéaire: conserver le démarrage à 1024
- */
-
 #include <starpu.h>
 #include <starpu.h>
 #include <assert.h>
 #include <assert.h>
 #include <starpu_scheduler.h>
 #include <starpu_scheduler.h>
 #include <unistd.h>
 #include <unistd.h>
 #include "../helper.h"
 #include "../helper.h"
 
 
-#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+/*
+ * - Calibrate the linear model only for large sizes: STARTline 1048576
+ * - Separate the test_memset loop in two loops:
+ *   - linear: start from 1048576
+ *   - non-linear: keep start at 1024
+ */
 
 
-#define STARTlin 1048576
+#define STARTlin 131072
 #define START 1024
 #define START 1024
 #ifdef STARPU_QUICK_CHECK
 #ifdef STARPU_QUICK_CHECK
 #define END 1048576
 #define END 1048576
@@ -38,168 +36,159 @@
 #define END 16777216
 #define END 16777216
 #endif
 #endif
 
 
-int ret;
-
 
 
 void memset_cpu(void *descr[], void *arg)
 void memset_cpu(void *descr[], void *arg)
 {
 {
-    (void)arg;
-    STARPU_SKIP_IF_VALGRIND;
+	(void)arg;
+	STARPU_SKIP_IF_VALGRIND;
 
 
-    int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
-    unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+	unsigned i;
 
 
-    usleep(1000);
-    int i;
+	usleep(1000);
 
 
-    for (i=0; i<n ; i++)
-    {
+	for (i=0; i<n ; i++)
+	{
 
 
-        ptr[0] += i;
-    }
+		ptr[0] += i;
+	}
 }
 }
 
 
-
 static struct starpu_perfmodel model =
 static struct starpu_perfmodel model =
 {
 {
-    .type = STARPU_REGRESSION_BASED,
-    .symbol = "memset_regression_based"
+	.type = STARPU_REGRESSION_BASED,
+	.symbol = "memset_regression_based"
 };
 };
 
 
 static struct starpu_perfmodel nl_model =
 static struct starpu_perfmodel nl_model =
 {
 {
-    .type = STARPU_NL_REGRESSION_BASED,
-    .symbol = "non_linear_memset_regression_based"
+	.type = STARPU_NL_REGRESSION_BASED,
+	.symbol = "non_linear_memset_regression_based"
 };
 };
 
 
 static struct starpu_codelet memset_cl =
 static struct starpu_codelet memset_cl =
 {
 {
-    .cpu_funcs = {memset_cpu},
-    .cpu_funcs_name = {"memset_cpu"},
-    .model = &model,
-    .nbuffers = 1,
-    .modes = {STARPU_W}
+	.cpu_funcs = {memset_cpu},
+	.cpu_funcs_name = {"memset_cpu"},
+	.model = &model,
+	.nbuffers = 1,
+	.modes = {STARPU_W}
 };
 };
 
 
 static struct starpu_codelet nl_memset_cl =
 static struct starpu_codelet nl_memset_cl =
 {
 {
-    .cpu_funcs = {memset_cpu},
-    .cpu_funcs_name = {"memset_cpu"},
-    .model = &nl_model,
-    .nbuffers = 1,
-    .modes = {STARPU_W}
+	.cpu_funcs = {memset_cpu},
+	.cpu_funcs_name = {"memset_cpu"},
+	.model = &nl_model,
+	.nbuffers = 1,
+	.modes = {STARPU_W}
 };
 };
 
 
-
 static void test_memset(int nelems, struct starpu_codelet *codelet)
 static void test_memset(int nelems, struct starpu_codelet *codelet)
 {
 {
-    int nloops = 100;
-    int loop;
-    starpu_data_handle_t handle;
+	int nloops = 100;
+	int loop;
+	starpu_data_handle_t handle;
 
 
-    starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, nelems, sizeof(int));
-    for (loop = 0; loop < nloops; loop++)
-    {
-        struct starpu_task *task = starpu_task_create();
+	starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, nelems, sizeof(int));
+	for (loop = 0; loop < nloops; loop++)
+	{
+		struct starpu_task *task = starpu_task_create();
 
 
-        task->cl = codelet;
-        task->handles[0] = handle;
+		task->cl = codelet;
+		task->handles[0] = handle;
 
 
-        int ret = starpu_task_submit(task);
-        if (ret == -ENODEV)
-            exit(STARPU_TEST_SKIPPED);
-        STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
-    }
+		int ret = starpu_task_submit(task);
+		if (ret == -ENODEV)
+			exit(STARPU_TEST_SKIPPED);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
 
 
-    starpu_data_unregister(handle);
+	starpu_data_unregister(handle);
 }
 }
 
 
-
-static void compare_performance(int size, struct starpu_codelet *codelet, struct starpu_task *task)
+static void compare_performance(int size, struct starpu_codelet *codelet, struct starpu_task *compar_task)
 {
 {
-    unsigned i;
-    int niter = 100;
-    starpu_data_handle_t handle;
-
-    starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
+	unsigned i;
+	unsigned niter = 100;
+	starpu_data_handle_t handle;
 
 
-    struct starpu_task **tasks = (struct starpu_task **) malloc(niter*sizeof(struct starpu_task *));
-    assert(tasks);
+	starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
 
 
-    for (i = 0; i < niter; i++)
-    {
-        struct starpu_task *task = starpu_task_create();
+	struct starpu_task **tasks = (struct starpu_task **) malloc(niter*sizeof(struct starpu_task *));
+	assert(tasks);
 
 
-        task->cl = codelet;
-        task->handles[0] = handle;
+	for (i = 0; i < niter; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
 
 
-        /* create a synchronous task: any call to starpu_task_submit will block
-         * until it is terminated */
+		task->cl = codelet;
+		task->handles[0] = handle;
 
 
-        task->synchronous = 1;
+		task->synchronous = 1;
 
 
-        /* We will destroy the task structure by hand so that we can
-         * query the profiling info before the task is destroyed. */
-        task->destroy = 0;
+		/* We will destroy the task structure by hand so that we can
+		 * query the profiling info before the task is destroyed. */
+		task->destroy = 0;
 
 
-        tasks[i] = task;
+		tasks[i] = task;
 
 
-        ret = starpu_task_submit(task);
+		int ret = starpu_task_submit(task);
 
 
-        if (STARPU_UNLIKELY(ret == -ENODEV))
-        {
-            FPRINTF(stderr, "No worker may execute this task\n");
-            exit(0);
-        }
-    }
+		if (STARPU_UNLIKELY(ret == -ENODEV))
+		{
+			FPRINTF(stderr, "No worker may execute this task\n");
+			exit(0);
+		}
+	}
 
 
-    starpu_data_unregister(handle);
+	starpu_data_unregister(handle);
 
 
-    starpu_task_wait_for_all();
+	starpu_task_wait_for_all();
 
 
-    double length_sum = 0.0;
+	double length_sum = 0.0;
 
 
-    for (i = 0; i < niter; i++)
-    {
-        struct starpu_task *task = tasks[i];
-        struct starpu_profiling_task_info *info = task->profiling_info;
+	for (i = 0; i < niter; i++)
+	{
+		struct starpu_task *task = tasks[i];
+		struct starpu_profiling_task_info *info = task->profiling_info;
 
 
 
 
-        /* How long was the task execution ? */
-        length_sum += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
+		/* How long was the task execution ? */
+		length_sum += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
 
 
-        /* We don't need the task structure anymore */
-        starpu_task_destroy(task);
-    }
+		/* We don't need the task structure anymore */
+		starpu_task_destroy(task);
+	}
 
 
 
 
-    /* Display the occupancy of all workers during the test */
-    unsigned worker;
-    for (worker = 0; worker < starpu_worker_get_count(); worker++)
-    {
-        struct starpu_profiling_worker_info worker_info;
-        ret = starpu_profiling_worker_get_info(worker, &worker_info);
-        STARPU_ASSERT(!ret);
+	/* Display the occupancy of all workers during the test */
+	unsigned worker;
+	for (worker = 0; worker < starpu_worker_get_count(); worker++)
+	{
+		struct starpu_profiling_worker_info worker_info;
+		int ret = starpu_profiling_worker_get_info(worker, &worker_info);
+		STARPU_ASSERT(!ret);
 
 
-        char workername[128];
-        starpu_worker_get_name(worker, workername, sizeof(workername));
-        unsigned nimpl;
+		char workername[128];
+		starpu_worker_get_name(worker, workername, sizeof(workername));
+		unsigned nimpl;
 
 
 
 
-        if (starpu_worker_get_type(worker)==STARPU_CPU_WORKER)
-        {
-            FPRINTF(stdout, "\n Worker :%s ::::::::::\n\n", workername);
+		if (starpu_worker_get_type(worker)==STARPU_CPU_WORKER)
+		{
+			FPRINTF(stdout, "\n Worker :%s ::::::::::\n\n", workername);
 
 
-            for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
-            {
+			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
+			{
 
 
-                FPRINTF(stdout, "Expected time for %d on %s (impl %u): %f, Measured time: %f\n",
-                        size, workername, nimpl,starpu_task_expected_length(task, starpu_worker_get_perf_archtype(worker, task->sched_ctx), nimpl), ((length_sum)/niter));
+				FPRINTF(stdout, "Expected time for %d on %s (impl %u): %f, Measured time: %f\n",
+						size, workername, nimpl,starpu_task_expected_length(compar_task, starpu_worker_get_perf_archtype(worker, compar_task->sched_ctx), nimpl), ((length_sum)/niter));
 
 
-            }
-        }
-
-    }
+			}
+		}
+	}
 
 
 
 
 }
 }
@@ -207,78 +196,76 @@ static void compare_performance(int size, struct starpu_codelet *codelet, struct
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
-    /* Enable profiling */
-    starpu_profiling_status_set(STARPU_PROFILING_ENABLE);
-
-    struct starpu_conf conf;
-    starpu_data_handle_t handle;
-    int ret;
-
-    starpu_conf_init(&conf);
+	/* Enable profiling */
+	starpu_profiling_status_set(STARPU_PROFILING_ENABLE);
 
 
-    conf.sched_policy_name = "eager";
-    conf.calibrate = 2;
+	struct starpu_conf conf;
+	starpu_data_handle_t handle;
+	int ret;
 
 
-    ret = starpu_initialize(&conf, &argc, &argv);
-    if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
-    STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	starpu_conf_init(&conf);
 
 
-    int size;
-    for (size = STARTlin; size < END; size *= 2)
-    {
-        /* Use a linear regression */
-        test_memset(size, &memset_cl);
-    }
+	conf.sched_policy_name = "eager";
+	conf.calibrate = 2;
 
 
-    for (size = START; size < END; size *= 2)
-    {
-        /* Use a non-linear regression */
-        test_memset(size, &nl_memset_cl);
-    }
+	ret = starpu_initialize(&conf, &argc, &argv);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
-    ret = starpu_task_wait_for_all();
-    STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
+	int size;
+	for (size = STARTlin; size < END; size *= 2)
+	{
+		/* Use a linear regression */
+		test_memset(size, &memset_cl);
+	}
 
 
-    starpu_shutdown();
+	for (size = START; size < END; size *= 2)
+	{
+		/* Use a non-linear regression */
+		test_memset(size, &nl_memset_cl);
+	}
 
 
+	ret = starpu_task_wait_for_all();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
 
 
-    /* Test Phase */
-    starpu_conf_init(&conf);
+	starpu_shutdown();
 
 
-    conf.sched_policy_name = "eager";
-    conf.calibrate = 0;
 
 
-    ret = starpu_initialize(&conf, &argc, &argv);
-    if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
-    STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	/* Test Phase */
+	starpu_conf_init(&conf);
 
 
-    /* Now create a dummy task just to estimate its duration according to the regression */
+	conf.sched_policy_name = "eager";
+	conf.calibrate = 0;
 
 
-    size = 1234567;
+	ret = starpu_initialize(&conf, &argc, &argv);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
-    starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
+	/* Now create a dummy task just to estimate its duration according to the regression */
 
 
-    struct starpu_task *task = starpu_task_create();
-    task->cl = &memset_cl;
-    task->handles[0] = handle;
-    task->destroy = 0;
+	size = 1234567;
 
 
-    FPRINTF(stdout, "\n ////linear regression results////\n");
-    compare_performance(size, &memset_cl,task);
+	starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
 
 
+	struct starpu_task *task = starpu_task_create();
+	task->cl = &memset_cl;
+	task->handles[0] = handle;
+	task->destroy = 0;
 
 
-    task->cl = &nl_memset_cl;
+	FPRINTF(stdout, "\n ////linear regression results////\n");
+	compare_performance(size, &memset_cl, task);
 
 
-    FPRINTF(stdout, "\n ////non linear regression results////\n");
+	task->cl = &nl_memset_cl;
 
 
-    compare_performance(size, &nl_memset_cl,task);
+	FPRINTF(stdout, "\n ////non linear regression results////\n");
 
 
+	compare_performance(size, &nl_memset_cl, task);
 
 
-    starpu_task_destroy(task);
+	starpu_task_destroy(task);
 
 
-    starpu_data_unregister(handle);
+	starpu_data_unregister(handle);
 
 
-    starpu_shutdown();
+	starpu_shutdown();
 
 
-    return 0;
+	return EXIT_SUCCESS;
 }
 }

+ 167 - 171
tests/perfmodels/regression_based_02.c

@@ -33,202 +33,197 @@
 #define END 16777216
 #define END 16777216
 #endif
 #endif
 
 
-int ret;
-
-//1er implémentation avec un delai initial (100 us)
+// first implementation with an initial delay (100 us)
 void memset0_cpu(void *descr[], void *arg)
 void memset0_cpu(void *descr[], void *arg)
 {
 {
-    (void)arg;
-    STARPU_SKIP_IF_VALGRIND;
+	(void)arg;
+	STARPU_SKIP_IF_VALGRIND;
 
 
-    int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
-    unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
-    int i;
+	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+	unsigned i;
 
 
-    //usleep(100);
+	usleep(100);
 
 
-    for (i=0; i<n ; i++)
-    {
-        ptr[0] += i;
-    }
+	for (i=0; i<n ; i++)
+	{
+		ptr[0] += i;
+	}
 }
 }
 
 
-//deuxième implémentation sans delai initial usleep() et fait 2.5 plus de tours de boucles
+// second implementation without initial delay but 2.5 more loops
 void memset_cpu(void *descr[], void *arg)
 void memset_cpu(void *descr[], void *arg)
 {
 {
-    (void)arg;
-    STARPU_SKIP_IF_VALGRIND;
+	(void)arg;
+	STARPU_SKIP_IF_VALGRIND;
 
 
-    int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
-    unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
-    int i;
+	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+	int i;
 
 
-    for (i=0; i<6.5*n ; i++)
-    {
-        ptr[0] += i;
-    }
+	for (i=0; i<6.5*n ; i++)
+	{
+		ptr[0] += i;
+	}
 }
 }
 
 
 //fonction pour mesurer l'energie
 //fonction pour mesurer l'energie
 double energy_function(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl)
 double energy_function(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl)
 
 
 {
 {
-    double energy;
-    int factor;
-    if (nimpl == 0)
-        factor = 10;
-    else
-        factor = 1;
+	double energy;
+	int factor;
+	if (nimpl == 0)
+		factor = 10;
+	else
+		factor = 1;
 
 
-    energy=starpu_task_expected_length(task, arch, nimpl)*factor;
+	energy=starpu_task_expected_length(task, arch, nimpl)*factor;
 
 
-    return energy;
+	return energy;
 }
 }
 
 
 static struct starpu_perfmodel model =
 static struct starpu_perfmodel model =
 {
 {
-    .type = STARPU_REGRESSION_BASED,
-    .symbol = "memset_regression_based"
+	.type = STARPU_REGRESSION_BASED,
+	.symbol = "memset_regression_based"
 };
 };
 
 
 static struct starpu_perfmodel nl_model =
 static struct starpu_perfmodel nl_model =
 {
 {
-    .type = STARPU_NL_REGRESSION_BASED,
-    .symbol = "non_linear_memset_regression_based"
+	.type = STARPU_NL_REGRESSION_BASED,
+	.symbol = "non_linear_memset_regression_based"
 };
 };
 
 
 static struct starpu_perfmodel nl_energy_model=
 static struct starpu_perfmodel nl_energy_model=
 {
 {
-    .type = STARPU_PER_ARCH,
-    .symbol = "non_linear_energy_model",
-    .arch_cost_function={energy_function},
+	.type = STARPU_PER_ARCH,
+	.symbol = "non_linear_energy_model",
+	.arch_cost_function=energy_function,
 };
 };
 
 
 static struct starpu_codelet memset_cl =
 static struct starpu_codelet memset_cl =
 {
 {
-    .cpu_funcs = {memset0_cpu, memset_cpu},
-    .cpu_funcs_name = {"memset0_cpu", "memset_cpu"},
-    .model = &model,
-    .nbuffers = 1,
-    .modes = {STARPU_W}
+	.cpu_funcs = {memset0_cpu, memset_cpu},
+	.cpu_funcs_name = {"memset0_cpu", "memset_cpu"},
+	.model = &model,
+	.nbuffers = 1,
+	.modes = {STARPU_W}
 };
 };
 
 
 static struct starpu_codelet nl_memset_cl =
 static struct starpu_codelet nl_memset_cl =
 {
 {
-    .cpu_funcs = {memset0_cpu, memset_cpu},
-    .cpu_funcs_name = {"memset0_cpu", "memset_cpu"},
-    .model = &nl_model,
-    .energy_model = &nl_energy_model,
-    .nbuffers = 1,
-    .modes = {STARPU_W}
+	.cpu_funcs = {memset0_cpu, memset_cpu},
+	.cpu_funcs_name = {"memset0_cpu", "memset_cpu"},
+	.model = &nl_model,
+	.energy_model = &nl_energy_model,
+	.nbuffers = 1,
+	.modes = {STARPU_W}
 };
 };
 
 
 static void test_memset(int nelems, struct starpu_codelet *codelet)
 static void test_memset(int nelems, struct starpu_codelet *codelet)
 {
 {
-    int nloops = 100;
-    int loop;
-    starpu_data_handle_t handle;
+	int nloops = 100;
+	int loop;
+	starpu_data_handle_t handle;
 
 
-    starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, nelems, sizeof(int));
-    for (loop = 0; loop < nloops; loop++)
-    {
-        struct starpu_task *task = starpu_task_create();
+	starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, nelems, sizeof(int));
+	for (loop = 0; loop < nloops; loop++)
+	{
+		struct starpu_task *task = starpu_task_create();
 
 
-        task->cl = codelet;
-        task->handles[0] = handle;
+		task->cl = codelet;
+		task->handles[0] = handle;
 
 
-        int ret = starpu_task_submit(task);
-        if (ret == -ENODEV)
-            exit(STARPU_TEST_SKIPPED);
-        STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
-    }
+		int ret = starpu_task_submit(task);
+		if (ret == -ENODEV)
+			exit(STARPU_TEST_SKIPPED);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
 
 
-    starpu_data_unregister(handle);
+	starpu_data_unregister(handle);
 }
 }
 
 
-static void compare_performance(int size, struct starpu_codelet *codelet, struct starpu_task *task)
+static void compare_performance(int size, struct starpu_codelet *codelet, struct starpu_task *compar_task)
 {
 {
-    unsigned i;
-    int niter = 100;
-    starpu_data_handle_t handle;
-
-    starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
+	unsigned i;
+	unsigned niter = 100;
+	starpu_data_handle_t handle;
 
 
-    struct starpu_task **tasks = (struct starpu_task **) malloc(niter*sizeof(struct starpu_task *));
-    assert(tasks);
+	starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
 
 
-    for (i = 0; i < niter; i++)
-    {
-        //fabriquer la tache
-        struct starpu_task *task = starpu_task_create();
+	struct starpu_task **tasks = (struct starpu_task **) malloc(niter*sizeof(struct starpu_task *));
+	assert(tasks);
 
 
-        task->cl = codelet;
-        task->handles[0] = handle;
+	for (i = 0; i < niter; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
 
 
-        task->synchronous = 1;
+		task->cl = codelet;
+		task->handles[0] = handle;
 
 
-        /* We will destroy the task structure by hand so that we can
-         * query the profiling info before the task is destroyed. */
-        task->destroy = 0;
+		task->synchronous = 1;
 
 
-        tasks[i] = task;
+		/* We will destroy the task structure by hand so that we can
+		 * query the profiling info before the task is destroyed. */
+		task->destroy = 0;
 
 
-        //soumettre la tache
-        ret = starpu_task_submit(task);
+		tasks[i] = task;
 
 
-        if (STARPU_UNLIKELY(ret == -ENODEV))
-        {
-            FPRINTF(stderr, "No worker may execute this task\n");
-            exit(0);
-        }
-    }
+		int ret = starpu_task_submit(task);
 
 
-    starpu_data_unregister(handle);
+		if (STARPU_UNLIKELY(ret == -ENODEV))
+		{
+			FPRINTF(stderr, "No worker may execute this task\n");
+			exit(0);
+		}
+	}
 
 
-    starpu_task_wait_for_all();
+	starpu_data_unregister(handle);
 
 
-    double length_sum = 0.0;
+	starpu_task_wait_for_all();
 
 
-    for (i = 0; i < niter; i++)
-    {
-        struct starpu_task *task = tasks[i];
+	double length_sum = 0.0;
 
 
-        struct starpu_profiling_task_info *info = task->profiling_info;
+	for (i = 0; i < niter; i++)
+	{
+		struct starpu_task *task = tasks[i];
+		struct starpu_profiling_task_info *info = task->profiling_info;
 
 
 
 
-        /* How long was the task execution ? */
-        length_sum += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
+		/* How long was the task execution ? */
+		length_sum += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
 
 
-        /* We don't need the task structure anymore */
-        starpu_task_destroy(task);
-    }
+		/* We don't need the task structure anymore */
+		starpu_task_destroy(task);
+	}
 
 
 
 
-    /* Display the occupancy of all workers during the test */
-    unsigned worker;
-    for (worker = 0; worker < starpu_worker_get_count(); worker++)
-    {
-        struct starpu_profiling_worker_info worker_info;
-        ret = starpu_profiling_worker_get_info(worker, &worker_info);
-        STARPU_ASSERT(!ret);
+	/* Display the occupancy of all workers during the test */
+	unsigned worker;
+	for (worker = 0; worker < starpu_worker_get_count(); worker++)
+	{
+		struct starpu_profiling_worker_info worker_info;
+		int ret = starpu_profiling_worker_get_info(worker, &worker_info);
+		STARPU_ASSERT(!ret);
 
 
-        char workername[128];
-        starpu_worker_get_name(worker, workername, sizeof(workername));
-        unsigned nimpl;
+		char workername[128];
+		starpu_worker_get_name(worker, workername, sizeof(workername));
+		unsigned nimpl;
 
 
-        if (starpu_worker_get_type(worker)==STARPU_CPU_WORKER)
-        {
-            FPRINTF(stdout, "\n Worker :%s ::::::::::\n\n", workername);
+		if (starpu_worker_get_type(worker)==STARPU_CPU_WORKER)
+		{
+			FPRINTF(stdout, "\n Worker :%s ::::::::::\n\n", workername);
 
 
-            for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
-            {
+			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
+			{
 
 
-                FPRINTF(stdout, "Expected time for %d on %s (impl %u): %f, Measured time: %f, Expected energy: %f\n",
-                        size, workername, nimpl,starpu_task_expected_length(task, starpu_worker_get_perf_archtype(worker, task->sched_ctx), nimpl), ((length_sum)/niter),
-                        starpu_task_expected_energy(task, starpu_worker_get_perf_archtype(worker, task->sched_ctx), nimpl));
-            }
-        }
-    }
+				FPRINTF(stdout, "Expected time for %d on %s (impl %u): %f, Measured time: %f, Expected energy: %f\n",
+						size, workername, nimpl,starpu_task_expected_length(compar_task, starpu_worker_get_perf_archtype(worker, compar_task->sched_ctx), nimpl), ((length_sum)/niter),
+						starpu_task_expected_energy(compar_task, starpu_worker_get_perf_archtype(worker, compar_task->sched_ctx), nimpl));
+			}
+		}
+	}
 
 
 
 
 }
 }
@@ -236,74 +231,75 @@ static void compare_performance(int size, struct starpu_codelet *codelet, struct
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
+	/* Enable profiling */
+	starpu_profiling_status_set(STARPU_PROFILING_ENABLE);
 
 
-    /* Enable profiling */
-    starpu_profiling_status_set(1);
+	struct starpu_conf conf;
+	starpu_data_handle_t handle;
+	int ret;
 
 
-    struct starpu_conf conf;
-    starpu_data_handle_t handle;
-    int ret;
+	starpu_conf_init(&conf);
 
 
-    starpu_conf_init(&conf);
+	conf.sched_policy_name = "dmda";
+	conf.calibrate = 2;
 
 
-    conf.sched_policy_name = "dmda";
-    conf.calibrate = 2;
+	ret = starpu_initialize(&conf, &argc, &argv);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
-    ret = starpu_initialize(&conf, &argc, &argv);
-    if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
-    STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-
-    int size;
-    /*for (size = STARTlin; size < END; size *= 2)
-    {
-        /* Use a linear regression */
-    //test_memset(size, &memset_cl);
-    //}
+	int size;
+#if 0
+	for (size = STARTlin; size < END; size *= 2)
+	{
+		/* Use a linear regression */
+		test_memset(size, &memset_cl);
+	}
+#endif
 
 
-    for (size = START; size < END; size *= 2)
-    {
-        /* Use a non-linear regression */
-        test_memset(size, &nl_memset_cl);
-    }
+	for (size = START; size < END; size *= 2)
+	{
+		/* Use a non-linear regression */
+		test_memset(size, &nl_memset_cl);
+	}
 
 
-    ret = starpu_task_wait_for_all();
-    STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
+	ret = starpu_task_wait_for_all();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
 
 
-    starpu_shutdown();
+	starpu_shutdown();
 
 
 
 
-    /* Test Phase */
-    starpu_conf_init(&conf);
+	/* Test Phase */
+	starpu_conf_init(&conf);
 
 
-    conf.sched_policy_name = "dmda";
-    conf.calibrate = 0;
+	conf.sched_policy_name = "dmda";
+	conf.calibrate = 0;
 
 
-    ret = starpu_initialize(&conf, &argc, &argv);
-    if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
-    STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_initialize(&conf, &argc, &argv);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
-    /* Now create a dummy task just to estimate its duration according to the regression */
+	/* Now create a dummy task just to estimate its duration according to the regression */
 
 
-    size = 1234567;
+	size = 1234567;
 
 
-    starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
+	starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
 
 
-    struct starpu_task *task = starpu_task_create();
-    task->cl = &memset_cl;
-    task->handles[0] = handle;
-    task->destroy = 0;
+	struct starpu_task *task = starpu_task_create();
+	task->cl = &memset_cl;
+	task->handles[0] = handle;
+	task->destroy = 0;
 
 
-    task->cl = &nl_memset_cl;
+	task->cl = &nl_memset_cl;
 
 
-    FPRINTF(stdout, "\n ////non linear regression results////\n");
+	FPRINTF(stdout, "\n ////non linear regression results////\n");
 
 
-    compare_performance(size, &nl_memset_cl,task);
+	compare_performance(size, &nl_memset_cl, task);
 
 
-    starpu_task_destroy(task);
+	starpu_task_destroy(task);
 
 
-    starpu_data_unregister(handle);
+	starpu_data_unregister(handle);
 
 
-    starpu_shutdown();
+	starpu_shutdown();
 
 
-    return EXIT_SUCCESS;
+	return EXIT_SUCCESS;
 }
 }

+ 165 - 173
tests/perfmodels/regression_based_03.c

@@ -34,204 +34,197 @@
 #define END 16777216
 #define END 16777216
 #endif
 #endif
 
 
-
-int ret;
-
-//1er implémentation avec un delai initial (100 us)
+// first implementation with an initial delay (100 us)
 void memset0_cpu(void *descr[], void *arg)
 void memset0_cpu(void *descr[], void *arg)
 {
 {
-    (void)arg;
-    STARPU_SKIP_IF_VALGRIND;
+	(void)arg;
+	STARPU_SKIP_IF_VALGRIND;
 
 
-    int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
-    unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
-    int i;
+	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+	unsigned i;
 
 
-    //usleep () function
-    //usleep(100);
+	usleep(100);
 
 
-    for (i=0; i<n ; i++)
-    {
-        ptr[0] += i;
-    }
+	for (i=0; i<n ; i++)
+	{
+		ptr[0] += i;
+	}
 }
 }
 
 
-//deuxième implémentation sans delai initial usleep() et fait 2.5 plus de tours de boucles
+// second implementation without initial delay but 2.5 more loops
 void memset_cpu(void *descr[], void *arg)
 void memset_cpu(void *descr[], void *arg)
 {
 {
-    (void)arg;
-    STARPU_SKIP_IF_VALGRIND;
+	(void)arg;
+	STARPU_SKIP_IF_VALGRIND;
 
 
-    int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
-    unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
-    int i;
+	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+	int i;
 
 
-    for (i=0; i<6.5*n ; i++)
-    {
-        ptr[0] += i;
-    }
+	for (i=0; i<6.5*n ; i++)
+	{
+		ptr[0] += i;
+	}
 }
 }
 
 
 //fonction pour mesurer l'energie
 //fonction pour mesurer l'energie
 double energy_function(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl)
 double energy_function(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl)
 
 
 {
 {
-    double energy;
-    int factor;
-    if (nimpl == 0)
-        factor = 10;
-    else
-        factor = 1;
+	double energy;
+	int factor;
+	if (nimpl == 0)
+		factor = 10;
+	else
+		factor = 1;
 
 
-    energy=starpu_task_expected_length(task, arch, nimpl)*factor;
+	energy=starpu_task_expected_length(task, arch, nimpl)*factor;
 
 
-    return energy;
+	return energy;
 }
 }
 
 
 static struct starpu_perfmodel model =
 static struct starpu_perfmodel model =
 {
 {
-    .type = STARPU_REGRESSION_BASED,
-    .symbol = "memset_regression_based"
+	.type = STARPU_REGRESSION_BASED,
+	.symbol = "memset_regression_based"
 };
 };
 
 
 static struct starpu_perfmodel nl_model =
 static struct starpu_perfmodel nl_model =
 {
 {
-    .type = STARPU_NL_REGRESSION_BASED,
-    .symbol = "non_linear_memset_regression_based"
+	.type = STARPU_NL_REGRESSION_BASED,
+	.symbol = "non_linear_memset_regression_based"
 };
 };
 
 
 static struct starpu_perfmodel nl_energy_model=
 static struct starpu_perfmodel nl_energy_model=
 {
 {
-    .type = STARPU_PER_ARCH,
-    .symbol = "non_linear_energy_model",
-    .arch_cost_function={energy_function},
+	.type = STARPU_PER_ARCH,
+	.symbol = "non_linear_energy_model",
+	.arch_cost_function=energy_function,
 };
 };
 
 
 static struct starpu_codelet memset_cl =
 static struct starpu_codelet memset_cl =
 {
 {
-    .cpu_funcs = {memset0_cpu, memset_cpu},
-    .cpu_funcs_name = {"memset0_cpu", "memset_cpu"},
-    .model = &model,
-    .nbuffers = 1,
-    .modes = {STARPU_W}
+	.cpu_funcs = {memset0_cpu, memset_cpu},
+	.cpu_funcs_name = {"memset0_cpu", "memset_cpu"},
+	.model = &model,
+	.nbuffers = 1,
+	.modes = {STARPU_W}
 };
 };
 
 
 static struct starpu_codelet nl_memset_cl =
 static struct starpu_codelet nl_memset_cl =
 {
 {
-    .cpu_funcs = {memset0_cpu, memset_cpu},
-    .cpu_funcs_name = {"memset0_cpu", "memset_cpu"},
-    .model = &nl_model,
-    .energy_model = &nl_energy_model,
-    .nbuffers = 1,
-    .modes = {STARPU_W}
+	.cpu_funcs = {memset0_cpu, memset_cpu},
+	.cpu_funcs_name = {"memset0_cpu", "memset_cpu"},
+	.model = &nl_model,
+	.energy_model = &nl_energy_model,
+	.nbuffers = 1,
+	.modes = {STARPU_W}
 };
 };
 
 
 static void test_memset(int nelems, struct starpu_codelet *codelet)
 static void test_memset(int nelems, struct starpu_codelet *codelet)
 {
 {
-    int nloops = 100;
-    int loop;
-    starpu_data_handle_t handle;
+	int nloops = 100;
+	int loop;
+	starpu_data_handle_t handle;
 
 
-    starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, nelems, sizeof(int));
-    for (loop = 0; loop < nloops; loop++)
-    {
-        struct starpu_task *task = starpu_task_create();
+	starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, nelems, sizeof(int));
+	for (loop = 0; loop < nloops; loop++)
+	{
+		struct starpu_task *task = starpu_task_create();
 
 
-        task->cl = codelet;
-        task->handles[0] = handle;
+		task->cl = codelet;
+		task->handles[0] = handle;
 
 
-        int ret = starpu_task_submit(task);
-        if (ret == -ENODEV)
-            exit(STARPU_TEST_SKIPPED);
-        STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
-    }
+		int ret = starpu_task_submit(task);
+		if (ret == -ENODEV)
+			exit(STARPU_TEST_SKIPPED);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
 
 
-    starpu_data_unregister(handle);
+	starpu_data_unregister(handle);
 }
 }
 
 
-static void compare_performance(int size, struct starpu_codelet *codelet, struct starpu_task *task)
+static void compare_performance(int size, struct starpu_codelet *codelet, struct starpu_task *compar_task)
 {
 {
-    unsigned i;
-    int niter = 100;
-    starpu_data_handle_t handle;
+	unsigned i;
+	unsigned niter = 100;
+	starpu_data_handle_t handle;
 
 
-    starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
+	starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
 
 
-    struct starpu_task **tasks = (struct starpu_task **) malloc(niter*sizeof(struct starpu_task *));
-    assert(tasks);
+	struct starpu_task **tasks = (struct starpu_task **) malloc(niter*sizeof(struct starpu_task *));
+	assert(tasks);
 
 
-    for (i = 0; i < niter; i++)
-    {
-        //fabriquer la tache
-        struct starpu_task *task = starpu_task_create();
+	for (i = 0; i < niter; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
 
 
-        task->cl = codelet;
-        task->handles[0] = handle;
+		task->cl = codelet;
+		task->handles[0] = handle;
 
 
-        task->synchronous = 1;
+		task->synchronous = 1;
 
 
-        /* We will destroy the task structure by hand so that we can
-         * query the profiling info before the task is destroyed. */
-        task->destroy = 0;
+		/* We will destroy the task structure by hand so that we can
+		 * query the profiling info before the task is destroyed. */
+		task->destroy = 0;
 
 
-        tasks[i] = task;
+		tasks[i] = task;
 
 
-        //soumettre la tache
-        ret = starpu_task_submit(task);
+		int ret = starpu_task_submit(task);
 
 
-        if (STARPU_UNLIKELY(ret == -ENODEV))
-        {
-            FPRINTF(stderr, "No worker may execute this task\n");
-            exit(0);
-        }
-    }
+		if (STARPU_UNLIKELY(ret == -ENODEV))
+		{
+			FPRINTF(stderr, "No worker may execute this task\n");
+			exit(0);
+		}
+	}
 
 
-    starpu_data_unregister(handle);
+	starpu_data_unregister(handle);
 
 
-    starpu_task_wait_for_all();
+	starpu_task_wait_for_all();
 
 
-    double length_sum = 0.0;
+	double length_sum = 0.0;
 
 
-    for (i = 0; i < niter; i++)
-    {
-        struct starpu_task *task = tasks[i];
+	for (i = 0; i < niter; i++)
+	{
+		struct starpu_task *task = tasks[i];
+		struct starpu_profiling_task_info *info = task->profiling_info;
 
 
-        struct starpu_profiling_task_info *info = task->profiling_info;
 
 
+		/* How long was the task execution ? */
+		length_sum += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
 
 
-        /* How long was the task execution ? */
-        length_sum += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
+		/* We don't need the task structure anymore */
+		starpu_task_destroy(task);
+	}
 
 
-        /* We don't need the task structure anymore */
-        starpu_task_destroy(task);
-    }
 
 
+	/* Display the occupancy of all workers during the test */
+	unsigned worker;
+	for (worker = 0; worker < starpu_worker_get_count(); worker++)
+	{
+		struct starpu_profiling_worker_info worker_info;
+		int ret = starpu_profiling_worker_get_info(worker, &worker_info);
+		STARPU_ASSERT(!ret);
 
 
-    /* Display the occupancy of all workers during the test */
-    unsigned worker;
-    for (worker = 0; worker < starpu_worker_get_count(); worker++)
-    {
-        struct starpu_profiling_worker_info worker_info;
-        ret = starpu_profiling_worker_get_info(worker, &worker_info);
-        STARPU_ASSERT(!ret);
+		char workername[128];
+		starpu_worker_get_name(worker, workername, sizeof(workername));
+		unsigned nimpl;
 
 
-        char workername[128];
-        starpu_worker_get_name(worker, workername, sizeof(workername));
-        unsigned nimpl;
+		if (starpu_worker_get_type(worker)==STARPU_CPU_WORKER)
+		{
+			FPRINTF(stdout, "\n Worker :%s ::::::::::\n\n", workername);
 
 
-        if (starpu_worker_get_type(worker)==STARPU_CPU_WORKER)
-        {
-            FPRINTF(stdout, "\n Worker :%s ::::::::::\n\n", workername);
+			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
+			{
 
 
-            for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
-            {
-
-                FPRINTF(stdout, "Expected time for %d on %s (impl %u): %f, Measured time: %f, Expected energy: %f\n",
-                        size, workername, nimpl,starpu_task_expected_length(task, starpu_worker_get_perf_archtype(worker, task->sched_ctx), nimpl), ((length_sum)/niter),
-                        starpu_task_expected_energy(task, starpu_worker_get_perf_archtype(worker, task->sched_ctx), nimpl));
-            }
-        }
-    }
+				FPRINTF(stdout, "Expected time for %d on %s (impl %u): %f, Measured time: %f, Expected energy: %f\n",
+						size, workername, nimpl,starpu_task_expected_length(compar_task, starpu_worker_get_perf_archtype(worker, compar_task->sched_ctx), nimpl), ((length_sum)/niter),
+						starpu_task_expected_energy(compar_task, starpu_worker_get_perf_archtype(worker, compar_task->sched_ctx), nimpl));
+			}
+		}
+	}
 
 
 
 
 }
 }
@@ -239,74 +232,73 @@ static void compare_performance(int size, struct starpu_codelet *codelet, struct
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
+	/* Enable profiling */
+	starpu_profiling_status_set(STARPU_PROFILING_ENABLE);
 
 
-    /* Enable profiling */
-    starpu_profiling_status_set(1);
-
-    struct starpu_conf conf;
-    starpu_data_handle_t handle;
-    int ret;
+	struct starpu_conf conf;
+	starpu_data_handle_t handle;
+	int ret;
 
 
-    starpu_conf_init(&conf);
+	starpu_conf_init(&conf);
 
 
-    conf.sched_policy_name = "dmda";
-    conf.calibrate = 2;
+	conf.sched_policy_name = "dmda";
+	conf.calibrate = 2;
 
 
-    ret = starpu_initialize(&conf, &argc, &argv);
-    if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
-    STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_initialize(&conf, &argc, &argv);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
-    int size;
-    for (size = STARTlin; size < END; size *= 2)
-    {
-        /* Use a linear regression */
-        test_memset(size, &memset_cl);
-    }
+	int size;
+	for (size = STARTlin; size < END; size *= 2)
+	{
+		/* Use a linear regression */
+		test_memset(size, &memset_cl);
+	}
 
 
-    for (size = START; size < END; size *= 2)
-    {
-        /* Use a non-linear regression */
-        test_memset(size, &nl_memset_cl);
-    }
+	for (size = START; size < END; size *= 2)
+	{
+		/* Use a non-linear regression */
+		test_memset(size, &nl_memset_cl);
+	}
 
 
-    ret = starpu_task_wait_for_all();
-    STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
+	ret = starpu_task_wait_for_all();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
 
 
-    starpu_shutdown();
+	starpu_shutdown();
 
 
 
 
-    /* Test Phase */
-    starpu_conf_init(&conf);
+	/* Test Phase */
+	starpu_conf_init(&conf);
 
 
-    conf.sched_policy_name = "dmda";
-    conf.calibrate = 0;
+	conf.sched_policy_name = "dmda";
+	conf.calibrate = 0;
 
 
-    ret = starpu_initialize(&conf, &argc, &argv);
-    if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
-    STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_initialize(&conf, &argc, &argv);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
-    /* Now create a dummy task just to estimate its duration according to the regression */
+	/* Now create a dummy task just to estimate its duration according to the regression */
 
 
-    size = 1234567;
+	size = 1234567;
 
 
-    starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
+	starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
 
 
-    struct starpu_task *task = starpu_task_create();
-    task->cl = &memset_cl;
-    task->handles[0] = handle;
-    task->destroy = 0;
+	struct starpu_task *task = starpu_task_create();
+	task->cl = &memset_cl;
+	task->handles[0] = handle;
+	task->destroy = 0;
 
 
-    task->cl = &nl_memset_cl;
+	task->cl = &nl_memset_cl;
 
 
-    FPRINTF(stdout, "\n ////non linear regression results////\n");
+	FPRINTF(stdout, "\n ////non linear regression results////\n");
 
 
-    compare_performance(size, &nl_memset_cl,task);
+	compare_performance(size, &nl_memset_cl, task);
 
 
-    starpu_task_destroy(task);
+	starpu_task_destroy(task);
 
 
-    starpu_data_unregister(handle);
+	starpu_data_unregister(handle);
 
 
-    starpu_shutdown();
+	starpu_shutdown();
 
 
-    return EXIT_SUCCESS;
+	return EXIT_SUCCESS;
 }
 }

+ 387 - 0
tests/perfmodels/regression_based_04.c

@@ -0,0 +1,387 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2011       Télécom-SudParis
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_scheduler.h>
+#include "../helper.h"
+
+/*
+ * A multi-implementation benchmark with dmda scheduler
+ * we aim to test OPENCL workers and calculate the estimated time for each type of worker (CPU or OPENCL or CUDA)
+ * dmda choose OPENCL workers for lage size (variable size of compare_performance) size=1234567
+ * dmda choose CPU workers for small size (size=1234)
+ */
+
+#define STARTlin (512*1024)
+#define START 1024
+#ifdef STARPU_QUICK_CHECK
+#define END 1048576
+#else
+#define END 16777216
+#endif
+
+#ifdef STARPU_USE_CUDA
+static void memset_cuda(void *descr[], void *arg)
+{
+	(void)arg;
+	STARPU_SKIP_IF_VALGRIND;
+
+	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+
+	cudaMemsetAsync(ptr, 42, n * sizeof(*ptr), starpu_cuda_get_local_stream());
+}
+#endif
+
+#ifdef STARPU_USE_OPENCL
+extern void memset0_opencl(void *buffers[], void *args);
+extern void memset_opencl(void *buffers[], void *args);
+#endif
+
+void memset0_cpu(void *descr[], void *arg)
+{
+	(void)arg;
+	STARPU_SKIP_IF_VALGRIND;
+
+	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+	unsigned i;
+
+	//starpu_usleep(100);
+
+	for (i = 0; i < n; i++)
+
+		ptr[0] += i;
+}
+
+void memset_cpu(void *descr[], void *arg)
+{
+	(void)arg;
+	STARPU_SKIP_IF_VALGRIND;
+
+	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+
+	//starpu_usleep(10);
+	memset(ptr, 42, n * sizeof(*ptr));
+}
+
+static struct starpu_perfmodel model =
+{
+	.type = STARPU_REGRESSION_BASED,
+	.symbol = "memset_regression_based"
+};
+
+static struct starpu_perfmodel nl_model =
+{
+	.type = STARPU_NL_REGRESSION_BASED,
+	.symbol = "non_linear_memset_regression_based"
+};
+
+static struct starpu_codelet memset_cl =
+{
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {memset_cuda},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {memset0_opencl, memset_opencl},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
+#endif
+	.cpu_funcs = {memset0_cpu, memset_cpu},
+	.cpu_funcs_name = {"memset0_cpu", "memset_cpu"},
+	.model = &model,
+	.nbuffers = 1,
+	.modes = {STARPU_W}
+};
+
+static struct starpu_codelet nl_memset_cl =
+{
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {memset_cuda},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {memset0_opencl, memset_opencl},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
+#endif
+	.cpu_funcs = {memset0_cpu, memset_cpu},
+	.cpu_funcs_name = {"memset0_cpu", "memset_cpu"},
+	.model = &nl_model,
+	.nbuffers = 1,
+	.modes = {STARPU_W}
+};
+
+static void test_memset(int nelems, struct starpu_codelet *codelet)
+{
+	int nloops = 100;
+	int loop;
+	starpu_data_handle_t handle;
+
+	starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, nelems, sizeof(int));
+	for (loop = 0; loop < nloops; loop++)
+	{
+		struct starpu_task *task = starpu_task_create();
+
+		task->cl = codelet;
+		task->handles[0] = handle;
+
+		int ret = starpu_task_submit(task);
+		if (ret == -ENODEV)
+			exit(STARPU_TEST_SKIPPED);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	starpu_data_unregister(handle);
+}
+
+static void compare_performance(int size, struct starpu_codelet *codelet, struct starpu_task *compar_task)
+{
+	unsigned i;
+	unsigned niter = 100;
+	starpu_data_handle_t handle;
+
+	starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
+
+	struct starpu_task **tasks = (struct starpu_task **) malloc(niter*sizeof(struct starpu_task *));
+	assert(tasks);
+
+	for (i = 0; i < niter; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+
+		task->cl = codelet;
+		task->handles[0] = handle;
+
+		task->synchronous = 1;
+
+		/* We will destroy the task structure by hand so that we can
+		 * query the profiling info before the task is destroyed. */
+		task->destroy = 0;
+
+		tasks[i] = task;
+
+		int ret = starpu_task_submit(task);
+
+		if (STARPU_UNLIKELY(ret == -ENODEV))
+		{
+			FPRINTF(stderr, "No worker may execute this task\n");
+			exit(0);
+		}
+	}
+
+	starpu_data_unregister(handle);
+
+	starpu_task_wait_for_all();
+
+	double length_cpu_sum = 0.0;
+	double length_gpu_sum = 0.0;
+
+	enum starpu_worker_archtype archi;
+
+	for (i = 0; i < niter; i++)
+	{
+		struct starpu_task *task = tasks[i];
+		struct starpu_profiling_task_info *info = task->profiling_info;
+
+		//archi=starpu_worker_get_type(0);
+		archi=starpu_worker_get_type(info->workerid);
+
+		switch (archi)
+		{
+		case STARPU_CPU_WORKER:
+			FPRINTF(stdout, "cpuuu\n");
+			/* How long was the task execution ? */
+			length_cpu_sum += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
+			break;
+
+		case STARPU_OPENCL_WORKER:
+
+			FPRINTF(stdout, "openclllllll\n");
+			/* How long was the task execution ? */
+			length_gpu_sum += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
+			break;
+
+		case STARPU_CUDA_WORKER:
+
+			FPRINTF(stdout, "cudaaaaaa\n");
+			/* How long was the task execution ? */
+			length_gpu_sum += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
+			break;
+
+
+	default:
+			FPRINTF(stdout, "unsupported!\n");
+		break;
+		}
+
+		/* We don't need the task structure anymore */
+		starpu_task_destroy(task);
+
+	}
+
+	unsigned worker;
+
+	/* Display the occupancy of all workers during the test */
+	unsigned ncpus =  starpu_cpu_worker_get_count();
+	unsigned ngpus =  starpu_opencl_worker_get_count()+starpu_cuda_worker_get_count();
+	//unsigned ncpu= starpu_worker_get_count_by_type(STARPU_CPU_WORKER);
+
+	FPRINTF(stderr, "ncpus %u \n", ncpus);
+	FPRINTF(stderr, "ngpus %u \n", ngpus);
+	for (worker= 0; worker< starpu_worker_get_count(); worker++)
+	{
+
+		struct starpu_profiling_worker_info worker_info;
+		int ret = starpu_profiling_worker_get_info(worker, &worker_info);
+		STARPU_ASSERT(!ret);
+
+		char workername[128];
+		starpu_worker_get_name(worker, workername, sizeof(workername));
+		unsigned nimpl;
+
+		FPRINTF(stdout, "\n Worker :%s ::::::::::\n\n", workername);
+
+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
+		{
+			switch (starpu_worker_get_type(worker))
+
+			{
+			case STARPU_CPU_WORKER:
+
+				FPRINTF(stdout, "Expected time for %d on %s (impl %u): %f, Measured time: %f \n",
+						size, workername, nimpl,starpu_task_expected_length(compar_task, starpu_worker_get_perf_archtype(worker, compar_task->sched_ctx), nimpl), ((length_cpu_sum)/niter));
+
+				break;
+
+			case STARPU_OPENCL_WORKER:
+
+				FPRINTF(stdout, "Expectedd time for %d on %s (impl %u): %f, Measuredd time: %f \n",
+						size, workername, nimpl,starpu_task_expected_length(compar_task, starpu_worker_get_perf_archtype(worker, compar_task->sched_ctx), nimpl), ((length_gpu_sum)/niter));
+
+				break;
+
+			case STARPU_CUDA_WORKER:
+
+				FPRINTF(stdout, "Expectedd time for %d on %s (impl %u): %f, Measuredd time: %f \n",
+						size, workername, nimpl,starpu_task_expected_length(compar_task, starpu_worker_get_perf_archtype(worker, compar_task->sched_ctx), nimpl), ((length_gpu_sum)/niter));
+
+				break;
+
+			default:
+				FPRINTF(stdout, "unsupported!\n");
+				break;
+			}
+		}
+	}
+
+
+}
+
+#ifdef STARPU_USE_OPENCL
+struct starpu_opencl_program opencl_program;
+#endif
+
+int main(int argc, char **argv)
+{
+	/* Enable profiling */
+	starpu_profiling_status_set(STARPU_PROFILING_ENABLE);
+
+	struct starpu_conf conf;
+	starpu_data_handle_t handle;
+	int ret;
+
+	starpu_conf_init(&conf);
+
+	conf.sched_policy_name = "dmda";
+	conf.calibrate = 2;
+
+	ret = starpu_initialize(&conf, &argc, &argv);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+#ifdef STARPU_USE_OPENCL
+	ret = starpu_opencl_load_opencl_from_file("tests/perfmodels/opencl_memset_kernel.cl",
+			&opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+#endif
+
+	int size;
+	for (size = STARTlin; size < END; size *= 2)
+	{
+		/* Use a linear regression */
+		test_memset(size, &memset_cl);
+	}
+
+	for (size = START*1.5; size < END; size *= 2)
+	{
+		/* Use a non-linear regression */
+		test_memset(size, &nl_memset_cl);
+	}
+
+	ret = starpu_task_wait_for_all();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
+
+	starpu_shutdown();
+
+
+	/* Test Phase */
+	starpu_conf_init(&conf);
+
+	conf.sched_policy_name = "dmda";
+	conf.calibrate = 0;
+
+	ret = starpu_initialize(&conf, &argc, &argv);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	ret = starpu_opencl_load_opencl_from_file("tests/perfmodels/opencl_memset_kernel.cl",
+			&opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+
+	/* Now create a dummy task just to estimate its duration according to the regression */
+
+	size = 1234567;
+
+	starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
+
+	struct starpu_task *task = starpu_task_create();
+	task->cl = &memset_cl;
+	task->handles[0] = handle;
+	task->destroy = 0;
+
+	//FPRINTF(stdout, "\n ////linear regression results////\n");
+	//compare_performance(size, &memset_cl, task);
+
+	task->cl = &nl_memset_cl;
+
+	FPRINTF(stdout, "\n ////non linear regression results////\n");
+
+	compare_performance(size, &nl_memset_cl, task);
+
+	starpu_task_destroy(task);
+
+	starpu_data_unregister(handle);
+
+#ifdef STARPU_USE_OPENCL
+	ret = starpu_opencl_unload_opencl(&opencl_program);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
+#endif
+	starpu_shutdown();
+
+	return EXIT_SUCCESS;
+}