Browse Source

merge trunk@7104:7182

Nathalie Furmento 12 years ago
parent
commit
59cbfcd5f3
57 changed files with 568 additions and 291 deletions
  1. 1 0
      ChangeLog
  2. 61 3
      configure.ac
  3. 30 1
      doc/chapters/basic-api.texi
  4. 64 93
      doc/chapters/configuration.texi
  5. 5 4
      doc/chapters/mpi-support.texi
  6. 2 2
      examples/Makefile.am
  7. 3 62
      examples/basic_examples/vector_scal_cpu.c
  8. 0 1
      examples/basic_examples/vector_scal_cpu_icc.icc
  9. 26 0
      examples/basic_examples/vector_scal_cpu_icc.icc
  10. 93 0
      examples/basic_examples/vector_scal_cpu_template.h
  11. 2 2
      examples/cholesky/cholesky.h
  12. 1 1
      examples/incrementer/incrementer.c
  13. 0 1
      examples/lu/xlu_pivot.c
  14. 1 1
      examples/mult/xgemm.c
  15. 1 1
      examples/pi/pi.c
  16. 8 8
      examples/pi/pi_redux.c
  17. 3 3
      examples/profiling/profiling.c
  18. 4 4
      examples/reductions/dot_product.c
  19. 2 2
      examples/stencil/stencil.c
  20. 7 1
      include/starpu.h
  21. 3 3
      include/starpu_profiling.h
  22. 40 2
      mpi/src/starpu_mpi_insert_task.c
  23. 14 2
      src/common/fxt.c
  24. 14 2
      src/common/fxt.h
  25. 1 7
      src/core/combined_workers.c
  26. 3 5
      src/core/perfmodel/perfmodel_bus.c
  27. 2 3
      src/core/perfmodel/perfmodel_history.c
  28. 1 1
      src/core/perfmodel/perfmodel_print.c
  29. 3 9
      src/core/topology.c
  30. 32 0
      src/core/workers.c
  31. 3 3
      src/core/workers.h
  32. 23 0
      src/datawizard/interfaces/data_interface.c
  33. 1 1
      src/debug/traces/starpu_fxt.h
  34. 15 8
      src/drivers/cuda/driver_cuda.c
  35. 1 0
      src/sched_policies/heft.c
  36. 5 2
      starpufft/examples/testx.c
  37. 3 3
      starpufft/starpufft.h
  38. 10 4
      starpufft/starpufftx.c
  39. 8 0
      starpufft/starpufftx1d.c
  40. 8 0
      starpufft/starpufftx2d.c
  41. 1 1
      tests/datawizard/increment_redux.c
  42. 1 1
      tests/datawizard/increment_redux_lazy.c
  43. 4 2
      tests/datawizard/interfaces/test_interfaces.c
  44. 2 2
      tests/datawizard/manual_reduction.c
  45. 1 1
      tests/datawizard/wt_host.c
  46. 3 3
      tests/errorcheck/starpu_init_noworker.c
  47. 16 5
      tests/loader.c
  48. 1 1
      tests/main/execute_on_a_specific_worker.c
  49. 8 6
      tests/main/starpu_init.c
  50. 1 1
      tests/microbenchs/async_tasks_overhead.c
  51. 12 9
      tests/microbenchs/matrix_as_vector.c
  52. 2 2
      tests/microbenchs/prefetch_data_on_node.c
  53. 1 1
      tests/overlap/overlap.c
  54. 1 1
      tests/parallel_tasks/parallel_kernels.c
  55. 6 1
      tests/sched_policies/execute_all_tasks.c
  56. 3 3
      tools/starpu_machine_display.c
  57. 1 6
      tools/starpu_perfmodel_display.c

+ 1 - 0
ChangeLog

@@ -51,6 +51,7 @@ Changes:
   * Add tag dependency in trace-generated DAG.
   * Fix CPU binding for optimized CPU-GPU transfers.
   * Fix parallel tasks CPU binding and combined worker generation.
+  * Fix generating FXT traces bigger than 64MiB.
 
 StarPU 1.0.1 (svn revision 6659)
 ==============================================

+ 61 - 3
configure.ac

@@ -271,7 +271,6 @@ AC_ARG_ENABLE(cpu, [AS_HELP_STRING([--disable-cpu],
 			enable_cpu=$enableval, enable_cpu=yes)
 AC_MSG_RESULT($enable_cpu)
 AC_SUBST(STARPU_USE_CPU, $enable_cpu)
-
 AM_CONDITIONAL(STARPU_USE_CPU, test x$enable_cpu = xyes)
 
 if test x$enable_cpu = xyes; then
@@ -733,6 +732,51 @@ fi
 
 ###############################################################################
 #                                                                             #
+# General GPU settings                                                        #
+#                                                                             #
+###############################################################################
+AC_MSG_CHECKING(whether asynchronous copy should be disabled)
+AC_ARG_ENABLE(asynchronous-copy, [AS_HELP_STRING([--disable-asynchronous-copy],
+			[disable asynchronous copy between CPU and GPU])],
+			enable_asynchronous_copy=$enableval, enable_asynchronous_copy=yes)
+disable_asynchronous_copy=no
+if test x$enable_asynchronous_copy = xno ; then
+   disable_asynchronous_copy=yes
+fi
+AC_MSG_RESULT($disable_asynchronous_copy)
+if test x$disable_asynchronous_copy = xyes ; then
+   AC_DEFINE([STARPU_DISABLE_ASYNCHRONOUS_COPY], [1], [Define to 1 to disable asynchronous copy between CPU and GPU devices])
+fi
+
+AC_MSG_CHECKING(whether asynchronous CUDA copy should be disabled)
+AC_ARG_ENABLE(asynchronous-cudacopy, [AS_HELP_STRING([--disable-asynchronous-cuda-copy],
+			[disable asynchronous copy between CPU and CUDA devices])],
+			enable_asynchronous_cuda_copy=$enableval, enable_asynchronous_cuda_copy=yes)
+disable_asynchronous_cuda_copy=no
+if test x$enable_asynchronous_cuda_copy = xno ; then
+   disable_asynchronous_cuda_copy=yes
+fi
+AC_MSG_RESULT($disable_asynchronous_cuda_copy)
+if test x$disable_asynchronous_cuda_copy = xyes ; then
+   AC_DEFINE([STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY], [1], [Define to 1 to disable asynchronous copy between CPU and CUDA devices])
+fi
+
+AC_MSG_CHECKING(whether asynchronous OpenCL copy should be disabled)
+AC_ARG_ENABLE(asynchronous-openclcopy, [AS_HELP_STRING([--disable-asynchronous-opencl-copy],
+			[disable asynchronous copy between CPU and OPENCL devices])],
+			enable_asynchronous_opencl_copy=$enableval, enable_asynchronous_opencl_copy=yes)
+disable_asynchronous_opencl_copy=no
+if test x$enable_asynchronous_opencl_copy = xno ; then
+   disable_asynchronous_opencl_copy=yes
+fi
+AC_MSG_RESULT($disable_asynchronous_opencl_copy)
+if test x$disable_asynchronous_opencl_copy = xyes ; then
+   AC_DEFINE([STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY], [1], [Define to 1 to disable asynchronous copy between CPU and OpenCL devices])
+fi
+
+
+###############################################################################
+#                                                                             #
 #                                 Cell settings                               #
 #                                                                             #
 ###############################################################################
@@ -897,11 +941,25 @@ if test x$use_fxt = xyes; then
 	AC_DEFINE(CONFIG_FUT, [1], [enable FUT traces])
 
 	if test x$use_fxt_from_system = xno; then
-		CPPFLAGS="${CPPFLAGS} -I$fxtdir/include/ "
-		LDFLAGS="${LDFLAGS} -L$fxtdir/lib/ -lfxt"
+		FXT_CFLAGS="-I$fxtdir/include/ "
+		FXT_LDFLAGS="-L$fxtdir/lib/"
+		FXT_LIBS="-lfxt"
 	else
 	    PKG_CHECK_MODULES([FXT],  [fxt])
 	fi
+	save_LIBS="$LIBS"
+	LIBS="$LIBS $FXT_LIBS"
+	save_LDFLAGS="$LDFLAGS"
+	LDFLAGS="$LDFLAGS $FXT_LDFLAGS"
+   	AC_CHECK_FUNCS([enable_fut_flush])
+   	AC_CHECK_FUNCS([fut_set_filename])
+	LDFLAGS="$save_LDFLAGS"
+	LIBS="$save_LIBS"
+	save_CFLAGS="$CFLAGS"
+	CFLAGS="$CFLAGS $FXT_CFLAGS"
+	AC_CHECK_DECLS([enable_fut_flush])
+	AC_CHECK_DECLS([fut_set_filename])
+	CFLAGS="$save_CFLAGS"
 fi
 
 AC_MSG_CHECKING(whether performance debugging should be enabled)

+ 30 - 1
doc/chapters/basic-api.texi

@@ -143,11 +143,30 @@ This can also be specified with the @code{STARPU_SINGLE_COMBINED_WORKER} environ
 
 @item @code{int disable_asynchronous_copy} (default = 0)
 This flag should be set to 1 to disable asynchronous copies between
-CPUs and accelerators. This can also be specified with the
+CPUs and all accelerators. This can also be specified with the
 @code{STARPU_DISABLE_ASYNCHRONOUS_COPY} environment variable.
 The AMD implementation of OpenCL is known to
 fail when copying data asynchronously. When using this implementation,
 it is therefore necessary to disable asynchronous data transfers.
+This can also be specified at compilation time by giving to the
+configure script the option @code{--disable-asynchronous-copy}.
+
+@item @code{int disable_cuda_asynchronous_copy} (default = 0)
+This flag should be set to 1 to disable asynchronous copies between
+CPUs and CUDA accelerators. This can also be specified with the
+@code{STARPU_DISABLE_CUDA_ASYNCHRONOUS_COPY} environment variable.
+This can also be specified at compilation time by giving to the
+configure script the option @code{--disable-asynchronous-cuda-copy}.
+
+@item @code{int disable_opencl_asynchronous_copy} (default = 0)
+This flag should be set to 1 to disable asynchronous copies between
+CPUs and OpenCL accelerators. This can also be specified with the
+@code{STARPU_DISABLE_OPENCL_ASYNCHRONOUS_COPY} environment variable.
+The AMD implementation of OpenCL is known to
+fail when copying data asynchronously. When using this implementation,
+it is therefore necessary to disable asynchronous data transfers.
+This can also be specified at compilation time by giving to the
+configure script the option @code{--disable-asynchronous-opencl-copy}.
 
 @item @code{int *cuda_opengl_interoperability} (default = NULL)
 This can be set to an array of CUDA device identifiers for which
@@ -190,6 +209,16 @@ Return 1 if asynchronous data transfers between CPU and accelerators
 are disabled.
 @end deftypefun
 
+@deftypefun int starpu_asynchronous_cuda_copy_disabled ()
+Return 1 if asynchronous data transfers between CPU and CUDA accelerators
+are disabled.
+@end deftypefun
+
+@deftypefun int starpu_asynchronous_opencl_copy_disabled ()
+Return 1 if asynchronous data transfers between CPU and OpenCL accelerators
+are disabled.
+@end deftypefun
+
 @node Workers' Properties
 @section Workers' Properties
 

+ 64 - 93
doc/chapters/configuration.texi

@@ -123,6 +123,20 @@ Allow for at most @var{count} scheduling contexts
 This information is then available as the
 @code{STARPU_NMAX_SCHED_CTXS} macro.
 
+@item --disable-asynchronous-copy
+Disable asynchronous copies between CPU and GPU devices.
+The AMD implementation of OpenCL is known to
+fail when copying data asynchronously. When using this implementation,
+it is therefore necessary to disable asynchronous data transfers.
+
+@item --disable-asynchronous-cuda-copy
+Disable asynchronous copies between CPU and CUDA devices.
+
+@item --disable-asynchronous-opencl-copy
+Disable asynchronous copies between CPU and OpenCL devices.
+The AMD implementation of OpenCL is known to
+fail when copying data asynchronously. When using this implementation,
+it is therefore necessary to disable asynchronous data transfers.
 @end table
 
 @node Extension configuration
@@ -222,6 +236,7 @@ Enables the Scheduling Context Hypervisor plugin(@pxref{Scheduling Context Hyper
 By default, it is disabled.
 
 @end table
+
 @node Execution configuration through environment variables
 @section Execution configuration through environment variables
 
@@ -234,55 +249,31 @@ By default, it is disabled.
 @node Workers
 @subsection Configuring workers
 
-@menu
-* STARPU_NCPU::                	Number of CPU workers
-* STARPU_NCUDA::                	Number of CUDA workers
-* STARPU_NOPENCL::              	Number of OpenCL workers
-* STARPU_NGORDON::              	Number of SPU workers (Cell)
-* STARPU_WORKERS_NOBIND::       	Do not bind workers
-* STARPU_WORKERS_CPUID::        	Bind workers to specific CPUs
-* STARPU_WORKERS_CUDAID::       	Select specific CUDA devices
-* STARPU_WORKERS_OPENCLID::     	Select specific OpenCL devices
-* STARPU_SINGLE_COMBINED_WORKER:: 	Do not use concurrent workers
-* STARPU_MIN_WORKERSIZE::	 	Minimum size of the combined workers
-* STARPU_MAX_WORKERSIZE:: 		Maximum size of the combined workers
-@end menu
-
-@node STARPU_NCPU
-@subsubsection @code{STARPU_NCPU} -- Number of CPU workers
+@table @code
 
+@item @code{STARPU_NCPU}
 Specify the number of CPU workers (thus not including workers dedicated to control acceleratores). Note that by default, StarPU will not allocate
 more CPU workers than there are physical CPUs, and that some CPUs are used to control
 the accelerators.
 
-@node STARPU_NCUDA
-@subsubsection @code{STARPU_NCUDA} -- Number of CUDA workers
-
+@item @code{STARPU_NCUDA}
 Specify the number of CUDA devices that StarPU can use. If
 @code{STARPU_NCUDA} is lower than the number of physical devices, it is
 possible to select which CUDA devices should be used by the means of the
 @code{STARPU_WORKERS_CUDAID} environment variable. By default, StarPU will
 create as many CUDA workers as there are CUDA devices.
 
-@node STARPU_NOPENCL
-@subsubsection @code{STARPU_NOPENCL} -- Number of OpenCL workers
-
+@item @code{STARPU_NOPENCL}
 OpenCL equivalent of the @code{STARPU_NCUDA} environment variable.
 
-@node STARPU_NGORDON
-@subsubsection @code{STARPU_NGORDON} -- Number of SPU workers (Cell)
-
+@item @code{STARPU_NGORDON}
 Specify the number of SPUs that StarPU can use.
 
-@node STARPU_WORKERS_NOBIND
-@subsubsection @code{STARPU_WORKERS_NOBIND} -- Do not bind workers to specific CPUs
-
+@item @code{STARPU_WORKERS_NOBIND}
 Setting it to non-zero will prevent StarPU from binding its threads to
 CPUs. This is for instance useful when running the testsuite in parallel.
 
-@node STARPU_WORKERS_CPUID
-@subsubsection @code{STARPU_WORKERS_CPUID} -- Bind workers to specific CPUs
-
+@item @code{STARPU_WORKERS_CPUID}
 Passing an array of integers (starting from 0) in @code{STARPU_WORKERS_CPUID}
 specifies on which logical CPU the different workers should be
 bound. For instance, if @code{STARPU_WORKERS_CPUID = "0 1 4 5"}, the first
@@ -306,9 +297,7 @@ third (resp. second and fourth) workers will be put on CPU #0 (resp. CPU #1).
 This variable is ignored if the @code{use_explicit_workers_bindid} flag of the
 @code{starpu_conf} structure passed to @code{starpu_init} is set.
 
-@node STARPU_WORKERS_CUDAID
-@subsubsection @code{STARPU_WORKERS_CUDAID} -- Select specific CUDA devices
-
+@item @code{STARPU_WORKERS_CUDAID}
 Similarly to the @code{STARPU_WORKERS_CPUID} environment variable, it is
 possible to select which CUDA devices should be used by StarPU. On a machine
 equipped with 4 GPUs, setting @code{STARPU_WORKERS_CUDAID = "1 3"} and
@@ -319,56 +308,57 @@ the one reported by CUDA).
 This variable is ignored if the @code{use_explicit_workers_cuda_gpuid} flag of
 the @code{starpu_conf} structure passed to @code{starpu_init} is set.
 
-@node STARPU_WORKERS_OPENCLID
-@subsubsection @code{STARPU_WORKERS_OPENCLID} -- Select specific OpenCL devices
-
+@item @code{STARPU_WORKERS_OPENCLID}
 OpenCL equivalent of the @code{STARPU_WORKERS_CUDAID} environment variable.
 
 This variable is ignored if the @code{use_explicit_workers_opencl_gpuid} flag of
 the @code{starpu_conf} structure passed to @code{starpu_init} is set.
 
-@node STARPU_SINGLE_COMBINED_WORKER
-@subsubsection @code{STARPU_SINGLE_COMBINED_WORKER} -- Do not use concurrent workers
-
+@item @code{STARPU_SINGLE_COMBINED_WORKER}
 If set, StarPU will create several workers which won't be able to work
 concurrently. It will create combined workers which size goes from 1 to the
 total number of CPU workers in the system.
 
-@node STARPU_MIN_WORKERSIZE
-@subsubsection @code{STARPU_MIN_WORKERSIZE} -- Minimum size of the combined workers
+@item @code{SYNTHESIZE_ARITY_COMBINED_WORKER}
 
+@item @code{STARPU_MIN_WORKERSIZE}
 Let the user give a hint to StarPU about which how many workers
 (minimum boundary) the combined workers should contain.
 
-@node STARPU_MAX_WORKERSIZE
-@subsubsection @code{STARPU_MAX_WORKERSIZE} -- Maximum size of the combined workers
-
+@item @code{STARPU_MAX_WORKERSIZE}
 Let the user give a hint to StarPU about which how many workers
 (maximum boundary) the combined workers should contain.
 
+@item @code{STARPU_DISABLE_ASYNCHRONOUS_COPY}
+Disable asynchronous copies between CPU and GPU devices.
+The AMD implementation of OpenCL is known to
+fail when copying data asynchronously. When using this implementation,
+it is therefore necessary to disable asynchronous data transfers.
+
+@item @code{STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY}
+Disable asynchronous copies between CPU and CUDA devices.
+
+@item @code{STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY}
+Disable asynchronous copies between CPU and OpenCL devices.
+The AMD implementation of OpenCL is known to
+fail when copying data asynchronously. When using this implementation,
+it is therefore necessary to disable asynchronous data transfers.
+
+@end table
+
+>>>>>>> .merge-right.r7182
 @node Scheduling
 @subsection Configuring the Scheduling engine
 
-@menu
-* STARPU_SCHED::                Scheduling policy
-* STARPU_CALIBRATE::            Calibrate performance models
-* STARPU_BUS_CALIBRATE::        Calibrate bus
-* STARPU_PREFETCH::             Use data prefetch
-* STARPU_SCHED_ALPHA::          Computation factor
-* STARPU_SCHED_BETA::           Communication factor
-@end menu
-
-@node STARPU_SCHED
-@subsubsection @code{STARPU_SCHED} -- Scheduling policy
+@table @code
 
+@item @code{STARPU_SCHED}
 Choose between the different scheduling policies proposed by StarPU: work
 random, stealing, greedy, with performance models, etc.
 
 Use @code{STARPU_SCHED=help} to get the list of available schedulers.
 
-@node STARPU_CALIBRATE
-@subsubsection @code{STARPU_CALIBRATE} -- Calibrate performance models
-
+@item @code{STARPU_CALIBRATE}
 If this variable is set to 1, the performance models are calibrated during
 the execution. If it is set to 2, the previous values are dropped to restart
 calibration from scratch. Setting this variable to 0 disable calibration, this
@@ -376,14 +366,11 @@ is the default behaviour.
 
 Note: this currently only applies to @code{dm}, @code{dmda} and @code{heft} scheduling policies.
 
-@node STARPU_BUS_CALIBRATE
-@subsubsection @code{STARPU_BUS_CALIBRATE} -- Calibrate bus
-
+@item @code{STARPU_BUS_CALIBRATE}
 If this variable is set to 1, the bus is recalibrated during intialization.
 
-@node STARPU_PREFETCH
-@subsubsection @code{STARPU_PREFETCH} -- Use data prefetch
-
+@item @code{STARPU_PREFETCH}
+@anchor{STARPU_PREFETCH}
 This variable indicates whether data prefetching should be enabled (0 means
 that it is disabled). If prefetching is enabled, when a task is scheduled to be
 executed e.g. on a GPU, StarPU will request an asynchronous transfer in
@@ -391,58 +378,42 @@ advance, so that data is already present on the GPU when the task starts. As a
 result, computation and data transfers are overlapped.
 Note that prefetching is enabled by default in StarPU.
 
-@node STARPU_SCHED_ALPHA
-@subsubsection @code{STARPU_SCHED_ALPHA} -- Computation factor
-
+@item @code{STARPU_SCHED_ALPHA}
 To estimate the cost of a task StarPU takes into account the estimated
 computation time (obtained thanks to performance models). The alpha factor is
 the coefficient to be applied to it before adding it to the communication part.
 
-@node STARPU_SCHED_BETA
-@subsubsection @code{STARPU_SCHED_BETA} -- Communication factor
-
+@item @code{STARPU_SCHED_BETA}
 To estimate the cost of a task StarPU takes into account the estimated
 data transfer time (obtained thanks to performance models). The beta factor is
 the coefficient to be applied to it before adding it to the computation part.
 
+@end table
+
 @node Misc
 @subsection Miscellaneous and debug
 
-@menu
-* STARPU_SILENT::               Disable verbose mode
-* STARPU_LOGFILENAME::          Select debug file name
-* STARPU_FXT_PREFIX::           FxT trace location
-* STARPU_LIMIT_GPU_MEM::        Restrict memory size on the GPUs
-* STARPU_GENERATE_TRACE::       Generate a Paje trace when StarPU is shut down
-@end menu
-
-@node STARPU_SILENT
-@subsubsection @code{STARPU_SILENT} -- Disable verbose mode
+@table @code
 
+@item @code{STARPU_SILENT}
 This variable allows to disable verbose mode at runtime when StarPU
 has been configured with the option @code{--enable-verbose}.
 
-@node STARPU_LOGFILENAME
-@subsubsection @code{STARPU_LOGFILENAME} -- Select debug file name
-
+@item @code{STARPU_LOGFILENAME}
 This variable specifies in which file the debugging output should be saved to.
 
-@node STARPU_FXT_PREFIX
-@subsubsection @code{STARPU_FXT_PREFIX} -- FxT trace location
-
+@item @code{STARPU_FXT_PREFIX}
 This variable specifies in which directory to save the trace generated if FxT is enabled. It needs to have a trailing '/' character.
 
-@node STARPU_LIMIT_GPU_MEM
-@subsubsection @code{STARPU_LIMIT_GPU_MEM} -- Restrict memory size on the GPUs
-
+@item @code{STARPU_LIMIT_GPU_MEM}
 This variable specifies the maximum number of megabytes that should be
 available to the application on each GPUs. In case this value is smaller than
 the size of the memory of a GPU, StarPU pre-allocates a buffer to waste memory
 on the device. This variable is intended to be used for experimental purposes
 as it emulates devices that have a limited amount of memory.
 
-@node STARPU_GENERATE_TRACE
-@subsubsection @code{STARPU_GENERATE_TRACE} -- Generate a Paje trace when StarPU is shut down
-
+@item @code{STARPU_GENERATE_TRACE}
 When set to 1, this variable indicates that StarPU should automatically
 generate a Paje trace when starpu_shutdown is called.
+
+@end table

+ 5 - 4
doc/chapters/mpi-support.texi

@@ -311,10 +311,11 @@ static struct starpu_data_interface_ops interface_complex_ops =
 To save the programmer from having to explicit all communications, StarPU
 provides an "MPI Insert Task Utility". The principe is that the application
 decides a distribution of the data over the MPI nodes by allocating it and
-notifying StarPU of that decision, i.e. tell StarPU which MPI node "owns" which
-data. All MPI nodes then process the whole task graph, and StarPU automatically
-determines which node actually execute which task, as well as the required MPI
-transfers.
+notifying StarPU of that decision, i.e. tell StarPU which MPI node "owns"
+which data. It also decides, for each handle, an MPI tag which will be used to
+exchange the content of the handle. All MPI nodes then process the whole task
+graph, and StarPU automatically determines which node actually execute which
+task, and trigger the required MPI transfers.
 
 @deftypefun int starpu_data_set_tag (starpu_data_handle_t @var{handle}, int @var{tag})
 Tell StarPU-MPI which MPI tag to use when exchanging the data.

+ 2 - 2
examples/Makefile.am

@@ -144,7 +144,8 @@ noinst_HEADERS = 				\
 	pi/SobolQRNG/sobol_gold.h		\
 	pi/SobolQRNG/sobol_gpu.h		\
 	pi/SobolQRNG/sobol_primitives.h         \
-	reductions/dot_product.h
+	reductions/dot_product.h                \
+	basic_examples/vector_scal_cpu_template.h
 
 #####################################
 # What to install and what to check #
@@ -315,7 +316,6 @@ basic_examples_vector_scal_SOURCES =		\
 if STARPU_HAVE_ICC
 basic_examples_vector_scal_SOURCES +=		\
 	basic_examples/vector_scal_cpu_icc.icc
-basic_examples/vector_scal_cpu_icc.o: CFLAGS += -Dscal_cpu_func=scal_cpu_func_icc -Dscal_sse_func=scal_sse_func_icc
 endif
 
 if STARPU_USE_CUDA

+ 3 - 62
examples/basic_examples/vector_scal_cpu.c

@@ -18,67 +18,8 @@
  * This example complements vector_scale.c: here we implement a CPU version.
  */
 
-#include <starpu.h>
-#ifdef __SSE__
-#include <xmmintrin.h>
-#endif
+#include "vector_scal_cpu_template.h"
 
-/* This kernel takes a buffer and scales it by a constant factor */
-void scal_cpu_func(void *buffers[], void *cl_arg)
-{
-	unsigned i;
-	float *factor = (float *) cl_arg;
+VECTOR_SCAL_CPU_FUNC(scal_cpu_func)
+VECTOR_SCAL_SSE_FUNC(scal_sse_func)
 
-	/*
-	 * The "buffers" array matches the task->handles array: for instance
-	 * task->handles[0] is a handle that corresponds to a data with
-	 * vector "interface", so that the first entry of the array in the
-	 * codelet  is a pointer to a structure describing such a vector (ie.
-	 * struct starpu_vector_interface *). Here, we therefore manipulate
-	 * the buffers[0] element as a vector: nx gives the number of elements
-	 * in the array, ptr gives the location of the array (that was possibly
-	 * migrated/replicated), and elemsize gives the size of each elements.
-	 */
-
-	struct starpu_vector_interface *vector = (struct starpu_vector_interface *) buffers[0];
-
-	/* length of the vector */
-	unsigned n = STARPU_VECTOR_GET_NX(vector);
-
-	/* get a pointer to the local copy of the vector : note that we have to
-	 * cast it in (float *) since a vector could contain any type of
-	 * elements so that the .ptr field is actually a uintptr_t */
-	float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
-
-	/* scale the vector */
-	for (i = 0; i < n; i++)
-		val[i] *= *factor;
-}
-
-#ifdef __SSE__
-void scal_sse_func(void *buffers[], void *cl_arg)
-{
-	float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
-	unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
-	unsigned int n_iterations = n/4;
-
-	__m128 *VECTOR = (__m128*) vector;
-	__m128 FACTOR __attribute__((aligned(16)));
-	float factor = *(float *) cl_arg;
-	FACTOR = _mm_set1_ps(factor);
-
-	unsigned int i;	
-	for (i = 0; i < n_iterations; i++)
-		VECTOR[i] = _mm_mul_ps(FACTOR, VECTOR[i]);
-
-	unsigned int remainder = n%4;
-	if (remainder != 0)
-	{
-		unsigned int start = 4 * n_iterations;
-		for (i = start; i < start+remainder; ++i)
-		{
-			vector[i] = factor * vector[i];
-		}
-	}
-}
-#endif

+ 0 - 1
examples/basic_examples/vector_scal_cpu_icc.icc

@@ -1 +0,0 @@
-vector_scal_cpu.c

+ 26 - 0
examples/basic_examples/vector_scal_cpu_icc.icc

@@ -0,0 +1,26 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This example complements vector_scale.c: here we implement a CPU version,
+ * meant to be compiled by icc.
+ */
+
+#include "vector_scal_cpu_template.h"
+
+VECTOR_SCAL_CPU_FUNC(scal_cpu_func_icc)
+VECTOR_SCAL_SSE_FUNC(scal_sse_func_icc)
+

+ 93 - 0
examples/basic_examples/vector_scal_cpu_template.h

@@ -0,0 +1,93 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This example complements vector_scale.c: here we implement a CPU version.
+ */
+
+#ifndef __VECTOR_SCAL_CPU_TEMPLATE_H__
+#define __VECTOR_SCAL_CPU_TEMPLATE_H__
+
+#include <starpu.h>
+#ifdef __SSE__
+#include <xmmintrin.h>
+#endif
+
+/* This kernel takes a buffer and scales it by a constant factor */
+#define VECTOR_SCAL_CPU_FUNC(func_name)                                        \
+void func_name(void *buffers[], void *cl_arg)                                  \
+{                                                                              \
+	unsigned i;                                                            \
+	float *factor = (float *) cl_arg;                                      \
+                                                                               \
+	/*                                                                     \
+	 * The "buffers" array matches the task->handles array: for instance   \
+	 * task->handles[0] is a handle that corresponds to a data with        \
+	 * vector "interface", so that the first entry of the array in the     \
+	 * codelet  is a pointer to a structure describing such a vector (ie.  \
+	 * struct starpu_vector_interface *). Here, we therefore manipulate    \
+	 * the buffers[0] element as a vector: nx gives the number of elements \
+	 * in the array, ptr gives the location of the array (that was possibly \
+	 * migrated/replicated), and elemsize gives the size of each elements.  \
+	 */                                                                    \
+                                                                               \
+	struct starpu_vector_interface *vector = (struct starpu_vector_interface *) buffers[0]; \
+                                                                               \
+	/* length of the vector */                                             \
+	unsigned n = STARPU_VECTOR_GET_NX(vector);                             \
+                                                                               \
+	/* get a pointer to the local copy of the vector : note that we have to \
+	 * cast it in (float *) since a vector could contain any type of       \
+	 * elements so that the .ptr field is actually a uintptr_t */          \
+	float *val = (float *)STARPU_VECTOR_GET_PTR(vector);                   \
+                                                                               \
+	/* scale the vector */                                                 \
+	for (i = 0; i < n; i++)                                                \
+		val[i] *= *factor;                                             \
+}
+
+#ifdef __SSE__
+#define VECTOR_SCAL_SSE_FUNC(func_name)                                        \
+void func_name(void *buffers[], void *cl_arg)                                  \
+{                                                                              \
+	float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);           \
+	unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);                     \
+	unsigned int n_iterations = n/4;                                       \
+                                                                               \
+	__m128 *VECTOR = (__m128*) vector;                                     \
+	__m128 FACTOR __attribute__((aligned(16)));                            \
+	float factor = *(float *) cl_arg;                                      \
+	FACTOR = _mm_set1_ps(factor);                                          \
+                                                                               \
+	unsigned int i;	                                                       \
+	for (i = 0; i < n_iterations; i++)                                     \
+		VECTOR[i] = _mm_mul_ps(FACTOR, VECTOR[i]);                     \
+                                                                               \
+	unsigned int remainder = n%4;                                          \
+	if (remainder != 0)                                                    \
+	{                                                                      \
+		unsigned int start = 4 * n_iterations;                         \
+		for (i = start; i < start+remainder; ++i)                      \
+		{                                                              \
+			vector[i] = factor * vector[i];                        \
+		}                                                              \
+	}                                                                      \
+}
+#else /* !__SSE__ */
+#define VECTOR_SCAL_SSE_FUNC(func_name)
+#endif /* !__SSE__ */
+
+#endif /* !__VECTOR_SCAL_CPU_TEMPLATE_H__ */

+ 2 - 2
examples/cholesky/cholesky.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -147,7 +147,7 @@ static void __attribute__((unused)) parse_args(int argc, char **argv)
 			check = 1;
 		}
 
-		if (strcmp(argv[i], "-h") == 0)
+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i],"--help") == 0)
 		{
 			printf("usage : %s [-pin] [-size size] [-nblocks nblocks] [-check]\n", argv[0]);
 		}

+ 1 - 1
examples/incrementer/incrementer.c

@@ -94,7 +94,7 @@ int main(int argc, char **argv)
 
 		task->handles[0] = float_array_handle;
 
-		int ret = starpu_task_submit(task);
+		ret = starpu_task_submit(task);
 		if (STARPU_UNLIKELY(ret == -ENODEV))
 		{
 			FPRINTF(stderr, "No worker may execute this task\n");

+ 0 - 1
examples/lu/xlu_pivot.c

@@ -312,7 +312,6 @@ static int dw_codelet_facto_pivot(starpu_data_handle_t *dataAp,
 	gettimeofday(&start, NULL);
 	ret = starpu_task_submit(entry_task);
 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
-	return ret;
 
 	/* stall the application until the end of computations */
 	starpu_tag_wait_array(ndeps, tags);

+ 1 - 1
examples/mult/xgemm.c

@@ -304,7 +304,7 @@ int main(int argc, char **argv)
 			task->handles[1] = starpu_data_get_sub_data(B_handle, 1, x);
 			task->handles[2] = starpu_data_get_sub_data(C_handle, 2, x, y);
 
-			int ret = starpu_task_submit(task);
+			ret = starpu_task_submit(task);
 			if (ret == -ENODEV)
 			{
 			     ret = 77;

+ 1 - 1
examples/pi/pi.c

@@ -155,7 +155,7 @@ int main(int argc, char **argv)
 		task->handles[0] = sobol_qrng_direction_handle;
 		task->handles[1] = starpu_data_get_sub_data(cnt_array_handle, 1, i);
 
-		int ret = starpu_task_submit(task);
+		ret = starpu_task_submit(task);
 		STARPU_ASSERT(!ret);
 	}
 

+ 8 - 8
examples/pi/pi_redux.c

@@ -46,7 +46,7 @@ static unsigned do_warmup = 0;
 #ifdef STARPU_HAVE_CURAND
 /* RNG for the CURAND library */
 static curandGenerator_t curandgens[STARPU_NMAXWORKERS];
-#endif 
+#endif
 
 /* state for the erand48 function : note the huge padding to avoid false-sharing */
 #define PADDING	1024
@@ -132,7 +132,7 @@ static void pi_func_cpu(void *descr[], void *cl_arg __attribute__ ((unused)))
 
 	unsigned short *worker_xsub;
 	worker_xsub = &xsubi[PADDING*workerid];
-	
+
 	struct drand48_data *buffer;
 	buffer = &randbuffer[PADDING*workerid];
 
@@ -165,7 +165,7 @@ extern void pi_redux_cuda_kernel(float *x, float *y, unsigned n, unsigned long *
 #ifdef STARPU_HAVE_CURAND
 static void pi_func_cuda(void *descr[], void *cl_arg __attribute__ ((unused)))
 {
-	curandStatus_t res;	
+	curandStatus_t res;
 
 	int workerid = starpu_worker_get_id();
 
@@ -260,7 +260,7 @@ static void redux_cuda_func(void *descr[], void *cl_arg)
 	unsigned long *d_b = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
 
 	unsigned long h_a, h_b;
-	
+
 	cudaMemcpyAsync(&h_a, d_a, sizeof(h_a), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
 	cudaMemcpyAsync(&h_b, d_b, sizeof(h_b), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
@@ -269,7 +269,7 @@ static void redux_cuda_func(void *descr[], void *cl_arg)
 
 	cudaMemcpyAsync(d_a, &h_a, sizeof(h_a), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
-};
+}
 #endif
 
 static void redux_cpu_func(void *descr[], void *cl_arg)
@@ -278,7 +278,7 @@ static void redux_cpu_func(void *descr[], void *cl_arg)
 	unsigned long *b = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
 
 	*a = *a + *b;
-};
+}
 
 static struct starpu_codelet redux_codelet =
 {
@@ -340,7 +340,7 @@ int main(int argc, char **argv)
 		task->handles[0] = xy_scratchpad_handle;
 		task->handles[1] = shot_cnt_handle;
 
-		int ret = starpu_task_submit(task);
+		ret = starpu_task_submit(task);
 		STARPU_ASSERT(!ret);
 	}
 
@@ -356,7 +356,7 @@ int main(int argc, char **argv)
 		task->handles[0] = xy_scratchpad_handle;
 		task->handles[1] = shot_cnt_handle;
 
-		int ret = starpu_task_submit(task);
+		ret = starpu_task_submit(task);
 		STARPU_ASSERT(!ret);
 	}
 

+ 3 - 3
examples/profiling/profiling.c

@@ -70,10 +70,10 @@ int main(int argc, char **argv)
 		/* We will destroy the task structure by hand so that we can
 		 * query the profiling info before the task is destroyed. */
 		task->destroy = 0;
-		
+
 		tasks[i] = task;
 
-		int ret = starpu_task_submit(task);
+		ret = starpu_task_submit(task);
 		if (STARPU_UNLIKELY(ret == -ENODEV))
 		{
 			FPRINTF(stderr, "No worker may execute this task\n");
@@ -111,7 +111,7 @@ int main(int argc, char **argv)
 	for (worker = 0; worker < starpu_worker_get_count(); worker++)
 	{
 		struct starpu_worker_profiling_info worker_info;
-		int ret = starpu_worker_get_profiling_info(worker, &worker_info);
+		ret = starpu_worker_get_profiling_info(worker, &worker_info);
 		STARPU_ASSERT(!ret);
 
 		double total_time = starpu_timing_timespec_to_us(&worker_info.total_time);

+ 4 - 4
examples/reductions/dot_product.c

@@ -347,7 +347,7 @@ int main(int argc, char **argv)
 	assert(x && y);
 
         starpu_srand48(0);
-	
+
 	DOT_TYPE reference_dot = 0.0;
 
 	unsigned long i;
@@ -357,8 +357,8 @@ int main(int argc, char **argv)
 		y[i] = (float)starpu_drand48();
 
 		reference_dot += (DOT_TYPE)x[i]*(DOT_TYPE)y[i];
-	} 
-	
+	}
+
 	unsigned block;
 	for (block = 0; block < nblocks; block++)
 	{
@@ -386,7 +386,7 @@ int main(int argc, char **argv)
 		task->handles[1] = y_handles[block];
 		task->handles[2] = dot_handle;
 
-		int ret = starpu_task_submit(task);
+		ret = starpu_task_submit(task);
 		if (ret == -ENODEV) goto enodev;
 		STARPU_ASSERT(!ret);
 	}

+ 2 - 2
examples/stencil/stencil.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -104,7 +104,7 @@ static void parse_args(int argc, char **argv)
 			ticks = atoi(argv[++i]);
 		}
 
-		if (strcmp(argv[i], "-h") == 0)
+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
 		{
 			 fprintf(stderr, "Usage : %s [options...]\n", argv[0]);
 			 fprintf(stderr, "\n");

+ 7 - 1
include/starpu.h

@@ -130,9 +130,15 @@ struct starpu_conf
 	/* Create only one combined worker, containing all CPU workers */
 	int single_combined_worker;
 
-        /* indicate if the asynchronous copies should be disabled */
+        /* indicate if all asynchronous copies should be disabled */
 	int disable_asynchronous_copy;
 
+        /* indicate if asynchronous copies to CUDA devices should be disabled */
+	int disable_cuda_asynchronous_copy;
+
+        /* indicate if asynchronous copies to OpenCL devices should be disabled */
+	int disable_opencl_asynchronous_copy;
+
 	/* Enable CUDA/OpenGL interoperation on these CUDA devices */
 	int *cuda_opengl_interoperability;
 	unsigned n_cuda_opengl_interoperability;

+ 3 - 3
include/starpu_profiling.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -150,8 +150,8 @@ static inline void starpu_timespec_accumulate(struct timespec *result,
 }
 
 /* Computes result = a - b */
-static inline void starpu_timespec_sub(struct timespec *a,
-					struct timespec *b,
+static inline void starpu_timespec_sub(const struct timespec *a,
+					const struct timespec *b,
 					struct timespec *result)
 {
 	result->tv_sec = a->tv_sec - b->tv_sec;

+ 40 - 2
mpi/src/starpu_mpi_insert_task.c

@@ -113,7 +113,14 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 	if (data && mode & STARPU_R) {
 		int mpi_rank = starpu_data_get_rank(data);
 		int mpi_tag = starpu_data_get_tag(data);
-		STARPU_ASSERT(mpi_tag >= 0 && "StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank");
+		if(mpi_rank == -1) {
+			fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
+			STARPU_ABORT();
+		}
+		if(mpi_tag == -1) {
+			fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
+			STARPU_ABORT();
+		}
 		/* The task needs to read this data */
 		if (do_execute && mpi_rank != me && mpi_rank != -1) {
 			/* I will have to execute but I don't have the data, receive */
@@ -164,7 +171,14 @@ void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum s
 	if (mode & STARPU_W) {
 		int mpi_rank = starpu_data_get_rank(data);
 		int mpi_tag = starpu_data_get_tag(data);
-		STARPU_ASSERT(mpi_tag >= 0 && "StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank");
+		if(mpi_rank == -1) {
+			fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
+			STARPU_ABORT();
+		}
+		if(mpi_tag == -1) {
+			fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
+			STARPU_ABORT();
+		}
 		if (mpi_rank == me) {
 			if (xrank != -1 && me != xrank) {
 				_STARPU_MPI_DEBUG("Receive data %p back from the task %d which executed the codelet ...\n", data, dest);
@@ -520,6 +534,14 @@ void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t da
 
 	rank = starpu_data_get_rank(data_handle);
 	tag = starpu_data_get_tag(data_handle);
+	if(rank == -1) {
+		fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
+		STARPU_ABORT();
+	}
+	if(tag == -1) {
+		fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
+		STARPU_ABORT();
+	}
 	MPI_Comm_rank(comm, &me);
 
 	if (node == rank) return;
@@ -540,6 +562,14 @@ void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle
 
 	rank = starpu_data_get_rank(data_handle);
 	tag = starpu_data_get_tag(data_handle);
+	if(rank == -1) {
+		fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
+		STARPU_ABORT();
+	}
+	if(tag == -1) {
+		fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
+		STARPU_ABORT();
+	}
 	MPI_Comm_rank(comm, &me);
 
 	if (node == rank) return;
@@ -561,6 +591,14 @@ void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
 
 	rank = starpu_data_get_rank(data_handle);
 	tag = starpu_data_get_tag(data_handle);
+	if(rank == -1) {
+		fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
+		STARPU_ABORT();
+	}
+	if(tag == -1) {
+		fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
+		STARPU_ABORT();
+	}
 
 	MPI_Comm_rank(comm, &me);
 	MPI_Comm_size(comm, &nb_nodes);

+ 14 - 2
src/common/fxt.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -97,6 +97,18 @@ void _starpu_start_fxt_profiling(void)
 		_starpu_profile_set_tracefile(NULL);
 	}
 
+#ifdef HAVE_FUT_SET_FILENAME
+	fut_set_filename(_STARPU_PROF_FILE_USER);
+#endif
+#ifdef HAVE_ENABLE_FUT_FLUSH
+	// when the event buffer is full, fxt stops recording events.
+	// The trace may thus be incomplete.
+	// Enable the fut_flush function which is called when the
+	// fxt event buffer is full to flush the buffer to disk,
+	// therefore allowing to record the remaining events.
+	enable_fut_flush();
+#endif
+
 	threadid = _starpu_gettid();
 
 	atexit(_starpu_stop_fxt_profiling);

+ 14 - 2
src/common/fxt.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011 Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -106,6 +106,18 @@
 #include <fxt/fxt.h>
 #include <fxt/fut.h>
 
+/* Some versions of FxT do not include the declaration of the function */
+#ifdef HAVE_ENABLE_FUT_FLUSH
+#if !HAVE_DECL_ENABLE_FUT_FLUSH
+void enable_fut_flush();
+#endif
+#endif
+#ifdef HAVE_FUT_SET_FILENAME
+#if !HAVE_DECL_FUT_SET_FILENAME
+void fut_set_filename(char *filename);
+#endif
+#endif
+
 long _starpu_gettid(void);
 
 /* Initialize the FxT library. */

+ 1 - 7
src/core/combined_workers.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -29,12 +29,6 @@
 #include <windows.h>
 #endif
 
-#ifndef HWLOC_BITMAP_H
-/* hwloc <1.1 does not offer the bitmap API yet */
-#define hwloc_bitmap_alloc hwloc_cpuset_alloc
-#define hwloc_bitmap_or hwloc_cpuset_or
-#endif
-
 static int compar_int(const void *pa, const void *pb)
 {
 	int a = *((int *)pa);

+ 3 - 5
src/core/perfmodel/perfmodel_bus.c

@@ -79,12 +79,12 @@ static double opencldev_timing_dtoh[STARPU_MAXNODES] = {0.0};
 static struct dev_timing opencldev_timing_per_cpu[STARPU_MAXNODES*STARPU_MAXCPUS];
 #endif
 
-#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
-
 #ifdef STARPU_HAVE_HWLOC
 static hwloc_topology_t hwtopology;
 #endif
 
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
+
 #ifdef STARPU_USE_CUDA
 
 static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int cpu, struct dev_timing *dev_timing_per_cpu)
@@ -516,7 +516,6 @@ static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_h
 
 static void benchmark_all_gpu_devices(void)
 {
-#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 	int i;
 #ifdef HAVE_CUDA_MEMCPY_PEER
 	int j;
@@ -530,7 +529,7 @@ static void benchmark_all_gpu_devices(void)
 #endif
 
 #ifdef STARPU_HAVE_HWLOC
-	hwloc_cpuset_t former_cpuset = hwloc_bitmap_alloc();
+	hwloc_bitmap_t former_cpuset = hwloc_bitmap_alloc();
 	hwloc_get_cpubind(hwtopology, former_cpuset, HWLOC_CPUBIND_THREAD);
 #elif __linux__
 	/* Save the current cpu binding */
@@ -595,7 +594,6 @@ static void benchmark_all_gpu_devices(void)
 #endif
 
 	_STARPU_DEBUG("Benchmarking the speed of the bus is done.\n");
-#endif /* defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) */
 
 	was_benchmarked = 1;
 }

+ 2 - 3
src/core/perfmodel/perfmodel_history.c

@@ -865,13 +865,12 @@ int starpu_perfmodel_list(FILE *output)
                                 fprintf(output, "file: <%s>\n", ep->d_name);
                 }
                 closedir (dp);
-                return 0;
         }
         else
 	{
-                perror("Couldn't open the directory");
-                return 1;
+		_STARPU_DISP("Could not open the perfmodel directory <%s>\n", path);
         }
+	return 0;
 }
 
 /* This function is intended to be used by external tools that should read the

+ 1 - 1
src/core/perfmodel/perfmodel_print.c

@@ -77,7 +77,7 @@ void starpu_perfmodel_print(struct starpu_perfmodel *model, enum starpu_perf_arc
 		/* no specific parameter was requested, so we display everything */
 		if (arch_model->regression.nsample)
 		{
-			fprintf(output, "\tRegression : #sample = %d\n", arch_model->regression.nsample);
+			fprintf(output, "\tRegression : #sample = %u\n", arch_model->regression.nsample);
 		}
 
 		/* Only display the regression model if we could actually build a model */

+ 3 - 9
src/core/topology.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  INRIA
  *
@@ -36,12 +36,6 @@
 #ifdef STARPU_HAVE_WINDOWS
 #include <windows.h>
 #endif
-#ifndef HWLOC_BITMAP_H
-/* hwloc <1.1 does not offer the bitmap API yet */
-#define hwloc_bitmap_alloc hwloc_cpuset_alloc
-#define hwloc_bitmap_only hwloc_cpuset_cpu
-#define hwloc_bitmap_singlify hwloc_cpuset_singlify
-#endif
 
 static unsigned topology_is_initialized = 0;
 
@@ -655,7 +649,7 @@ void _starpu_bind_thread_on_cpu(struct _starpu_machine_config *config STARPU_ATT
 	if (support->cpubind->set_thisthread_cpubind)
 	{
 		hwloc_obj_t obj = hwloc_get_obj_by_depth(config->topology.hwtopology, config->cpu_depth, cpuid);
-		hwloc_cpuset_t set = obj->cpuset;
+		hwloc_bitmap_t set = obj->cpuset;
 		int ret;
 
 		hwloc_bitmap_singlify(set);
@@ -706,7 +700,7 @@ void _starpu_bind_thread_on_cpus(struct _starpu_machine_config *config STARPU_AT
 	support = hwloc_topology_get_support(config->topology.hwtopology);
 	if (support->cpubind->set_thisthread_cpubind)
 	{
-		hwloc_cpuset_t set = combined_worker->hwloc_cpu_set;
+		hwloc_bitmap_t set = combined_worker->hwloc_cpu_set;
 		int ret;
 
 		ret = hwloc_set_cpubind(config->topology.hwtopology, set, HWLOC_CPUBIND_THREAD);

+ 32 - 0
src/core/workers.c

@@ -500,9 +500,29 @@ int starpu_conf_init(struct starpu_conf *conf)
 	if (conf->single_combined_worker == -1)
 	     conf->single_combined_worker = 0;
 
+#if defined(STARPU_DISABLE_ASYNCHRONOUS_COPY)
+	conf->disable_asynchronous_copy = 1;
+#else
 	conf->disable_asynchronous_copy = starpu_get_env_number("STARPU_DISABLE_ASYNCHRONOUS_COPY");
 	if (conf->disable_asynchronous_copy == -1)
 		conf->disable_asynchronous_copy = 0;
+#endif
+
+#if defined(STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY)
+	conf->disable_asynchronous_cuda_copy = 1;
+#else
+	conf->disable_cuda_asynchronous_copy = starpu_get_env_number("STARPU_DISABLE_CUDA_ASYNCHRONOUS_COPY");
+	if (conf->disable_cuda_asynchronous_copy == -1)
+		conf->disable_cuda_asynchronous_copy = 0;
+#endif
+
+#if defined(STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY)
+	conf->disable_asynchronous_opencl_copy = 1;
+#else
+	conf->disable_opencl_asynchronous_copy = starpu_get_env_number("STARPU_DISABLE_OPENCL_ASYNCHRONOUS_COPY");
+	if (conf->disable_opencl_asynchronous_copy == -1)
+		conf->disable_opencl_asynchronous_copy = 0;
+#endif
 
 	return 0;
 }
@@ -534,6 +554,8 @@ static void _starpu_conf_check_environment(struct starpu_conf *conf)
 	_starpu_conf_set_value_against_environment("STARPU_BUS_CALIBRATE", &conf->bus_calibrate);
 	_starpu_conf_set_value_against_environment("STARPU_SINGLE_COMBINED_WORKER", &conf->single_combined_worker);
 	_starpu_conf_set_value_against_environment("STARPU_DISABLE_ASYNCHRONOUS_COPY", &conf->disable_asynchronous_copy);
+	_starpu_conf_set_value_against_environment("STARPU_DISABLE_CUDA_ASYNCHRONOUS_COPY", &conf->disable_cuda_asynchronous_copy);
+	_starpu_conf_set_value_against_environment("STARPU_DISABLE_OPENCL_ASYNCHRONOUS_COPY", &conf->disable_opencl_asynchronous_copy);
 }
 
 int starpu_init(struct starpu_conf *user_conf)
@@ -903,6 +925,16 @@ int starpu_asynchronous_copy_disabled()
 	return config.conf->disable_asynchronous_copy;
 }
 
+int starpu_asynchronous_cuda_copy_disabled()
+{
+	return config.conf->disable_cuda_asynchronous_copy;
+}
+
+int starpu_asynchronous_opencl_copy_disabled()
+{
+	return config.conf->disable_opencl_asynchronous_copy;
+}
+
 /* When analyzing performance, it is useful to see what is the processing unit
  * that actually performed the task. This function returns the id of the
  * processing unit actually executing it, therefore it makes no sense to use it

+ 3 - 3
src/core/workers.h

@@ -99,8 +99,8 @@ struct _starpu_worker
 	cpu_set_t current_cpu_set;
 #endif /* __GLIBC__ */
 #ifdef STARPU_HAVE_HWLOC
-	hwloc_cpuset_t initial_hwloc_cpu_set;
-	hwloc_cpuset_t current_hwloc_cpu_set;
+	hwloc_bitmap_t initial_hwloc_cpu_set;
+	hwloc_bitmap_t current_hwloc_cpu_set;
 #endif
 };
 
@@ -116,7 +116,7 @@ struct _starpu_combined_worker
 	cpu_set_t cpu_set;
 #endif /* __GLIBC__ */
 #ifdef STARPU_HAVE_HWLOC
-	hwloc_cpuset_t hwloc_cpu_set;
+	hwloc_bitmap_t hwloc_cpu_set;
 #endif
 };
 

+ 23 - 0
src/datawizard/interfaces/data_interface.c

@@ -274,6 +274,9 @@ static starpu_data_handle_t _starpu_data_handle_allocate(struct starpu_data_inte
 
 	}
 
+	handle->tag = -1;
+	handle->rank = -1;
+
 	return handle;
 }
 
@@ -303,6 +306,26 @@ void starpu_data_register(starpu_data_handle_t *handleptr, uint32_t home_node,
 #endif
 	}
 
+#ifdef STARPU_USE_CUDA
+	int asynchronous_cuda_copy_disabled = starpu_asynchronous_cuda_copy_disabled();
+	if (STARPU_UNLIKELY(asynchronous_cuda_copy_disabled))
+	{
+		ops->copy_methods->ram_to_cuda_async = NULL;
+		ops->copy_methods->cuda_to_ram_async = NULL;
+		ops->copy_methods->cuda_to_cuda_async = NULL;
+	}
+#endif
+
+#ifdef STARPU_USE_OPENCL
+	int asynchronous_opencl_copy_disabled = starpu_asynchronous_opencl_copy_disabled();
+	if (STARPU_UNLIKELY(asynchronous_opencl_copy_disabled))
+	{
+		ops->copy_methods->ram_to_opencl_async = NULL;
+		ops->copy_methods->opencl_to_ram_async = NULL;
+		ops->copy_methods->opencl_to_opencl_async = NULL;
+	}
+#endif
+
 	/* fill the interface fields with the appropriate method */
 	STARPU_ASSERT(ops->register_data_handle);
 	ops->register_data_handle(handle, home_node, data_interface);

+ 1 - 1
src/debug/traces/starpu_fxt.h

@@ -34,7 +34,7 @@
 
 #include <common/fxt.h>
 #include <common/list.h>
-#include "../mpi/starpu_mpi_fxt.h"
+#include "../mpi/src/starpu_mpi_fxt.h"
 #include <starpu.h>
 #include "../../../include/starpu_fxt.h"
 

+ 15 - 8
src/drivers/cuda/driver_cuda.c

@@ -306,13 +306,20 @@ static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *arg
 static struct _starpu_worker*
 _starpu_get_worker_from_driver(struct starpu_driver *d)
 {
-	int workers[d->id.cuda_id + 1];
-	int nworkers;
-	nworkers = starpu_worker_get_ids_by_type(STARPU_CUDA_WORKER, workers, d->id.cuda_id+1);
-	if (nworkers >= 0 && (unsigned) nworkers < d->id.cuda_id)
-		return NULL; // No device was found.
-	
-	return _starpu_get_worker_struct(workers[d->id.cuda_id]);
+	unsigned nworkers = starpu_worker_get_count();
+	unsigned  workerid;
+	for (workerid = 0; workerid < nworkers; workerid++)
+	{
+		if (starpu_worker_get_type(workerid) == d->type)
+		{
+			struct _starpu_worker *worker;
+			worker = _starpu_get_worker_struct(workerid);
+			if (worker->devid == d->id.cuda_id)
+				return worker;
+		}
+	}
+
+	return NULL;
 }
 
 /* XXX Should this be merged with _starpu_init_cuda ? */
@@ -527,7 +534,7 @@ void starpu_cublas_report_error(const char *func, const char *file, int line, cu
 			errormsg = "unknown error";
 			break;
 	}
-	printf("oops in %s (%s:%u)... %d: %s \n", func, file, line, status, errormsg);
+	fprintf(stderr, "oops in %s (%s:%u)... %d: %s \n", func, file, line, status, errormsg);
 	STARPU_ASSERT(0);
 }
 

+ 1 - 0
src/sched_policies/heft.c

@@ -289,6 +289,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 	return starpu_push_local_task(best_workerid, task, prio);
 }
 
+/* TODO: Correct the bugs in the scheduling !!! */
 /* TODO: factorize with dmda!! */
 static void compute_all_performance_predictions(struct starpu_task *task,
 						double (*local_task_length)[STARPU_MAXIMPLEMENTATIONS], 

+ 5 - 2
starpufft/examples/testx.c

@@ -127,6 +127,7 @@ int main(int argc, char *argv[])
 #endif
 
 	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	if (argc == 1)
@@ -221,7 +222,8 @@ int main(int argc, char *argv[])
 	printf("CUDA took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing);
 #endif
 
-	STARPUFFT(execute)(plan, in, out);
+	ret = STARPUFFT(execute)(plan, in, out);
+	if (ret == -1) return 77;
 	STARPUFFT(showstats)(stdout);
 
 #ifdef STARPU_HAVE_FFTW
@@ -235,7 +237,8 @@ int main(int argc, char *argv[])
 	starpu_vector_data_register(&in_handle, 0, (uintptr_t) in, size, sizeof(*in));
 	starpu_vector_data_register(&out_handle, 0, (uintptr_t) out, size, sizeof(*out));
 
-	STARPUFFT(execute_handle)(plan, in_handle, out_handle);
+	ret = STARPUFFT(execute_handle)(plan, in_handle, out_handle);
+	if (ret == -1) return 77;
 
 	starpu_data_unregister(in_handle);
 	starpu_data_unregister(out_handle);

+ 3 - 3
starpufft/starpufft.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -39,10 +39,10 @@ starpufft(plan) starpufft(plan_dft_c2r_1d)(int n, unsigned flags); \
 void *starpufft(malloc)(size_t n); \
 void starpufft(free)(void *p); \
 \
-void starpufft(execute)(starpufft(plan) p, void *in, void *out); \
+int starpufft(execute)(starpufft(plan) p, void *in, void *out); \
 struct starpu_task *starpufft(start)(starpufft(plan) p, void *in, void *out); \
 \
-void starpufft(execute_handle)(starpufft(plan) p, starpu_data_handle_t in, starpu_data_handle_t out); \
+int starpufft(execute_handle)(starpufft(plan) p, starpu_data_handle_t in, starpu_data_handle_t out); \
 struct starpu_task *starpufft(start_handle)(starpufft(plan) p, starpu_data_handle_t in, starpu_data_handle_t out); \
 \
 void starpufft(cleanup)(starpufft(plan) p); \

+ 10 - 4
starpufft/starpufftx.c

@@ -268,7 +268,7 @@ STARPUFFT(start_handle)(STARPUFFT(plan) plan, starpu_data_handle_t in, starpu_da
 	return STARPUFFT(start1dC2C)(plan, in, out);
 }
 
-void
+int
 STARPUFFT(execute)(STARPUFFT(plan) plan, void *in, void *out)
 {
 	int ret;
@@ -280,22 +280,28 @@ STARPUFFT(execute)(STARPUFFT(plan) plan, void *in, void *out)
 
 	struct starpu_task *task = STARPUFFT(start)(plan, in, out);
 	gettimeofday(&submit_tasks, NULL);
-	ret = starpu_task_wait(task);
-	STARPU_ASSERT(ret == 0);
+	if (task)
+	{
+	     ret = starpu_task_wait(task);
+	     STARPU_ASSERT(ret == 0);
+	}
 
 	STARPUFFT(cleanup)(plan);
 
 	gettimeofday(&end, NULL);
+	return (task == NULL ? -1 : 0);
 }
 
-void
+int
 STARPUFFT(execute_handle)(STARPUFFT(plan) plan, starpu_data_handle_t in, starpu_data_handle_t out)
 {
 	int ret;
 
 	struct starpu_task *task = STARPUFFT(start_handle)(plan, in, out);
+	if (!task) return -1;
 	ret = starpu_task_wait(task);
 	STARPU_ASSERT(ret == 0);
+	return 0;
 }
 
 /* Destroy FFTW plans, unregister and free buffers, and free tags */

+ 8 - 0
starpufft/starpufftx1d.c

@@ -782,24 +782,31 @@ STARPUFFT(start1dC2C)(STARPUFFT(plan) plan, starpu_data_handle_t in, starpu_data
 if (PARALLEL) {
 	for (z=0; z < plan->totsize1; z++) {
 		ret = starpu_task_submit(plan->twist1_tasks[z]);
+		if (ret == -ENODEV) return NULL;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		ret = starpu_task_submit(plan->fft1_tasks[z]);
+		if (ret == -ENODEV) return NULL;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}
 
 	ret = starpu_task_submit(plan->join_task);
+	if (ret == -ENODEV) return NULL;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
 	for (z=0; z < plan->totsize3; z++) {
 		ret = starpu_task_submit(plan->twist2_tasks[z]);
+		if (ret == -ENODEV) return NULL;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		ret = starpu_task_submit(plan->fft2_tasks[z]);
+		if (ret == -ENODEV) return NULL;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		ret = starpu_task_submit(plan->twist3_tasks[z]);
+		if (ret == -ENODEV) return NULL;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}
 
 	ret = starpu_task_submit(plan->end_task);
+	if (ret == -ENODEV) return NULL;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
 	return plan->end_task;
@@ -815,6 +822,7 @@ if (PARALLEL) {
 	task->cl_arg = plan;
 
 	ret = starpu_task_submit(task);
+	if (ret == -ENODEV) return NULL;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	return task;
 }

+ 8 - 0
starpufft/starpufftx2d.c

@@ -780,24 +780,31 @@ STARPUFFT(start2dC2C)(STARPUFFT(plan) plan, starpu_data_handle_t in, starpu_data
 if (PARALLEL) {
 	for (z=0; z < plan->totsize1; z++) {
 		ret = starpu_task_submit(plan->twist1_tasks[z]);
+		if (ret == -ENODEV) return NULL;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		ret = starpu_task_submit(plan->fft1_tasks[z]);
+		if (ret == -ENODEV) return NULL;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}
 
 	ret = starpu_task_submit(plan->join_task);
+	if (ret == -ENODEV) return NULL;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
 	for (z=0; z < plan->totsize3; z++) {
 		ret = starpu_task_submit(plan->twist2_tasks[z]);
+		if (ret == -ENODEV) return NULL;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		ret = starpu_task_submit(plan->fft2_tasks[z]);
+		if (ret == -ENODEV) return NULL;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		ret = starpu_task_submit(plan->twist3_tasks[z]);
+		if (ret == -ENODEV) return NULL;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}
 
 	ret = starpu_task_submit(plan->end_task);
+	if (ret == -ENODEV) return NULL;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
 	return plan->end_task;
@@ -813,6 +820,7 @@ if (PARALLEL) {
 	task->cl_arg = plan;
 
 	ret = starpu_task_submit(task);
+	if (ret == -ENODEV) return NULL;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	return task;
 }

+ 1 - 1
tests/datawizard/increment_redux.c

@@ -235,7 +235,7 @@ int main(int argc, char **argv)
 			task->cl = &increment_cl;
 			task->handles[0] = handle;
 
-			int ret = starpu_task_submit(task);
+			ret = starpu_task_submit(task);
 			if (ret == -ENODEV) goto enodev;
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		}

+ 1 - 1
tests/datawizard/increment_redux_lazy.c

@@ -216,7 +216,7 @@ int main(int argc, char **argv)
 			task->cl = &increment_cl;
 			task->handles[0] = handle;
 
-			int ret = starpu_task_submit(task);
+			ret = starpu_task_submit(task);
 			if (ret == -ENODEV) goto enodev;
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		}

+ 4 - 2
tests/datawizard/interfaces/test_interfaces.c

@@ -154,13 +154,15 @@ data_interface_test_summary_success(data_interface_test_summary *s)
 
 enum operation
 {
-	CPU_TO_CPU,
+	CPU_TO_CPU
 #ifdef STARPU_USE_CUDA
+	,
 	CPU_TO_CUDA,
 	CUDA_TO_CUDA,
-	CUDA_TO_CPU,
+	CUDA_TO_CPU
 #endif /* !STARPU_USE_CUDA */
 #ifdef STARPU_USE_OPENCL
+	,
 	CPU_TO_OPENCL,
 	OPENCL_TO_CPU
 #endif /* !STARPU_USE_OPENCL */

+ 2 - 2
tests/datawizard/manual_reduction.c

@@ -202,7 +202,7 @@ int main(int argc, char **argv)
 		task->execute_on_a_specific_worker = 1;
 		task->workerid = (unsigned)workerid;
 
-		int ret = starpu_task_submit(task);
+		ret = starpu_task_submit(task);
 		if (ret == -ENODEV) goto enodev;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}
@@ -217,7 +217,7 @@ int main(int argc, char **argv)
 		task->handles[0] = variable_handle;
 		task->handles[1] = per_worker_handle[worker];
 
-		int ret = starpu_task_submit(task);
+		ret = starpu_task_submit(task);
 		if (ret == -ENODEV) goto enodev;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}

+ 1 - 1
tests/datawizard/wt_host.c

@@ -114,7 +114,7 @@ int main(int argc, char **argv)
 		task->cl = &increment_cl;
 		task->handles[0] = handle;
 
-		int ret = starpu_task_submit(task);
+		ret = starpu_task_submit(task);
 		if (ret == -ENODEV) goto enodev;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}

+ 3 - 3
tests/errorcheck/starpu_init_noworker.c

@@ -60,9 +60,9 @@ int main(int argc, char **argv)
 		unsigned ncuda = starpu_cuda_worker_get_count();
 		unsigned nopencl = starpu_opencl_worker_get_count();
 		FPRINTF(stderr, "StarPU has found :\n");
-		FPRINTF(stderr, "\t%d CPU cores\n", ncpu);
-		FPRINTF(stderr, "\t%d CUDA devices\n", ncuda);
-		FPRINTF(stderr, "\t%d OpenCL devices\n", nopencl);
+		FPRINTF(stderr, "\t%u CPU cores\n", ncpu);
+		FPRINTF(stderr, "\t%u CUDA devices\n", ncuda);
+		FPRINTF(stderr, "\t%u OpenCL devices\n", nopencl);
 		return EXIT_FAILURE;
 	}
 

+ 16 - 5
tests/loader.c

@@ -148,6 +148,10 @@ int main(int argc, char *argv[])
 	char *launcher_args;
 	char *top_srcdir;
 	struct sigaction sa;
+	int   ret;
+	struct timeval start;
+	struct timeval end;
+	double timing;
 
 	test_args = NULL;
 	timeout = 0;
@@ -230,7 +234,8 @@ int main(int argc, char *argv[])
 					argv[i] = strtok(NULL, " ");
 				}
 				argv[i] = test_name;
-				argv[i+1] = NULL;
+				argv[i+1] = test_args;
+				argv[i+2] = NULL;
 				execvp(*argv, argv);
 			}
 			else
@@ -253,6 +258,8 @@ int main(int argc, char *argv[])
 		exit(EXIT_FAILURE);
 	}
 
+	ret = EXIT_SUCCESS;
+	gettimeofday(&start, NULL);
 	alarm(timeout);
 	if (child_pid == waitpid(child_pid, &child_exit_status, 0))
 	{
@@ -268,7 +275,7 @@ int main(int argc, char *argv[])
 				if (status != AUTOTEST_SKIPPED_TEST)
 					fprintf(stdout, "`%s' exited with return code %d\n",
 						test_name, status);
-				return status;
+				ret = status;
 			}
 		}
 		else if (WIFSIGNALED(child_exit_status))
@@ -276,15 +283,19 @@ int main(int argc, char *argv[])
 			fprintf(stderr, "[error] `%s' killed with signal %d; test marked as failed\n",
 				test_name, WTERMSIG(child_exit_status));
 			launch_gdb(test_name);
-			return EXIT_FAILURE;
+			ret = EXIT_FAILURE;
 		}
 		else
 		{
 			fprintf(stderr, "[error] `%s' did not terminate normally; test marked as failed\n",
 				test_name);
-			return EXIT_FAILURE;
+			ret = EXIT_FAILURE;
 		}
 	}
 
-	return EXIT_SUCCESS;
+	gettimeofday(&end, NULL);
+	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	fprintf(stderr, "Execution of test '%s' took %f s\n", test_name, timing/1000000);
+
+	return ret;
 }

+ 1 - 1
tests/main/execute_on_a_specific_worker.c

@@ -138,7 +138,7 @@ int main(int argc, char **argv)
 			task->execute_on_a_specific_worker = 1;
 			task->workerid = worker;
 
-			int ret = starpu_task_submit(task);
+			ret = starpu_task_submit(task);
 			if (ret == -ENODEV) goto enodev;
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		}

+ 8 - 6
tests/main/starpu_init.c

@@ -30,12 +30,13 @@ int main(int argc, char **argv)
 static int check_cpu(int env_cpu, int conf_cpu, int expected_cpu, int *cpu)
 {
 	int ret;
+	char *string;
 
 	FPRINTF(stderr, "Testing with env=%d - conf=%d\n", env_cpu, conf_cpu);
 
 	if (env_cpu != -1)
 	{
-		char string[50];
+		string = malloc(50);
 		sprintf(string, "STARPU_NCPUS=%d", env_cpu);
 		putenv(string);
 	}
@@ -48,17 +49,18 @@ static int check_cpu(int env_cpu, int conf_cpu, int expected_cpu, int *cpu)
 	}
 	ret = starpu_init(&user_conf);
 
+	if (env_cpu != -1)
+	{
+		unsetenv("STARPU_NCPUS");
+		free(string);
+	}
+
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	*cpu = starpu_cpu_worker_get_count();
 	starpu_shutdown();
 
-	if (env_cpu != -1)
-	{
-		unsetenv("STARPU_NCPUS");
-	}
-
 	if (expected_cpu == -1)
 	{
 		FPRINTF(stderr, "Number of CPUS: %3d\n", *cpu);

+ 1 - 1
tests/microbenchs/async_tasks_overhead.c

@@ -136,7 +136,7 @@ int main(int argc, char **argv)
 	gettimeofday(&start, NULL);
 	for (i = 0; i < ntasks; i++)
 	{
-		int ret = starpu_task_submit(tasks[i]);
+		ret = starpu_task_submit(tasks[i]);
 		if (ret == -ENODEV) goto enodev;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}

+ 12 - 9
tests/microbenchs/matrix_as_vector.c

@@ -88,15 +88,21 @@ int check_size(int nx, struct starpu_codelet *vector_codelet, struct starpu_code
 {
 	float *matrix, mean;
 	starpu_data_handle_t vector_handle, matrix_handle;
-	int ret, i, loop;
+	int ret, i, loop, maxloops;
 	double vector_timing, matrix_timing;
 	struct timeval start;
 	struct timeval end;
 
 	matrix = malloc(nx*sizeof(matrix[0]));
+	maxloops = LOOPS;
+#ifdef STARPU_HAVE_VALGRIND_H
+	if (RUNNING_ON_VALGRIND)
+		/* computations are skipped when running on valgrind, there is no need to have several loops */
+		maxloops=1;
+#endif /* STARPU_HAVE_VALGRIND_H */
 
 	gettimeofday(&start, NULL);
-	for(loop=1 ; loop<=LOOPS ; loop++)
+	for(loop=1 ; loop<=maxloops ; loop++)
 	{
 		for(i=0 ; i<nx ; i++) matrix[i] = i;
 		starpu_vector_data_register(&vector_handle, 0, (uintptr_t)matrix, nx, sizeof(matrix[0]));
@@ -107,11 +113,11 @@ int check_size(int nx, struct starpu_codelet *vector_codelet, struct starpu_code
 	gettimeofday(&end, NULL);
 
 	vector_timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	vector_timing /= LOOPS;
+	vector_timing /= maxloops;
 	mean = matrix[0];
 
 	gettimeofday(&start, NULL);
-	for(loop=1 ; loop<=LOOPS ; loop++)
+	for(loop=1 ; loop<=maxloops ; loop++)
 	{
 		for(i=0 ; i<nx ; i++) matrix[i] = i;
 		starpu_matrix_data_register(&matrix_handle, 0, (uintptr_t)matrix, nx/2, nx/2, 2, sizeof(matrix[0]));
@@ -122,7 +128,7 @@ int check_size(int nx, struct starpu_codelet *vector_codelet, struct starpu_code
 	gettimeofday(&end, NULL);
 
 	matrix_timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	matrix_timing /= LOOPS;
+	matrix_timing /= maxloops;
 
 	if (mean == matrix[0])
 	{
@@ -182,8 +188,7 @@ int check_size_on_device(uint32_t where, char *device_name)
 		if (ret != EXIT_SUCCESS) break;
 	}
 	return ret;
-
-};
+}
 
 int main(int argc, char **argv)
 {
@@ -204,9 +209,7 @@ int main(int argc, char **argv)
 	devices = starpu_cuda_worker_get_count();
 	if (devices)
 	{
-		starpu_helper_cublas_init();
 		ret = check_size_on_device(STARPU_CUDA, "STARPU_CUDA");
-		starpu_helper_cublas_shutdown();
 		if (ret) goto error;
 	}
 	devices = starpu_opencl_worker_get_count();

+ 2 - 2
tests/microbenchs/prefetch_data_on_node.c

@@ -123,7 +123,7 @@ int main(int argc, char **argv)
 			task->cl = select_codelet_with_random_mode();
 			task->synchronous = 1;
 
-			int ret = starpu_task_submit(task);
+			ret = starpu_task_submit(task);
 			if (ret == -ENODEV) goto enodev;
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		}
@@ -148,7 +148,7 @@ int main(int argc, char **argv)
 
 			task->synchronous = 0;
 
-			int ret = starpu_task_submit(task);
+			ret = starpu_task_submit(task);
 			if (ret == -ENODEV) goto enodev;
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		}

+ 1 - 1
tests/overlap/overlap.c

@@ -120,7 +120,7 @@ int main(int argc, char **argv)
 		task->callback_func = callback;
 		task->callback_arg = NULL;
 
-		int ret = starpu_task_submit(task);
+		ret = starpu_task_submit(task);
 		if (ret == -ENODEV) goto enodev;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}

+ 1 - 1
tests/parallel_tasks/parallel_kernels.c

@@ -89,7 +89,7 @@ int main(int argc, char **argv)
 
 			task->handles[0] = v_handle;
 
-			int ret = starpu_task_submit(task);
+			ret = starpu_task_submit(task);
 			if (ret == -ENODEV) goto enodev;
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		}

+ 6 - 1
tests/sched_policies/execute_all_tasks.c

@@ -67,7 +67,12 @@ static int
 run(struct starpu_sched_policy *p)
 {
 	int ret;
-	ret = starpu_init(NULL);
+	struct starpu_conf conf;
+
+	(void) starpu_conf_init(&conf);
+	conf.sched_policy = p;
+
+	ret = starpu_init(&conf);
 	if (ret == -ENODEV)
 		exit(STARPU_TEST_SKIPPED);
 

+ 3 - 3
tools/starpu_machine_display.c

@@ -142,13 +142,13 @@ int main(int argc, char **argv)
 
 	fprintf(stdout, "StarPU has found :\n");
 
-	fprintf(stdout, "\t%d CPU cores\n", ncpu);
+	fprintf(stdout, "\t%u CPU cores\n", ncpu);
 	display_worker_names(STARPU_CPU_WORKER);
 
-	fprintf(stdout, "\t%d CUDA devices\n", ncuda);
+	fprintf(stdout, "\t%u CUDA devices\n", ncuda);
 	display_worker_names(STARPU_CUDA_WORKER);
 
-	fprintf(stdout, "\t%d OpenCL devices\n", nopencl);
+	fprintf(stdout, "\t%u OpenCL devices\n", nopencl);
 	display_worker_names(STARPU_OPENCL_WORKER);
 
 	display_all_combined_workers();

+ 1 - 6
tools/starpu_perfmodel_display.c

@@ -143,12 +143,7 @@ int main(int argc, char **argv)
 
         if (plist)
 	{
-                int ret = starpu_perfmodel_list(stdout);
-                if (ret)
-		{
-                        fprintf(stderr, "The performance model directory is invalid\n");
-                        return 1;
-                }
+                starpu_perfmodel_list(stdout);
         }
         else
 	{