Marc Sergent před 13 roky
rodič
revize
f4fb7d745d
100 změnil soubory, kde provedl 965 přidání a 1272 odebrání
  1. 12 9
      ChangeLog
  2. 6 6
      Makefile.am
  3. 21 1
      configure.ac
  4. 7 4
      doc/Makefile.am
  5. 53 60
      doc/chapters/advanced-api.texi
  6. 19 9
      doc/chapters/basic-api.texi
  7. 14 2
      doc/chapters/configuration.texi
  8. 44 1
      doc/chapters/mpi-support.texi
  9. 17 7
      doc/chapters/perf-feedback.texi
  10. 6 1
      doc/chapters/perf-optimization.texi
  11. 4 4
      doc/chapters/sched_ctx_hypervisor.texi
  12. 1 1
      examples/Makefile.am
  13. 2 2
      examples/basic_examples/block_cpu.c
  14. 58 0
      examples/cholesky/cholesky.h
  15. 10 1
      examples/cholesky/cholesky_grain_tag.c
  16. 6 2
      examples/cholesky/cholesky_implicit.c
  17. 10 1
      examples/cholesky/cholesky_tag.c
  18. 10 1
      examples/cholesky/cholesky_tile_tag.c
  19. 2 2
      examples/filters/custom_mf/custom_conversion_codelets.c
  20. 4 3
      examples/filters/custom_mf/custom_interface.c
  21. 2 2
      examples/filters/fblock_cpu.c
  22. 8 8
      examples/filters/fblock_opencl.c
  23. 3 3
      examples/filters/fmatrix.c
  24. 2 2
      examples/filters/fvector.c
  25. 3 3
      examples/filters/shadow.c
  26. 19 128
      examples/interface/complex_interface.c
  27. 3 3
      examples/ppm_downscaler/ppm_downscaler.c
  28. 2 2
      examples/profiling/profiling.c
  29. 2 2
      examples/sched_ctx/sched_ctx.c
  30. 1 1
      examples/sched_ctx_utils/sched_ctx_utils.c
  31. 4 4
      examples/scheduler/dummy_sched.c
  32. 2 2
      examples/stencil/Makefile.am
  33. 1 1
      examples/stencil/life.c
  34. 5 5
      examples/stencil/stencil-blocks.c
  35. 11 11
      examples/stencil/stencil-tasks.c
  36. 4 4
      examples/stencil/stencil.c
  37. 6 6
      examples/stencil/stencil.h
  38. 1 1
      examples/tag_example/tag_example2.c
  39. 1 2
      include/starpu_cuda.h
  40. 1 1
      include/starpu_data.h
  41. 6 1
      include/starpu_data_interfaces.h
  42. 2 3
      include/starpu_opencl.h
  43. 7 1
      include/starpu_perfmodel.h
  44. 21 25
      include/starpu_sched_ctx.h
  45. 3 0
      include/starpu_task.h
  46. 2 2
      include/starpu_task_util.h
  47. 4 4
      include/starpu_util.h
  48. 1 1
      mpi/examples/Makefile.am
  49. 1 2
      mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
  50. 2 2
      mpi/examples/matrix_decomposition/mpi_decomposition_params.c
  51. 3 3
      mpi/examples/mpi_lu/plu_example.c
  52. 2 2
      mpi/examples/mpi_lu/pxlu.c
  53. 1 1
      mpi/src/Makefile.am
  54. 14 15
      mpi/src/starpu_mpi.c
  55. 2 2
      mpi/src/starpu_mpi_stats.c
  56. 1 1
      mpi/tests/Makefile.am
  57. 3 3
      mpi/tests/mpi_detached_tag.c
  58. 3 3
      mpi/tests/mpi_irecv.c
  59. 3 3
      mpi/tests/mpi_irecv_detached.c
  60. 3 3
      mpi/tests/mpi_isend.c
  61. 3 3
      mpi/tests/mpi_isend_detached.c
  62. 3 3
      mpi/tests/mpi_probe.c
  63. 3 3
      mpi/tests/mpi_test.c
  64. 3 3
      mpi/tests/pingpong.c
  65. 8 8
      mpi/tests/ring.c
  66. 8 8
      mpi/tests/ring_async.c
  67. 8 8
      mpi/tests/ring_async_implicit.c
  68. 2 2
      sched_ctx_hypervisor/examples/Makefile.am
  69. 3 3
      sched_ctx_hypervisor/examples/app_driven_test/app_driven_test.c
  70. 4 4
      sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.c
  71. 1 0
      sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.h
  72. 7 1
      sched_ctx_hypervisor/include/sched_ctx_hypervisor.h
  73. 2 4
      sched_ctx_hypervisor/src/Makefile.am
  74. 9 9
      sched_ctx_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c
  75. 1 1
      sched_ctx_hypervisor/src/hypervisor_policies/ispeed_policy.c
  76. 13 6
      sched_ctx_hypervisor/src/hypervisor_policies/lp_tools.c
  77. 26 19
      sched_ctx_hypervisor/src/hypervisor_policies/policy_tools.c
  78. 25 22
      sched_ctx_hypervisor/src/sched_ctx_hypervisor.c
  79. 1 1
      socl/examples/Makefile.am
  80. 7 4
      socl/src/cl_enqueuendrangekernel.c
  81. 1 1
      src/Makefile.am
  82. 10 2
      src/core/jobs.c
  83. 14 4
      src/core/perfmodel/perfmodel_bus.c
  84. 31 14
      src/core/perfmodel/perfmodel_history.c
  85. 4 4
      src/core/perfmodel/perfmodel_print.c
  86. 35 30
      src/core/sched_ctx.c
  87. 2 2
      src/core/sched_ctx.h
  88. 15 16
      src/core/sched_policy.c
  89. 12 2
      src/core/task.c
  90. 24 24
      src/core/workers.c
  91. 1 0
      src/datawizard/coherency.c
  92. 147 35
      src/datawizard/copy_driver.c
  93. 14 127
      src/datawizard/interfaces/bcsr_interface.c
  94. 9 7
      src/datawizard/interfaces/block_interface.c
  95. 19 247
      src/datawizard/interfaces/coo_interface.c
  96. 13 220
      src/datawizard/interfaces/csr_interface.c
  97. 2 36
      src/datawizard/interfaces/data_interface.c
  98. 3 3
      src/datawizard/interfaces/matrix_interface.c
  99. 1 1
      src/datawizard/interfaces/multiformat_interface.c
  100. 0 0
      src/datawizard/interfaces/variable_interface.c

+ 12 - 9
ChangeLog

@@ -93,6 +93,18 @@ New features:
   * Introduce new variables STARPU_LIMIT_CUDA_devid_MEM and
   * Introduce new variables STARPU_LIMIT_CUDA_devid_MEM and
     STARPU_LIMIT_OPENCL_devid_MEM to limit memory per specific device
     STARPU_LIMIT_OPENCL_devid_MEM to limit memory per specific device
 
 
+Small features:
+  * Add starpu_worker_get_by_type and starpu_worker_get_by_devid
+  * Add starpu_fxt_stop_profiling/starpu_fxt_start_profiling which permits to
+    pause trace recording.
+  * Add trace_buffer_size configuration field to permit to specify the tracing
+    buffer size.
+  * Add starpu_codelet_profile and starpu_codelet_histo_profile, tools which draw
+    the profile of a codelet.
+  * File STARPU-REVISION --- containing the SVN revision number from which
+    StarPU was compiled --- is installed in the share/doc/starpu directory
+  * starpu_perfmodel_plot can now directly draw GFlops curves.
+
 Changes:
 Changes:
   * Fix the block filter functions.
   * Fix the block filter functions.
   * Fix StarPU-MPI on Darwin.
   * Fix StarPU-MPI on Darwin.
@@ -127,15 +139,6 @@ Changes:
   * StarPU can now use poti to generate paje traces.
   * StarPU can now use poti to generate paje traces.
   * Rename scheduling policy "parallel greedy" to "parallel eager"
   * Rename scheduling policy "parallel greedy" to "parallel eager"
 
 
-Small features:
-  * Add starpu_worker_get_by_type and starpu_worker_get_by_devid
-  * Add starpu_fxt_stop_profiling/starpu_fxt_start_profiling which permits to
-  pause trace recording.
-  * Add trace_buffer_size configuration field to permit to specify the tracing
-  buffer size.
-  * Add starpu_codelet_profile and starpu_codelet_histo_profile, tools which draw
-  the profile of a codelet.
-
 Small changes:
 Small changes:
   * STARPU_NCPU should now be used instead of STARPU_NCPUS. STARPU_NCPUS is
   * STARPU_NCPU should now be used instead of STARPU_NCPUS. STARPU_NCPUS is
 	still available for compatibility reasons.
 	still available for compatibility reasons.

+ 6 - 6
Makefile.am

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 #
 # Copyright (C) 2009-2012  Université de Bordeaux 1
 # Copyright (C) 2009-2012  Université de Bordeaux 1
-# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+# Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
 #
 #
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
 # it under the terms of the GNU Lesser General Public License as published by
@@ -91,7 +91,7 @@ all-local:
 	cd starpu-top ; $(QMAKE) ; $(MAKE)
 	cd starpu-top ; $(QMAKE) ; $(MAKE)
 clean-local:
 clean-local:
 	cd starpu-top ; $(QMAKE) ; $(MAKE) clean ; $(RM) Makefile
 	cd starpu-top ; $(QMAKE) ; $(MAKE) clean ; $(RM) Makefile
-	$(RM) starpu_top.1 starpu-top/starpu_top
+	$(RM) starpu-top/starpu_top.1 starpu-top/starpu_top
 # TODO: resources
 # TODO: resources
 install-exec-local:
 install-exec-local:
 	$(MKDIR_P) $(DESTDIR)$(bindir)
 	$(MKDIR_P) $(DESTDIR)$(bindir)
@@ -102,10 +102,10 @@ uninstall-local:
 	$(RM) starpu-top/Makefile
 	$(RM) starpu-top/Makefile
 
 
 if STARPU_HAVE_HELP2MAN
 if STARPU_HAVE_HELP2MAN
-starpu_top.1: starpu-top/starpu_top$(EXEEXT)
+starpu-top/starpu_top.1: starpu-top/starpu_top$(EXEEXT)
 	help2man --no-discard-stderr -N --output=$@ starpu-top/starpu_top$(EXEEXT)
 	help2man --no-discard-stderr -N --output=$@ starpu-top/starpu_top$(EXEEXT)
 dist_man1_MANS =\
 dist_man1_MANS =\
-	starpu_top.1
+	starpu-top/starpu_top.1
 endif
 endif
 endif
 endif
 
 
@@ -114,8 +114,8 @@ txtdir = ${prefix}
 else
 else
 txtdir = ${docdir}
 txtdir = ${docdir}
 endif
 endif
-txt_DATA = AUTHORS COPYING.LGPL README
-EXTRA_DIST = AUTHORS COPYING.LGPL README STARPU-VERSION build-aux/svn2cl.xsl
+txt_DATA = AUTHORS COPYING.LGPL README STARPU-REVISION
+EXTRA_DIST = AUTHORS COPYING.LGPL README STARPU-VERSION STARPU-REVISION build-aux/svn2cl.xsl
 
 
 include starpu-top/extradist
 include starpu-top/extradist
 
 

+ 21 - 1
configure.ac

@@ -81,6 +81,21 @@ AC_C_RESTRICT
 # Check if bash is available
 # Check if bash is available
 AC_CHECK_PROGS([BASH], [bash])
 AC_CHECK_PROGS([BASH], [bash])
 
 
+# Check whether subversion is installed
+AC_PATH_PROG(svnversioncommand, svnversion)
+
+# use svnversion to record the current repository revision only if
+# subversion is installed and we are in a working copy
+if test "$svnversioncommand" = "" || test `LC_ALL=C $svnversioncommand -n $srcdir` = "exported" ; then
+   if test -f $srcdir/STARPU-REVISION ; then
+      cp $srcdir/STARPU-REVISION .
+   else
+      echo "unknown" > ./STARPU-REVISION
+   fi
+else
+   LC_ALL=C svnversion $srcdir > ./STARPU-REVISION
+fi
+
 AC_PATH_PROGS([STARPU_MS_LIB], [lib])
 AC_PATH_PROGS([STARPU_MS_LIB], [lib])
 AC_ARG_VAR([STARPU_MS_LIB], [Path to Microsoft's Visual Studio `lib' tool])
 AC_ARG_VAR([STARPU_MS_LIB], [Path to Microsoft's Visual Studio `lib' tool])
 AM_CONDITIONAL([STARPU_HAVE_MS_LIB], [test "x$STARPU_MS_LIB" != "x"])
 AM_CONDITIONAL([STARPU_HAVE_MS_LIB], [test "x$STARPU_MS_LIB" != "x"])
@@ -1327,12 +1342,17 @@ AC_DEFUN([IS_SUPPORTED_CFLAG],
 IS_SUPPORTED_CFLAG(-W)
 IS_SUPPORTED_CFLAG(-W)
 IS_SUPPORTED_CFLAG(-Wall)
 IS_SUPPORTED_CFLAG(-Wall)
 IS_SUPPORTED_CFLAG(-Wextra)
 IS_SUPPORTED_CFLAG(-Wextra)
-AC_SUBST(GLOBAL_AM_CFLAGS)
+IS_SUPPORTED_CFLAG(-Werror=implicit)
 
 
 if test "x$STARPU_DEVEL" != x; then
 if test "x$STARPU_DEVEL" != x; then
 	AC_DEFINE(STARPU_DEVEL, [1], [enable developer warnings])
 	AC_DEFINE(STARPU_DEVEL, [1], [enable developer warnings])
+	IS_SUPPORTED_CFLAG(-Wunused)
+	IS_SUPPORTED_CFLAG(-Wundef)
+	IS_SUPPORTED_CFLAG(-Wshadow)
 fi
 fi
 
 
+AC_SUBST(GLOBAL_AM_CFLAGS)
+
 # Same value as Automake's, for use in other places.
 # Same value as Automake's, for use in other places.
 pkglibdir="\${libdir}/$PACKAGE"
 pkglibdir="\${libdir}/$PACKAGE"
 AC_SUBST([pkglibdir])
 AC_SUBST([pkglibdir])

+ 7 - 4
doc/Makefile.am

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 #
 # Copyright (C) 2009, 2011  Université de Bordeaux 1
 # Copyright (C) 2009, 2011  Université de Bordeaux 1
-# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+# Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
 #
 #
 # Permission is granted to copy, distribute and/or modify this document
 # Permission is granted to copy, distribute and/or modify this document
 # under the terms of the GNU Free Documentation License, Version 1.3
 # under the terms of the GNU Free Documentation License, Version 1.3
@@ -12,7 +12,7 @@
 
 
 info_TEXINFOS = starpu.texi
 info_TEXINFOS = starpu.texi
 
 
-starpu_TEXINFOS = chapters/advanced-api.texi \
+chapters =	chapters/advanced-api.texi \
 	chapters/benchmarks.texi \
 	chapters/benchmarks.texi \
 	chapters/configuration.texi \
 	chapters/configuration.texi \
 	chapters/perf-feedback.texi \
 	chapters/perf-feedback.texi \
@@ -35,9 +35,12 @@ starpu_TEXINFOS = chapters/advanced-api.texi \
 	chapters/using.texi \
 	chapters/using.texi \
 	chapters/vector_scal_opencl.texi \
 	chapters/vector_scal_opencl.texi \
 	chapters/socl.texi \
 	chapters/socl.texi \
-	chapters/version.texi \
 	chapters/sched_ctx_hypervisor.texi
 	chapters/sched_ctx_hypervisor.texi
 
 
+starpu_TEXINFOS = 		\
+	chapters/version.texi 	\
+	$(chapters)
+
 MAINTAINERCLEANFILES = starpu.pdf starpu.html
 MAINTAINERCLEANFILES = starpu.pdf starpu.html
 
 
 EXTRA_DIST = starpu.css
 EXTRA_DIST = starpu.css
@@ -50,7 +53,7 @@ AM_MAKEINFOHTMLFLAGS = --css-include=$(top_srcdir)/doc/starpu.css --no-headers -
 uninstall-local:
 uninstall-local:
 	$(RM) $(DESTDIR)$(infodir)/dir
 	$(RM) $(DESTDIR)$(infodir)/dir
 
 
-chapters/version.texi:
+chapters/version.texi: $(chapters)
 	@-for f in $(starpu_TEXINFOS) ; do \
 	@-for f in $(starpu_TEXINFOS) ; do \
                 if test -f $(top_srcdir)/doc/$$f ; then stat --format=%Y $(top_srcdir)/doc/$$f 2>/dev/null ; fi \
                 if test -f $(top_srcdir)/doc/$$f ; then stat --format=%Y $(top_srcdir)/doc/$$f 2>/dev/null ; fi \
         done | sort -r | head -1 > timestamp
         done | sort -r | head -1 > timestamp

+ 53 - 60
doc/chapters/advanced-api.texi

@@ -38,7 +38,7 @@ The arguments following the codelets can be of the following types:
 @item
 @item
 the specific values @code{STARPU_VALUE}, @code{STARPU_CALLBACK},
 the specific values @code{STARPU_VALUE}, @code{STARPU_CALLBACK},
 @code{STARPU_CALLBACK_ARG}, @code{STARPU_CALLBACK_WITH_ARG},
 @code{STARPU_CALLBACK_ARG}, @code{STARPU_CALLBACK_WITH_ARG},
-@code{STARPU_PRIORITY}, @code{STARPU_TAG}, followed by the appropriated objects
+@code{STARPU_PRIORITY}, @code{STARPU_TAG}, @code{STARPU_FLOPS}, followed by the appropriated objects
 as defined below.
 as defined below.
 @end itemize
 @end itemize
 
 
@@ -85,6 +85,12 @@ this macro is used when calling @code{starpu_insert_task}, and must be
 followed by a tag.
 followed by a tag.
 @end defmac
 @end defmac
 
 
+@defmac STARPU_FLOPS
+this macro is used when calling @code{starpu_insert_task}, and must be followed
+by an amount of floating point operations, as a double. The user may have to
+explicitly cast into double, otherwise parameter passing will not work.
+@end defmac
+
 @deftypefun void starpu_codelet_pack_args ({char **}@var{arg_buffer}, {size_t *}@var{arg_buffer_size}, ...)
 @deftypefun void starpu_codelet_pack_args ({char **}@var{arg_buffer}, {size_t *}@var{arg_buffer_size}, ...)
 Pack arguments of type @code{STARPU_VALUE} into a buffer which can be
 Pack arguments of type @code{STARPU_VALUE} into a buffer which can be
 given to a codelet and later unpacked with the function
 given to a codelet and later unpacked with the function
@@ -165,24 +171,6 @@ to the world size. Communications statistics must be enabled
 @node Communication
 @node Communication
 @subsection Communication
 @subsection Communication
 
 
-The standard point to point communications of MPI have been
-implemented. The semantic is similar to the MPI one, but adapted to
-the DSM provided by StarPU. A MPI request will only be submitted when
-the data is available in the main memory of the node submitting the
-request.
-
-There is two types of asynchronous communications: the classic
-asynchronous communications and the detached communications. The
-classic asynchronous communications (@code{starpu_mpi_isend} and
-@code{starpu_mpi_irecv}) need to be followed by a call to
-@code{starpu_mpi_wait} or to @code{starpu_mpi_test} to wait for or to
-test the completion of the communication. Waiting for or testing the
-completion of detached communications is not possible, this is done
-internally by StarPU-MPI, on completion, the resources are
-automatically released. This mechanism is similar to the pthread
-detach state attribute which determines whether a thread will be
-created in a joinable or a detached state.
-
 @deftypefun int starpu_mpi_send (starpu_data_handle_t @var{data_handle}, int @var{dest}, int @var{mpi_tag}, MPI_Comm @var{comm})
 @deftypefun int starpu_mpi_send (starpu_data_handle_t @var{data_handle}, int @var{dest}, int @var{mpi_tag}, MPI_Comm @var{comm})
 Performs a standard-mode, blocking send of @var{data_handle} to the
 Performs a standard-mode, blocking send of @var{data_handle} to the
 node @var{dest} using the message tag @code{mpi_tag} within the
 node @var{dest} using the message tag @code{mpi_tag} within the
@@ -354,51 +342,56 @@ Unpack the data handle from the contiguous buffer at the address @code{ptr} of s
 @end deftp
 @end deftp
 
 
 @deftp {Data Type} {struct starpu_data_copy_methods}
 @deftp {Data Type} {struct starpu_data_copy_methods}
-Defines the per-interface methods.
+Defines the per-interface methods. If the @code{any_to_any} method is provided,
+it will be used by default if no more specific method is provided. It can still
+be useful to provide more specific method in case of e.g. available particular
+CUDA or OpenCL support.
+
 @table @asis
 @table @asis
-@item @code{int @{ram,cuda,opencl@}_to_@{ram,cuda,opencl@}(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)}
+@item @code{int (*@{ram,cuda,opencl@}_to_@{ram,cuda,opencl@})(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)}
 These 12 functions define how to copy data from the @var{src_interface}
 These 12 functions define how to copy data from the @var{src_interface}
 interface on the @var{src_node} node to the @var{dst_interface} interface
 interface on the @var{src_node} node to the @var{dst_interface} interface
 on the @var{dst_node} node. They return 0 on success.
 on the @var{dst_node} node. They return 0 on success.
 
 
-@item @code{int (*ram_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
-Define how to copy data from the @var{src_interface} interface on the
-@var{src_node} node (in RAM) to the @var{dst_interface} interface on the
-@var{dst_node} node (on a CUDA device), using the given @var{stream}. Return 0
-on success.
-
-@item @code{int (*cuda_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
-Define how to copy data from the @var{src_interface} interface on the
-@var{src_node} node (on a CUDA device) to the @var{dst_interface} interface on the
-@var{dst_node} node (in RAM), using the given @var{stream}. Return 0
-on success.
-
-@item @code{int (*cuda_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
+@item @code{int (*@{ram,cuda@}_to_@{ram,cuda@}_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
+These 3 functions (@code{ram_to_ram} is not among these) define how to copy
+data from the @var{src_interface} interface on the @var{src_node} node to the
+@var{dst_interface} interface on the @var{dst_node} node, using the given
+@var{stream}. Must return 0 if the transfer was actually completed completely
+synchronously, or -EAGAIN if at least some transfers are still ongoing and
+should be awaited for by the core.
+
+@item @code{int (*@{ram,opencl@}_to_@{ram,opencl@}_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
+These 3 functions (@code{ram_to_ram} is not among them) define how to copy
+data from the @var{src_interface} interface on the @var{src_node} node to the
+@var{dst_interface} interface on the @var{dst_node} node, by recording in
+@var{event}, a pointer to a cl_event, the event of the last submitted transfer.
+Must return 0 if the transfer was actually completed completely synchronously,
+or -EAGAIN if at least some transfers are still ongoing and should be awaited
+for by the core.
+
+@item @code{int (*any_to_any)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)}
 Define how to copy data from the @var{src_interface} interface on the
 Define how to copy data from the @var{src_interface} interface on the
-@var{src_node} node (on a CUDA device) to the @var{dst_interface} interface on
-the @var{dst_node} node (on another CUDA device), using the given @var{stream}.
-Return 0 on success.
+@var{src_node} node to the @var{dst_interface} interface on the @var{dst_node}
+node. This is meant to be implemented through the @var{starpu_interface_copy}
+helper, to which @var{async_data} should be passed as such, and will be used to
+manage asynchronicity. This must return -EAGAIN if any of the
+@var{starpu_interface_copy} calls has returned -EAGAIN (i.e. at least some
+transfer is still ongoing), and return 0 otherwise.
 
 
-@item @code{int (*ram_to_opencl_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
-Define how to copy data from the @var{src_interface} interface on the
-@var{src_node} node (in RAM) to the @var{dst_interface} interface on the
-@var{dst_node} node (on an OpenCL device), using @var{event}, a pointer to a
-cl_event. Return 0 on success.
-
-@item @code{int (*opencl_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
-Define how to copy data from the @var{src_interface} interface on the
-@var{src_node} node (on an OpenCL device) to the @var{dst_interface} interface
-on the @var{dst_node} node (in RAM), using the given @var{event}, a pointer to
-a cl_event. Return 0 on success.
-
-@item @code{int (*opencl_to_opencl_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
-Define how to copy data from the @var{src_interface} interface on the
-@var{src_node} node (on an OpenCL device) to the @var{dst_interface} interface
-on the @var{dst_node} node (on another OpenCL device), using the given
-@var{event}, a pointer to a cl_event. Return 0 on success.
 @end table
 @end table
 @end deftp
 @end deftp
 
 
+@deftypefun int starpu_interface_copy (uintptr_t @var{src}, size_t @var{src_offset}, unsigned @var{src_node}, uintptr_t @var{dst}, size_t @var{dst_offset}, unsigned @var{dst_node}, size_t @var{size}, {void *}@var{async_data})
+Copy @var{size} bytes from byte offset @var{src_offset} of @var{src} on
+@var{src_node} to byte offset @var{dst_offset} of @var{dst} on @var{dst_node}.
+This is to be used in the @var{any_to_any} copy method, which is provided with
+the @var{async_data} to be pased to @var{starpu_interface_copy}. this returns
+-EAGAIN if the transfer is still ongoing, or 0 if the transfer is already
+completed.
+@end deftypefun
+
+
 @deftypefun uint32_t starpu_crc32_be_n ({void *}@var{input}, size_t @var{n}, uint32_t @var{inputcrc})
 @deftypefun uint32_t starpu_crc32_be_n ({void *}@var{input}, size_t @var{n}, uint32_t @var{inputcrc})
 Compute the CRC of a byte buffer seeded by the inputcrc "current
 Compute the CRC of a byte buffer seeded by the inputcrc "current
 state". The return value should be considered as the new "current
 state". The return value should be considered as the new "current
@@ -457,7 +450,7 @@ void starpu_complex_data_register(starpu_data_handle_t *handle,
                 .nx = nx
                 .nx = nx
         @};
         @};
 
 
-        if (interface_complex_ops.interfaceid == -1)
+        if (interface_complex_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
         @{
         @{
                 interface_complex_ops.interfaceid = starpu_data_interface_get_next_id();
                 interface_complex_ops.interfaceid = starpu_data_interface_get_next_id();
         @}
         @}
@@ -483,7 +476,7 @@ static struct starpu_data_interface_ops interface_complex_ops =
         .copy_methods = &complex_copy_methods,
         .copy_methods = &complex_copy_methods,
         .get_size = complex_get_size,
         .get_size = complex_get_size,
         .footprint = complex_footprint,
         .footprint = complex_footprint,
-        .interfaceid = -1,
+        .interfaceid = STARPU_UNKNOWN_INTERFACE_ID,
         .interface_size = sizeof(struct starpu_complex_interface),
         .interface_size = sizeof(struct starpu_complex_interface),
 @};
 @};
 @end smallexample
 @end smallexample
@@ -837,7 +830,7 @@ The number of workerids
 @item @code{pthread_key_t cursor_key} (optional)
 @item @code{pthread_key_t cursor_key} (optional)
 The cursor needed to iterate the collection (depending on the data structure)
 The cursor needed to iterate the collection (depending on the data structure)
 @item @code{int type}
 @item @code{int type}
-The type of structure (currently STARPU_WORKER_LIST is the only one available)
+The type of structure (currently STARPU_SCHED_CTX_WORKER_LIST is the only one available)
 @item @code{unsigned (*has_next)(struct starpu_sched_ctx_worker_collection *workers)}
 @item @code{unsigned (*has_next)(struct starpu_sched_ctx_worker_collection *workers)}
 Checks if there is a next worker
 Checks if there is a next worker
 @item @code{int (*get_next)(struct starpu_sched_ctx_worker_collection *workers)}
 @item @code{int (*get_next)(struct starpu_sched_ctx_worker_collection *workers)}
@@ -870,15 +863,15 @@ Delete the worker collection of the specified scheduling context
 Return the worker collection managed by the indicated context
 Return the worker collection managed by the indicated context
 @end deftypefun
 @end deftypefun
 
 
-@deftypefun pthread_mutex_t* starpu_get_changing_ctx_mutex (unsigned @var{sched_ctx_id})
+@deftypefun pthread_mutex_t* starpu_sched_ctx_get_changing_ctx_mutex (unsigned @var{sched_ctx_id})
 TODO
 TODO
 @end deftypefun
 @end deftypefun
 
 
-@deftypefun void starpu_task_set_context (unsigned *@var{sched_ctx_id})
+@deftypefun void starpu_sched_ctx_set_context (unsigned *@var{sched_ctx_id})
 Set the scheduling context the subsequent tasks will be submitted to
 Set the scheduling context the subsequent tasks will be submitted to
 @end deftypefun
 @end deftypefun
 
 
-@deftypefun unsigned starpu_task_get_context (void)
+@deftypefun unsigned starpu_sched_ctx_get_context (void)
 Return the scheduling context the tasks are currently submitted to
 Return the scheduling context the tasks are currently submitted to
 @end deftypefun
 @end deftypefun
 
 

+ 19 - 9
doc/chapters/basic-api.texi

@@ -1849,6 +1849,11 @@ A pointer to the next task. This should only be used by StarPU.
 This is only used for tasks that use multiformat handle. This should only be
 This is only used for tasks that use multiformat handle. This should only be
 used by StarPU.
 used by StarPU.
 
 
+@item @code{double flops}
+This can be set to the number of floating points operations that the task
+will have to achieve. This is useful for easily getting GFlops curves from
+@code{starpu_perfmodel_plot}, and for the hypervisor load balancing.
+
 @item @code{void *starpu_private}
 @item @code{void *starpu_private}
 This is private to StarPU, do not modify. If the task is allocated by hand
 This is private to StarPU, do not modify. If the task is allocated by hand
 (without starpu_task_create), this field should be set to NULL.
 (without starpu_task_create), this field should be set to NULL.
@@ -1857,6 +1862,7 @@ This is private to StarPU, do not modify. If the task is allocated by hand
 This field is set when initializing a task. It prevents a task from being
 This field is set when initializing a task. It prevents a task from being
 submitted if it has not been properly initialized.
 submitted if it has not been properly initialized.
 @end table
 @end table
+
 @end deftp
 @end deftp
 
 
 @deftypefun void starpu_task_init ({struct starpu_task} *@var{task})
 @deftypefun void starpu_task_init ({struct starpu_task} *@var{task})
@@ -1939,6 +1945,18 @@ This function blocks until all the tasks that were submitted are terminated. It
 does not destroy these tasks.
 does not destroy these tasks.
 @end deftypefun
 @end deftypefun
 
 
+@deftypefun int starpu_task_nready (void)
+@end deftypefun
+
+@deftypefun int starpu_task_nsubmitted (void)
+Return the number of submitted tasks which have not completed yet.
+@end deftypefun
+
+@deftypefun int starpu_task_nready (void)
+Return the number of submitted tasks which are ready for execution are already
+executing. It thus does not include tasks waiting for dependencies.
+@end deftypefun
+
 @deftypefun {struct starpu_task *} starpu_task_get_current (void)
 @deftypefun {struct starpu_task *} starpu_task_get_current (void)
 This function returns the task currently executed by the worker, or
 This function returns the task currently executed by the worker, or
 NULL if it is called either from a thread that is not a task or simply
 NULL if it is called either from a thread that is not a task or simply
@@ -2489,10 +2507,6 @@ This function returns a pointer to device properties for worker @var{workerid}
 (assumed to be a CUDA worker).
 (assumed to be a CUDA worker).
 @end deftypefun
 @end deftypefun
 
 
-@deftypefun size_t starpu_cuda_get_global_mem_size (unsigned @var{devid})
-Return the size of the global memory of CUDA device @var{devid}.
-@end deftypefun
-
 @deftypefun void starpu_cuda_report_error ({const char *}@var{func}, {const char *}@var{file}, int @var{line}, cudaError_t @var{status})
 @deftypefun void starpu_cuda_report_error ({const char *}@var{func}, {const char *}@var{file}, int @var{line}, cudaError_t @var{status})
 Report a CUDA error.
 Report a CUDA error.
 @end deftypefun
 @end deftypefun
@@ -2560,10 +2574,6 @@ OpenCL as shown in @ref{Full source code for the 'Scaling a Vector' example}.
 @node Writing OpenCL kernels
 @node Writing OpenCL kernels
 @subsection Writing OpenCL kernels
 @subsection Writing OpenCL kernels
 
 
-@deftypefun size_t starpu_opencl_get_global_mem_size (int @var{devid})
-Return the size of global device memory in bytes.
-@end deftypefun
-
 @deftypefun void starpu_opencl_get_context (int @var{devid}, {cl_context *}@var{context})
 @deftypefun void starpu_opencl_get_context (int @var{devid}, {cl_context *}@var{context})
 Places the OpenCL context of the device designated by @var{devid} into @var{context}.
 Places the OpenCL context of the device designated by @var{devid} into @var{context}.
 @end deftypefun
 @end deftypefun
@@ -2780,7 +2790,7 @@ otherwise. The integer pointed to by @var{ret} is set to -EAGAIN if the asynchro
 was successful, or to 0 if event was NULL.
 was successful, or to 0 if event was NULL.
 @end deftypefun
 @end deftypefun
 
 
-@deftypefun cl_int starpu_opencl_copy_async_sync (cl_mem @var{src}, unsigned @var{src_node}, size_t @var{src_offset}, cl_mem @var{dst}, unsigned @var{dst_node}, size_t @var{dst_offset}, size_t @var{size}, {cl_event *}@var{event})
+@deftypefun cl_int starpu_opencl_copy_async_sync (uintptr_t @var{src}, size_t @var{src_offset}, unsigned @var{src_node}, uintptr_t @var{dst}, size_t @var{dst_offset}, unsigned @var{dst_node}, size_t @var{size}, {cl_event *}@var{event})
 Copy @var{size} bytes from byte offset @var{src_offset} of
 Copy @var{size} bytes from byte offset @var{src_offset} of
 @var{src} on @var{src_node} to byte offset @var{dst_offset} of @var{dst} on
 @var{src} on @var{src_node} to byte offset @var{dst_offset} of @var{dst} on
 @var{dst_node}. if @var{event} is NULL, the copy is synchronous, i.e the queue is
 @var{dst_node}. if @var{event} is NULL, the copy is synchronous, i.e the queue is

+ 14 - 2
doc/chapters/configuration.texi

@@ -417,8 +417,20 @@ the @code{starpu_conf} structure passed to @code{starpu_init} is set.
 
 
 @defvr {Environment variable} @code{STARPU_SINGLE_COMBINED_WORKER}
 @defvr {Environment variable} @code{STARPU_SINGLE_COMBINED_WORKER}
 If set, StarPU will create several workers which won't be able to work
 If set, StarPU will create several workers which won't be able to work
-concurrently. It will create combined workers which size goes from 1 to the
-total number of CPU workers in the system.
+concurrently. It will by default create combined workers which size goes from 1
+to the total number of CPU workers in the system. @code{STARPU_MIN_WORKERSIZE}
+and @code{STARPU_MAX_WORKERSIZE} can be used to change this default.
+@end defvr
+
+@defvr {Environment variable} @code{STARPU_MIN_WORKERSIZE}
+When @code{STARPU_SINGLE_COMBINED_WORKER} is set, @code{STARPU_MIN_WORKERSIZE}
+permits to specify the minimum size of the combined workers (instead of the default 1)
+@end defvr
+
+@defvr {Environment variable} @code{STARPU_MAX_WORKERSIZE}
+When @code{STARPU_SINGLE_COMBINED_WORKER} is set, @code{STARPU_MAX_WORKERSIZE}
+permits to specify the minimum size of the combined workers (instead of the
+number of CPU workers in the system)
 @end defvr
 @end defvr
 
 
 @defvr {Environment variable} STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER
 @defvr {Environment variable} STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER

+ 44 - 1
doc/chapters/mpi-support.texi

@@ -21,6 +21,7 @@ according to the task graph and an application-provided distribution.
 
 
 @menu
 @menu
 * Simple Example::
 * Simple Example::
+* Point to point communication::
 * Exchanging User Defined Data Interface::
 * Exchanging User Defined Data Interface::
 * MPI Insert Task Utility::
 * MPI Insert Task Utility::
 * MPI Collective Operations::
 * MPI Collective Operations::
@@ -120,7 +121,49 @@ int main(int argc, char **argv)
 @end smallexample
 @end smallexample
 @end cartouche
 @end cartouche
 
 
-@page
+@node Point to point communication
+@section Point to point communication
+
+The standard point to point communications of MPI have been
+implemented. The semantic is similar to the MPI one, but adapted to
+the DSM provided by StarPU. A MPI request will only be submitted when
+the data is available in the main memory of the node submitting the
+request.
+
+There is two types of asynchronous communications: the classic
+asynchronous communications and the detached communications. The
+classic asynchronous communications (@code{starpu_mpi_isend} and
+@code{starpu_mpi_irecv}) need to be followed by a call to
+@code{starpu_mpi_wait} or to @code{starpu_mpi_test} to wait for or to
+test the completion of the communication. Waiting for or testing the
+completion of detached communications is not possible, this is done
+internally by StarPU-MPI, on completion, the resources are
+automatically released. This mechanism is similar to the pthread
+detach state attribute which determines whether a thread will be
+created in a joinable or a detached state.
+
+For any communication, the call of the function will result in the
+creation of a StarPU-MPI request, the function
+@code{starpu_data_acquire_cb} is then called to asynchronously request
+StarPU to fetch the data in main memory; when the data is available in
+main memory, a StarPU-MPI function is called to put the new request in
+the list of the ready requests.
+
+The StarPU-MPI progression thread regularly polls this list of ready
+requests. For each new ready request, the appropriate function is
+called to post the corresponding MPI call. For example, calling
+@code{starpu_mpi_isend} will result in posting @code{MPI_Isend}. If
+the request is marked as detached, the request will be put in the list
+of detached requests.
+
+The StarPU-MPI progression thread also polls the list of detached
+requests. For each detached request, it regularly tests the completion
+of the MPI request by calling @code{MPI_Test}. On completion, the data
+handle is released, and if a callback was defined, it is called.
+
+@ref{Communication} gives the list of all the point to point
+communications defined in StarPU-MPI.
+
 @node Exchanging User Defined Data Interface
 @node Exchanging User Defined Data Interface
 @section Exchanging User Defined Data Interface
 @section Exchanging User Defined Data Interface
 
 

+ 17 - 7
doc/chapters/perf-feedback.texi

@@ -411,7 +411,7 @@ display the regression formula, and in the case of non-linear regression, the
 same performance log as for history-based performance models:
 same performance log as for history-based performance models:
 
 
 @example
 @example
-$ starpu_perfmodel_display -s non_linear_memset_regression_based.type
+$ starpu_perfmodel_display -s non_linear_memset_regression_based
 performance model for cpu_impl_0
 performance model for cpu_impl_0
 	Regression : #sample = 1400
 	Regression : #sample = 1400
 	Linear: y = alpha size ^ beta
 	Linear: y = alpha size ^ beta
@@ -429,15 +429,25 @@ a3d3725e	4096           	4.763200e+00   	7.650928e-01   	100
 ...
 ...
 @end example
 @end example
 
 
-The @code{starpu_perfmodel_plot} tool can be used to draw performance models.
-It writes a @code{.gp} file in the current directory, to be run in the
-@code{gnuplot} tool, which shows the corresponding curve.
-
 The same can also be achieved by using StarPU's library API, see
 The same can also be achieved by using StarPU's library API, see
 @ref{Performance Model API} and notably the @code{starpu_perfmodel_load_symbol}
 @ref{Performance Model API} and notably the @code{starpu_perfmodel_load_symbol}
 function. The source code of the @code{starpu_perfmodel_display} tool can be a
 function. The source code of the @code{starpu_perfmodel_display} tool can be a
 useful example.
 useful example.
 
 
+The @code{starpu_perfmodel_plot} tool can be used to draw performance models.
+It writes a @code{.gp} file in the current directory, to be run in the
+@code{gnuplot} tool, which shows the corresponding curve.
+
+When the @code{flops} field of tasks is set, @code{starpu_perfmodel_plot} can
+directly draw a GFlops curve, by simply adding the @code{-f} option:
+
+@example
+$ starpu_perfmodel_display -f -s chol_model_11
+@end example
+
+This will however disable displaying the regression model, for which we can not
+compute GFlops.
+
 When the FxT trace file @code{filename} has been generated, it is possible to
 When the FxT trace file @code{filename} has been generated, it is possible to
 get a profiling of each codelet by calling:
 get a profiling of each codelet by calling:
 @example
 @example
@@ -453,10 +463,10 @@ This is also available in the @code{starpu_perfmodel_plot} tool, by passing it
 the fxt trace:
 the fxt trace:
 
 
 @example
 @example
-$ starpu_perfmodel_display -s non_linear_memset_regression_based.type -i /tmp/prof_file_foo_0
+$ starpu_perfmodel_plot -s non_linear_memset_regression_based -i /tmp/prof_file_foo_0
 @end example
 @end example
 
 
-It willd produce a @code{.gp} file which contains both the performance model
+It will produce a @code{.gp} file which contains both the performance model
 curves, and the profiling measurements.
 curves, and the profiling measurements.
 
 
 If you have the R statistical tool installed, you can additionally use
 If you have the R statistical tool installed, you can additionally use

+ 6 - 1
doc/chapters/perf-optimization.texi

@@ -228,7 +228,7 @@ int workerids[3] = @{1, 3, 10@};
 int id_ctx = starpu_sched_ctx_create("heft", workerids, 3, "my_ctx");
 int id_ctx = starpu_sched_ctx_create("heft", workerids, 3, "my_ctx");
 
 
 /* @b{let StarPU know that the folowing tasks will be submitted to this context} */
 /* @b{let StarPU know that the folowing tasks will be submitted to this context} */
-starpu_task_set_context(id);
+starpu_sched_ctx_set_task_context(id);
 
 
 /* @b{submit the task to StarPU} */
 /* @b{submit the task to StarPU} */
 starpu_task_submit(task);
 starpu_task_submit(task);
@@ -548,6 +548,11 @@ The number of devices can be chosen as usual with @code{STARPU_NCPU},
 cpus can be arbitrarily chosen. The number of CUDA and OpenCL devices have to be
 cpus can be arbitrarily chosen. The number of CUDA and OpenCL devices have to be
 lower than the real number on the current machine.
 lower than the real number on the current machine.
 
 
+The amount of simulated GPU memory is for now unbound by default, but
+it can be chosen by hand through the @code{STARPU_LIMIT_CUDA_MEM},
+@code{STARPU_LIMIT_CUDA_devid_MEM}, @code{STARPU_LIMIT_OPENCL_MEM}, and
+@code{STARPU_LIMIT_OPENCL_devid_MEM} environment variables.
+
 The Simgrid default stack size is small; to increase it use the
 The Simgrid default stack size is small; to increase it use the
 parameter @code{--cfg=contexts/stack_size}, for example:
 parameter @code{--cfg=contexts/stack_size}, for example:
 
 

+ 4 - 4
doc/chapters/sched_ctx_hypervisor.texi

@@ -27,7 +27,7 @@ Basic strategies of resizing scheduling contexts already exist but a platform fo
 @section Managing the hypervisor
 @section Managing the hypervisor
 There is a single hypervisor that is in charge of resizing contexts and the resizing strategy is chosen at the initialization of the hypervisor. A single resize can be done at a time.
 There is a single hypervisor that is in charge of resizing contexts and the resizing strategy is chosen at the initialization of the hypervisor. A single resize can be done at a time.
 
 
-@deftypefun {struct starpu_performance_counters *} sched_ctx_hypervisor_init ({struct sched_ctx_hypervisor_policy *} @var{policy})
+@deftypefun {struct starpu_sched_ctx_performance_counters *} sched_ctx_hypervisor_init ({struct sched_ctx_hypervisor_policy *} @var{policy})
 Initializes the hypervisor to use the strategy provided as parameter and creates the performance counters (see @pxref{Performance Counters}).
 Initializes the hypervisor to use the strategy provided as parameter and creates the performance counters (see @pxref{Performance Counters}).
 These performance counters represent actually some callbacks that will be used by the contexts to notify the information needed by the hypervisor.
 These performance counters represent actually some callbacks that will be used by the contexts to notify the information needed by the hypervisor.
 @end deftypefun
 @end deftypefun
@@ -200,7 +200,7 @@ or
 @smallexample
 @smallexample
 starpu_insert_task(&codelet,
 starpu_insert_task(&codelet,
                     ...,
                     ...,
-                    STARPU_FLOPS, 100,
+                    STARPU_FLOPS, (double) 100,
                     0);
                     0);
 @end smallexample
 @end smallexample
 @end cartouche
 @end cartouche
@@ -210,8 +210,8 @@ starpu_insert_task(&codelet,
 
 
 The Scheduling Context Hypervisor Plugin provides a series of performance counters to StarPU. By incrementing them, StarPU can help the hypervisor in the resizing decision making process.
 The Scheduling Context Hypervisor Plugin provides a series of performance counters to StarPU. By incrementing them, StarPU can help the hypervisor in the resizing decision making process.
 
 
-@deftp {Data Type} {struct starpu_performance_counters}
-@anchor{struct starpu_performance_counters}
+@deftp {Data Type} {struct starpu_sched_ctx_performance_counters}
+@anchor{struct starpu_sched_ctx_performance_counters}
 
 
 @table @asis
 @table @asis
 @item @code{void (*notify_idle_cycle)(unsigned sched_ctx_id, int worker, double idle_time)}
 @item @code{void (*notify_idle_cycle)(unsigned sched_ctx_id, int worker, double idle_time)}

+ 1 - 1
examples/Makefile.am

@@ -16,7 +16,7 @@
 #
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
 
-AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) -Werror=implicit
+AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
 AM_CXXFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
 AM_CXXFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(MAGMA_LIBS) $(HWLOC_LIBS) @LIBS@
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(MAGMA_LIBS) $(HWLOC_LIBS) @LIBS@
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include

+ 2 - 2
examples/basic_examples/block_cpu.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010  Université de Bordeaux 1
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -26,7 +26,7 @@ void cpu_codelet(void *descr[], void *_args)
         unsigned ldy = STARPU_BLOCK_GET_LDY(descr[0]);
         unsigned ldy = STARPU_BLOCK_GET_LDY(descr[0]);
         unsigned ldz = STARPU_BLOCK_GET_LDZ(descr[0]);
         unsigned ldz = STARPU_BLOCK_GET_LDZ(descr[0]);
         float *multiplier = (float *)_args;
         float *multiplier = (float *)_args;
-        unsigned i, j, k;
+        int i, j, k;
 
 
         for(k=0; k<nz ; k++)
         for(k=0; k<nz ; k++)
 	{
 	{

+ 58 - 0
examples/cholesky/cholesky.h

@@ -55,6 +55,64 @@
 #define BLAS3_FLOP(n1,n2,n3)    \
 #define BLAS3_FLOP(n1,n2,n3)    \
         (2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
         (2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
 
 
+/* This is from magma
+
+  -- Innovative Computing Laboratory
+  -- Electrical Engineering and Computer Science Department
+  -- University of Tennessee
+  -- (C) Copyright 2009
+
+  Redistribution  and  use  in  source and binary forms, with or without
+  modification,  are  permitted  provided  that the following conditions
+  are met:
+
+  * Redistributions  of  source  code  must  retain  the above copyright
+    notice,  this  list  of  conditions  and  the  following  disclaimer.
+  * Redistributions  in  binary  form must reproduce the above copyright
+    notice,  this list of conditions and the following disclaimer in the
+    documentation  and/or other materials provided with the distribution.
+  * Neither  the  name of the University of Tennessee, Knoxville nor the
+    names of its contributors may be used to endorse or promote products
+    derived from this software without specific prior written permission.
+
+  THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
+  LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  */
+
+#define FMULS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n) + 0.5) * (double)(__n) + (1. / 3.)))
+#define FADDS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n)      ) * (double)(__n) - (1. / 6.)))
+
+#define FLOPS_SPOTRF(__n) (     FMULS_POTRF((__n)) +       FADDS_POTRF((__n)) )
+
+#define FMULS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)+1.))
+#define FADDS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)-1.))
+
+#define FMULS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FMULS_TRMM_2((__m), (__n)) :*/ FMULS_TRMM_2((__n), (__m)) )
+#define FADDS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FADDS_TRMM_2((__m), (__n)) :*/ FADDS_TRMM_2((__n), (__m)) )
+
+#define FMULS_TRSM FMULS_TRMM
+#define FADDS_TRSM FMULS_TRMM
+
+#define FLOPS_STRSM(__m, __n) (     FMULS_TRSM((__m), (__n)) +       FADDS_TRSM((__m), (__n)) )
+
+
+#define FMULS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
+#define FADDS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
+
+#define FLOPS_SGEMM(__m, __n, __k) (     FMULS_GEMM((__m), (__n), (__k)) +       FADDS_GEMM((__m), (__n), (__k)) )
+
+/* End of magma code */
+
 static unsigned size = 4*1024;
 static unsigned size = 4*1024;
 static unsigned nblocks = 16;
 static unsigned nblocks = 16;
 static unsigned nbigblocks = 8;
 static unsigned nbigblocks = 8;

+ 10 - 1
examples/cholesky/cholesky_grain_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2013  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
@@ -68,6 +68,9 @@ static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned
 		starpu_tag_declare_deps(TAG11_AUX(k, reclevel), 1, TAG22_AUX(k-1, k, k, reclevel));
 		starpu_tag_declare_deps(TAG11_AUX(k, reclevel), 1, TAG22_AUX(k-1, k, k, reclevel));
 	}
 	}
 
 
+	int n = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_SPOTRF(n);
+
 	return task;
 	return task;
 }
 }
 
 
@@ -110,6 +113,9 @@ static int create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j, un
 		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 1, TAG11_AUX(k, reclevel));
 		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 1, TAG11_AUX(k, reclevel));
 	}
 	}
 
 
+	int n = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_STRSM(n, n);
+
 	ret = starpu_task_submit(task);
 	ret = starpu_task_submit(task);
 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	return ret;
 	return ret;
@@ -157,6 +163,9 @@ static int create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, un
 		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 2, TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
 		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 2, TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
 	}
 	}
 
 
+	int n = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_SGEMM(n, n, n);
+
 	ret = starpu_task_submit(task);
 	ret = starpu_task_submit(task);
 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	return ret;
 	return ret;

+ 6 - 2
examples/cholesky/cholesky_implicit.c

@@ -85,6 +85,8 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 	double end;
 	double end;
 
 
 	unsigned i,j,k;
 	unsigned i,j,k;
+	unsigned long n = starpu_matrix_get_nx(dataA);
+	unsigned long nn = n/nblocks;
 
 
 	int prio_level = noprio?STARPU_DEFAULT_PRIO:STARPU_MAX_PRIO;
 	int prio_level = noprio?STARPU_DEFAULT_PRIO:STARPU_MAX_PRIO;
 
 
@@ -101,6 +103,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 					 STARPU_PRIORITY, prio_level,
 					 STARPU_PRIORITY, prio_level,
 					 STARPU_RW, sdatakk,
 					 STARPU_RW, sdatakk,
 					 STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
 					 STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
+					 STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
 					 0);
 					 0);
 		if (ret == -ENODEV) return 77;
 		if (ret == -ENODEV) return 77;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
@@ -113,6 +116,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 						 STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
 						 STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
 						 STARPU_R, sdatakk,
 						 STARPU_R, sdatakk,
 						 STARPU_RW, sdatakj,
 						 STARPU_RW, sdatakj,
+						 STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
 						 0);
 						 0);
 			if (ret == -ENODEV) return 77;
 			if (ret == -ENODEV) return 77;
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
@@ -129,6 +133,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 								 STARPU_R, sdataki,
 								 STARPU_R, sdataki,
 								 STARPU_R, sdatakj,
 								 STARPU_R, sdatakj,
 								 STARPU_RW, sdataij,
 								 STARPU_RW, sdataij,
+								 STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
 								 0);
 								 0);
 					if (ret == -ENODEV) return 77;
 					if (ret == -ENODEV) return 77;
 					STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 					STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
@@ -144,9 +149,8 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 	end = starpu_timing_now();
 	end = starpu_timing_now();
 
 
 	double timing = end - start;
 	double timing = end - start;
-	unsigned long n = starpu_matrix_get_nx(dataA);
 
 
-	double flop = (1.0f*n*n*n)/3.0f;
+	double flop = FLOPS_SPOTRF(n);
 
 
 	if(with_ctxs || with_noctxs || chole1 || chole2)
 	if(with_ctxs || with_noctxs || chole1 || chole2)
 		update_sched_ctx_timing_results((flop/timing/1000.0f), (timing/1000000.0f));
 		update_sched_ctx_timing_results((flop/timing/1000.0f), (timing/1000000.0f));

+ 10 - 1
examples/cholesky/cholesky_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2013  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
@@ -69,6 +69,9 @@ static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned
 		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
 		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
 	}
 	}
 
 
+	int n = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_SPOTRF(n);
+
 	return task;
 	return task;
 }
 }
 
 
@@ -109,6 +112,9 @@ static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j)
 		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
 		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
 	}
 	}
 
 
+	int n = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_STRSM(n, n);
+
 	int ret = starpu_task_submit(task);
 	int ret = starpu_task_submit(task);
         if (STARPU_UNLIKELY(ret == -ENODEV))
         if (STARPU_UNLIKELY(ret == -ENODEV))
 	{
 	{
@@ -158,6 +164,9 @@ static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, u
 		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
 		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
 	}
 	}
 
 
+	int n = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_SGEMM(n, n, n);
+
 	int ret = starpu_task_submit(task);
 	int ret = starpu_task_submit(task);
         if (STARPU_UNLIKELY(ret == -ENODEV))
         if (STARPU_UNLIKELY(ret == -ENODEV))
 	{
 	{

+ 10 - 1
examples/cholesky/cholesky_tile_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -71,6 +71,9 @@ static struct starpu_task * create_task_11(unsigned k, unsigned nblocks)
 		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
 		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
 	}
 	}
 
 
+	int n = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_SPOTRF(n);
+
 	return task;
 	return task;
 }
 }
 
 
@@ -113,6 +116,9 @@ static int create_task_21(unsigned k, unsigned j)
 		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
 		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
 	}
 	}
 
 
+	int n = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_STRSM(n, n);
+
 	ret = starpu_task_submit(task);
 	ret = starpu_task_submit(task);
 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	return ret;
 	return ret;
@@ -160,6 +166,9 @@ static int create_task_22(unsigned k, unsigned i, unsigned j)
 		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
 		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
 	}
 	}
 
 
+	int n = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_SGEMM(n, n, n);
+
 	ret = starpu_task_submit(task);
 	ret = starpu_task_submit(task);
 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	return ret;
 	return ret;

+ 2 - 2
examples/filters/custom_mf/custom_conversion_codelets.c

@@ -21,7 +21,7 @@
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 void cuda_to_cpu(void *buffers[], void *arg)
 void cuda_to_cpu(void *buffers[], void *arg)
 {
 {
-	unsigned int n = CUSTOM_GET_NX(buffers[0]);
+	int n = CUSTOM_GET_NX(buffers[0]);
 	float *x = (float*) CUSTOM_GET_X_PTR(buffers[0]);
 	float *x = (float*) CUSTOM_GET_X_PTR(buffers[0]);
 	float *y = (float*) CUSTOM_GET_Y_PTR(buffers[0]);
 	float *y = (float*) CUSTOM_GET_Y_PTR(buffers[0]);
 	struct point *aop;
 	struct point *aop;
@@ -60,7 +60,7 @@ struct starpu_codelet cuda_to_cpu_cl =
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
 void opencl_to_cpu_cpu_func(void *buffers[], void *arg)
 void opencl_to_cpu_cpu_func(void *buffers[], void *arg)
 {
 {
-	unsigned int n = CUSTOM_GET_NX(buffers[0]);
+	int n = CUSTOM_GET_NX(buffers[0]);
 	float *x = (float *) CUSTOM_GET_OPENCL_X_PTR(buffers[0]);
 	float *x = (float *) CUSTOM_GET_OPENCL_X_PTR(buffers[0]);
 	struct point *aop;
 	struct point *aop;
 	aop = (struct point *) CUSTOM_GET_CPU_PTR(buffers[0]);
 	aop = (struct point *) CUSTOM_GET_CPU_PTR(buffers[0]);

+ 4 - 3
examples/filters/custom_mf/custom_interface.c

@@ -46,7 +46,7 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
 				    cl_event *event);
 				    cl_event *event);
 #endif /* !STARPU_USE_OPENCL */
 #endif /* !STARPU_USE_OPENCL */
 
 
-static struct starpu_data_copy_methods custom_copy_data_methods_s =
+static const struct starpu_data_copy_methods custom_copy_data_methods_s =
 {
 {
 	.ram_to_ram = NULL,
 	.ram_to_ram = NULL,
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
@@ -98,7 +98,7 @@ static struct starpu_data_interface_ops interface_custom_ops =
 	.get_size              = custom_interface_get_size,
 	.get_size              = custom_interface_get_size,
 	.footprint             = footprint_custom_interface_crc32,
 	.footprint             = footprint_custom_interface_crc32,
 	.compare               = NULL,
 	.compare               = NULL,
-	.interfaceid           = -1,
+	.interfaceid           = STARPU_UNKNOWN_INTERFACE_ID,
 	.interface_size        = sizeof(struct custom_data_interface),
 	.interface_size        = sizeof(struct custom_data_interface),
 	.display               = display_custom_interface,
 	.display               = display_custom_interface,
 	.is_multiformat        = 1,
 	.is_multiformat        = 1,
@@ -276,7 +276,8 @@ void custom_data_register(starpu_data_handle_t *handle,
 		.ops = format_ops
 		.ops = format_ops
 	};
 	};
 
 
-	if (interface_custom_ops.interfaceid == -1) {
+	if (interface_custom_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
+	{
 		interface_custom_ops.interfaceid = starpu_data_interface_get_next_id();
 		interface_custom_ops.interfaceid = starpu_data_interface_get_next_id();
 	}
 	}
 	starpu_data_register(handle, home_node, &custom, &interface_custom_ops);
 	starpu_data_register(handle, home_node, &custom, &interface_custom_ops);

+ 2 - 2
examples/filters/fblock_cpu.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -18,7 +18,7 @@
 
 
 void cpu_func(void *buffers[], void *cl_arg)
 void cpu_func(void *buffers[], void *cl_arg)
 {
 {
-        unsigned i, j, k;
+        int i, j, k;
         int *factor = (int *) cl_arg;
         int *factor = (int *) cl_arg;
 	int *block = (int *)STARPU_BLOCK_GET_PTR(buffers[0]);
 	int *block = (int *)STARPU_BLOCK_GET_PTR(buffers[0]);
 	int nx = (int)STARPU_BLOCK_GET_NX(buffers[0]);
 	int nx = (int)STARPU_BLOCK_GET_NX(buffers[0]);

+ 8 - 8
examples/filters/fblock_opencl.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Université de Bordeaux 1
  * Copyright (C) 2011  Université de Bordeaux 1
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -17,13 +17,13 @@
 
 
 #include <starpu.h>
 #include <starpu.h>
 
 
-#define CHECK_CL_SET_KERNEL_ARG(kernel, n, size, ptr)       \
-do                                                          \
-{							    \
-	int err;                                            \
-	err = clSetKernelArg(kernel, n, size, ptr);         \
-	if (err != CL_SUCCESS)                              \
-       		STARPU_OPENCL_REPORT_ERROR(err);            \
+#define CHECK_CL_SET_KERNEL_ARG(kernel, n, size, ptr)       	\
+do                                                          	\
+{								\
+	int check_err;                           	        \
+	check_err = clSetKernelArg(kernel, n, size, ptr);       \
+	if (check_err != CL_SUCCESS)                            \
+       		STARPU_OPENCL_REPORT_ERROR(check_err);          \
 } while (0)
 } while (0)
 
 
 extern struct starpu_opencl_program opencl_program;
 extern struct starpu_opencl_program opencl_program;

+ 3 - 3
examples/filters/fmatrix.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -43,9 +43,9 @@ void cpu_func(void *buffers[], void *cl_arg)
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
-	unsigned i, j, n=1;
+	unsigned j, n=1;
         int matrix[NX*NY];
         int matrix[NX*NY];
-	int ret;
+	int ret, i;
 
 
         FPRINTF(stderr,"IN  Matrix: \n");
         FPRINTF(stderr,"IN  Matrix: \n");
         for(j=0 ; j<NY ; j++)
         for(j=0 ; j<NY ; j++)

+ 2 - 2
examples/filters/fvector.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -37,7 +37,7 @@ void cpu_func(void *buffers[], void *cl_arg)
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
-	unsigned i;
+	int i;
         int vector[NX];
         int vector[NX];
         starpu_data_handle_t handle;
         starpu_data_handle_t handle;
         int factor=1;
         int factor=1;

+ 3 - 3
examples/filters/shadow.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2012  Université de Bordeaux 1
  * Copyright (C) 2012  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -91,11 +91,11 @@ void cuda_func(void *buffers[], void *cl_arg)
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
-	unsigned i, j;
+	unsigned j;
         int vector[NX + 2*SHADOW];
         int vector[NX + 2*SHADOW];
         int vector2[NX + PARTS*2*SHADOW];
         int vector2[NX + PARTS*2*SHADOW];
 	starpu_data_handle_t handle, handle2;
 	starpu_data_handle_t handle, handle2;
-	int ret;
+	int ret, i;
 
 
         struct starpu_codelet cl =
         struct starpu_codelet cl =
 	{
 	{

+ 19 - 128
examples/interface/complex_interface.c

@@ -146,139 +146,30 @@ static int complex_unpack_data(starpu_data_handle_t handle, unsigned node, void
 	return 0;
 	return 0;
 }
 }
 
 
-#ifdef STARPU_USE_CUDA
-static int copy_cuda_async_sync(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, enum cudaMemcpyKind kind, cudaStream_t stream)
+static int copy_any_to_any(void *src_interface, unsigned src_node,
+			   void *dst_interface, unsigned dst_node,
+			   void *async_data)
 {
 {
 	struct starpu_complex_interface *src_complex = src_interface;
 	struct starpu_complex_interface *src_complex = src_interface;
 	struct starpu_complex_interface *dst_complex = dst_interface;
 	struct starpu_complex_interface *dst_complex = dst_interface;
-
-	cudaStream_t sstream = stream;
-	int ret;
-
-	ret = starpu_cuda_copy_async_sync((void *)src_complex->real, src_node, (void *)dst_complex->real, dst_node,
-					  src_complex->nx*sizeof(src_complex->real[0]), sstream, kind);
-	if (ret == 0) sstream = NULL;
-
-	ret = starpu_cuda_copy_async_sync((char *)src_complex->imaginary, src_node, (char *)dst_complex->imaginary, dst_node,
-					  src_complex->nx*sizeof(src_complex->imaginary[0]), sstream, kind);
+	int ret = 0;
+
+	if (starpu_interface_copy((uintptr_t) src_complex->real, 0, src_node,
+				    (uintptr_t) dst_complex->real, 0, dst_node,
+				     src_complex->nx*sizeof(src_complex->real[0]),
+				     async_data))
+		ret = -EAGAIN;
+	if (starpu_interface_copy((uintptr_t) src_complex->imaginary, 0, src_node,
+				    (uintptr_t) dst_complex->imaginary, 0, dst_node,
+				     src_complex->nx*sizeof(src_complex->imaginary[0]),
+				     async_data))
+		ret = -EAGAIN;
 	return ret;
 	return ret;
 }
 }
 
 
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-     return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, NULL);
-}
-
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, stream);
-}
-
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, NULL);
-}
-
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, stream);
-}
-#endif
-
-#ifdef STARPU_USE_OPENCL
-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
-{
-	struct starpu_complex_interface *src_complex = src_interface;
-	struct starpu_complex_interface *dst_complex = dst_interface;
-	cl_int err;
-	int ret;
-
-	err = starpu_opencl_copy_ram_to_opencl(src_complex->real,
-					       src_node,
-					       (cl_mem) dst_complex->real,
-					       dst_node,
-					       src_complex->nx * sizeof(src_complex->real[0]),
-					       0,
-					       event,
-					       &ret);
-	if (STARPU_UNLIKELY(err != CL_SUCCESS))
-		STARPU_OPENCL_REPORT_ERROR(err);
-	if (ret == 0)
-		event = NULL;
-
-	err = starpu_opencl_copy_ram_to_opencl(src_complex->imaginary,
-					       src_node,
-					       (cl_mem) dst_complex->imaginary,
-					       dst_node,
-					       src_complex->nx * sizeof(src_complex->imaginary[0]),
-					       0,
-					       event,
-					       &ret);
-	if (STARPU_UNLIKELY(err != CL_SUCCESS))
-		STARPU_OPENCL_REPORT_ERROR(err);
-
-	return ret;
-}
-
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-        return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
-}
-
-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
-{
-	struct starpu_complex_interface *src_complex = src_interface;
-	struct starpu_complex_interface *dst_complex = dst_interface;
-	cl_int err;
-	int ret;
-
-	err = starpu_opencl_copy_opencl_to_ram((cl_mem) src_complex->real,
-					       src_node,
-					       dst_complex->real,
-					       dst_node,
-					       src_complex->nx * sizeof(src_complex->real[0]),
-					       0,
-					       event,
-					       &ret);
-	if (STARPU_UNLIKELY(err != CL_SUCCESS))
-		STARPU_OPENCL_REPORT_ERROR(err);
-	if (ret == 0)
-		event = NULL;
-
-	err = starpu_opencl_copy_opencl_to_ram((cl_mem) src_complex->imaginary,
-					       src_node,
-					       dst_complex->imaginary,
-					       dst_node,
-					       src_complex->nx * sizeof(src_complex->imaginary[0]),
-					       0,
-					       event,
-					       &ret);
-	if (STARPU_UNLIKELY(err != CL_SUCCESS))
-		STARPU_OPENCL_REPORT_ERROR(err);
-
-	return ret;
-}
-
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-        return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
-}
-#endif
-
-static struct starpu_data_copy_methods complex_copy_methods =
+static const struct starpu_data_copy_methods complex_copy_methods =
 {
 {
-#ifdef STARPU_USE_CUDA
-	.ram_to_cuda = copy_ram_to_cuda,
-	.cuda_to_ram = copy_cuda_to_ram,
-	.ram_to_cuda_async = copy_ram_to_cuda_async,
-	.cuda_to_ram_async = copy_cuda_to_ram_async,
-#endif
-#ifdef STARPU_USE_OPENCL
-	.ram_to_opencl = copy_ram_to_opencl,
-	.opencl_to_ram = copy_opencl_to_ram,
-	.ram_to_opencl_async = copy_ram_to_opencl_async,
-	.opencl_to_ram_async = copy_opencl_to_ram_async,
-#endif
+	.any_to_any = copy_any_to_any
 };
 };
 
 
 static struct starpu_data_interface_ops interface_complex_ops =
 static struct starpu_data_interface_ops interface_complex_ops =
@@ -289,7 +180,7 @@ static struct starpu_data_interface_ops interface_complex_ops =
 	.copy_methods = &complex_copy_methods,
 	.copy_methods = &complex_copy_methods,
 	.get_size = complex_get_size,
 	.get_size = complex_get_size,
 	.footprint = complex_footprint,
 	.footprint = complex_footprint,
-	.interfaceid = -1,
+	.interfaceid = STARPU_UNKNOWN_INTERFACE_ID,
 	.interface_size = sizeof(struct starpu_complex_interface),
 	.interface_size = sizeof(struct starpu_complex_interface),
 	.handle_to_pointer = complex_handle_to_pointer,
 	.handle_to_pointer = complex_handle_to_pointer,
 	.pack_data = complex_pack_data,
 	.pack_data = complex_pack_data,
@@ -305,7 +196,7 @@ void starpu_complex_data_register(starpu_data_handle_t *handleptr, unsigned home
 		.nx = nx
 		.nx = nx
 	};
 	};
 
 
-	if (interface_complex_ops.interfaceid == -1)
+	if (interface_complex_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
 	{
 	{
 		interface_complex_ops.interfaceid = starpu_data_interface_get_next_id();
 		interface_complex_ops.interfaceid = starpu_data_interface_get_next_id();
 	}
 	}

+ 3 - 3
examples/ppm_downscaler/ppm_downscaler.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010  Université de Bordeaux 1
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -75,7 +75,7 @@ struct ppm_image *file_to_ppm(char *filename)
 	ret = fread(ppm->data, sizeof(struct ppm_color), ppm->ncols*ppm->nlines, file);
 	ret = fread(ppm->data, sizeof(struct ppm_color), ppm->ncols*ppm->nlines, file);
 	STARPU_ASSERT(ret == ppm->ncols*ppm->nlines);
 	STARPU_ASSERT(ret == ppm->ncols*ppm->nlines);
 
 
-	unsigned i;
+	int i;
 	for (i = 0; i < ppm->ncols*ppm->nlines; i++)
 	for (i = 0; i < ppm->ncols*ppm->nlines; i++)
 	{
 	{
 /*		fprintf(stderr, "READ (index %d) -> r %d g %d b %d\n", i, ppm->data[i].r, ppm->data[i].g, ppm->data[i].b); */
 /*		fprintf(stderr, "READ (index %d) -> r %d g %d b %d\n", i, ppm->data[i].r, ppm->data[i].g, ppm->data[i].b); */
@@ -121,7 +121,7 @@ void dummy_downscale(struct ppm_image *input_ppm, struct ppm_image *output_ppm)
 	struct ppm_color *in = input_ppm->data;
 	struct ppm_color *in = input_ppm->data;
 	struct ppm_color *out = output_ppm->data;
 	struct ppm_color *out = output_ppm->data;
 
 
-	unsigned line, col;
+	int line, col;
 	for (line = 0; line < output_ppm->nlines; line++)
 	for (line = 0; line < output_ppm->nlines; line++)
 	{
 	{
 		for (col = 0; col < output_ppm->ncols; col++)
 		for (col = 0; col < output_ppm->ncols; col++)

+ 2 - 2
examples/profiling/profiling.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010-2012  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -107,7 +107,7 @@ int main(int argc, char **argv)
 	FPRINTF(stderr, "Avg. length : %2.2lf us\n", (length_sum)/niter);
 	FPRINTF(stderr, "Avg. length : %2.2lf us\n", (length_sum)/niter);
 
 
 	/* Display the occupancy of all workers during the test */
 	/* Display the occupancy of all workers during the test */
-	int worker;
+	unsigned worker;
 	for (worker = 0; worker < starpu_worker_get_count(); worker++)
 	for (worker = 0; worker < starpu_worker_get_count(); worker++)
 	{
 	{
 		struct starpu_worker_profiling_info worker_info;
 		struct starpu_worker_profiling_info worker_info;

+ 2 - 2
examples/sched_ctx/sched_ctx.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010-2012  Université de Bordeaux 1
- * Copyright (C) 2010-2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -79,7 +79,7 @@ int main(int argc, char **argv)
 	/*indicate what to do with the resources when context 2 finishes (it depends on your application)*/
 	/*indicate what to do with the resources when context 2 finishes (it depends on your application)*/
 	starpu_sched_ctx_set_inheritor(sched_ctx2, sched_ctx1);
 	starpu_sched_ctx_set_inheritor(sched_ctx2, sched_ctx1);
 
 
-	unsigned i;
+	int i;
 	for (i = 0; i < ntasks/2; i++)
 	for (i = 0; i < ntasks/2; i++)
 	{
 	{
 		struct starpu_task *task = starpu_task_create();
 		struct starpu_task *task = starpu_task_create();

+ 1 - 1
examples/sched_ctx_utils/sched_ctx_utils.c

@@ -93,7 +93,7 @@ void* start_bench(void *val)
 	pthread_setspecific(key, &p->id);
 	pthread_setspecific(key, &p->id);
 
 
 	if(p->ctx != 0)
 	if(p->ctx != 0)
-		starpu_task_set_context(&p->ctx);
+		starpu_sched_ctx_set_context(&p->ctx);
 
 
 	for(i = 0; i < NSAMPLES; i++)
 	for(i = 0; i < NSAMPLES; i++)
 		p->bench(p->size, p->nblocks);
 		p->bench(p->size, p->nblocks);

+ 4 - 4
examples/scheduler/dummy_sched.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010-2012  Université de Bordeaux 1
- * Copyright (C) 2010-2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -28,7 +28,7 @@ typedef struct dummy_sched_data {
 
 
 static void init_dummy_sched(unsigned sched_ctx_id)
 static void init_dummy_sched(unsigned sched_ctx_id)
 {
 {
-	starpu_sched_ctx_create_worker_collection(sched_ctx_id, STARPU_WORKER_LIST);
+	starpu_sched_ctx_create_worker_collection(sched_ctx_id, STARPU_SCHED_CTX_WORKER_LIST);
 
 
 	struct dummy_sched_data *data = (struct dummy_sched_data*)malloc(sizeof(struct dummy_sched_data));
 	struct dummy_sched_data *data = (struct dummy_sched_data*)malloc(sizeof(struct dummy_sched_data));
 	
 	
@@ -70,7 +70,7 @@ static int push_task_dummy(struct starpu_task *task)
 	   of them would pop for tasks */
 	   of them would pop for tasks */
 	unsigned worker = 0;
 	unsigned worker = 0;
 	struct starpu_sched_ctx_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
 	struct starpu_sched_ctx_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
-	struct starpu_iterator it;
+	struct starpu_sched_ctx_iterator it;
 	if(workers->init_iterator)
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 		workers->init_iterator(workers, &it);
 
 
@@ -159,7 +159,7 @@ int main(int argc, char **argv)
 	ntasks /= 100;
 	ntasks /= 100;
 #endif
 #endif
 
 
-	unsigned i;
+	int i;
 	for (i = 0; i < ntasks; i++)
 	for (i = 0; i < ntasks; i++)
 	{
 	{
 		struct starpu_task *task = starpu_task_create();
 		struct starpu_task *task = starpu_task_create();

+ 2 - 2
examples/stencil/Makefile.am

@@ -13,10 +13,10 @@
 #
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
 
-AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
+AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(HWLOC_LIBS) @LIBS@
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(HWLOC_LIBS) @LIBS@
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
-AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) 
+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS)
 
 
 if USE_MPI
 if USE_MPI
 LIBS += $(top_builddir)/mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 LIBS += $(top_builddir)/mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la

+ 1 - 1
examples/stencil/life.c

@@ -20,7 +20,7 @@
 
 
 void life_update(int bz, const TYPE *old, TYPE *newp, int nx, int ny, int nz, int ldy, int ldz, int iter)
 void life_update(int bz, const TYPE *old, TYPE *newp, int nx, int ny, int nz, int ldy, int ldz, int iter)
 {
 {
-	unsigned x, y, z, num, alive;
+	int x, y, z, num, alive;
 
 
 	for (z = iter; z < nz - iter; z++)
 	for (z = iter; z < nz - iter; z++)
 	{
 	{

+ 5 - 5
examples/stencil/stencil-blocks.c

@@ -121,7 +121,7 @@ struct block_description *get_block_description(int z)
 	return &blocks[z];
 	return &blocks[z];
 }
 }
 
 
-unsigned get_block_mpi_node(int z)
+int get_block_mpi_node(int z)
 {
 {
 	z = (z + nbz)%nbz;
 	z = (z + nbz)%nbz;
 	return blocks[z].mpi_node;
 	return blocks[z].mpi_node;
@@ -277,7 +277,7 @@ void allocate_memory_on_node(int rank)
 	{
 	{
 		struct block_description *block = get_block_description(bz);
 		struct block_description *block = get_block_description(bz);
 
 
-		unsigned node = block->mpi_node;
+		int node = block->mpi_node;
 
 
 		unsigned size_bz = block_sizes_z[bz];
 		unsigned size_bz = block_sizes_z[bz];
 	
 	
@@ -301,7 +301,7 @@ void allocate_memory_on_node(int rank)
 		}
 		}
 
 
 		/* Boundary blocks : Top */
 		/* Boundary blocks : Top */
-		unsigned top_node = block->boundary_blocks[T]->mpi_node;
+		int top_node = block->boundary_blocks[T]->mpi_node;
 		if ((node == rank) || (top_node == rank))
 		if ((node == rank) || (top_node == rank))
 		{
 		{
 			allocate_block_on_node(&block->boundaries_handle[T][0], &block->boundaries[T][0],
 			allocate_block_on_node(&block->boundaries_handle[T][0], &block->boundaries[T][0],
@@ -311,7 +311,7 @@ void allocate_memory_on_node(int rank)
 		} 
 		} 
 
 
 		/* Boundary blocks : Bottom */
 		/* Boundary blocks : Bottom */
-		unsigned bottom_node = block->boundary_blocks[B]->mpi_node;
+		int bottom_node = block->boundary_blocks[B]->mpi_node;
 		if ((node == rank) || (bottom_node == rank))
 		if ((node == rank) || (bottom_node == rank))
 		{
 		{
 			allocate_block_on_node(&block->boundaries_handle[B][0], &block->boundaries[B][0],
 			allocate_block_on_node(&block->boundaries_handle[B][0], &block->boundaries[B][0],
@@ -330,7 +330,7 @@ void check(int rank)
 	{
 	{
 		struct block_description *block = get_block_description(bz);
 		struct block_description *block = get_block_description(bz);
 
 
-		unsigned node = block->mpi_node;
+		int node = block->mpi_node;
 
 
 		/* Main blocks */
 		/* Main blocks */
 		if (node == rank)
 		if (node == rank)

+ 11 - 11
examples/stencil/stencil-tasks.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010  Université de Bordeaux 1
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -40,7 +40,7 @@
  */
  */
 
 
 /* R(z) = R(z+d) = local, just call the save kernel */
 /* R(z) = R(z+d) = local, just call the save kernel */
-static void create_task_save_local(unsigned iter, unsigned z, int dir, unsigned local_rank)
+static void create_task_save_local(unsigned iter, unsigned z, int dir, int local_rank)
 {
 {
 	struct starpu_task *save_task = starpu_task_create();
 	struct starpu_task *save_task = starpu_task_create();
 	struct block_description *descr = get_block_description(z);
 	struct block_description *descr = get_block_description(z);
@@ -81,7 +81,7 @@ static void send_done(void *arg)
 
 
 #ifdef STARPU_USE_MPI
 #ifdef STARPU_USE_MPI
 /* Post MPI send */
 /* Post MPI send */
-static void create_task_save_mpi_send(unsigned iter, unsigned z, int dir, unsigned local_rank)
+static void create_task_save_mpi_send(unsigned iter, unsigned z, int dir, int local_rank)
 {
 {
 	struct block_description *descr = get_block_description(z);
 	struct block_description *descr = get_block_description(z);
 	STARPU_ASSERT(descr->mpi_node == local_rank);
 	STARPU_ASSERT(descr->mpi_node == local_rank);
@@ -108,7 +108,7 @@ static void recv_done(void *arg)
 }
 }
 
 
 /* Post MPI recv */
 /* Post MPI recv */
-static void create_task_save_mpi_recv(unsigned iter, unsigned z, int dir, unsigned local_rank)
+static void create_task_save_mpi_recv(unsigned iter, unsigned z, int dir, int local_rank)
 {
 {
 	struct block_description *descr = get_block_description(z);
 	struct block_description *descr = get_block_description(z);
 	STARPU_ASSERT(descr->mpi_node != local_rank);
 	STARPU_ASSERT(descr->mpi_node != local_rank);
@@ -129,10 +129,10 @@ static void create_task_save_mpi_recv(unsigned iter, unsigned z, int dir, unsign
 /*
 /*
  * Schedule saving boundaries of blocks to communication buffers
  * Schedule saving boundaries of blocks to communication buffers
  */
  */
-void create_task_save(unsigned iter, unsigned z, int dir, unsigned local_rank)
+void create_task_save(unsigned iter, unsigned z, int dir, int local_rank)
 {
 {
-	unsigned node_z = get_block_mpi_node(z);
-	unsigned node_z_and_d = get_block_mpi_node(z+dir);
+	int node_z = get_block_mpi_node(z);
+	int node_z_and_d = get_block_mpi_node(z+dir);
 
 
 #ifdef STARPU_USE_MPI
 #ifdef STARPU_USE_MPI
 	if (node_z == local_rank)
 	if (node_z == local_rank)
@@ -168,7 +168,7 @@ void create_task_save(unsigned iter, unsigned z, int dir, unsigned local_rank)
  * Schedule update computation in computation buffer
  * Schedule update computation in computation buffer
  */
  */
 
 
-void create_task_update(unsigned iter, unsigned z, unsigned local_rank)
+void create_task_update(unsigned iter, unsigned z, int local_rank)
 {
 {
 	STARPU_ASSERT(iter != 0);
 	STARPU_ASSERT(iter != 0);
 
 
@@ -253,8 +253,8 @@ void create_start_task(int z, int dir)
  */
  */
 void create_tasks(int rank)
 void create_tasks(int rank)
 {
 {
-	unsigned iter;
-	unsigned bz;
+	int iter;
+	int bz;
 	int niter = get_niter();
 	int niter = get_niter();
 	int nbz = get_nbz();
 	int nbz = get_nbz();
 
 
@@ -288,7 +288,7 @@ void create_tasks(int rank)
  */
  */
 void wait_end_tasks(int rank)
 void wait_end_tasks(int rank)
 {
 {
-	unsigned bz;
+	int bz;
 	int nbz = get_nbz();
 	int nbz = get_nbz();
 
 
 	for (bz = 0; bz < nbz; bz++)
 	for (bz = 0; bz < nbz; bz++)

+ 4 - 4
examples/stencil/stencil.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010-2012  Université de Bordeaux 1
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -221,7 +221,7 @@ int main(int argc, char **argv)
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
 #ifdef STARPU_USE_MPI
 #ifdef STARPU_USE_MPI
-	starpu_mpi_initialize();
+	starpu_mpi_init(NULL, NULL, 0);
 #endif
 #endif
 
 
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
@@ -312,8 +312,8 @@ int main(int argc, char **argv)
 #if 1
 #if 1
 		unsigned nzblocks_per_process = (nbz + world_size - 1) / world_size;
 		unsigned nzblocks_per_process = (nbz + world_size - 1) / world_size;
 
 
-		unsigned bz, iter;
-		unsigned last;
+		int iter;
+		unsigned last, bz;
 		for (iter = 0; iter < who_runs_what_len; iter++)
 		for (iter = 0; iter < who_runs_what_len; iter++)
 		{
 		{
 			last = 1;
 			last = 1;

+ 6 - 6
examples/stencil/stencil.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010-2011  Université de Bordeaux 1
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -56,8 +56,8 @@ typedef enum
 struct block_description
 struct block_description
 {
 {
 	/* Which MPI node should process that block ? */
 	/* Which MPI node should process that block ? */
-	unsigned mpi_node;
-	
+	int mpi_node;
+
 	unsigned preferred_worker;
 	unsigned preferred_worker;
 
 
 	unsigned bz;
 	unsigned bz;
@@ -101,7 +101,7 @@ void check(int rank);
 
 
 void display_memory_consumption(int rank);
 void display_memory_consumption(int rank);
 
 
-unsigned get_block_mpi_node(int z);
+int get_block_mpi_node(int z);
 unsigned get_block_size(int z);
 unsigned get_block_size(int z);
 unsigned get_bind_tasks(void);
 unsigned get_bind_tasks(void);
 
 
@@ -111,8 +111,8 @@ unsigned get_ticks(void);
 
 
 unsigned global_workerid(unsigned local_workerid);
 unsigned global_workerid(unsigned local_workerid);
 
 
-void create_task_update(unsigned iter, unsigned z, unsigned local_rank);
-void create_task_save(unsigned iter, unsigned z, int dir, unsigned local_rank);
+void create_task_update(unsigned iter, unsigned z, int local_rank);
+void create_task_save(unsigned iter, unsigned z, int dir, int local_rank);
 
 
 extern int starpu_mpi_initialize(void);
 extern int starpu_mpi_initialize(void);
 extern int starpu_mpi_shutdown(void);
 extern int starpu_mpi_shutdown(void);

+ 1 - 1
examples/tag_example/tag_example2.c

@@ -76,7 +76,7 @@ static void tag_cleanup_grid(unsigned ni, unsigned iter)
 
 
 static int create_task_grid(unsigned iter)
 static int create_task_grid(unsigned iter)
 {
 {
-	int i;
+	unsigned i;
 	int ret;
 	int ret;
 
 
 /*	FPRINTF(stderr, "start iter %d ni %d...\n", iter, ni); */
 /*	FPRINTF(stderr, "start iter %d ni %d...\n", iter, ni); */

+ 1 - 2
include/starpu_cuda.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010-2012  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -39,7 +39,6 @@ void starpu_cuda_report_error(const char *func, const char *file, int line, cuda
 #define STARPU_CUDA_REPORT_ERROR(status) \
 #define STARPU_CUDA_REPORT_ERROR(status) \
 	starpu_cuda_report_error(__starpu_func__, __FILE__, __LINE__, status)
 	starpu_cuda_report_error(__starpu_func__, __FILE__, __LINE__, status)
 
 
-size_t starpu_cuda_get_global_mem_size(unsigned devid);
 cudaStream_t starpu_cuda_get_local_stream(void);
 cudaStream_t starpu_cuda_get_local_stream(void);
 
 
 const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid);
 const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid);

+ 1 - 1
include/starpu_data.h

@@ -85,7 +85,7 @@ int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, unsigned node, e
 void starpu_data_release(starpu_data_handle_t handle);
 void starpu_data_release(starpu_data_handle_t handle);
 void starpu_data_release_on_node(starpu_data_handle_t handle, unsigned node);
 void starpu_data_release_on_node(starpu_data_handle_t handle, unsigned node);
 
 
-void starpu_malloc_set_align(size_t);
+void starpu_malloc_set_align(size_t align);
 int starpu_malloc(void **A, size_t dim);
 int starpu_malloc(void **A, size_t dim);
 int starpu_free(void *A);
 int starpu_free(void *A);
 void starpu_memory_display_stats();
 void starpu_memory_display_stats();

+ 6 - 1
include/starpu_data_interfaces.h

@@ -73,10 +73,15 @@ struct starpu_data_copy_methods
 	int (*opencl_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event);
 	int (*opencl_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event);
 	int (*opencl_to_opencl_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event);
 	int (*opencl_to_opencl_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event);
 #endif
 #endif
+
+	int (*any_to_any)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 };
 };
 
 
+int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, void *async_data);
+
 enum starpu_data_interface_id
 enum starpu_data_interface_id
 {
 {
+	STARPU_UNKNOWN_INTERFACE_ID = -1,
 	STARPU_MATRIX_INTERFACE_ID=0,
 	STARPU_MATRIX_INTERFACE_ID=0,
 	STARPU_BLOCK_INTERFACE_ID=1,
 	STARPU_BLOCK_INTERFACE_ID=1,
 	STARPU_VECTOR_INTERFACE_ID=2,
 	STARPU_VECTOR_INTERFACE_ID=2,
@@ -99,7 +104,7 @@ struct starpu_data_interface_ops
 	/* Free data of the interface on a given node. */
 	/* Free data of the interface on a given node. */
 	void (*free_data_on_node)(void *data_interface, unsigned node);
 	void (*free_data_on_node)(void *data_interface, unsigned node);
 	/* ram/cuda/opencl synchronous and asynchronous transfer methods */
 	/* ram/cuda/opencl synchronous and asynchronous transfer methods */
-	struct starpu_data_copy_methods *copy_methods;
+	const struct starpu_data_copy_methods *copy_methods;
 	/* Return the current pointer (if any) for the handle on the given node. */
 	/* Return the current pointer (if any) for the handle on the given node. */
 	void * (*handle_to_pointer)(starpu_data_handle_t handle, unsigned node);
 	void * (*handle_to_pointer)(starpu_data_handle_t handle, unsigned node);
 	/* Return an estimation of the size of data, for performance models */
 	/* Return an estimation of the size of data, for performance models */

+ 2 - 3
include/starpu_opencl.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010-2013  Université de Bordeaux 1
  * Copyright (C) 2010-2013  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -53,7 +53,6 @@ struct starpu_opencl_program
 	cl_program programs[STARPU_MAXOPENCLDEVS];
 	cl_program programs[STARPU_MAXOPENCLDEVS];
 };
 };
 
 
-size_t starpu_opencl_get_global_mem_size(int devid);
 void starpu_opencl_get_context(int devid, cl_context *context);
 void starpu_opencl_get_context(int devid, cl_context *context);
 void starpu_opencl_get_device(int devid, cl_device_id *device);
 void starpu_opencl_get_device(int devid, cl_device_id *device);
 void starpu_opencl_get_queue(int devid, cl_command_queue *queue);
 void starpu_opencl_get_queue(int devid, cl_command_queue *queue);
@@ -108,7 +107,7 @@ cl_int starpu_opencl_copy_opencl_to_ram(cl_mem buffer, unsigned src_node, void *
 
 
 cl_int starpu_opencl_copy_opencl_to_opencl(cl_mem src, unsigned src_node, size_t src_offset, cl_mem dst, unsigned dst_node, size_t dst_offset, size_t size, cl_event *event, int *ret);
 cl_int starpu_opencl_copy_opencl_to_opencl(cl_mem src, unsigned src_node, size_t src_offset, cl_mem dst, unsigned dst_node, size_t dst_offset, size_t size, cl_event *event, int *ret);
 
 
-cl_int starpu_opencl_copy_async_sync(uintptr_t src, unsigned src_node, size_t src_offset, uintptr_t dst, unsigned dst_node, size_t dst_offset, size_t size, cl_event *event);
+cl_int starpu_opencl_copy_async_sync(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, cl_event *event);
 
 
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }

+ 7 - 1
include/starpu_perfmodel.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
  *
  *
@@ -102,6 +102,8 @@ struct starpu_perfmodel_history_entry
 #else
 #else
 	size_t size; /* in bytes */
 	size_t size; /* in bytes */
 #endif
 #endif
+
+	double flops; /* Provided by the application */
 };
 };
 
 
 struct starpu_perfmodel_history_list
 struct starpu_perfmodel_history_list
@@ -212,6 +214,10 @@ void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct star
 void starpu_bus_print_bandwidth(FILE *f);
 void starpu_bus_print_bandwidth(FILE *f);
 void starpu_bus_print_affinity(FILE *f);
 void starpu_bus_print_affinity(FILE *f);
 
 
+/* use bw & latency to compute the velocity of resources*/
+double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev);
+double starpu_get_latency_RAM_CUDA(unsigned cudadev);
+
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }
 #endif
 #endif

+ 21 - 25
include/starpu_sched_ctx.h

@@ -24,12 +24,8 @@ extern "C"
 {
 {
 #endif
 #endif
 
 
-#ifdef STARPU_DEVEL
-#  warning rename all objects to start with starpu_sched_ctx
-#endif
-
-//struct starpu_iterator;
-struct starpu_iterator
+//struct starpu_sched_ctx_iterator;
+struct starpu_sched_ctx_iterator
 {
 {
 	int cursor;
 	int cursor;
 };
 };
@@ -42,12 +38,12 @@ struct starpu_sched_ctx_worker_collection
 	void *workerids;
 	void *workerids;
 	/* the number of workers in the collection */
 	/* the number of workers in the collection */
 	unsigned nworkers;
 	unsigned nworkers;
-	/* the type of structure (STARPU_WORKER_LIST,...) */
+	/* the type of structure (STARPU_SCHED_CTX_WORKER_LIST,...) */
 	int type;
 	int type;
 	/* checks if there is another element in collection */
 	/* checks if there is another element in collection */
-	unsigned (*has_next)(struct starpu_sched_ctx_worker_collection *workers, struct starpu_iterator *it);
+	unsigned (*has_next)(struct starpu_sched_ctx_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
 	/* return the next element in the collection */
 	/* return the next element in the collection */
-	int (*get_next)(struct starpu_sched_ctx_worker_collection *workers, struct starpu_iterator *it);
+	int (*get_next)(struct starpu_sched_ctx_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
 	/* add a new element in the collection */
 	/* add a new element in the collection */
 	int (*add)(struct starpu_sched_ctx_worker_collection *workers, int worker);
 	int (*add)(struct starpu_sched_ctx_worker_collection *workers, int worker);
 	/* remove an element from the collection */
 	/* remove an element from the collection */
@@ -57,26 +53,26 @@ struct starpu_sched_ctx_worker_collection
 	/* free the structure */
 	/* free the structure */
 	void (*deinit)(struct starpu_sched_ctx_worker_collection *workers);
 	void (*deinit)(struct starpu_sched_ctx_worker_collection *workers);
 	/* initialize the cursor if there is one */
 	/* initialize the cursor if there is one */
-	void (*init_iterator)(struct starpu_sched_ctx_worker_collection *workers, struct starpu_iterator *it);
+	void (*init_iterator)(struct starpu_sched_ctx_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
 };
 };
 
 
 /* types of structures the worker collection can implement */
 /* types of structures the worker collection can implement */
-#define STARPU_WORKER_LIST 0
+#define STARPU_SCHED_CTX_WORKER_LIST 0
 
 
-struct starpu_performance_counters
+struct starpu_sched_ctx_performance_counters
 {
 {
 	void (*notify_idle_cycle)(unsigned sched_ctx_id, int worker, double idle_time);
 	void (*notify_idle_cycle)(unsigned sched_ctx_id, int worker, double idle_time);
 	void (*notify_idle_end)(unsigned sched_ctx_id, int worker);
 	void (*notify_idle_end)(unsigned sched_ctx_id, int worker);
 	void (*notify_pushed_task)(unsigned sched_ctx_id, int worker);
 	void (*notify_pushed_task)(unsigned sched_ctx_id, int worker);
-	void (*notify_poped_task)(unsigned sched_ctx_id, int worker, double flops);
+	void (*notify_poped_task)(unsigned sched_ctx_id, int worker, double flops, size_t data_size);
 	void (*notify_post_exec_hook)(unsigned sched_ctx_id, int taskid);
 	void (*notify_post_exec_hook)(unsigned sched_ctx_id, int taskid);
 	void (*notify_submitted_job)(struct starpu_task *task, uint32_t footprint);
 	void (*notify_submitted_job)(struct starpu_task *task, uint32_t footprint);
 };
 };
 
 
 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
-void starpu_set_perf_counters(unsigned sched_ctx_id, struct starpu_performance_counters *perf_counters);
-void starpu_call_poped_task_cb(int workerid, unsigned sched_ctx_id, double flops);
-void starpu_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
+void starpu_sched_ctx_set_perf_counters(unsigned sched_ctx_id, struct starpu_sched_ctx_performance_counters *perf_counters);
+void starpu_sched_ctx_call_poped_task_cb(int workerid, unsigned sched_ctx_id, double flops, size_t data_size);
+void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
 
 
 unsigned starpu_sched_ctx_create(const char *policy_name, int *workerids_ctx, int nworkers_ctx, const char *sched_ctx_name);
 unsigned starpu_sched_ctx_create(const char *policy_name, int *workerids_ctx, int nworkers_ctx, const char *sched_ctx_name);
@@ -102,16 +98,16 @@ void starpu_sched_ctx_delete_worker_collection(unsigned sched_ctx_id);
 struct starpu_sched_ctx_worker_collection* starpu_sched_ctx_get_worker_collection(unsigned sched_ctx_id);
 struct starpu_sched_ctx_worker_collection* starpu_sched_ctx_get_worker_collection(unsigned sched_ctx_id);
 
 
 #if !defined(_MSC_VER) && !defined(STARPU_SIMGRID)
 #if !defined(_MSC_VER) && !defined(STARPU_SIMGRID)
-pthread_mutex_t* starpu_get_changing_ctx_mutex(unsigned sched_ctx_id);
+pthread_mutex_t* starpu_sched_ctx_get_changing_ctx_mutex(unsigned sched_ctx_id);
 #endif
 #endif
 
 
-void starpu_task_set_context(unsigned *sched_ctx_id);
+void starpu_sched_ctx_set_context(unsigned *sched_ctx_id);
 
 
-unsigned starpu_task_get_context(void);
+unsigned starpu_sched_ctx_get_context(void);
 
 
-void starpu_notify_hypervisor_exists(void);
+void starpu_sched_ctx_notify_hypervisor_exists(void);
 
 
-unsigned starpu_check_if_hypervisor_exists(void);
+unsigned starpu_sched_ctx_check_if_hypervisor_exists(void);
 
 
 unsigned starpu_sched_ctx_get_nworkers(unsigned sched_ctx_id);
 unsigned starpu_sched_ctx_get_nworkers(unsigned sched_ctx_id);
 
 
@@ -121,13 +117,13 @@ unsigned starpu_sched_ctx_contains_worker(int workerid, unsigned sched_ctx_id);
 
 
 unsigned starpu_sched_ctx_overlapping_ctxs_on_worker(int workerid);
 unsigned starpu_sched_ctx_overlapping_ctxs_on_worker(int workerid);
 
 
-unsigned starpu_is_ctxs_turn(int workerid, unsigned sched_ctx_id);
+unsigned starpu_sched_ctx_is_ctxs_turn(int workerid, unsigned sched_ctx_id);
 
 
-void starpu_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id);
+void starpu_sched_ctx_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id);
 
 
-double starpu_get_max_time_worker_on_ctx(void);
+double starpu_sched_ctx_get_max_time_worker_on_ctx(void);
 
 
-void starpu_stop_task_submission(void);
+void starpu_sched_ctx_stop_task_submission(void);
 
 
 void starpu_sched_ctx_set_inheritor(unsigned sched_ctx_id, unsigned inheritor);
 void starpu_sched_ctx_set_inheritor(unsigned sched_ctx_id, unsigned inheritor);
 
 

+ 3 - 0
include/starpu_task.h

@@ -321,6 +321,9 @@ int starpu_task_wait_for_all_in_ctx(unsigned sched_ctx_id);
 /* This function waits until there is no more ready task. */
 /* This function waits until there is no more ready task. */
 int starpu_task_wait_for_no_ready(void);
 int starpu_task_wait_for_no_ready(void);
 
 
+int starpu_task_nready(void);
+int starpu_task_nsubmitted(void);
+
 void starpu_codelet_init(struct starpu_codelet *cl);
 void starpu_codelet_init(struct starpu_codelet *cl);
 
 
 void starpu_display_codelet_stats(struct starpu_codelet *cl);
 void starpu_display_codelet_stats(struct starpu_codelet *cl);

+ 2 - 2
include/starpu_task_util.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -45,7 +45,7 @@ void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t
 #define STARPU_DATA_ARRAY       (1<<11) /* Array of data handles */
 #define STARPU_DATA_ARRAY       (1<<11) /* Array of data handles */
 #define STARPU_TAG       (1<<12) /* Tag */
 #define STARPU_TAG       (1<<12) /* Tag */
 #define STARPU_HYPERVISOR_TAG	(1<<13)	/* Used to tag a task after whose execution we'll execute  a code */
 #define STARPU_HYPERVISOR_TAG	(1<<13)	/* Used to tag a task after whose execution we'll execute  a code */
-#define STARPU_HYPERVISOR_FLOPS	(1<<14)	/* Used to specify the number of flops needed to be executed by a task */
+#define STARPU_FLOPS	(1<<14)	/* Used to specify the number of flops needed to be executed by a task */
 
 
 /* Wrapper to create a task. */
 /* Wrapper to create a task. */
 int starpu_insert_task(struct starpu_codelet *cl, ...);
 int starpu_insert_task(struct starpu_codelet *cl, ...);

+ 4 - 4
include/starpu_util.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010-2012  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -207,17 +207,17 @@ static __inline int starpu_get_env_number(const char *str)
 	if (strval)
 	if (strval)
 	{
 	{
 		/* the env variable was actually set */
 		/* the env variable was actually set */
-		unsigned val;
+		long int val;
 		char *check;
 		char *check;
 
 
-		val = (int)strtol(strval, &check, 10);
+		val = strtol(strval, &check, 10);
 		if (*check) {
 		if (*check) {
 			fprintf(stderr,"The %s environment variable must contain an integer\n", str);
 			fprintf(stderr,"The %s environment variable must contain an integer\n", str);
 			STARPU_ABORT();
 			STARPU_ABORT();
 		}
 		}
 
 
 		/* fprintf(stderr, "ENV %s WAS %d\n", str, val); */
 		/* fprintf(stderr, "ENV %s WAS %d\n", str, val); */
-		return val;
+		return (int)val;
 	}
 	}
 	else
 	else
 	{
 	{

+ 1 - 1
mpi/examples/Makefile.am

@@ -75,7 +75,7 @@ NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -
 	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
 	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
 endif
 endif
 
 
-AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Werror=implicit
+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include
 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS)
 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS)

+ 1 - 2
mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c

@@ -72,10 +72,9 @@ void dw_cholesky(float ***matA, unsigned size, unsigned ld, unsigned nblocks, in
 	struct timeval start;
 	struct timeval start;
 	struct timeval end;
 	struct timeval end;
 	starpu_data_handle_t **data_handles;
 	starpu_data_handle_t **data_handles;
-	int x, y;
+	unsigned x,y,i,j,k;
 
 
 	/* create all the DAG nodes */
 	/* create all the DAG nodes */
-	unsigned i,j,k;
 
 
 	data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
 	data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
 	for(x=0 ; x<nblocks ; x++) data_handles[x] = malloc(nblocks*sizeof(starpu_data_handle_t));
 	for(x=0 ; x<nblocks ; x++) data_handles[x] = malloc(nblocks*sizeof(starpu_data_handle_t));

+ 2 - 2
mpi/examples/matrix_decomposition/mpi_decomposition_params.c

@@ -25,8 +25,8 @@ unsigned nblocks = 16;
 unsigned nbigblocks = 2;
 unsigned nbigblocks = 2;
 unsigned noprio = 0;
 unsigned noprio = 0;
 unsigned display = 0;
 unsigned display = 0;
-unsigned dblockx = -1;
-unsigned dblocky = -1;
+int dblockx = -1;
+int dblocky = -1;
 
 
 void parse_args(int argc, char **argv, int nodes)
 void parse_args(int argc, char **argv, int nodes)
 {
 {

+ 3 - 3
mpi/examples/mpi_lu/plu_example.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
  * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -32,8 +32,8 @@
 static unsigned long size = 4096;
 static unsigned long size = 4096;
 static unsigned nblocks = 16;
 static unsigned nblocks = 16;
 static unsigned check = 0;
 static unsigned check = 0;
-static unsigned p = 1;
-static unsigned q = 1;
+static int p = 1;
+static int q = 1;
 static unsigned display = 0;
 static unsigned display = 0;
 
 
 #ifdef STARPU_HAVE_LIBNUMA
 #ifdef STARPU_HAVE_LIBNUMA

+ 2 - 2
mpi/examples/mpi_lu/pxlu.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -101,7 +101,7 @@ static void send_data_to_mask(starpu_data_handle_t handle, int *rank_mask, int m
 	int mpi_tag_array[world_size];
 	int mpi_tag_array[world_size];
 	starpu_data_handle_t handle_array[world_size];
 	starpu_data_handle_t handle_array[world_size];
 
 
-	unsigned r;
+	int r;
 	for (r = 0; r < world_size; r++)
 	for (r = 0; r < world_size; r++)
 	{
 	{
 		if (rank_mask[r]) {
 		if (rank_mask[r]) {

+ 1 - 1
mpi/src/Makefile.am

@@ -21,7 +21,7 @@ BUILT_SOURCES =
 
 
 CLEANFILES = *.gcno *.gcda *.linkinfo
 CLEANFILES = *.gcno *.gcda *.linkinfo
 
 
-AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Werror=implicit
+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS)
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/src/ -I$(top_builddir)/src -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/mpi/src
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/src/ -I$(top_builddir)/src -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/mpi/src
 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS)
 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS)

+ 14 - 15
mpi/src/starpu_mpi.c

@@ -443,12 +443,12 @@ static void _starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
 	_STARPU_MPI_DEBUG("Test request %p - mpitag %d - TYPE %s %d\n", &req->request, req->mpi_tag, _starpu_mpi_request_type(req->request_type), req->srcdst);
 	_STARPU_MPI_DEBUG("Test request %p - mpitag %d - TYPE %s %d\n", &req->request, req->mpi_tag, _starpu_mpi_request_type(req->request_type), req->srcdst);
 
 
 	TRACE_MPI_UTESTING_BEGIN(req->srcdst, req->mpi_tag);
 	TRACE_MPI_UTESTING_BEGIN(req->srcdst, req->mpi_tag);
-	
+
 	req->ret = MPI_Test(&req->request, testing_req->flag, testing_req->status);
 	req->ret = MPI_Test(&req->request, testing_req->flag, testing_req->status);
 	STARPU_ASSERT(req->ret == MPI_SUCCESS);
 	STARPU_ASSERT(req->ret == MPI_SUCCESS);
 
 
 	TRACE_MPI_UTESTING_END(req->srcdst, req->mpi_tag);
 	TRACE_MPI_UTESTING_END(req->srcdst, req->mpi_tag);
-	
+
 	if (*testing_req->flag)
 	if (*testing_req->flag)
 	{
 	{
 		testing_req->ret = req->ret;
 		testing_req->ret = req->ret;
@@ -841,6 +841,17 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 		_starpu_mpi_print_thread_level_support(provided, " has been initialized with");
 		_starpu_mpi_print_thread_level_support(provided, " has been initialized with");
 	}
 	}
 
 
+	{
+	     int rank, worldsize;
+	     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	     MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
+	     TRACE_MPI_START(rank, worldsize);
+#ifdef STARPU_USE_FXT
+	     starpu_set_profiling_id(rank);
+#endif //STARPU_USE_FXT
+	}
+
+
 	/* notify the main thread that the progression thread is ready */
 	/* notify the main thread that the progression thread is ready */
 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	running = 1;
 	running = 1;
@@ -862,7 +873,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 			_STARPU_MPI_DEBUG("NO MORE REQUESTS TO HANDLE\n");
 			_STARPU_MPI_DEBUG("NO MORE REQUESTS TO HANDLE\n");
 
 
 			TRACE_MPI_SLEEP_BEGIN();
 			TRACE_MPI_SLEEP_BEGIN();
-			
+
 			if (barrier_running)
 			if (barrier_running)
 				/* Tell mpi_barrier */
 				/* Tell mpi_barrier */
 				_STARPU_PTHREAD_COND_SIGNAL(&cond_finished);
 				_STARPU_PTHREAD_COND_SIGNAL(&cond_finished);
@@ -967,13 +978,6 @@ int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi)
 	argc_argv->argc = argc;
 	argc_argv->argc = argc;
 	argc_argv->argv = argv;
 	argc_argv->argv = argv;
 
 
-	int rank, worldsize;
-
-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-	MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
-
-	TRACE_MPI_START(rank,worldsize);
-
 	_STARPU_PTHREAD_CREATE("MPI progress", &progress_thread, NULL, _starpu_mpi_progress_thread_func, argc_argv);
 	_STARPU_PTHREAD_CREATE("MPI progress", &progress_thread, NULL, _starpu_mpi_progress_thread_func, argc_argv);
 
 
 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
@@ -981,10 +985,6 @@ int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi)
 		_STARPU_PTHREAD_COND_WAIT(&cond_progression, &mutex);
 		_STARPU_PTHREAD_COND_WAIT(&cond_progression, &mutex);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 
 
-#ifdef STARPU_USE_FXT
-	starpu_set_profiling_id(rank);
-#endif //STARPU_USE_FXT
-
 #ifdef USE_STARPU_ACTIVITY
 #ifdef USE_STARPU_ACTIVITY
 	hookid = starpu_progression_hook_register(progression_hook_func, NULL);
 	hookid = starpu_progression_hook_register(progression_hook_func, NULL);
 	STARPU_ASSERT(hookid >= 0);
 	STARPU_ASSERT(hookid >= 0);
@@ -1053,4 +1053,3 @@ int starpu_mpi_shutdown(void)
 
 
 	return 0;
 	return 0;
 }
 }
-

+ 2 - 2
mpi/src/starpu_mpi_stats.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -70,7 +70,7 @@ void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts)
 
 
 void _starpu_mpi_comm_amounts_display(int node)
 void _starpu_mpi_comm_amounts_display(int node)
 {
 {
-	unsigned dst;
+	int dst;
 	size_t sum = 0;
 	size_t sum = 0;
 
 
 	if (stats_enabled == 0) return;
 	if (stats_enabled == 0) return;

+ 1 - 1
mpi/tests/Makefile.am

@@ -64,7 +64,7 @@ NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -
 	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
 	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
 endif
 endif
 
 
-AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Werror=implicit
+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/src -I$(top_builddir)/src -I$(top_srcdir)/examples/
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/src -I$(top_builddir)/src -I$(top_srcdir)/examples/
 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS)
 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS)

+ 3 - 3
mpi/tests/mpi_detached_tag.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010  Université de Bordeaux 1
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -54,8 +54,8 @@ int main(int argc, char **argv)
 
 
 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
 
 
-	unsigned nloops = NITER;
-	unsigned loop;
+	int nloops = NITER;
+	int loop;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 
 	for (loop = 0; loop < nloops; loop++)
 	for (loop = 0; loop < nloops; loop++)

+ 3 - 3
mpi/tests/mpi_irecv.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -54,8 +54,8 @@ int main(int argc, char **argv)
 
 
 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
 
 
-	unsigned nloops = NITER;
-	unsigned loop;
+	int nloops = NITER;
+	int loop;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 
 	for (loop = 0; loop < nloops; loop++)
 	for (loop = 0; loop < nloops; loop++)

+ 3 - 3
mpi/tests/mpi_irecv_detached.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010, 2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2012  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -69,8 +69,8 @@ int main(int argc, char **argv)
 
 
 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
 
 
-	unsigned nloops = NITER;
-	unsigned loop;
+	int nloops = NITER;
+	int loop;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 
 	for (loop = 0; loop < nloops; loop++)
 	for (loop = 0; loop < nloops; loop++)

+ 3 - 3
mpi/tests/mpi_isend.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -54,8 +54,8 @@ int main(int argc, char **argv)
 
 
 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
 
 
-	unsigned nloops = NITER;
-	unsigned loop;
+	int nloops = NITER;
+	int loop;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 
 	for (loop = 0; loop < nloops; loop++)
 	for (loop = 0; loop < nloops; loop++)

+ 3 - 3
mpi/tests/mpi_isend_detached.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010, 2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2012  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -68,8 +68,8 @@ int main(int argc, char **argv)
 
 
 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
 
 
-	unsigned nloops = NITER;
-	unsigned loop;
+	int nloops = NITER;
+	int loop;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 
 	for (loop = 0; loop < nloops; loop++)
 	for (loop = 0; loop < nloops; loop++)

+ 3 - 3
mpi/tests/mpi_probe.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010, 2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2012  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -69,8 +69,8 @@ int main(int argc, char **argv)
 
 
 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
 
 
-	unsigned nloops = NITER;
-	unsigned loop;
+	int nloops = NITER;
+	int loop;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 
 	for (loop = 0; loop < nloops; loop++)
 	for (loop = 0; loop < nloops; loop++)

+ 3 - 3
mpi/tests/mpi_test.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010  Université de Bordeaux 1
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -55,8 +55,8 @@ int main(int argc, char **argv)
 
 
 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
 
 
-	unsigned nloops = NITER;
-	unsigned loop;
+	int nloops = NITER;
+	int loop;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 
 	for (loop = 0; loop < nloops; loop++)
 	for (loop = 0; loop < nloops; loop++)

+ 3 - 3
mpi/tests/pingpong.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -55,8 +55,8 @@ int main(int argc, char **argv)
 
 
 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
 
 
-	unsigned nloops = NITER;
-	unsigned loop;
+	int nloops = NITER;
+	int loop;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 
 	for (loop = 0; loop < nloops; loop++)
 	for (loop = 0; loop < nloops; loop++)

+ 8 - 8
mpi/tests/ring.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -24,7 +24,7 @@
 #  define NITER	2048
 #  define NITER	2048
 #endif
 #endif
 
 
-unsigned token = 42;
+int token = 42;
 starpu_data_handle_t token_handle;
 starpu_data_handle_t token_handle;
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
@@ -33,7 +33,7 @@ extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
 
 
 void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 {
 {
-	unsigned *tokenptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
+	int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
 	(*tokenptr)++;
 	(*tokenptr)++;
 }
 }
 
 
@@ -82,13 +82,13 @@ int main(int argc, char **argv)
 	ret = starpu_mpi_init(NULL, NULL, 0);
 	ret = starpu_mpi_init(NULL, NULL, 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 
-	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
+	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(token));
 
 
-	unsigned nloops = NITER;
-	unsigned loop;
+	int nloops = NITER;
+	int loop;
 
 
-	unsigned last_loop = nloops - 1;
-	unsigned last_rank = size - 1;
+	int last_loop = nloops - 1;
+	int last_rank = size - 1;
 
 
 	for (loop = 0; loop < nloops; loop++)
 	for (loop = 0; loop < nloops; loop++)
 	{
 	{

+ 8 - 8
mpi/tests/ring_async.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -24,7 +24,7 @@
 #  define NITER	2048
 #  define NITER	2048
 #endif
 #endif
 
 
-unsigned token = 42;
+int token = 42;
 starpu_data_handle_t token_handle;
 starpu_data_handle_t token_handle;
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
@@ -33,7 +33,7 @@ extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
 
 
 void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 {
 {
-	unsigned *tokenptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
+	int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
 	(*tokenptr)++;
 	(*tokenptr)++;
 }
 }
 
 
@@ -82,13 +82,13 @@ int main(int argc, char **argv)
 	ret = starpu_mpi_init(NULL, NULL, 0);
 	ret = starpu_mpi_init(NULL, NULL, 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 
-	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
+	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(token));
 
 
-	unsigned nloops = NITER;
-	unsigned loop;
+	int nloops = NITER;
+	int loop;
 
 
-	unsigned last_loop = nloops - 1;
-	unsigned last_rank = size - 1;
+	int last_loop = nloops - 1;
+	int last_rank = size - 1;
 
 
 	for (loop = 0; loop < nloops; loop++)
 	for (loop = 0; loop < nloops; loop++)
 	{
 	{

+ 8 - 8
mpi/tests/ring_async_implicit.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010  Université de Bordeaux 1
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -24,7 +24,7 @@
 #  define NITER	2048
 #  define NITER	2048
 #endif
 #endif
 
 
-unsigned token = 42;
+int token = 42;
 starpu_data_handle_t token_handle;
 starpu_data_handle_t token_handle;
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
@@ -33,7 +33,7 @@ extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
 
 
 void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 {
 {
-	unsigned *tokenptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
+	int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
 	(*tokenptr)++;
 	(*tokenptr)++;
 }
 }
 
 
@@ -80,13 +80,13 @@ int main(int argc, char **argv)
 	}
 	}
 
 
 
 
-	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
+	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(token));
 
 
-	unsigned nloops = NITER;
-	unsigned loop;
+	int nloops = NITER;
+	int loop;
 
 
-	unsigned last_loop = nloops - 1;
-	unsigned last_rank = size - 1;
+	int last_loop = nloops - 1;
+	int last_rank = size - 1;
 
 
 	for (loop = 0; loop < nloops; loop++)
 	for (loop = 0; loop < nloops; loop++)
 	{
 	{

+ 2 - 2
sched_ctx_hypervisor/examples/Makefile.am

@@ -13,9 +13,9 @@
 #
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
 
-AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS)
+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS)
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(top_builddir)/sched_ctx_hypervisor/src/libsched_ctx_hypervisor.la
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(top_builddir)/sched_ctx_hypervisor/src/libsched_ctx_hypervisor.la
-AM_CPPFLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/examples -I$(top_builddir)/include -I$(top_srcdir)/sched_ctx_hypervisor/include
+AM_CPPFLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/examples -I$(top_builddir)/include -I$(top_srcdir)/sched_ctx_hypervisor/include -I$(top_srcdir)/sched_ctx_hypervisor/examples
 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_GLPK_LDFLAGS)
 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_GLPK_LDFLAGS)
 
 
 if !NO_BLAS_LIB
 if !NO_BLAS_LIB

+ 3 - 3
sched_ctx_hypervisor/examples/app_driven_test/app_driven_test.c

@@ -48,7 +48,7 @@ int tag = 1;
 void* start_thread(void *arg)
 void* start_thread(void *arg)
 {
 {
 	unsigned sched_ctx = *((unsigned*)arg);
 	unsigned sched_ctx = *((unsigned*)arg);
-	starpu_task_set_context(&sched_ctx);
+	starpu_sched_ctx_set_context(&sched_ctx);
 
 
 	struct starpu_task *task[10];
 	struct starpu_task *task[10];
 	struct params params[10];
 	struct params params[10];
@@ -115,8 +115,8 @@ int main()
 	policy.name = "app_driven";
 	policy.name = "app_driven";
 	void *perf_counters = sched_ctx_hypervisor_init(&policy);
 	void *perf_counters = sched_ctx_hypervisor_init(&policy);
 
 
-	starpu_set_perf_counters(sched_ctx1, (struct starpu_performance_counters*)perf_counters);
-	starpu_set_perf_counters(sched_ctx2, (struct starpu_performance_counters*)perf_counters);
+	starpu_sched_ctx_set_perf_counters(sched_ctx1, (struct starpu_sched_ctx_performance_counters*)perf_counters);
+	starpu_sched_ctx_set_perf_counters(sched_ctx2, (struct starpu_sched_ctx_performance_counters*)perf_counters);
 	sched_ctx_hypervisor_register_ctx(sched_ctx1, 0.0);
 	sched_ctx_hypervisor_register_ctx(sched_ctx1, 0.0);
 	sched_ctx_hypervisor_register_ctx(sched_ctx2, 0.0);
 	sched_ctx_hypervisor_register_ctx(sched_ctx2, 0.0);
 
 

+ 4 - 4
sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.c

@@ -97,7 +97,7 @@ void* start_bench(void *val)
 	pthread_setspecific(key, &p->id);
 	pthread_setspecific(key, &p->id);
 
 
 	if(p->ctx != 0)
 	if(p->ctx != 0)
-		starpu_task_set_context(&p->ctx);
+		starpu_sched_ctx_set_context(&p->ctx);
 
 
 	for(i = 0; i < NSAMPLES; i++)
 	for(i = 0; i < NSAMPLES; i++)
 		p->bench(p->mat[i], p->size, p->nblocks);
 		p->bench(p->mat[i], p->size, p->nblocks);
@@ -241,7 +241,7 @@ void construct_contexts(void (*bench)(float*, unsigned, unsigned))
 	struct sched_ctx_hypervisor_policy policy;
 	struct sched_ctx_hypervisor_policy policy;
 	policy.custom = 0;
 	policy.custom = 0;
 	policy.name = "idle";
 	policy.name = "idle";
-	struct starpu_performance_counters *perf_counters = sched_ctx_hypervisor_init(&policy);
+	struct starpu_sched_ctx_performance_counters *perf_counters = sched_ctx_hypervisor_init(&policy);
 	int nworkers1 = cpu1 + gpu + gpu1;
 	int nworkers1 = cpu1 + gpu + gpu1;
 	int nworkers2 = cpu2 + gpu + gpu2;
 	int nworkers2 = cpu2 + gpu + gpu2;
 	unsigned n_all_gpus = gpu + gpu1 + gpu2;
 	unsigned n_all_gpus = gpu + gpu1 + gpu2;
@@ -267,7 +267,7 @@ void construct_contexts(void (*bench)(float*, unsigned, unsigned))
 		p1.workers[i] = i;
 		p1.workers[i] = i;
 
 
 	p1.ctx = starpu_sched_ctx_create("heft", p1.workers, nworkers1, "sched_ctx1");
 	p1.ctx = starpu_sched_ctx_create("heft", p1.workers, nworkers1, "sched_ctx1");
-	starpu_set_perf_counters(p1.ctx, perf_counters);
+	starpu_sched_ctx_set_perf_counters(p1.ctx, perf_counters);
 	p2.the_other_ctx = (int)p1.ctx;
 	p2.the_other_ctx = (int)p1.ctx;
 	p1.nworkers = nworkers1;
 	p1.nworkers = nworkers1;
 	sched_ctx_hypervisor_register_ctx(p1.ctx, 0.0);
 	sched_ctx_hypervisor_register_ctx(p1.ctx, 0.0);
@@ -303,7 +303,7 @@ void construct_contexts(void (*bench)(float*, unsigned, unsigned))
 	/* 	p2.workers[k++] = i; */
 	/* 	p2.workers[k++] = i; */
 
 
 	p2.ctx = starpu_sched_ctx_create("heft", p2.workers, 0, "sched_ctx2");
 	p2.ctx = starpu_sched_ctx_create("heft", p2.workers, 0, "sched_ctx2");
-	starpu_set_perf_counters(p2.ctx, perf_counters);
+	starpu_sched_ctx_set_perf_counters(p2.ctx, perf_counters);
 	p1.the_other_ctx = (int)p2.ctx;
 	p1.the_other_ctx = (int)p2.ctx;
 	p2.nworkers = 0;
 	p2.nworkers = 0;
 	sched_ctx_hypervisor_register_ctx(p2.ctx, 0.0);
 	sched_ctx_hypervisor_register_ctx(p2.ctx, 0.0);

+ 1 - 0
sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.h

@@ -30,3 +30,4 @@ void end_contexts(void);
 void start_2benchs(void (*bench)(float *mat, unsigned size, unsigned nblocks));
 void start_2benchs(void (*bench)(float *mat, unsigned size, unsigned nblocks));
 void start_1stbench(void (*bench)(float *mat, unsigned size, unsigned nblocks));
 void start_1stbench(void (*bench)(float *mat, unsigned size, unsigned nblocks));
 void start_2ndbench(void (*bench)(float *mat, unsigned size, unsigned nblocks));
 void start_2ndbench(void (*bench)(float *mat, unsigned size, unsigned nblocks));
+void set_hypervisor_conf(int event, int task_tag);

+ 7 - 1
sched_ctx_hypervisor/include/sched_ctx_hypervisor.h

@@ -127,6 +127,12 @@ struct sched_ctx_hypervisor_wrapper
 	/* number of flops executed since last resizing */
 	/* number of flops executed since last resizing */
 	double elapsed_flops[STARPU_NMAXWORKERS];
 	double elapsed_flops[STARPU_NMAXWORKERS];
 
 
+	/* data quantity executed on each worker in this ctx */
+	size_t elapsed_data[STARPU_NMAXWORKERS];
+
+	/* nr of tasks executed on each worker in this ctx */
+	int elapsed_tasks[STARPU_NMAXWORKERS];
+
 	/* the average speed of workers when they belonged to this context */
 	/* the average speed of workers when they belonged to this context */
 	double ref_velocity[STARPU_NMAXWORKERS];
 	double ref_velocity[STARPU_NMAXWORKERS];
 
 
@@ -168,7 +174,7 @@ struct sched_ctx_hypervisor_policy
 	void (*end_ctx)(unsigned sched_ctx);
 	void (*end_ctx)(unsigned sched_ctx);
 };
 };
 
 
-struct starpu_performance_counters *sched_ctx_hypervisor_init(struct sched_ctx_hypervisor_policy *policy);
+struct starpu_sched_ctx_performance_counters *sched_ctx_hypervisor_init(struct sched_ctx_hypervisor_policy *policy);
 
 
 void sched_ctx_hypervisor_shutdown(void);
 void sched_ctx_hypervisor_shutdown(void);
 
 

+ 2 - 4
sched_ctx_hypervisor/src/Makefile.am

@@ -12,11 +12,9 @@
 #
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
 
-AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS)
 LIBS = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la
 LIBS = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la
-
-AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include/starpu/$(STARPU_EFFECTIVE_VERSION)/ -I$(top_builddir)/src/ -I$(top_srcdir)/src/ -I$(top_srcdir)/sched_ctx_hypervisor/include/
-
+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/src/ -I$(top_builddir)/src/ -I$(top_builddir)/include -I$(top_srcdir)/sched_ctx_hypervisor/include/ -I$(top_srcdir)/sched_ctx_hypervisor/src
 AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
 AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
 
 
 lib_LTLIBRARIES = libsched_ctx_hypervisor.la
 lib_LTLIBRARIES = libsched_ctx_hypervisor.la

+ 9 - 9
sched_ctx_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c

@@ -32,10 +32,12 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 	int *sched_ctxs = in_sched_ctxs == NULL ? sched_ctx_hypervisor_get_sched_ctxs() : in_sched_ctxs;
 	int *sched_ctxs = in_sched_ctxs == NULL ? sched_ctx_hypervisor_get_sched_ctxs() : in_sched_ctxs;
 	
 	
 	int w,s;
 	int w,s;
-	struct sched_ctx_hypervisor_wrapper* sc_w = NULL;
 
 
+	struct sched_ctx_hypervisor_wrapper* sc_w = NULL;
+	double total_flops = 0.0;
 	for(s = 0; s < ns; s++)
 	for(s = 0; s < ns; s++)
 	{
 	{
+		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
 		for(w = 0; w < nw; w++)
 		for(w = 0; w < nw; w++)
 		{
 		{
 			w_in_s[s][w] = 0.0;
 			w_in_s[s][w] = 0.0;
@@ -44,7 +46,6 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 			draft_flops_on_w[s][w] = 0.0;
 			draft_flops_on_w[s][w] = 0.0;
 			int worker = workers == NULL ? w : workers[w];
 			int worker = workers == NULL ? w : workers[w];
 
 
-			sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
 			velocity[s][w] = _get_velocity_per_worker(sc_w, worker);
 			velocity[s][w] = _get_velocity_per_worker(sc_w, worker);
 			if(velocity[s][w] == -1.0)
 			if(velocity[s][w] == -1.0)
 			{
 			{
@@ -53,21 +54,20 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 				if(velocity[s][w] == -1.0)
 				if(velocity[s][w] == -1.0)
 					velocity[s][w] = sc_w->ref_velocity[worker];
 					velocity[s][w] = sc_w->ref_velocity[worker];
 				if(velocity[s][w] == -1.0)
 				if(velocity[s][w] == -1.0)
-					velocity[s][w] = arch == STARPU_CPU_WORKER ? 5.0 : 150.0;
+					velocity[s][w] = arch == STARPU_CPU_WORKER ? 5.0 : 100.0;
 			}
 			}
 			
 			
-			printf("v[w%d][s%d] = %lf\n",w, s, velocity[s][w]);
+//			printf("v[w%d][s%d] = %lf\n",w, s, velocity[s][w]);
 		}
 		}
 		struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sched_ctxs[s]);
 		struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sched_ctxs[s]);
 		flops[s] = config->ispeed_ctx_sample/1000000000; /* in gflops */
 		flops[s] = config->ispeed_ctx_sample/1000000000; /* in gflops */
 	}
 	}
-
-
+	
 	/* take the exec time of the slowest ctx 
 	/* take the exec time of the slowest ctx 
 	   as starting point and then try to minimize it
 	   as starting point and then try to minimize it
 	   as increasing it a little for the faster ctxs */
 	   as increasing it a little for the faster ctxs */
 	double tmax = _get_slowest_ctx_exec_time();
 	double tmax = _get_slowest_ctx_exec_time();
-	double smallest_tmax = _get_fastest_ctx_exec_time(); //tmax - 0.5*tmax;
+ 	double smallest_tmax = _get_fastest_ctx_exec_time(); //tmax - 0.5*tmax; 
 //	printf("tmax %lf smallest %lf\n", tmax, smallest_tmax);
 //	printf("tmax %lf smallest %lf\n", tmax, smallest_tmax);
 
 
 	double res = 1.0;
 	double res = 1.0;
@@ -413,8 +413,8 @@ static void ispeed_lp_end_ctx(unsigned sched_ctx)
 {
 {
 	struct sched_ctx_hypervisor_wrapper* sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
 	struct sched_ctx_hypervisor_wrapper* sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
 	int worker;
 	int worker;
-	for(worker = 0; worker < 12; worker++)
-		printf("%d/%d: speed %lf\n", worker, sched_ctx, sc_w->ref_velocity[worker]);
+/* 	for(worker = 0; worker < 12; worker++) */
+/* 		printf("%d/%d: speed %lf\n", worker, sched_ctx, sc_w->ref_velocity[worker]); */
 
 
 	return;
 	return;
 }
 }

+ 1 - 1
sched_ctx_hypervisor/src/hypervisor_policies/ispeed_policy.c

@@ -77,7 +77,7 @@ static int* _get_slowest_workers(unsigned sched_ctx, int *nworkers, enum starpu_
 	int worker;
 	int worker;
 	int considered = 0;
 	int considered = 0;
 
 
-	struct starpu_iterator it;
+	struct starpu_sched_ctx_iterator it;
 	if(workers->init_iterator)
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 		workers->init_iterator(workers, &it);
 
 

+ 13 - 6
sched_ctx_hypervisor/src/hypervisor_policies/lp_tools.c

@@ -283,6 +283,12 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 		{
 		{
 			tmp_nw_move[w] = 0;
 			tmp_nw_move[w] = 0;
 			tmp_nw_add[w] = 0;
 			tmp_nw_add[w] = 0;
+			int i;
+			for(i = 0; i < STARPU_NMAXWORKERS; i++)
+			{
+				tmp_workers_move[w][i] = -1;
+				tmp_workers_add[w][i] = -1;
+			}
 		}
 		}
 
 
 		/* find workers that ctx s has to give away */
 		/* find workers that ctx s has to give away */
@@ -363,6 +369,7 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 				int nw_add = 0;
 				int nw_add = 0;
 
 
 				int w;
 				int w;
+				int j = 0, k = 0;
 				for(w = 0; w < nw; w++)
 				for(w = 0; w < nw; w++)
 				{
 				{
 					enum starpu_archtype arch = STARPU_ANY_WORKER;
 					enum starpu_archtype arch = STARPU_ANY_WORKER;
@@ -375,7 +382,7 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 					if( nw_needed > 0 && tmp_nw_move[w] > 0)
 					if( nw_needed > 0 && tmp_nw_move[w] > 0)
 					{
 					{
 						nw_move += nw_needed >= tmp_nw_move[w] ? tmp_nw_move[w] : nw_needed;
 						nw_move += nw_needed >= tmp_nw_move[w] ? tmp_nw_move[w] : nw_needed;
-						int i = 0, j = 0;
+						int i = 0;
 						for(i = 0; i < STARPU_NMAXWORKERS; i++)
 						for(i = 0; i < STARPU_NMAXWORKERS; i++)
 						{
 						{
 							if(tmp_workers_move[w][i] != -1)
 							if(tmp_workers_move[w][i] != -1)
@@ -395,14 +402,14 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 					if(diff > 0.3 && tmp_nw_add[w] != 0)
 					if(diff > 0.3 && tmp_nw_add[w] != 0)
 					{
 					{
 						nw_add = tmp_nw_add[w];
 						nw_add = tmp_nw_add[w];
-						int i = 0, j = 0;
+						int i = 0;
 						for(i = 0; i < STARPU_NMAXWORKERS; i++)
 						for(i = 0; i < STARPU_NMAXWORKERS; i++)
 						{
 						{
 							if(tmp_workers_add[w][i] != -1)
 							if(tmp_workers_add[w][i] != -1)
 							{
 							{
-								workers_add[j++] = tmp_workers_add[w][i];
+								workers_add[k++] = tmp_workers_add[w][i];
 								tmp_workers_add[w][i] = -1;
 								tmp_workers_add[w][i] = -1;
-								if(j == nw_add)
+								if(k == nw_add)
 									break;
 									break;
 							}
 							}
 						}
 						}
@@ -413,7 +420,7 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 				
 				
 				if(nw_move > 0)
 				if(nw_move > 0)
 				{
 				{
-					sched_ctx_hypervisor_move_workers(sched_ctxs[s], sched_ctxs[s2], workers_move, nw_move, 1);
+					sched_ctx_hypervisor_move_workers(sched_ctxs[s], sched_ctxs[s2], workers_move, nw_move, 0);
 					nw_move = 0;
 					nw_move = 0;
 				}
 				}
 
 
@@ -452,7 +459,7 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 		}
 		}
 
 
 		if(nw_move > 0)
 		if(nw_move > 0)
-			sched_ctx_hypervisor_remove_workers_from_sched_ctx(workers_move, nw_move, sched_ctxs[s], 1);
+			sched_ctx_hypervisor_remove_workers_from_sched_ctx(workers_move, nw_move, sched_ctxs[s], 0);
 	}
 	}
 }
 }
 
 

+ 26 - 19
sched_ctx_hypervisor/src/hypervisor_policies/policy_tools.c

@@ -28,7 +28,7 @@ static int _compute_priority(unsigned sched_ctx)
 	struct starpu_sched_ctx_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx);
 	struct starpu_sched_ctx_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx);
 	int worker;
 	int worker;
 
 
-	struct starpu_iterator it;
+	struct starpu_sched_ctx_iterator it;
 	if(workers->init_iterator)
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 		workers->init_iterator(workers, &it);
 
 
@@ -113,7 +113,7 @@ int* _get_first_workers(unsigned sched_ctx, int *nworkers, enum starpu_archtype
 	int worker;
 	int worker;
 	int considered = 0;
 	int considered = 0;
 
 
-	struct starpu_iterator it;
+	struct starpu_sched_ctx_iterator it;
 	if(workers->init_iterator)
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 		workers->init_iterator(workers, &it);
 
 
@@ -169,7 +169,6 @@ int* _get_first_workers(unsigned sched_ctx, int *nworkers, enum starpu_archtype
 		}
 		}
 	}
 	}
 
 
-
 	return curr_workers;
 	return curr_workers;
 }
 }
 
 
@@ -181,7 +180,7 @@ unsigned _get_potential_nworkers(struct sched_ctx_hypervisor_policy_config *conf
 	unsigned potential_workers = 0;
 	unsigned potential_workers = 0;
 	int worker;
 	int worker;
 
 
-	struct starpu_iterator it;
+	struct starpu_sched_ctx_iterator it;
 	if(workers->init_iterator)
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 		workers->init_iterator(workers, &it);
 	while(workers->has_next(workers, &it))
 	while(workers->has_next(workers, &it))
@@ -304,7 +303,7 @@ static double _get_best_elapsed_flops(struct sched_ctx_hypervisor_wrapper* sc_w,
 	struct starpu_sched_ctx_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sc_w->sched_ctx);
 	struct starpu_sched_ctx_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sc_w->sched_ctx);
         int worker;
         int worker;
 
 
-	struct starpu_iterator it;
+	struct starpu_sched_ctx_iterator it;
 	if(workers->init_iterator)
 	if(workers->init_iterator)
                 workers->init_iterator(workers, &it);
                 workers->init_iterator(workers, &it);
 
 
@@ -330,7 +329,7 @@ static double _get_ispeed_sample_for_type_of_worker(struct sched_ctx_hypervisor_
 
 
 	double avg = 0.0;
 	double avg = 0.0;
 	int n = 0;
 	int n = 0;
-	struct starpu_iterator it;
+	struct starpu_sched_ctx_iterator it;
 	if(workers->init_iterator)
 	if(workers->init_iterator)
                 workers->init_iterator(workers, &it);
                 workers->init_iterator(workers, &it);
 
 
@@ -356,7 +355,7 @@ static double _get_ispeed_sample_for_sched_ctx(unsigned sched_ctx)
         
         
 	int worker;
 	int worker;
 	double ispeed_sample = 0.0;
 	double ispeed_sample = 0.0;
-	struct starpu_iterator it;
+	struct starpu_sched_ctx_iterator it;
 
 
 	if(workers->init_iterator)
 	if(workers->init_iterator)
                 workers->init_iterator(workers, &it);
                 workers->init_iterator(workers, &it);
@@ -401,7 +400,7 @@ double _get_slowest_ctx_exec_time(void)
 	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
 	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
 	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
 	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
 
 
-	double curr_time = starpu_timing_now();
+/* 	double curr_time = starpu_timing_now(); */
 	double slowest_time = 0.0;
 	double slowest_time = 0.0;
 
 
 	int s;
 	int s;
@@ -410,18 +409,13 @@ double _get_slowest_ctx_exec_time(void)
 	{
 	{
 		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
 		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
 
 
-/*                 double elapsed_time = curr_time - sc_w->start_time; */
-/* 		if(elapsed_time > slowest_time) */
-/* 			slowest_time = elapsed_time; */
-
-//		double elapsed_flops = sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);
+//		double elapsed_time  = (curr_time - sc_w->start_time)/1000000;
 		struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx);
 		struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx);
-                double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/_get_ctx_velocity(sc_w);
+		double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/_get_ctx_velocity(sc_w);
 		if(elapsed_time > slowest_time)
 		if(elapsed_time > slowest_time)
 			slowest_time = elapsed_time;
 			slowest_time = elapsed_time;
 
 
         }
         }
-//	return slowest_time / 1000000.0;
 	return slowest_time;
 	return slowest_time;
 }
 }
 
 
@@ -431,7 +425,7 @@ double _get_fastest_ctx_exec_time(void)
 	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
 	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
 
 
 	double curr_time = starpu_timing_now();
 	double curr_time = starpu_timing_now();
-	double fastest_time = curr_time;
+ 	double fastest_time = curr_time;
 
 
 	int s;
 	int s;
 	struct sched_ctx_hypervisor_wrapper* sc_w;		
 	struct sched_ctx_hypervisor_wrapper* sc_w;		
@@ -440,13 +434,13 @@ double _get_fastest_ctx_exec_time(void)
 		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
 		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
 
 
 		struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx);
 		struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx);
-                double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/_get_ctx_velocity(sc_w);
-
+		double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/_get_ctx_velocity(sc_w);
+		
 		if(elapsed_time < fastest_time)
 		if(elapsed_time < fastest_time)
 			fastest_time = elapsed_time;
 			fastest_time = elapsed_time;
 
 
         }
         }
-//	return fastest_time / 1000000.0;
+
 	return fastest_time;
 	return fastest_time;
 }
 }
 
 
@@ -457,6 +451,8 @@ double _get_velocity_per_worker(struct sched_ctx_hypervisor_wrapper *sc_w, unsig
 		return -1.0;
 		return -1.0;
 
 
         double elapsed_flops = sc_w->elapsed_flops[worker] / 1000000000.0; /*in gflops */
         double elapsed_flops = sc_w->elapsed_flops[worker] / 1000000000.0; /*in gflops */
+	size_t elapsed_data_used = sc_w->elapsed_data[worker];
+	int elapsed_tasks = sc_w->elapsed_tasks[worker];
 	struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx);
 	struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx);
 	double sample = config->ispeed_w_sample[worker] / 1000000000.0; /*in gflops */
 	double sample = config->ispeed_w_sample[worker] / 1000000000.0; /*in gflops */
 
 
@@ -479,6 +475,17 @@ double _get_velocity_per_worker(struct sched_ctx_hypervisor_wrapper *sc_w, unsig
         {
         {
                 double curr_time = starpu_timing_now();
                 double curr_time = starpu_timing_now();
                 double elapsed_time = (curr_time - sc_w->start_time) / 1000000.0; /* in seconds */
                 double elapsed_time = (curr_time - sc_w->start_time) / 1000000.0; /* in seconds */
+ 		enum starpu_archtype arch = starpu_worker_get_type(worker);
+		if(arch == STARPU_CUDA_WORKER)
+		{
+			double transfer_velocity = starpu_get_bandwidth_RAM_CUDA(worker);
+			elapsed_time +=  (elapsed_data_used / transfer_velocity) / 1000000 ;
+			double latency = starpu_get_latency_RAM_CUDA(worker);
+//			printf("%d/%d: latency %lf elapsed_time before %lf ntasks %d\n", worker, sc_w->sched_ctx, latency, elapsed_time, elapsed_tasks);
+			elapsed_time += (elapsed_tasks * latency)/1000000;
+//			printf("elapsed time after %lf \n", elapsed_time);
+		}
+			
                 double vel  = (elapsed_flops/elapsed_time);/* in Gflops/s */
                 double vel  = (elapsed_flops/elapsed_time);/* in Gflops/s */
 		sc_w->ref_velocity[worker] = sc_w->ref_velocity[worker] > 0.0 ? (sc_w->ref_velocity[worker] + vel) / 2 : vel; 
 		sc_w->ref_velocity[worker] = sc_w->ref_velocity[worker] > 0.0 ? (sc_w->ref_velocity[worker] + vel) / 2 : vel; 
                 return vel;
                 return vel;

+ 25 - 22
sched_ctx_hypervisor/src/sched_ctx_hypervisor.c

@@ -19,11 +19,11 @@
 #include <starpu_config.h>
 #include <starpu_config.h>
 
 
 unsigned imposed_resize = 0;
 unsigned imposed_resize = 0;
-struct starpu_performance_counters* perf_counters = NULL;
+struct starpu_sched_ctx_performance_counters* perf_counters = NULL;
 
 
 static void notify_idle_cycle(unsigned sched_ctx, int worker, double idle_time);
 static void notify_idle_cycle(unsigned sched_ctx, int worker, double idle_time);
 static void notify_pushed_task(unsigned sched_ctx, int worker);
 static void notify_pushed_task(unsigned sched_ctx, int worker);
-static void notify_poped_task(unsigned sched_ctx, int worker, double flops);
+static void notify_poped_task(unsigned sched_ctx, int worker, double flops, size_t data_size);
 static void notify_post_exec_hook(unsigned sched_ctx, int taskid);
 static void notify_post_exec_hook(unsigned sched_ctx, int taskid);
 static void notify_idle_end(unsigned sched_ctx, int  worker);
 static void notify_idle_end(unsigned sched_ctx, int  worker);
 static void notify_submitted_job(struct starpu_task *task, unsigned footprint);
 static void notify_submitted_job(struct starpu_task *task, unsigned footprint);
@@ -125,7 +125,7 @@ static struct sched_ctx_hypervisor_policy *_select_hypervisor_policy(struct sche
 
 
 
 
 /* initializez the performance counters that starpu will use to retrive hints for resizing */
 /* initializez the performance counters that starpu will use to retrive hints for resizing */
-struct starpu_performance_counters* sched_ctx_hypervisor_init(struct sched_ctx_hypervisor_policy *hypervisor_policy)
+struct starpu_sched_ctx_performance_counters* sched_ctx_hypervisor_init(struct sched_ctx_hypervisor_policy *hypervisor_policy)
 {
 {
 	hypervisor.min_tasks = 0;
 	hypervisor.min_tasks = 0;
 	hypervisor.nsched_ctxs = 0;
 	hypervisor.nsched_ctxs = 0;
@@ -158,6 +158,8 @@ struct starpu_performance_counters* sched_ctx_hypervisor_init(struct sched_ctx_h
 			hypervisor.sched_ctx_w[i].pushed_tasks[j] = 0;
 			hypervisor.sched_ctx_w[i].pushed_tasks[j] = 0;
 			hypervisor.sched_ctx_w[i].poped_tasks[j] = 0;
 			hypervisor.sched_ctx_w[i].poped_tasks[j] = 0;
 			hypervisor.sched_ctx_w[i].elapsed_flops[j] = 0.0;
 			hypervisor.sched_ctx_w[i].elapsed_flops[j] = 0.0;
+			hypervisor.sched_ctx_w[i].elapsed_data[j] = 0;
+			hypervisor.sched_ctx_w[i].elapsed_tasks[j] = 0;
 			hypervisor.sched_ctx_w[i].total_elapsed_flops[j] = 0.0;
 			hypervisor.sched_ctx_w[i].total_elapsed_flops[j] = 0.0;
 			hypervisor.sched_ctx_w[i].worker_to_be_removed[j] = 0;
 			hypervisor.sched_ctx_w[i].worker_to_be_removed[j] = 0;
 			hypervisor.sched_ctx_w[i].ref_velocity[j] = -1.0;
 			hypervisor.sched_ctx_w[i].ref_velocity[j] = -1.0;
@@ -167,7 +169,7 @@ struct starpu_performance_counters* sched_ctx_hypervisor_init(struct sched_ctx_h
 	struct sched_ctx_hypervisor_policy *selected_hypervisor_policy = _select_hypervisor_policy(hypervisor_policy);
 	struct sched_ctx_hypervisor_policy *selected_hypervisor_policy = _select_hypervisor_policy(hypervisor_policy);
 	_load_hypervisor_policy(selected_hypervisor_policy);
 	_load_hypervisor_policy(selected_hypervisor_policy);
 
 
-	perf_counters = (struct starpu_performance_counters*)malloc(sizeof(struct starpu_performance_counters));
+	perf_counters = (struct starpu_sched_ctx_performance_counters*)malloc(sizeof(struct starpu_sched_ctx_performance_counters));
 	perf_counters->notify_idle_cycle = notify_idle_cycle;
 	perf_counters->notify_idle_cycle = notify_idle_cycle;
 	perf_counters->notify_pushed_task = notify_pushed_task;
 	perf_counters->notify_pushed_task = notify_pushed_task;
 	perf_counters->notify_poped_task = notify_poped_task;
 	perf_counters->notify_poped_task = notify_poped_task;
@@ -175,7 +177,7 @@ struct starpu_performance_counters* sched_ctx_hypervisor_init(struct sched_ctx_h
 	perf_counters->notify_idle_end = notify_idle_end;
 	perf_counters->notify_idle_end = notify_idle_end;
 	perf_counters->notify_submitted_job = notify_submitted_job;
 	perf_counters->notify_submitted_job = notify_submitted_job;
 
 
-	starpu_notify_hypervisor_exists();
+	starpu_sched_ctx_notify_hypervisor_exists();
 
 
 	return perf_counters;
 	return perf_counters;
 }
 }
@@ -346,7 +348,7 @@ int sched_ctx_hypervisor_get_nworkers_ctx(unsigned sched_ctx, enum starpu_archty
 	struct starpu_sched_ctx_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx);
 	struct starpu_sched_ctx_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx);
 	int worker;
 	int worker;
 
 
-	struct starpu_iterator it;
+	struct starpu_sched_ctx_iterator it;
 	if(workers->init_iterator)
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 		workers->init_iterator(workers, &it);
 
 
@@ -364,7 +366,14 @@ static void _set_elapsed_flops_per_sched_ctx(unsigned sched_ctx, double val)
 {
 {
 	int i;
 	int i;
 	for(i = 0; i < STARPU_NMAXWORKERS; i++)
 	for(i = 0; i < STARPU_NMAXWORKERS; i++)
+	{
 		hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[i] = val;
 		hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[i] = val;
+		if(val == 0)
+		{
+			hypervisor.sched_ctx_w[sched_ctx].elapsed_data[i] = 0;
+			hypervisor.sched_ctx_w[sched_ctx].elapsed_tasks[i] = 0;
+		}
+	}
 }
 }
 
 
 double sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(struct sched_ctx_hypervisor_wrapper* sc_w)
 double sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(struct sched_ctx_hypervisor_wrapper* sc_w)
@@ -396,7 +405,7 @@ void _reset_resize_sample_info(unsigned sender_sched_ctx, unsigned receiver_sche
 	sender_sc_w->start_time = start_time;
 	sender_sc_w->start_time = start_time;
 	sender_sc_w->remaining_flops = sender_sc_w->remaining_flops - sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sender_sc_w);
 	sender_sc_w->remaining_flops = sender_sc_w->remaining_flops - sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sender_sc_w);
 	_set_elapsed_flops_per_sched_ctx(sender_sched_ctx, 0.0);
 	_set_elapsed_flops_per_sched_ctx(sender_sched_ctx, 0.0);
-	
+
 	receiver_sc_w->start_time = start_time;
 	receiver_sc_w->start_time = start_time;
 	receiver_sc_w->remaining_flops = receiver_sc_w->remaining_flops - sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(receiver_sc_w);
 	receiver_sc_w->remaining_flops = receiver_sc_w->remaining_flops - sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(receiver_sc_w);
 	_set_elapsed_flops_per_sched_ctx(receiver_sched_ctx, 0.0);
 	_set_elapsed_flops_per_sched_ctx(receiver_sched_ctx, 0.0);
@@ -410,19 +419,11 @@ void sched_ctx_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned recei
 	{
 	{
 		_print_current_time();
 		_print_current_time();
 		int j;
 		int j;
-		printf("resize ctx %d with", sender_sched_ctx);
+		printf("resize ctx %d with %d workers", sender_sched_ctx, nworkers_to_move);
 		for(j = 0; j < nworkers_to_move; j++)
 		for(j = 0; j < nworkers_to_move; j++)
 			printf(" %d", workers_to_move[j]);
 			printf(" %d", workers_to_move[j]);
 		printf("\n");
 		printf("\n");
 
 
-/* 		int *cpus = (int*) malloc(nworkers_to_move * sizeof(int)); */
-/* 		int ncpus; */
-
-/* 		_get_cpus(workers_to_move, nworkers_to_move, cpus, &ncpus); */
-
-/* //		if(ncpus != 0) */
-/* //			starpu_sched_ctx_remove_workers(cpus, ncpus, sender_sched_ctx); */
-
 		starpu_sched_ctx_add_workers(workers_to_move, nworkers_to_move, receiver_sched_ctx);
 		starpu_sched_ctx_add_workers(workers_to_move, nworkers_to_move, receiver_sched_ctx);
 
 
 		if(now)
 		if(now)
@@ -622,11 +623,11 @@ static unsigned _ack_resize_completed(unsigned sched_ctx, int worker)
 			   whatever the application says */
 			   whatever the application says */
 			if(!((hypervisor.resize[sender_sched_ctx] == 0 || hypervisor.resize[receiver_sched_ctx] == 0) && imposed_resize))
 			if(!((hypervisor.resize[sender_sched_ctx] == 0 || hypervisor.resize[receiver_sched_ctx] == 0) && imposed_resize))
 			{
 			{
-				int j;
-				printf("remove after ack from ctx %d:", sender_sched_ctx);
-				for(j = 0; j < nmoved_workers; j++)
-					printf(" %d", moved_workers[j]);
-				printf("\n");
+/* 				int j; */
+/* 				printf("remove after ack from ctx %d:", sender_sched_ctx); */
+/* 				for(j = 0; j < nmoved_workers; j++) */
+/* 					printf(" %d", moved_workers[j]); */
+/* 				printf("\n"); */
 
 
 				starpu_sched_ctx_remove_workers(moved_workers, nmoved_workers, sender_sched_ctx);
 				starpu_sched_ctx_remove_workers(moved_workers, nmoved_workers, sender_sched_ctx);
 
 
@@ -715,10 +716,12 @@ static void notify_pushed_task(unsigned sched_ctx, int worker)
 }
 }
 
 
 /* notifies the hypervisor that a task was poped from the queue of the worker */
 /* notifies the hypervisor that a task was poped from the queue of the worker */
-static void notify_poped_task(unsigned sched_ctx, int worker, double elapsed_flops)
+static void notify_poped_task(unsigned sched_ctx, int worker, double elapsed_flops, size_t data_size)
 {
 {
 	hypervisor.sched_ctx_w[sched_ctx].poped_tasks[worker]++;
 	hypervisor.sched_ctx_w[sched_ctx].poped_tasks[worker]++;
 	hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[worker] += elapsed_flops;
 	hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[worker] += elapsed_flops;
+	hypervisor.sched_ctx_w[sched_ctx].elapsed_data[worker] += data_size ;
+	hypervisor.sched_ctx_w[sched_ctx].elapsed_tasks[worker]++ ;
 	hypervisor.sched_ctx_w[sched_ctx].total_elapsed_flops[worker] += elapsed_flops;
 	hypervisor.sched_ctx_w[sched_ctx].total_elapsed_flops[worker] += elapsed_flops;
 	hypervisor.sched_ctx_w[sched_ctx].remaining_flops -= elapsed_flops; //sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(&hypervisor.sched_ctx_w[sched_ctx]);
 	hypervisor.sched_ctx_w[sched_ctx].remaining_flops -= elapsed_flops; //sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(&hypervisor.sched_ctx_w[sched_ctx]);
 
 

+ 1 - 1
socl/examples/Makefile.am

@@ -14,7 +14,7 @@
 #
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
 
-AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
+AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS)
 LIBS = $(top_builddir)/socl/src/libsocl-@STARPU_EFFECTIVE_VERSION@.la $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la
 LIBS = $(top_builddir)/socl/src/libsocl-@STARPU_EFFECTIVE_VERSION@.la $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la
 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS)
 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS)
 
 

+ 7 - 4
socl/src/cl_enqueuendrangekernel.c

@@ -164,9 +164,6 @@ soclEnqueueNDRangeKernel(cl_command_queue cq,
       cl_event beforeEvent, afterEvent, totalEvent;
       cl_event beforeEvent, afterEvent, totalEvent;
 
 
       totalEvent = event_create();
       totalEvent = event_create();
-      totalEvent->prof_start = _socl_nanotime();
-      totalEvent->prof_submit = totalEvent->prof_start;
-      totalEvent->prof_queued = totalEvent->prof_start;
       gc_entity_store(&totalEvent->cq, cq);
       gc_entity_store(&totalEvent->cq, cq);
 
 
       command_marker cmd = command_marker_create();
       command_marker cmd = command_marker_create();
@@ -197,7 +194,7 @@ soclEnqueueNDRangeKernel(cl_command_queue cq,
          /* Store perf */
          /* Store perf */
          cl_ulong start,end;
          cl_ulong start,end;
          soclGetEventProfilingInfo(beforeEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &start, NULL);
          soclGetEventProfilingInfo(beforeEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &start, NULL);
-         soclGetEventProfilingInfo(afterEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &end, NULL);
+         soclGetEventProfilingInfo(afterEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
          soclReleaseEvent(afterEvent);
          soclReleaseEvent(afterEvent);
 
 
          kernel->split_perfs[iter] = end-start;
          kernel->split_perfs[iter] = end-start;
@@ -205,6 +202,12 @@ soclEnqueueNDRangeKernel(cl_command_queue cq,
          pthread_mutex_unlock(&kernel->split_lock);
          pthread_mutex_unlock(&kernel->split_lock);
 
 
          event_complete(totalEvent);
          event_complete(totalEvent);
+
+         totalEvent->prof_start = start;
+         totalEvent->prof_submit = start;
+         totalEvent->prof_queued = start;
+         totalEvent->prof_end = end;
+
          RETURN_EVENT(totalEvent,event);
          RETURN_EVENT(totalEvent,event);
       } else {
       } else {
          soclReleaseEvent(totalEvent);
          soclReleaseEvent(totalEvent);

+ 1 - 1
src/Makefile.am

@@ -49,7 +49,7 @@ endif STARPU_HAVE_WINDOWS
 
 
 lib_LTLIBRARIES = libstarpu-@STARPU_EFFECTIVE_VERSION@.la
 lib_LTLIBRARIES = libstarpu-@STARPU_EFFECTIVE_VERSION@.la
 
 
-libstarpu_@STARPU_EFFECTIVE_VERSION@_la_CPPFLAGS = -I$(top_srcdir)/include/ -DBUILDING_STARPU -Werror=implicit
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_CPPFLAGS = -I$(top_srcdir)/include/ -DBUILDING_STARPU
 
 
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_CFLAGS = $(GLOBAL_AM_CFLAGS) $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS)
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_CFLAGS = $(GLOBAL_AM_CFLAGS) $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS)
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_LIBADD = -lm $(HWLOC_LIBS) $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LIBS) $(STARPU_GLPK_LDFLAGS)
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_LIBADD = -lm $(HWLOC_LIBS) $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LIBS) $(STARPU_GLPK_LDFLAGS)

+ 10 - 2
src/core/jobs.c

@@ -144,6 +144,15 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 
 
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
 
 
+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
+	int workerid = starpu_worker_get_id();
+	int i;
+	size_t data_size = 0;
+	for(i = 0; i < STARPU_NMAXBUFS; i++)
+		if(task->handles[i] != NULL)
+			data_size += _starpu_data_get_size(task->handles[i]);
+#endif //STARPU_USE_SCHED_CTX_HYPERVISOR
+
 	/* We release handle reference count */
 	/* We release handle reference count */
 	if (task->cl)
 	if (task->cl)
 	{
 	{
@@ -210,8 +219,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 	{
 	{
 		_starpu_sched_post_exec_hook(task);
 		_starpu_sched_post_exec_hook(task);
 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
-		int workerid = starpu_worker_get_id();
-		starpu_call_poped_task_cb(workerid, task->sched_ctx, task->flops);
+		starpu_sched_ctx_call_poped_task_cb(workerid, task->sched_ctx, task->flops, data_size);
 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
 	}
 	}
 
 

+ 14 - 4
src/core/perfmodel/perfmodel_bus.c

@@ -1344,6 +1344,16 @@ static void write_bus_bandwidth_file_content(void)
 }
 }
 #endif /* STARPU_SIMGRID */
 #endif /* STARPU_SIMGRID */
 
 
+double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev)
+{
+	return bandwidth_matrix[0][cudadev+1];
+}
+
+double starpu_get_latency_RAM_CUDA(unsigned cudadev)
+{
+	return latency_matrix[0][cudadev+1];
+}
+
 void starpu_bus_print_bandwidth(FILE *f)
 void starpu_bus_print_bandwidth(FILE *f)
 {
 {
 	unsigned src, dst, maxnode;
 	unsigned src, dst, maxnode;
@@ -1397,14 +1407,14 @@ void starpu_bus_print_bandwidth(FILE *f)
 	{
 	{
 		struct dev_timing *timing;
 		struct dev_timing *timing;
 		struct _starpu_machine_config *config = _starpu_get_machine_config();
 		struct _starpu_machine_config *config = _starpu_get_machine_config();
-		int ncpus = _starpu_topology_get_nhwcpu(config);
-		int cpu;
+		unsigned config_ncpus = _starpu_topology_get_nhwcpu(config);
+		unsigned cpu;
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		if (src <= ncuda)
 		if (src <= ncuda)
 		{
 		{
 			fprintf(f, "CUDA %d\t", src-1);
 			fprintf(f, "CUDA %d\t", src-1);
-			for (cpu = 0; cpu < ncpus; cpu++)
+			for (cpu = 0; cpu < config_ncpus; cpu++)
 			{
 			{
 				timing = &cudadev_timing_per_cpu[src*STARPU_MAXCPUS+cpu];
 				timing = &cudadev_timing_per_cpu[src*STARPU_MAXCPUS+cpu];
 				if (timing->timing_htod)
 				if (timing->timing_htod)
@@ -1420,7 +1430,7 @@ void starpu_bus_print_bandwidth(FILE *f)
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
 		{
 		{
 			fprintf(f, "OpenCL%d\t", src-ncuda-1);
 			fprintf(f, "OpenCL%d\t", src-ncuda-1);
-			for (cpu = 0; cpu < ncpus; cpu++)
+			for (cpu = 0; cpu < config_ncpus; cpu++)
 			{
 			{
 				timing = &opencldev_timing_per_cpu[(src-ncuda)*STARPU_MAXCPUS+cpu];
 				timing = &opencldev_timing_per_cpu[(src-ncuda)*STARPU_MAXCPUS+cpu];
 				if (timing->timing_htod)
 				if (timing->timing_htod)

+ 31 - 14
src/core/perfmodel/perfmodel_history.c

@@ -180,7 +180,7 @@ static void scan_reg_model(FILE *f, struct starpu_perfmodel_regression_model *re
 
 
 static void dump_history_entry(FILE *f, struct starpu_perfmodel_history_entry *entry)
 static void dump_history_entry(FILE *f, struct starpu_perfmodel_history_entry *entry)
 {
 {
-	fprintf(f, "%08x\t%-15lu\t%-15le\t%-15le\t%-15le\t%-15le\t%u\n", entry->footprint, (unsigned long) entry->size, entry->mean, entry->deviation, entry->sum, entry->sum2, entry->nsample);
+	fprintf(f, "%08x\t%-15lu\t%-15le\t%-15le\t%-15le\t%-15le\t%-15le\t%u\n", entry->footprint, (unsigned long) entry->size, entry->flops, entry->mean, entry->deviation, entry->sum, entry->sum2, entry->nsample);
 }
 }
 
 
 static void scan_history_entry(FILE *f, struct starpu_perfmodel_history_entry *entry)
 static void scan_history_entry(FILE *f, struct starpu_perfmodel_history_entry *entry)
@@ -192,28 +192,36 @@ static void scan_history_entry(FILE *f, struct starpu_perfmodel_history_entry *e
 	/* In case entry is NULL, we just drop these values */
 	/* In case entry is NULL, we just drop these values */
 	unsigned nsample;
 	unsigned nsample;
 	uint32_t footprint;
 	uint32_t footprint;
-#ifdef STARPU_HAVE_WINDOWS
-	unsigned size; /* in bytes */
-#else
-	size_t size; /* in bytes */
-#endif
+	unsigned long size; /* in bytes */
+	double flops;
 	double mean;
 	double mean;
 	double deviation;
 	double deviation;
 	double sum;
 	double sum;
 	double sum2;
 	double sum2;
 
 
+	char line[256];
+	char *ret;
+
+	ret = fgets(line, sizeof(line), f);
+	STARPU_ASSERT(ret);
+	STARPU_ASSERT(strchr(line, '\n'));
+
 	/* Read the values from the file */
 	/* Read the values from the file */
-	res = fscanf(f, "%x\t%"
-#ifndef STARPU_HAVE_WINDOWS
-	"z"
-#endif
-	"u\t%le\t%le\t%le\t%le\t%u\n", &footprint, &size, &mean, &deviation, &sum, &sum2, &nsample);
-	STARPU_ASSERT_MSG(res == 7, "Incorrect performance model file");
+	res = sscanf(line, "%x\t%lu\t%le\t%le\t%le\t%le\t%le\t%u", &footprint, &size, &flops, &mean, &deviation, &sum, &sum2, &nsample);
+
+	if (res != 8)
+	{
+		flops = 0.;
+		/* Read the values from the file */
+		res = sscanf(line, "%x\t%lu\t%le\t%le\t%le\t%le\t%u", &footprint, &size, &mean, &deviation, &sum, &sum2, &nsample);
+		STARPU_ASSERT_MSG(res == 7, "Incorrect performance model file");
+	}
 
 
 	if (entry)
 	if (entry)
 	{
 	{
 		entry->footprint = footprint;
 		entry->footprint = footprint;
 		entry->size = size;
 		entry->size = size;
+		entry->flops = flops;
 		entry->mean = mean;
 		entry->mean = mean;
 		entry->deviation = deviation;
 		entry->deviation = deviation;
 		entry->sum = sum;
 		entry->sum = sum;
@@ -393,7 +401,7 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel *model, un
 	/* Dump the history into the model file in case it is necessary */
 	/* Dump the history into the model file in case it is necessary */
 	if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
 	if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
 	{
 	{
-		fprintf(f, "# hash\t\tsize\t\tmean\t\tdev\t\tsum\t\tsum2\t\tn\n");
+		fprintf(f, "# hash\t\tsize\t\tflops\t\tmean\t\tdev\t\tsum\t\tsum2\t\tn\n");
 		ptr = per_arch_model->list;
 		ptr = per_arch_model->list;
 		while (ptr)
 		while (ptr)
 		{
 		{
@@ -956,7 +964,7 @@ int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *mo
 			char *symbol2 = strdup(symbol);
 			char *symbol2 = strdup(symbol);
 			symbol2[dot-symbol] = '\0';
 			symbol2[dot-symbol] = '\0';
 			int ret;
 			int ret;
-			fprintf(stderr,"note: loading history from %s instead of %s\n", symbol2, symbol);
+			_STARPU_DISP("note: loading history from %s instead of %s\n", symbol2, symbol);
 			ret = starpu_perfmodel_load_symbol(symbol2,model);
 			ret = starpu_perfmodel_load_symbol(symbol2,model);
 			free(symbol2);
 			free(symbol2);
 			return ret;
 			return ret;
@@ -1152,6 +1160,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 				entry->sum2 = measured*measured;
 				entry->sum2 = measured*measured;
 
 
 				entry->size = _starpu_job_get_data_size(model, arch, nimpl, j);
 				entry->size = _starpu_job_get_data_size(model, arch, nimpl, j);
+				entry->flops = j->task->flops;
 
 
 				entry->footprint = key;
 				entry->footprint = key;
 				entry->nsample = 1;
 				entry->nsample = 1;
@@ -1168,6 +1177,14 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 				unsigned n = entry->nsample;
 				unsigned n = entry->nsample;
 				entry->mean = entry->sum / n;
 				entry->mean = entry->sum / n;
 				entry->deviation = sqrt((entry->sum2 - (entry->sum*entry->sum)/n)/n);
 				entry->deviation = sqrt((entry->sum2 - (entry->sum*entry->sum)/n)/n);
+				if (j->task->flops != 0.)
+				{
+					if (entry->flops == 0.)
+						entry->flops = j->task->flops;
+					else if (entry->flops != j->task->flops)
+						/* Incoherent flops! forget about trying to record flops */
+						entry->flops = NAN;
+				}
 			}
 			}
 
 
 			STARPU_ASSERT(entry);
 			STARPU_ASSERT(entry);

+ 4 - 4
src/core/perfmodel/perfmodel_print.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2011, 2013  Université de Bordeaux 1
  * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
  *
  *
@@ -38,8 +38,8 @@ void _starpu_perfmodel_print_history_based(struct starpu_perfmodel_per_arch *per
 			if (!parameter)
 			if (!parameter)
 			{
 			{
 				/* There isn't a parameter that is explicitely requested, so we display all parameters */
 				/* There isn't a parameter that is explicitely requested, so we display all parameters */
-				printf("%08x\t%-15lu\t%-15le\t%-15le\t%u\n", entry->footprint,
-					(unsigned long) entry->size, entry->mean, entry->deviation, entry->nsample);
+				printf("%08x\t%-15lu\t%-15le\t%-15le\t%-15le\t%u\n", entry->footprint,
+					(unsigned long) entry->size, entry->flops, entry->mean, entry->deviation, entry->nsample);
 			}
 			}
 			else
 			else
 			{
 			{
@@ -230,7 +230,7 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 		nmatched = sscanf(arch, "cuda_%d", &gpuid);
 		nmatched = sscanf(arch, "cuda_%d", &gpuid);
 		if (nmatched == 1)
 		if (nmatched == 1)
 		{
 		{
-			unsigned archid = STARPU_CUDA_DEFAULT+ gpuid;
+			int archid = STARPU_CUDA_DEFAULT+ gpuid;
 			unsigned implid;
 			unsigned implid;
 			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
 			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
 				starpu_perfmodel_print(model, (enum starpu_perf_archtype) archid, implid, parameter, footprint, output);
 				starpu_perfmodel_print(model, (enum starpu_perf_archtype) archid, implid, parameter, footprint, output);

+ 35 - 30
src/core/sched_ctx.c

@@ -40,9 +40,9 @@ static void _starpu_worker_gets_into_ctx(unsigned sched_ctx_id, struct _starpu_w
 	unsigned worker_sched_ctx_id = _starpu_worker_get_sched_ctx_id(worker, sched_ctx_id);
 	unsigned worker_sched_ctx_id = _starpu_worker_get_sched_ctx_id(worker, sched_ctx_id);
 	/* the worker was planning to go away in another ctx but finally he changed his mind & 
 	/* the worker was planning to go away in another ctx but finally he changed his mind & 
 	   he's staying */
 	   he's staying */
-	if(worker_sched_ctx_id  == STARPU_NMAX_SCHED_CTXS)
+	if (worker_sched_ctx_id  == STARPU_NMAX_SCHED_CTXS)
 	{
 	{
-		unsigned worker_sched_ctx_id = _starpu_worker_get_first_free_sched_ctx(worker);
+		worker_sched_ctx_id = _starpu_worker_get_first_free_sched_ctx(worker);
 		struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 		struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 		/* add context to worker */
 		/* add context to worker */
 		worker->sched_ctx[worker_sched_ctx_id] = sched_ctx;
 		worker->sched_ctx[worker_sched_ctx_id] = sched_ctx;
@@ -124,7 +124,7 @@ static void _starpu_update_workers_without_ctx(int *workerids, int nworkers, int
 	return;
 	return;
 }
 }
 
 
-void starpu_stop_task_submission()
+void starpu_sched_ctx_stop_task_submission()
 {
 {
 	_starpu_exclude_task_from_dag(&stop_submission_task);
 	_starpu_exclude_task_from_dag(&stop_submission_task);
 	_starpu_task_submit_internally(&stop_submission_task);
 	_starpu_task_submit_internally(&stop_submission_task);
@@ -442,7 +442,7 @@ unsigned starpu_sched_ctx_create(const char *policy_name, int *workerids,
 }
 }
 
 
 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
-void starpu_set_perf_counters(unsigned sched_ctx_id, struct starpu_performance_counters *perf_counters)
+void starpu_sched_ctx_set_perf_counters(unsigned sched_ctx_id, struct starpu_sched_ctx_performance_counters *perf_counters)
 {
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	sched_ctx->perf_counters = perf_counters;
 	sched_ctx->perf_counters = perf_counters;
@@ -716,6 +716,8 @@ void _starpu_decrement_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
 {
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	int finished = _starpu_barrier_counter_decrement_until_empty_counter(&sched_ctx->tasks_barrier);
 	int finished = _starpu_barrier_counter_decrement_until_empty_counter(&sched_ctx->tasks_barrier);
+/*when finished decrementing the tasks if the user signaled he will not submit tasks anymore
+  we can move all its workers to the inheritor context */
 	if(finished && sched_ctx->inheritor != STARPU_NMAX_SCHED_CTXS)
 	if(finished && sched_ctx->inheritor != STARPU_NMAX_SCHED_CTXS)
 	{
 	{
 		_STARPU_PTHREAD_MUTEX_LOCK(&finished_submit_mutex);
 		_STARPU_PTHREAD_MUTEX_LOCK(&finished_submit_mutex);
@@ -723,17 +725,20 @@ void _starpu_decrement_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
 		{
 		{
 			_STARPU_PTHREAD_MUTEX_UNLOCK(&finished_submit_mutex);
 			_STARPU_PTHREAD_MUTEX_UNLOCK(&finished_submit_mutex);
 
 
-			_STARPU_PTHREAD_MUTEX_LOCK(&changing_ctx_mutex[sched_ctx->id]);
-			int *workerids = NULL;
-			unsigned nworkers = _get_workers_list(sched_ctx, &workerids);
-
-			if(nworkers > 0)
+			/* take care the context is not deleted or changed at the same time */
+			_STARPU_PTHREAD_MUTEX_LOCK(&changing_ctx_mutex[sched_ctx_id]);
+			if(sched_ctx->id != STARPU_NMAX_SCHED_CTXS)
 			{
 			{
-				starpu_sched_ctx_add_workers(workerids, nworkers, sched_ctx->inheritor);
-				free(workerids);
+				int *workerids = NULL;
+				unsigned nworkers = _get_workers_list(sched_ctx, &workerids);
+				
+				if(nworkers > 0)
+				{
+					starpu_sched_ctx_add_workers(workerids, nworkers, sched_ctx->inheritor);
+					free(workerids);
+				}
 			}
 			}
-
-			_STARPU_PTHREAD_MUTEX_UNLOCK(&changing_ctx_mutex[sched_ctx->id]);
+			_STARPU_PTHREAD_MUTEX_UNLOCK(&changing_ctx_mutex[sched_ctx_id]);
 
 
 			return;
 			return;
 		}
 		}
@@ -748,12 +753,12 @@ void _starpu_increment_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
 	_starpu_barrier_counter_increment(&sched_ctx->tasks_barrier);
 	_starpu_barrier_counter_increment(&sched_ctx->tasks_barrier);
 }
 }
 
 
-void starpu_task_set_context(unsigned *sched_ctx)
+void starpu_sched_ctx_set_context(unsigned *sched_ctx)
 {
 {
 	pthread_setspecific(sched_ctx_key, (void*)sched_ctx);
 	pthread_setspecific(sched_ctx_key, (void*)sched_ctx);
 }
 }
 
 
-unsigned starpu_task_get_context()
+unsigned starpu_sched_ctx_get_context()
 {
 {
 	unsigned *sched_ctx = (unsigned*)pthread_getspecific(sched_ctx_key);
 	unsigned *sched_ctx = (unsigned*)pthread_getspecific(sched_ctx_key);
 	if(sched_ctx == NULL)
 	if(sched_ctx == NULL)
@@ -762,12 +767,12 @@ unsigned starpu_task_get_context()
 	return *sched_ctx;
 	return *sched_ctx;
 }
 }
 
 
-void starpu_notify_hypervisor_exists()
+void starpu_sched_ctx_notify_hypervisor_exists()
 {
 {
 	with_hypervisor = 1;
 	with_hypervisor = 1;
 }
 }
 
 
-unsigned starpu_check_if_hypervisor_exists()
+unsigned starpu_sched_ctx_check_if_hypervisor_exists()
 {
 {
 	return with_hypervisor;
 	return with_hypervisor;
 }
 }
@@ -797,7 +802,7 @@ struct starpu_sched_ctx_worker_collection* starpu_sched_ctx_create_worker_collec
 
 
 	switch(worker_collection_type)
 	switch(worker_collection_type)
 	{
 	{
-	case STARPU_WORKER_LIST:
+	case STARPU_SCHED_CTX_WORKER_LIST:
 		sched_ctx->workers->has_next = worker_list.has_next;
 		sched_ctx->workers->has_next = worker_list.has_next;
 		sched_ctx->workers->get_next = worker_list.get_next;
 		sched_ctx->workers->get_next = worker_list.get_next;
 		sched_ctx->workers->add = worker_list.add;
 		sched_ctx->workers->add = worker_list.add;
@@ -805,7 +810,7 @@ struct starpu_sched_ctx_worker_collection* starpu_sched_ctx_create_worker_collec
 		sched_ctx->workers->init = worker_list.init;
 		sched_ctx->workers->init = worker_list.init;
 		sched_ctx->workers->deinit = worker_list.deinit;
 		sched_ctx->workers->deinit = worker_list.deinit;
 		sched_ctx->workers->init_iterator = worker_list.init_iterator;
 		sched_ctx->workers->init_iterator = worker_list.init_iterator;
-		sched_ctx->workers->type = STARPU_WORKER_LIST;
+		sched_ctx->workers->type = STARPU_SCHED_CTX_WORKER_LIST;
 		break;
 		break;
 	}
 	}
 
 
@@ -818,7 +823,7 @@ static unsigned _get_workers_list(struct _starpu_sched_ctx *sched_ctx, int **wor
 	*workerids = (int*)malloc(workers->nworkers*sizeof(int));
 	*workerids = (int*)malloc(workers->nworkers*sizeof(int));
 	int worker;
 	int worker;
 	unsigned nworkers = 0;
 	unsigned nworkers = 0;
-	struct starpu_iterator it;
+	struct starpu_sched_ctx_iterator it;
 	if(workers->init_iterator)
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 		workers->init_iterator(workers, &it);
 
 
@@ -851,7 +856,7 @@ int starpu_get_workers_of_sched_ctx(unsigned sched_ctx_id, int *pus, enum starpu
 	int worker;
 	int worker;
 
 
 	int npus = 0;
 	int npus = 0;
-	struct starpu_iterator it;
+	struct starpu_sched_ctx_iterator it;
 	if(workers->init_iterator)
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 		workers->init_iterator(workers, &it);
 
 
@@ -866,7 +871,7 @@ int starpu_get_workers_of_sched_ctx(unsigned sched_ctx_id, int *pus, enum starpu
 	return npus;
 	return npus;
 }
 }
 
 
-_starpu_pthread_mutex_t* starpu_get_changing_ctx_mutex(unsigned sched_ctx_id)
+_starpu_pthread_mutex_t* starpu_sched_ctx_get_changing_ctx_mutex(unsigned sched_ctx_id)
 {
 {
 	return &changing_ctx_mutex[sched_ctx_id];
 	return &changing_ctx_mutex[sched_ctx_id];
 }
 }
@@ -891,7 +896,7 @@ unsigned starpu_sched_ctx_get_nshared_workers(unsigned sched_ctx_id, unsigned sc
         int worker, worker2;
         int worker, worker2;
         int shared_workers = 0;
         int shared_workers = 0;
 
 
-	struct starpu_iterator it1, it2;
+	struct starpu_sched_ctx_iterator it1, it2;
         if(workers->init_iterator)
         if(workers->init_iterator)
                 workers->init_iterator(workers, &it1);
                 workers->init_iterator(workers, &it1);
 
 
@@ -926,7 +931,7 @@ unsigned starpu_sched_ctx_contains_worker(int workerid, unsigned sched_ctx_id)
         struct starpu_sched_ctx_worker_collection *workers = sched_ctx->workers;
         struct starpu_sched_ctx_worker_collection *workers = sched_ctx->workers;
         int worker;
         int worker;
 
 
-	struct starpu_iterator it;
+	struct starpu_sched_ctx_iterator it;
         if(workers->init_iterator)
         if(workers->init_iterator)
                 workers->init_iterator(workers, &it);
                 workers->init_iterator(workers, &it);
 
 
@@ -963,7 +968,7 @@ unsigned starpu_sched_ctx_overlapping_ctxs_on_worker(int workerid)
 	return worker->nsched_ctxs > 1;
 	return worker->nsched_ctxs > 1;
 }
 }
 
 
-unsigned starpu_is_ctxs_turn(int workerid, unsigned sched_ctx_id)
+unsigned starpu_sched_ctx_is_ctxs_turn(int workerid, unsigned sched_ctx_id)
 {
 {
 	if(max_time_worker_on_ctx == -1.0) return 1;
 	if(max_time_worker_on_ctx == -1.0) return 1;
 
 
@@ -971,7 +976,7 @@ unsigned starpu_is_ctxs_turn(int workerid, unsigned sched_ctx_id)
 	return worker->active_ctx == sched_ctx_id;
 	return worker->active_ctx == sched_ctx_id;
 }
 }
 
 
-void starpu_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id)
+void starpu_sched_ctx_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id)
 {
 {
 	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
 	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
 
 
@@ -996,7 +1001,7 @@ void starpu_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id)
 	}
 	}
 }
 }
 
 
-double starpu_get_max_time_worker_on_ctx(void)
+double starpu_sched_ctx_get_max_time_worker_on_ctx(void)
 {
 {
 	return max_time_worker_on_ctx;
 	return max_time_worker_on_ctx;
 }
 }
@@ -1020,15 +1025,15 @@ void starpu_sched_ctx_finished_submit(unsigned sched_ctx_id)
 
 
 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
 
 
-void starpu_call_poped_task_cb(int workerid, unsigned sched_ctx_id, double flops)
+void starpu_sched_ctx_call_poped_task_cb(int workerid, unsigned sched_ctx_id, double flops, size_t data_size)
 {
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	if(sched_ctx != NULL && sched_ctx_id != 0 && sched_ctx_id != STARPU_NMAX_SCHED_CTXS
 	if(sched_ctx != NULL && sched_ctx_id != 0 && sched_ctx_id != STARPU_NMAX_SCHED_CTXS
 	   && sched_ctx->perf_counters != NULL)
 	   && sched_ctx->perf_counters != NULL)
-		sched_ctx->perf_counters->notify_poped_task(sched_ctx_id, workerid, flops);
+		sched_ctx->perf_counters->notify_poped_task(sched_ctx_id, workerid, flops, data_size);
 }
 }
 
 
-void starpu_call_pushed_task_cb(int workerid, unsigned sched_ctx_id)
+void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id)
 {
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 
 

+ 2 - 2
src/core/sched_ctx.h

@@ -91,7 +91,7 @@ struct _starpu_sched_ctx
 
 
 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
 	/* a structure containing a series of performance counters determining the resize procedure */
 	/* a structure containing a series of performance counters determining the resize procedure */
-	struct starpu_performance_counters *perf_counters;
+	struct starpu_sched_ctx_performance_counters *perf_counters;
 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
 };
 };
 
 
@@ -139,7 +139,7 @@ void _starpu_worker_gets_out_of_ctx(unsigned sched_ctx_id, struct _starpu_worker
 unsigned _starpu_worker_belongs_to_a_sched_ctx(int workerid, unsigned sched_ctx_id);
 unsigned _starpu_worker_belongs_to_a_sched_ctx(int workerid, unsigned sched_ctx_id);
 
 
 #if defined(_MSC_VER) || defined(STARPU_SIMGRID)
 #if defined(_MSC_VER) || defined(STARPU_SIMGRID)
-_starpu_pthread_mutex_t* starpu_get_changing_ctx_mutex(unsigned sched_ctx_id);
+_starpu_pthread_mutex_t* starpu_sched_ctx_get_changing_ctx_mutex(unsigned sched_ctx_id);
 #endif
 #endif
 
 
 #endif // __SCHED_CONTEXT_H__
 #endif // __SCHED_CONTEXT_H__

+ 15 - 16
src/core/sched_policy.c

@@ -225,7 +225,7 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 	}
 	}
 
 
 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
-	starpu_call_pushed_task_cb(workerid, task->sched_ctx);
+	starpu_sched_ctx_call_pushed_task_cb(workerid, task->sched_ctx);
 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
 
 
 	if (is_basic_worker)
 	if (is_basic_worker)
@@ -233,7 +233,6 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 		unsigned node = starpu_worker_get_memory_node(workerid);
 		unsigned node = starpu_worker_get_memory_node(workerid);
 		if (_starpu_task_uses_multiformat_handles(task))
 		if (_starpu_task_uses_multiformat_handles(task))
 		{
 		{
-			unsigned i;
 			for (i = 0; i < task->cl->nbuffers; i++)
 			for (i = 0; i < task->cl->nbuffers; i++)
 			{
 			{
 				struct starpu_task *conversion_task;
 				struct starpu_task *conversion_task;
@@ -269,24 +268,24 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 
 
 		int ret = 0;
 		int ret = 0;
 
 
-		struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
-		j->task_size = worker_size;
-		j->combined_workerid = workerid;
-		j->active_task_alias_count = 0;
+		struct _starpu_job *job = _starpu_get_job_associated_to_task(task);
+		job->task_size = worker_size;
+		job->combined_workerid = workerid;
+		job->active_task_alias_count = 0;
 
 
-		_STARPU_PTHREAD_BARRIER_INIT(&j->before_work_barrier, NULL, worker_size);
-		_STARPU_PTHREAD_BARRIER_INIT(&j->after_work_barrier, NULL, worker_size);
+		_STARPU_PTHREAD_BARRIER_INIT(&job->before_work_barrier, NULL, worker_size);
+		_STARPU_PTHREAD_BARRIER_INIT(&job->after_work_barrier, NULL, worker_size);
 
 
 		/* Note: we have to call that early, or else the task may have
 		/* Note: we have to call that early, or else the task may have
 		 * disappeared already */
 		 * disappeared already */
 		_starpu_push_task_end(task);
 		_starpu_push_task_end(task);
 
 
-		int i;
-		for (i = 0; i < worker_size; i++)
+		int j;
+		for (j = 0; j < worker_size; j++)
 		{
 		{
 			struct starpu_task *alias = _starpu_create_task_alias(task);
 			struct starpu_task *alias = _starpu_create_task_alias(task);
 
 
-			worker = _starpu_get_worker_struct(combined_workerid[i]);
+			worker = _starpu_get_worker_struct(combined_workerid[j]);
 			ret |= _starpu_push_local_task(worker, alias, 0);
 			ret |= _starpu_push_local_task(worker, alias, 0);
 		}
 		}
 
 
@@ -299,14 +298,14 @@ static int _starpu_nworkers_able_to_execute_task(struct starpu_task *task, struc
 	int worker = -1, nworkers = 0;
 	int worker = -1, nworkers = 0;
 	struct starpu_sched_ctx_worker_collection *workers = sched_ctx->workers;
 	struct starpu_sched_ctx_worker_collection *workers = sched_ctx->workers;
 
 
-	struct starpu_iterator it;
+	struct starpu_sched_ctx_iterator it;
 	if(workers->init_iterator)
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 		workers->init_iterator(workers, &it);
 
 
 	while(workers->has_next(workers, &it))
 	while(workers->has_next(workers, &it))
 	{
 	{
 		worker = workers->get_next(workers, &it);
 		worker = workers->get_next(workers, &it);
-		if (starpu_worker_can_execute_task(worker, task, 0) && starpu_is_ctxs_turn(worker, sched_ctx->id))
+		if (starpu_worker_can_execute_task(worker, task, 0) && starpu_sched_ctx_is_ctxs_turn(worker, sched_ctx->id))
 			nworkers++;
 			nworkers++;
 	}
 	}
 
 
@@ -563,7 +562,7 @@ pick:
 	{
 	{
 		struct _starpu_sched_ctx *sched_ctx;
 		struct _starpu_sched_ctx *sched_ctx;
 
 
-		unsigned lucky_ctx = STARPU_NMAX_SCHED_CTXS;
+		//unsigned lucky_ctx = STARPU_NMAX_SCHED_CTXS;
 
 
 		int been_here[STARPU_NMAX_SCHED_CTXS];
 		int been_here[STARPU_NMAX_SCHED_CTXS];
 		int i;
 		int i;
@@ -582,7 +581,7 @@ pick:
 				if (sched_ctx->sched_policy && sched_ctx->sched_policy->pop_task)
 				if (sched_ctx->sched_policy && sched_ctx->sched_policy->pop_task)
 				{
 				{
 					task = sched_ctx->sched_policy->pop_task(sched_ctx->id);
 					task = sched_ctx->sched_policy->pop_task(sched_ctx->id);
-					lucky_ctx = sched_ctx->id;
+					//lucky_ctx = sched_ctx->id;
 				}
 				}
 			}
 			}
 
 
@@ -605,7 +604,7 @@ pick:
 
 
 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
 	struct _starpu_sched_ctx *sched_ctx = NULL;
 	struct _starpu_sched_ctx *sched_ctx = NULL;
-	struct starpu_performance_counters *perf_counters = NULL;
+	struct starpu_sched_ctx_performance_counters *perf_counters = NULL;
 	int j;
 	int j;
 	for(j = 0; j < STARPU_NMAX_SCHED_CTXS; j++)
 	for(j = 0; j < STARPU_NMAX_SCHED_CTXS; j++)
 	{
 	{

+ 12 - 2
src/core/task.c

@@ -365,7 +365,7 @@ int starpu_task_submit(struct starpu_task *task)
 
 
 	if (task->sched_ctx == 0 && nsched_ctxs != 1 && !j->exclude_from_dag)
 	if (task->sched_ctx == 0 && nsched_ctxs != 1 && !j->exclude_from_dag)
 	{
 	{
-		set_sched_ctx = starpu_task_get_context();
+		set_sched_ctx = starpu_sched_ctx_get_context();
 		if (set_sched_ctx != STARPU_NMAX_SCHED_CTXS)
 		if (set_sched_ctx != STARPU_NMAX_SCHED_CTXS)
 			task->sched_ctx = set_sched_ctx;
 			task->sched_ctx = set_sched_ctx;
 	}
 	}
@@ -633,7 +633,7 @@ void starpu_display_codelet_stats(struct starpu_codelet *cl)
 int starpu_task_wait_for_all(void)
 int starpu_task_wait_for_all(void)
 {
 {
 	unsigned nsched_ctxs = _starpu_get_nsched_ctxs();
 	unsigned nsched_ctxs = _starpu_get_nsched_ctxs();
-	unsigned sched_ctx_id = nsched_ctxs == 1 ? 0 : starpu_task_get_context();
+	unsigned sched_ctx_id = nsched_ctxs == 1 ? 0 : starpu_sched_ctx_get_context();
 
 
 	/* if there is no indication about which context to wait,
 	/* if there is no indication about which context to wait,
 	   we wait for all tasks submitted to starpu */
 	   we wait for all tasks submitted to starpu */
@@ -745,6 +745,11 @@ static void _starpu_increment_nsubmitted_tasks(void)
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
 }
 }
 
 
+int starpu_task_nsubmitted(void)
+{
+	return nsubmitted;
+}
+
 void _starpu_increment_nready_tasks(void)
 void _starpu_increment_nready_tasks(void)
 {
 {
 	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
 	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
@@ -765,6 +770,11 @@ void _starpu_decrement_nready_tasks(void)
 
 
 }
 }
 
 
+int starpu_task_nready(void)
+{
+	return nready;
+}
+
 void _starpu_initialize_current_task_key(void)
 void _starpu_initialize_current_task_key(void)
 {
 {
 	_STARPU_PTHREAD_KEY_CREATE(&current_task_key, NULL);
 	_STARPU_PTHREAD_KEY_CREATE(&current_task_key, NULL);

+ 24 - 24
src/core/workers.c

@@ -84,15 +84,15 @@ static uint32_t _starpu_worker_exists_and_can_execute(struct starpu_task *task,
 			switch (arch)
 			switch (arch)
 			{
 			{
 			case STARPU_CPU_WORKER:
 			case STARPU_CPU_WORKER:
-				if (task->cl->cpu_funcs[i] != NULL)
+				if (task->cl->cpu_funcs[impl] != NULL)
 					test_implementation = 1;
 					test_implementation = 1;
 				break;
 				break;
 			case STARPU_CUDA_WORKER:
 			case STARPU_CUDA_WORKER:
-				if (task->cl->cuda_funcs[i] != NULL)
+				if (task->cl->cuda_funcs[impl] != NULL)
 					test_implementation = 1;
 					test_implementation = 1;
 				break;
 				break;
 			case STARPU_OPENCL_WORKER:
 			case STARPU_OPENCL_WORKER:
-				if (task->cl->opencl_funcs[i] != NULL)
+				if (task->cl->opencl_funcs[impl] != NULL)
 					test_implementation = 1;
 					test_implementation = 1;
 				break;
 				break;
 			default:
 			default:
@@ -340,14 +340,14 @@ void _starpu_worker_init(struct _starpu_worker *worker, unsigned fut_key)
 
 
 }
 }
 
 
-static void _starpu_launch_drivers(struct _starpu_machine_config *config)
+static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 {
 {
-	config->running = 1;
-	config->submitting = 1;
+	pconfig->running = 1;
+	pconfig->submitting = 1;
 
 
 	_STARPU_PTHREAD_KEY_CREATE(&worker_key, NULL);
 	_STARPU_PTHREAD_KEY_CREATE(&worker_key, NULL);
 
 
-	unsigned nworkers = config->topology.nworkers;
+	unsigned nworkers = pconfig->topology.nworkers;
 
 
 	/* Launch workers asynchronously */
 	/* Launch workers asynchronously */
 	unsigned cpu = 0, cuda = 0;
 	unsigned cpu = 0, cuda = 0;
@@ -368,9 +368,9 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
 
 	for (worker = 0; worker < nworkers; worker++)
 	for (worker = 0; worker < nworkers; worker++)
 	{
 	{
-		struct _starpu_worker *workerarg = &config->workers[worker];
+		struct _starpu_worker *workerarg = &pconfig->workers[worker];
 
 
-		workerarg->config = config;
+		workerarg->config = pconfig;
 
 
 		_starpu_barrier_counter_init(&workerarg->tasks_barrier, 0);
 		_starpu_barrier_counter_init(&workerarg->tasks_barrier, 0);
 
 
@@ -388,7 +388,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 		workerarg->run_by_starpu = 1;
 		workerarg->run_by_starpu = 1;
 		workerarg->worker_is_running = 0;
 		workerarg->worker_is_running = 0;
 		workerarg->worker_is_initialized = 0;
 		workerarg->worker_is_initialized = 0;
-		
+
 		int ctx;
 		int ctx;
 		for(ctx = 0; ctx < STARPU_NMAX_SCHED_CTXS; ctx++)
 		for(ctx = 0; ctx < STARPU_NMAX_SCHED_CTXS; ctx++)
 			workerarg->removed_from_ctx[ctx] = 0;
 			workerarg->removed_from_ctx[ctx] = 0;
@@ -419,7 +419,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 			case STARPU_CPU_WORKER:
 			case STARPU_CPU_WORKER:
 				workerarg->set = NULL;
 				workerarg->set = NULL;
 				driver.id.cpu_id = cpu;
 				driver.id.cpu_id = cpu;
-				if (_starpu_may_launch_driver(config->conf, &driver))
+				if (_starpu_may_launch_driver(pconfig->conf, &driver))
 				{
 				{
 					_STARPU_PTHREAD_CREATE_ON(
 					_STARPU_PTHREAD_CREATE_ON(
 						workerarg->name,
 						workerarg->name,
@@ -446,7 +446,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 			case STARPU_CUDA_WORKER:
 			case STARPU_CUDA_WORKER:
 				workerarg->set = NULL;
 				workerarg->set = NULL;
 				driver.id.cuda_id = cuda;
 				driver.id.cuda_id = cuda;
-				if (_starpu_may_launch_driver(config->conf, &driver))
+				if (_starpu_may_launch_driver(pconfig->conf, &driver))
 				{
 				{
 					_STARPU_PTHREAD_CREATE_ON(
 					_STARPU_PTHREAD_CREATE_ON(
 						workerarg->name,
 						workerarg->name,
@@ -473,7 +473,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 			case STARPU_OPENCL_WORKER:
 			case STARPU_OPENCL_WORKER:
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
 				starpu_opencl_get_device(workerarg->devid, &driver.id.opencl_id);
 				starpu_opencl_get_device(workerarg->devid, &driver.id.opencl_id);
-				if (!_starpu_may_launch_driver(config->conf, &driver))
+				if (!_starpu_may_launch_driver(pconfig->conf, &driver))
 				{
 				{
 					workerarg->run_by_starpu = 0;
 					workerarg->run_by_starpu = 0;
 					break;
 					break;
@@ -504,7 +504,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 	cuda = 0;
 	cuda = 0;
 	for (worker = 0; worker < nworkers; worker++)
 	for (worker = 0; worker < nworkers; worker++)
 	{
 	{
-		struct _starpu_worker *workerarg = &config->workers[worker];
+		struct _starpu_worker *workerarg = &pconfig->workers[worker];
 		struct starpu_driver driver;
 		struct starpu_driver driver;
 		driver.type = workerarg->arch;
 		driver.type = workerarg->arch;
 
 
@@ -512,7 +512,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 		{
 		{
 			case STARPU_CPU_WORKER:
 			case STARPU_CPU_WORKER:
 				driver.id.cpu_id = cpu;
 				driver.id.cpu_id = cpu;
-				if (!_starpu_may_launch_driver(config->conf, &driver))
+				if (!_starpu_may_launch_driver(pconfig->conf, &driver))
 				{
 				{
 					cpu++;
 					cpu++;
 					break;
 					break;
@@ -526,7 +526,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 				break;
 				break;
 			case STARPU_CUDA_WORKER:
 			case STARPU_CUDA_WORKER:
 				driver.id.cuda_id = cuda;
 				driver.id.cuda_id = cuda;
-				if (!_starpu_may_launch_driver(config->conf, &driver))
+				if (!_starpu_may_launch_driver(pconfig->conf, &driver))
 				{
 				{
 					cuda++;
 					cuda++;
 					break;
 					break;
@@ -542,7 +542,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 			case STARPU_OPENCL_WORKER:
 			case STARPU_OPENCL_WORKER:
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
 				starpu_opencl_get_device(workerarg->devid, &driver.id.opencl_id);
 				starpu_opencl_get_device(workerarg->devid, &driver.id.opencl_id);
-				if (!_starpu_may_launch_driver(config->conf, &driver))
+				if (!_starpu_may_launch_driver(pconfig->conf, &driver))
 					break;
 					break;
 #endif
 #endif
 				_STARPU_DEBUG("waiting for worker %u initialization\n", worker);
 				_STARPU_DEBUG("waiting for worker %u initialization\n", worker);
@@ -817,19 +817,19 @@ void starpu_profiling_init()
  * Handle runtime termination
  * Handle runtime termination
  */
  */
 
 
-static void _starpu_terminate_workers(struct _starpu_machine_config *config)
+static void _starpu_terminate_workers(struct _starpu_machine_config *pconfig)
 {
 {
 	int status STARPU_ATTRIBUTE_UNUSED;
 	int status STARPU_ATTRIBUTE_UNUSED;
 	unsigned workerid;
 	unsigned workerid;
 
 
-	for (workerid = 0; workerid < config->topology.nworkers; workerid++)
+	for (workerid = 0; workerid < pconfig->topology.nworkers; workerid++)
 	{
 	{
 		starpu_wake_all_blocked_workers();
 		starpu_wake_all_blocked_workers();
 
 
 		_STARPU_DEBUG("wait for worker %u\n", workerid);
 		_STARPU_DEBUG("wait for worker %u\n", workerid);
 
 
-		struct _starpu_worker_set *set = config->workers[workerid].set;
-		struct _starpu_worker *worker = &config->workers[workerid];
+		struct _starpu_worker_set *set = pconfig->workers[workerid].set;
+		struct _starpu_worker *worker = &pconfig->workers[workerid];
 
 
 		/* in case StarPU termination code is called from a callback,
 		/* in case StarPU termination code is called from a callback,
  		 * we have to check if pthread_self() is the worker itself */
  		 * we have to check if pthread_self() is the worker itself */
@@ -914,10 +914,10 @@ unsigned _starpu_worker_can_block(unsigned memnode STARPU_ATTRIBUTE_UNUSED)
 #endif
 #endif
 }
 }
 
 
-static void _starpu_kill_all_workers(struct _starpu_machine_config *config)
+static void _starpu_kill_all_workers(struct _starpu_machine_config *pconfig)
 {
 {
 	/* set the flag which will tell workers to stop */
 	/* set the flag which will tell workers to stop */
-	config->running = 0;
+	pconfig->running = 0;
 	/* running is just protected by a memory barrier */
 	/* running is just protected by a memory barrier */
 	STARPU_WMB();
 	STARPU_WMB();
 	starpu_wake_all_blocked_workers();
 	starpu_wake_all_blocked_workers();
@@ -1299,7 +1299,7 @@ int starpu_worker_get_nids_ctx_free_by_type(enum starpu_archtype type, int *work
 				if(config.sched_ctxs[s].id != STARPU_NMAX_SCHED_CTXS)
 				if(config.sched_ctxs[s].id != STARPU_NMAX_SCHED_CTXS)
 				{
 				{
 					struct starpu_sched_ctx_worker_collection *workers = config.sched_ctxs[s].workers;
 					struct starpu_sched_ctx_worker_collection *workers = config.sched_ctxs[s].workers;
-					struct starpu_iterator it;
+					struct starpu_sched_ctx_iterator it;
 					if(workers->init_iterator)
 					if(workers->init_iterator)
 						workers->init_iterator(workers, &it);
 						workers->init_iterator(workers, &it);
 
 

+ 1 - 0
src/datawizard/coherency.c

@@ -458,6 +458,7 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 			}
 			}
 		}
 		}
 		else
 		else
+			/* The last request will perform the callback after termination */
 			_starpu_data_request_append_callback(r, callback_func, callback_arg);
 			_starpu_data_request_append_callback(r, callback_func, callback_arg);
 
 
 
 

+ 147 - 35
src/datawizard/copy_driver.c

@@ -134,8 +134,10 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 	{
 	{
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM):
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM):
 		/* STARPU_CPU_RAM -> STARPU_CPU_RAM */
 		/* STARPU_CPU_RAM -> STARPU_CPU_RAM */
-		STARPU_ASSERT(copy_methods->ram_to_ram);
-		copy_methods->ram_to_ram(src_interface, src_node, dst_interface, dst_node);
+		if (copy_methods->ram_to_ram)
+			copy_methods->ram_to_ram(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
 		break;
 		break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
@@ -143,11 +145,15 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 #if !defined(HAVE_CUDA_MEMCPY_PEER)
 #if !defined(HAVE_CUDA_MEMCPY_PEER)
 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == src_node);
 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == src_node);
 #endif
 #endif
-		STARPU_ASSERT(copy_methods->cuda_to_ram);
-		if (!req || !copy_methods->cuda_to_ram_async)
+		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
+				!(copy_methods->cuda_to_ram_async || copy_methods->any_to_any))
 		{
 		{
 			/* this is not associated to a request so it's synchronous */
 			/* this is not associated to a request so it's synchronous */
-			copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node);
+			STARPU_ASSERT(copy_methods->cuda_to_ram || copy_methods->any_to_any);
+			if (copy_methods->cuda_to_ram)
+				copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node);
+			else
+				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
 		}
 		}
 		else
 		else
 		{
 		{
@@ -156,7 +162,13 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 
 
 			stream = starpu_cuda_get_local_out_transfer_stream();
 			stream = starpu_cuda_get_local_out_transfer_stream();
-			ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
+			if (copy_methods->cuda_to_ram_async)
+				ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
+			else
+			{
+				STARPU_ASSERT(copy_methods->any_to_any);
+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+			}
 
 
 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
@@ -168,11 +180,15 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 #if !defined(HAVE_CUDA_MEMCPY_PEER)
 #if !defined(HAVE_CUDA_MEMCPY_PEER)
 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node);
 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node);
 #endif
 #endif
-		STARPU_ASSERT(copy_methods->ram_to_cuda);
-		if (!req || !copy_methods->ram_to_cuda_async)
+		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
+				!(copy_methods->ram_to_cuda_async || copy_methods->any_to_any))
 		{
 		{
 			/* this is not associated to a request so it's synchronous */
 			/* this is not associated to a request so it's synchronous */
-			copy_methods->ram_to_cuda(src_interface, src_node, dst_interface, dst_node);
+			STARPU_ASSERT(copy_methods->ram_to_cuda || copy_methods->any_to_any);
+			if (copy_methods->ram_to_cuda)
+				copy_methods->ram_to_cuda(src_interface, src_node, dst_interface, dst_node);
+			else
+				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
 		}
 		}
 		else
 		else
 		{
 		{
@@ -182,7 +198,13 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 				STARPU_CUDA_REPORT_ERROR(cures);
 				STARPU_CUDA_REPORT_ERROR(cures);
 
 
 			stream = starpu_cuda_get_local_in_transfer_stream();
 			stream = starpu_cuda_get_local_in_transfer_stream();
-			ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
+			if (copy_methods->ram_to_cuda_async)
+				ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
+			else
+			{
+				STARPU_ASSERT(copy_methods->any_to_any);
+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+			}
 
 
 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
 			if (STARPU_UNLIKELY(cures != cudaSuccess))
 			if (STARPU_UNLIKELY(cures != cudaSuccess))
@@ -191,12 +213,15 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 		break;
 		break;
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
 		/* CUDA - CUDA transfer */
 		/* CUDA - CUDA transfer */
-		STARPU_ASSERT(copy_methods->cuda_to_cuda || copy_methods->cuda_to_cuda_async);
-		if (!req || !copy_methods->cuda_to_cuda_async)
+		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
+				!(copy_methods->cuda_to_cuda_async || copy_methods->any_to_any))
 		{
 		{
-			STARPU_ASSERT(copy_methods->cuda_to_cuda);
+			STARPU_ASSERT(copy_methods->cuda_to_cuda || copy_methods->any_to_any);
 			/* this is not associated to a request so it's synchronous */
 			/* this is not associated to a request so it's synchronous */
-			copy_methods->cuda_to_cuda(src_interface, src_node, dst_interface, dst_node);
+			if (copy_methods->cuda_to_cuda)
+				copy_methods->cuda_to_cuda(src_interface, src_node, dst_interface, dst_node);
+			else
+				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
 		}
 		}
 		else
 		else
 		{
 		{
@@ -205,7 +230,13 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 
 
 			stream = starpu_cuda_get_local_peer_transfer_stream();
 			stream = starpu_cuda_get_local_peer_transfer_stream();
-			ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
+			if (copy_methods->cuda_to_cuda_async)
+				ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
+			else
+			{
+				STARPU_ASSERT(copy_methods->any_to_any);
+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+			}
 
 
 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
@@ -215,54 +246,77 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM):
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM):
 		/* OpenCL -> RAM */
 		/* OpenCL -> RAM */
-		if (_starpu_memory_node_get_local_key() == src_node)
+		STARPU_ASSERT(_starpu_memory_node_get_local_key() == src_node);
+		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() ||
+				!(copy_methods->opencl_to_ram_async || copy_methods->any_to_any))
 		{
 		{
-			STARPU_ASSERT(copy_methods->opencl_to_ram);
-			if (!req || !copy_methods->opencl_to_ram_async)
-			{
-				/* this is not associated to a request so it's synchronous */
+			STARPU_ASSERT(copy_methods->opencl_to_ram || copy_methods->any_to_any);
+			/* this is not associated to a request so it's synchronous */
+			if (copy_methods->opencl_to_ram)
 				copy_methods->opencl_to_ram(src_interface, src_node, dst_interface, dst_node);
 				copy_methods->opencl_to_ram(src_interface, src_node, dst_interface, dst_node);
-			}
 			else
 			else
-			{
-				req->async_channel.type = STARPU_OPENCL_RAM;
-				ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
-			}
+				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
 		}
 		}
 		else
 		else
 		{
 		{
-			/* we should not have a blocking call ! */
-			STARPU_ABORT();
+			req->async_channel.type = STARPU_OPENCL_RAM;
+			if (copy_methods->opencl_to_ram_async)
+				ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
+			else
+			{
+				STARPU_ASSERT(copy_methods->any_to_any);
+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+			}
 		}
 		}
 		break;
 		break;
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM):
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM):
 		/* STARPU_CPU_RAM -> STARPU_OPENCL_RAM */
 		/* STARPU_CPU_RAM -> STARPU_OPENCL_RAM */
 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node);
 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node);
-		STARPU_ASSERT(copy_methods->ram_to_opencl);
-		if (!req || !copy_methods->ram_to_opencl_async)
+		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() ||
+				!(copy_methods->ram_to_opencl_async || copy_methods->any_to_any))
 		{
 		{
+			STARPU_ASSERT(copy_methods->ram_to_opencl || copy_methods->any_to_any);
 			/* this is not associated to a request so it's synchronous */
 			/* this is not associated to a request so it's synchronous */
-			copy_methods->ram_to_opencl(src_interface, src_node, dst_interface, dst_node);
+			if (copy_methods->ram_to_opencl)
+				copy_methods->ram_to_opencl(src_interface, src_node, dst_interface, dst_node);
+			else
+				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
 		}
 		}
 		else
 		else
 		{
 		{
 			req->async_channel.type = STARPU_OPENCL_RAM;
 			req->async_channel.type = STARPU_OPENCL_RAM;
-			ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
+			if (copy_methods->ram_to_opencl_async)
+				ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
+			else
+			{
+				STARPU_ASSERT(copy_methods->any_to_any);
+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+			}
 		}
 		}
 		break;
 		break;
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_OPENCL_RAM):
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_OPENCL_RAM):
 		/* STARPU_OPENCL_RAM -> STARPU_OPENCL_RAM */
 		/* STARPU_OPENCL_RAM -> STARPU_OPENCL_RAM */
 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node || _starpu_memory_node_get_local_key() == src_node);
 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node || _starpu_memory_node_get_local_key() == src_node);
-		STARPU_ASSERT(copy_methods->opencl_to_opencl);
-		if (!req || !copy_methods->opencl_to_opencl_async)
+		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() ||
+				!(copy_methods->opencl_to_opencl_async || copy_methods->any_to_any))
 		{
 		{
+			STARPU_ASSERT(copy_methods->opencl_to_opencl || copy_methods->any_to_any);
 			/* this is not associated to a request so it's synchronous */
 			/* this is not associated to a request so it's synchronous */
-			copy_methods->opencl_to_opencl(src_interface, src_node, dst_interface, dst_node);
+			if (copy_methods->opencl_to_opencl)
+				copy_methods->opencl_to_opencl(src_interface, src_node, dst_interface, dst_node);
+			else
+				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
 		}
 		}
 		else
 		else
 		{
 		{
 			req->async_channel.type = STARPU_OPENCL_RAM;
 			req->async_channel.type = STARPU_OPENCL_RAM;
-			ret = copy_methods->opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
+			if (copy_methods->opencl_to_opencl_async)
+				ret = copy_methods->opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
+			else
+			{
+				STARPU_ASSERT(copy_methods->any_to_any);
+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+			}
 		}
 		}
 		break;
 		break;
 #endif
 #endif
@@ -331,6 +385,64 @@ int __attribute__((warn_unused_result)) _starpu_driver_copy_data_1_to_1(starpu_d
 	return 0;
 	return 0;
 }
 }
 
 
+/* This can be used by interfaces to easily transfer a piece of data without
+ * caring about the particular CUDA/OpenCL methods.  */
+
+int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, void *async_data)
+{
+	struct _starpu_async_channel *async_channel = async_data;
+	enum starpu_node_kind src_kind = starpu_node_get_kind(src_node);
+	enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node);
+
+	switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind))
+	{
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM):
+		memcpy((void *) dst + dst_offset, (void *) src + src_offset, size);
+		return 0;
+
+#ifdef STARPU_USE_CUDA
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
+		return starpu_cuda_copy_async_sync(
+				(void*) src + src_offset, src_node,
+				(void*) dst + dst_offset, dst_node,
+				size,
+				async_channel?starpu_cuda_get_local_out_transfer_stream():NULL,
+				cudaMemcpyDeviceToHost);
+
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
+		return starpu_cuda_copy_async_sync(
+				(void*) src + src_offset, src_node,
+				(void*) dst + dst_offset, dst_node,
+				size,
+				async_channel?starpu_cuda_get_local_in_transfer_stream():NULL,
+				cudaMemcpyHostToDevice);
+
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
+		return starpu_cuda_copy_async_sync(
+				(void*) src + src_offset, src_node,
+				(void*) dst + dst_offset, dst_node,
+				size,
+				async_channel?starpu_cuda_get_local_peer_transfer_stream():NULL,
+				cudaMemcpyDeviceToDevice);
+
+#endif
+#ifdef STARPU_USE_OPENCL
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM):
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM):
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_OPENCL_RAM):
+		return starpu_opencl_copy_async_sync(
+				src, src_offset, src_node,
+				dst, dst_offset, dst_node,
+				size,
+				&async_channel->event.opencl_event);
+#endif
+	default:
+		STARPU_ABORT();
+		return -1;
+	}
+	return 0;
+}
+
 void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_channel)
 void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_channel)
 {
 {
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID

+ 14 - 127
src/datawizard/interfaces/bcsr_interface.c

@@ -31,31 +31,11 @@
  * BCSR : blocked CSR, we use blocks of size (r x c)
  * BCSR : blocked CSR, we use blocks of size (r x c)
  */
  */
 
 
-static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-#ifdef STARPU_USE_CUDA
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-#endif
-#ifdef STARPU_USE_OPENCL
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-#endif
-
-static struct starpu_data_copy_methods bcsr_copy_data_methods_s =
+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
+
+static const struct starpu_data_copy_methods bcsr_copy_data_methods_s =
 {
 {
-	.ram_to_ram = copy_ram_to_ram,
-#ifdef STARPU_USE_CUDA
-	.ram_to_cuda = copy_ram_to_cuda,
-	.cuda_to_ram = copy_cuda_to_ram,
-	.cuda_to_cuda = copy_cuda_to_cuda,
-#endif
-#ifdef STARPU_USE_OPENCL
-	.ram_to_opencl = copy_ram_to_opencl,
-	.opencl_to_ram = copy_opencl_to_ram,
-	.opencl_to_opencl = copy_opencl_to_opencl,
-#endif
+	.any_to_any = copy_any_to_any,
 };
 };
 
 
 static void register_bcsr_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
 static void register_bcsr_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
@@ -315,105 +295,7 @@ static void free_bcsr_buffer_on_node(void *data_interface, unsigned node)
 	starpu_free_buffer_on_node(node, (uintptr_t) bcsr_interface->rowptr, (nrow+1)*sizeof(uint32_t));
 	starpu_free_buffer_on_node(node, (uintptr_t) bcsr_interface->rowptr, (nrow+1)*sizeof(uint32_t));
 }
 }
 
 
-#ifdef STARPU_USE_CUDA
-static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
-{
-	struct starpu_bcsr_interface *src_bcsr = src_interface;
-	struct starpu_bcsr_interface *dst_bcsr = dst_interface;
-
-	uint32_t nnz = src_bcsr->nnz;
-	uint32_t nrow = src_bcsr->nrow;
-	size_t elemsize = src_bcsr->elemsize;
-
-	uint32_t r = src_bcsr->r;
-	uint32_t c = src_bcsr->c;
-
-	cudaError_t cures;
-
-	cures = cudaMemcpy((char *)dst_bcsr->nzval, (char *)src_bcsr->nzval, nnz*r*c*elemsize, kind);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-
-	cures = cudaMemcpy((char *)dst_bcsr->colind, (char *)src_bcsr->colind, nnz*sizeof(uint32_t), kind);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-
-	cures = cudaMemcpy((char *)dst_bcsr->rowptr, (char *)src_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), kind);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-
-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
-
-	return 0;
-}
-
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
-}
-
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
-}
-
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
-}
-#endif // STARPU_USE_CUDA
-
-#ifdef STARPU_USE_OPENCL
-static int copy_opencl_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-	struct starpu_bcsr_interface *src_bcsr = src_interface;
-	struct starpu_bcsr_interface *dst_bcsr = dst_interface;
-
-	uint32_t nnz = src_bcsr->nnz;
-	uint32_t nrow = src_bcsr->nrow;
-	size_t elemsize = src_bcsr->elemsize;
-
-	uint32_t r = src_bcsr->r;
-	uint32_t c = src_bcsr->c;
-
-        int err;
-
-	err = starpu_opencl_copy_async_sync(src_bcsr->nzval, src_node, 0, dst_bcsr->nzval, dst_node, 0, nnz*r*c*elemsize, NULL);
-	if (STARPU_UNLIKELY(err))
-		STARPU_OPENCL_REPORT_ERROR(err);
-
-	err = starpu_opencl_copy_async_sync((uintptr_t)src_bcsr->colind, src_node, 0, (uintptr_t)dst_bcsr->colind, dst_node, 0, nnz*sizeof(uint32_t), NULL);
-	if (STARPU_UNLIKELY(err))
-		STARPU_OPENCL_REPORT_ERROR(err);
-
-	err = starpu_opencl_copy_async_sync((uintptr_t)src_bcsr->rowptr, src_node, 0, (uintptr_t)dst_bcsr->rowptr, dst_node, 0, (nrow+1)*sizeof(uint32_t), NULL);
-	if (STARPU_UNLIKELY(err))
-		STARPU_OPENCL_REPORT_ERROR(err);
-
-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
-
-	return 0;
-}
-
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
-}
-
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
-}
-
-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
-}
-
-#endif // STARPU_USE_OPENCL
-
-/* as not all platform easily have a BLAS lib installed ... */
-static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)
 {
 {
 	struct starpu_bcsr_interface *src_bcsr = (struct starpu_bcsr_interface *) src_interface;
 	struct starpu_bcsr_interface *src_bcsr = (struct starpu_bcsr_interface *) src_interface;
 	struct starpu_bcsr_interface *dst_bcsr = (struct starpu_bcsr_interface *) dst_interface;
 	struct starpu_bcsr_interface *dst_bcsr = (struct starpu_bcsr_interface *) dst_interface;
@@ -425,13 +307,18 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBU
 	uint32_t r = src_bcsr->r;
 	uint32_t r = src_bcsr->r;
 	uint32_t c = src_bcsr->c;
 	uint32_t c = src_bcsr->c;
 
 
-	memcpy((void *)dst_bcsr->nzval, (void *)src_bcsr->nzval, nnz*elemsize*r*c);
+	int ret = 0;
+
+	if (starpu_interface_copy(src_bcsr->nzval, 0, src_node, dst_bcsr->nzval, 0, dst_node, nnz*elemsize*r*c, async_data))
+		ret = -EAGAIN;
 
 
-	memcpy((void *)dst_bcsr->colind, (void *)src_bcsr->colind, nnz*sizeof(uint32_t));
+	if (starpu_interface_copy((uintptr_t)src_bcsr->colind, 0, src_node, (uintptr_t)dst_bcsr->colind, 0, dst_node, nnz*sizeof(uint32_t), async_data))
+		ret = -EAGAIN;
 
 
-	memcpy((void *)dst_bcsr->rowptr, (void *)src_bcsr->rowptr, (nrow+1)*sizeof(uint32_t));
+	if (starpu_interface_copy((uintptr_t)src_bcsr->rowptr, 0, src_node, (uintptr_t)dst_bcsr->rowptr, 0, dst_node, (nrow+1)*sizeof(uint32_t), async_data))
+		ret = -EAGAIN;
 
 
 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize*r*c + (nnz+nrow+1)*sizeof(uint32_t));
 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize*r*c + (nnz+nrow+1)*sizeof(uint32_t));
 
 
-	return 0;
+	return ret;
 }
 }

+ 9 - 7
src/datawizard/interfaces/block_interface.c

@@ -44,7 +44,7 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
 static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
 static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
 #endif
 #endif
 
 
-static struct starpu_data_copy_methods block_copy_data_methods_s =
+static const struct starpu_data_copy_methods block_copy_data_methods_s =
 {
 {
 	.ram_to_ram = copy_ram_to_ram,
 	.ram_to_ram = copy_ram_to_ram,
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
@@ -350,7 +350,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 	}
 	}
 	else
 	else
 	{
 	{
-		/* Default case: we transfer all lines one by one: ny*nz transfers */
+		/* Default case: we transfer all blocks one by one: nz transfers */
 		unsigned layer;
 		unsigned layer;
 		for (layer = 0; layer < src_block->nz; layer++)
 		for (layer = 0; layer < src_block->nz; layer++)
 		{
 		{
@@ -420,7 +420,7 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 	}
 	}
 	else
 	else
 	{
 	{
-		/* Default case: we transfer all lines one by one: ny*nz transfers */
+		/* Default case: we transfer all blocks one by one: nz 2D transfers */
 		unsigned layer;
 		unsigned layer;
 		for (layer = 0; layer < src_block->nz; layer++)
 		for (layer = 0; layer < src_block->nz; layer++)
 		{
 		{
@@ -514,8 +514,8 @@ static int copy_opencl_common(void *src_interface, unsigned src_node, void *dst_
 		/* Is that a single contiguous buffer ? */
 		/* Is that a single contiguous buffer ? */
 		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
 		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
 		{
 		{
-			ret = starpu_opencl_copy_async_sync(src_block->dev_handle, src_node, src_block->offset,
-								dst_block->dev_handle, dst_node, dst_block->offset,
+			ret = starpu_opencl_copy_async_sync(src_block->dev_handle, src_block->offset, src_node,
+								dst_block->dev_handle, dst_block->offset, dst_node,
 							       src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
 							       src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
 							       event);
 							       event);
                 }
                 }
@@ -535,10 +535,12 @@ static int copy_opencl_common(void *src_interface, unsigned src_node, void *dst_
                         unsigned j;
                         unsigned j;
                         for(j=0 ; j<src_block->ny ; j++)
                         for(j=0 ; j<src_block->ny ; j++)
 			{
 			{
-				ret = starpu_opencl_copy_async_sync(src_block->dev_handle, src_node,
+				ret = starpu_opencl_copy_async_sync(src_block->dev_handle,
 								    src_block->offset + layer*src_block->ldz*src_block->elemsize + j*src_block->ldy*src_block->elemsize,
 								    src_block->offset + layer*src_block->ldz*src_block->elemsize + j*src_block->ldy*src_block->elemsize,
-								    dst_block->dev_handle, dst_node,
+								    src_node,
+								    dst_block->dev_handle,
 								    dst_block->offset + layer*dst_block->ldz*dst_block->elemsize + j*dst_block->ldy*dst_block->elemsize,
 								    dst_block->offset + layer*dst_block->ldz*dst_block->elemsize + j*dst_block->ldy*dst_block->elemsize,
+								    dst_node,
 								       src_block->nx*src_block->elemsize,
 								       src_block->nx*src_block->elemsize,
 								       event);
 								       event);
                         }
                         }

+ 19 - 247
src/datawizard/interfaces/coo_interface.c

@@ -19,190 +19,36 @@
 #include <datawizard/memalloc.h>
 #include <datawizard/memalloc.h>
 
 
 static int
 static int
-copy_ram_to_ram(void *src_interface, STARPU_ATTRIBUTE_UNUSED unsigned src_node,
-		void *dst_interface, STARPU_ATTRIBUTE_UNUSED unsigned dst_node)
+copy_any_to_any(void *src_interface, unsigned src_node,
+		void *dst_interface, unsigned dst_node, void *async_data)
 {
 {
 	size_t size = 0;
 	size_t size = 0;
 	struct starpu_coo_interface *src_coo, *dst_coo;
 	struct starpu_coo_interface *src_coo, *dst_coo;
-
-	src_coo = (struct starpu_coo_interface *) src_interface;
-	dst_coo = (struct starpu_coo_interface *) dst_interface;
-
-	size = src_coo->n_values * sizeof(src_coo->columns[0]);
-	memcpy((void *) dst_coo->columns, (void *) src_coo->columns, size);
-
-	/* sizeof(src_coo->columns[0]) == sizeof(src_coo->rows[0]) */
-	memcpy((void *) dst_coo->rows, (void *) src_coo->rows, size);
-
-	size = src_coo->n_values * src_coo->elemsize;
-	memcpy((void *) dst_coo->values, (void *) src_coo->values, size);
-
-	_STARPU_TRACE_DATA_COPY(src_node, dst_node,
-		src_coo->n_values *
-		(2 * sizeof(src_coo->rows[0]) + src_coo->elemsize));
-
-	return 0;
-}
-
-#ifdef STARPU_USE_CUDA
-static int
-copy_cuda_async_sync(void *src_interface, unsigned src_node,
-		     void *dst_interface, unsigned dst_node,
-		     cudaStream_t stream, enum cudaMemcpyKind kind)
-{
-	int ret;
-	size_t size = 0;
-	struct starpu_coo_interface *src_coo, *dst_coo;
-
-	src_coo = (struct starpu_coo_interface *) src_interface;
-	dst_coo = (struct starpu_coo_interface *) dst_interface;
-
-	size = src_coo->n_values * sizeof(src_coo->columns[0]);
-	ret = starpu_cuda_copy_async_sync(
-		(void *) src_coo->columns,
-		src_node,
-		(void *) dst_coo->columns,
-		dst_node,
-		size,
-		stream,
-		kind);
-	if (ret == 0)
-		stream = NULL;
-
-	/* sizeof(src_coo->columns[0]) == sizeof(src_coo->rows[0]) */
-	ret = starpu_cuda_copy_async_sync(
-		(void *) src_coo->rows,
-		src_node,
-		(void *) dst_coo->rows,
-		dst_node,
-		size,
-		stream,
-		kind);
-	if (ret == 0)
-		stream = NULL;
-
-	size = src_coo->n_values * src_coo->elemsize;
-	ret = starpu_cuda_copy_async_sync(
-		(void *) src_coo->values,
-		src_node,
-		(void *) dst_coo->values,
-		dst_node,
-		size,
-		stream,
-		kind);
-
-	_STARPU_TRACE_DATA_COPY(src_node, dst_node,
-		src_coo->n_values *
-		(2 * sizeof(src_coo->rows[0]) + src_coo->elemsize));
-	return ret;
-}
-
-static int
-copy_ram_to_cuda(void *src_interface, unsigned src_node,
-		 void *dst_interface, unsigned dst_node)
-{
-	return copy_cuda_async_sync(src_interface, src_node,
-				    dst_interface, dst_node,
-				    NULL, cudaMemcpyHostToDevice);
-}
-
-static int
-copy_cuda_to_ram(void *src_interface, unsigned src_node,
-		 void *dst_interface, unsigned dst_node)
-{
-	return copy_cuda_async_sync(src_interface, src_node,
-				    dst_interface, dst_node,
-				    NULL, cudaMemcpyDeviceToHost);
-}
-
-static int
-copy_ram_to_cuda_async(void *src_interface, unsigned src_node,
-		       void *dst_interface, unsigned dst_node,
-		       cudaStream_t stream)
-{
-	return copy_cuda_async_sync(src_interface, src_node,
-				    dst_interface, dst_node,
-				    stream, cudaMemcpyHostToDevice);
-}
-
-static int
-copy_cuda_to_ram_async(void *src_interface, unsigned src_node,
-		       void *dst_interface, unsigned dst_node,
-		       cudaStream_t stream)
-{
-	return copy_cuda_async_sync(src_interface, src_node,
-				    dst_interface, dst_node,
-				    stream, cudaMemcpyDeviceToHost);
-}
-
-static int
-copy_cuda_to_cuda(void *src_interface, unsigned src_node,
-		  void *dst_interface, unsigned dst_node)
-{
-	return copy_cuda_async_sync(src_interface, src_node,
-				    dst_interface, dst_node,
-				    NULL, cudaMemcpyDeviceToDevice);
-}
-
-#ifdef NO_STRIDE
-static int
-copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,
-			void *dst_interface, unsigned dst_node,
-			cudaStream_t stream)
-{
-	return copy_cuda_async_sync(src_interface, src_node,
-				    dst_interface, dst_node,
-				    stream, cudaMemcpyDeviceToDevice);
-}
-#endif /* !NO_STRIDE */
-#endif /* !STARPU_USE_CUDA */
-
-#ifdef STARPU_USE_OPENCL
-static int
-copy_opencl_common(void *src_interface, unsigned src_node,
-			 void *dst_interface, unsigned dst_node,
-			 cl_event *event)
-{
 	int ret = 0;
 	int ret = 0;
-	size_t size = 0;
-	struct starpu_coo_interface *src_coo, *dst_coo;
 
 
 	src_coo = (struct starpu_coo_interface *) src_interface;
 	src_coo = (struct starpu_coo_interface *) src_interface;
 	dst_coo = (struct starpu_coo_interface *) dst_interface;
 	dst_coo = (struct starpu_coo_interface *) dst_interface;
 
 
-
 	size = src_coo->n_values * sizeof(src_coo->columns[0]);
 	size = src_coo->n_values * sizeof(src_coo->columns[0]);
-	ret = starpu_opencl_copy_async_sync(
-		(uintptr_t) src_coo->columns,
-		src_node,
-		0,
-		(uintptr_t) dst_coo->columns,
-		dst_node,
-		0,
-		size,
-		NULL);
+	if (starpu_interface_copy(
+		(uintptr_t) src_coo->columns, 0, src_node,
+		(uintptr_t) dst_coo->columns, 0, dst_node,
+		size, async_data))
+		ret = -EAGAIN;
 
 
 	/* sizeof(src_coo->columns[0]) == sizeof(src_coo->rows[0]) */
 	/* sizeof(src_coo->columns[0]) == sizeof(src_coo->rows[0]) */
-	ret = starpu_opencl_copy_async_sync(
-		(uintptr_t) src_coo->rows,
-		src_node,
-		0,
-		(uintptr_t) dst_coo->rows,
-		dst_node,
-		0,
-		size,
-		NULL);
+	if (starpu_interface_copy(
+		(uintptr_t) src_coo->rows, 0, src_node,
+		(uintptr_t) dst_coo->rows, 0, dst_node,
+		size, async_data))
+		ret = -EAGAIN;
 
 
 	size = src_coo->n_values * src_coo->elemsize;
 	size = src_coo->n_values * src_coo->elemsize;
-	ret = starpu_opencl_copy_async_sync(
-		src_coo->values,
-		src_node,
-		0,
-		(uintptr_t) dst_coo->values,
-		dst_node,
-		0,
-		size,
-		event);
+	if (starpu_interface_copy(
+		src_coo->values, 0, src_node,
+		dst_coo->values, 0, dst_node,
+		size, async_data))
+		ret = -EAGAIN;
 
 
 	_STARPU_TRACE_DATA_COPY(src_node, dst_node,
 	_STARPU_TRACE_DATA_COPY(src_node, dst_node,
 		src_coo->n_values *
 		src_coo->n_values *
@@ -211,83 +57,9 @@ copy_opencl_common(void *src_interface, unsigned src_node,
 	return ret;
 	return ret;
 }
 }
 
 
-static int
-copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
-			 void *dst_interface, unsigned dst_node,
-			 cl_event *event)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
-}
-
-static int
-copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
-			 void *dst_interface, unsigned dst_node,
-			 cl_event *event)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
-}
-
-static int
-copy_opencl_to_opencl_async(void *src_interface, unsigned src_node,
-			 void *dst_interface, unsigned dst_node,
-			 cl_event *event)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
-}
-
-static int
-copy_ram_to_opencl(void *src_interface, unsigned src_node,
-		   void *dst_interface, unsigned dst_node)
-{
-	return copy_ram_to_opencl_async(src_interface, src_node,
-					dst_interface, dst_node,
-					NULL);
-}
-static int
-copy_opencl_to_ram(void *src_interface, unsigned src_node,
-		   void *dst_interface, unsigned dst_node)
-{
-	return copy_opencl_to_ram_async(src_interface, src_node,
-					dst_interface, dst_node,
-					NULL);
-}
-static int
-copy_opencl_to_opencl(void *src_interface, unsigned src_node,
-		   void *dst_interface, unsigned dst_node)
-{
-	return copy_opencl_to_opencl_async(src_interface, src_node,
-					dst_interface, dst_node,
-					NULL);
-}
-#endif /* !STARPU_USE_OPENCL */
-
-static struct starpu_data_copy_methods coo_copy_data_methods =
+static const struct starpu_data_copy_methods coo_copy_data_methods =
 {
 {
-	.ram_to_ram          = copy_ram_to_ram,
-#ifdef STARPU_USE_CUDA
-	.ram_to_cuda         = copy_ram_to_cuda,
-	.cuda_to_ram         = copy_cuda_to_ram,
-	.ram_to_cuda_async   = copy_ram_to_cuda_async,
-	.cuda_to_ram_async   = copy_cuda_to_ram_async,
-	.cuda_to_cuda        = copy_cuda_to_cuda,
-#ifdef NO_STRIDE
-	.cuda_to_cuda_async  = copy_cuda_to_cuda_async,
-#endif
-#else
-#ifdef STARPU_SIMGRID
-#ifdef NO_STRIDE
-	/* Enable GPU-GPU transfers in simgrid */
-	.cuda_to_cuda_async = 1,
-#endif
-#endif
-#endif /* !STARPU_USE_CUDA */
-#ifdef STARPU_USE_OPENCL
-	.ram_to_opencl       = copy_ram_to_opencl,
-	.opencl_to_ram       = copy_opencl_to_ram,
-	.opencl_to_opencl    = copy_opencl_to_opencl,
-	.ram_to_opencl_async = copy_ram_to_opencl_async,
-	.opencl_to_opencl_async = copy_opencl_to_opencl_async,
-#endif /* !STARPU_USE_OPENCL */
+	.any_to_any          = copy_any_to_any,
 };
 };
 
 
 static void
 static void

+ 13 - 220
src/datawizard/interfaces/csr_interface.c

@@ -28,42 +28,11 @@
 #include <starpu_opencl.h>
 #include <starpu_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 
 
-static int copy_ram_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
-#ifdef STARPU_USE_CUDA
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
-static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
-#endif
-#ifdef STARPU_USE_OPENCL
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
-#endif
-
-static struct starpu_data_copy_methods csr_copy_data_methods_s =
+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
+
+static const struct starpu_data_copy_methods csr_copy_data_methods_s =
 {
 {
-	.ram_to_ram = copy_ram_to_ram,
-#ifdef STARPU_USE_CUDA
-	.ram_to_cuda = copy_ram_to_cuda,
-	.cuda_to_ram = copy_cuda_to_ram,
-	.cuda_to_cuda = copy_cuda_to_cuda,
-	.ram_to_cuda_async = copy_ram_to_cuda_async,
-	.cuda_to_ram_async = copy_cuda_to_ram_async,
-	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
-#else
-#ifdef STARPU_SIMGRID
-	/* Enable GPU-GPU transfers in simgrid */
-	.cuda_to_cuda_async = 1,
-#endif
-#endif
-#ifdef STARPU_USE_OPENCL
-	.ram_to_opencl = copy_ram_to_opencl,
-	.opencl_to_ram = copy_opencl_to_ram,
-	.opencl_to_opencl = copy_opencl_to_opencl,
-#endif
+	.any_to_any = copy_any_to_any,
 };
 };
 
 
 static void register_csr_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
 static void register_csr_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
@@ -293,188 +262,8 @@ static void free_csr_buffer_on_node(void *data_interface, unsigned node)
 	starpu_free_buffer_on_node(node, (uintptr_t) csr_interface->rowptr, (nrow+1)*sizeof(uint32_t));
 	starpu_free_buffer_on_node(node, (uintptr_t) csr_interface->rowptr, (nrow+1)*sizeof(uint32_t));
 }
 }
 
 
-#ifdef STARPU_USE_CUDA
-static int copy_cuda_async_sync(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind, cudaStream_t stream)
-{
-	struct starpu_csr_interface *src_csr = src_interface;
-	struct starpu_csr_interface *dst_csr = dst_interface;
-
-	uint32_t nnz = src_csr->nnz;
-	uint32_t nrow = src_csr->nrow;
-	size_t elemsize = src_csr->elemsize;
-
-	cudaStream_t sstream = stream;
-	int ret;
-
-	ret = starpu_cuda_copy_async_sync((void *)src_csr->nzval, src_node, (void *)dst_csr->nzval, dst_node, nnz*elemsize, sstream, kind);
-	if (ret == 0) sstream = NULL;
-
-	ret = starpu_cuda_copy_async_sync((void *)src_csr->colind, src_node, (void *)dst_csr->colind, dst_node, nnz*sizeof(uint32_t), sstream, kind);
-	if (ret == 0) sstream = NULL;
-
-	ret = starpu_cuda_copy_async_sync((void *)src_csr->rowptr, src_node, (void *)dst_csr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), sstream, kind);
-
-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
-	return ret;
-}
-
-static int copy_cuda_peer_async(void *src_interface STARPU_ATTRIBUTE_UNUSED, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
-				void *dst_interface STARPU_ATTRIBUTE_UNUSED, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream STARPU_ATTRIBUTE_UNUSED)
-{
-#ifdef HAVE_CUDA_MEMCPY_PEER
-	struct starpu_csr_interface *src_csr = src_interface;
-	struct starpu_csr_interface *dst_csr = dst_interface;
-
-	uint32_t nnz = src_csr->nnz;
-	uint32_t nrow = src_csr->nrow;
-	size_t elemsize = src_csr->elemsize;
-
-	cudaError_t cures;
-
-	int src_dev = _starpu_memory_node_get_devid(src_node);
-	int dst_dev = _starpu_memory_node_get_devid(dst_node);
-
-	int synchronous_fallback = 0;
-
-	_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
-	cures = cudaMemcpyPeerAsync((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize, stream);
-	if (cures)
-	{
-		synchronous_fallback = 1;
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
-		cures = cudaMemcpyPeer((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize);
-		if (STARPU_UNLIKELY(cures))
-			STARPU_CUDA_REPORT_ERROR(cures);
-	}
-
-	if (!synchronous_fallback)
-	{
-		cures = cudaMemcpyPeerAsync((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t), stream);
-	}
-
-	if (synchronous_fallback || cures != cudaSuccess)
-	{
-		synchronous_fallback = 1;
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
-		cures = cudaMemcpyPeer((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t));
-		if (STARPU_UNLIKELY(cures))
-			STARPU_CUDA_REPORT_ERROR(cures);
-	}
-
-	if (!synchronous_fallback)
-	{
-		cures = cudaMemcpyPeerAsync((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t), stream);
-	}
-
-	if (synchronous_fallback || cures != cudaSuccess)
-	{
-		synchronous_fallback = 1;
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
-		cures = cudaMemcpyPeer((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t));
-		if (STARPU_UNLIKELY(cures))
-			STARPU_CUDA_REPORT_ERROR(cures);
-	}
-
-	if (synchronous_fallback)
-	{
-		_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
-		return 0;
-	}
-	else
-	{
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
-		return -EAGAIN;
-	}
-#else
-	/* Illegal without Peer tranfers */
-	STARPU_ABORT();
-	return 0;
-#endif
-}
-
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, NULL);
-}
-
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, NULL);
-}
-
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, NULL);
-}
-
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, stream);
-}
-
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, stream);
-}
-
-static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
-{
-	if (src_node == dst_node)
-		return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, stream);
-	else
-		return copy_cuda_peer_async(src_interface, src_node, dst_interface, dst_node, stream);
-}
-
-#endif // STARPU_USE_CUDA
-
-#ifdef STARPU_USE_OPENCL
-static int copy_opencl_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-	struct starpu_csr_interface *src_csr = src_interface;
-	struct starpu_csr_interface *dst_csr = dst_interface;
-
-	uint32_t nnz = src_csr->nnz;
-	uint32_t nrow = src_csr->nrow;
-	size_t elemsize = src_csr->elemsize;
-
-        int err;
-
-	err = starpu_opencl_copy_async_sync(src_csr->nzval, src_node, 0, dst_csr->nzval, dst_node, 0, nnz*elemsize, NULL);
-	if (STARPU_UNLIKELY(err))
-                STARPU_OPENCL_REPORT_ERROR(err);
-
-	err = starpu_opencl_copy_async_sync((uintptr_t)src_csr->colind, src_node, 0, (uintptr_t)dst_csr->colind, dst_node, 0, nnz*sizeof(uint32_t), NULL);
-        if (STARPU_UNLIKELY(err))
-                STARPU_OPENCL_REPORT_ERROR(err);
-
-	err = starpu_opencl_copy_async_sync((uintptr_t)src_csr->rowptr, src_node, 0, (uintptr_t)dst_csr->rowptr, dst_node, 0, (nrow+1)*sizeof(uint32_t), NULL);
-	if (STARPU_UNLIKELY(err))
-                STARPU_OPENCL_REPORT_ERROR(err);
-
-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
-
-	return 0;
-}
-
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
-}
-
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
-}
-
-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
-}
-
-#endif // STARPU_USE_OPENCL
-
 /* as not all platform easily have a BLAS lib installed ... */
 /* as not all platform easily have a BLAS lib installed ... */
-static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)
 {
 {
 	struct starpu_csr_interface *src_csr = (struct starpu_csr_interface *) src_interface;
 	struct starpu_csr_interface *src_csr = (struct starpu_csr_interface *) src_interface;
 	struct starpu_csr_interface *dst_csr = (struct starpu_csr_interface *) dst_interface;
 	struct starpu_csr_interface *dst_csr = (struct starpu_csr_interface *) dst_interface;
@@ -482,14 +271,18 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBU
 	uint32_t nnz = src_csr->nnz;
 	uint32_t nnz = src_csr->nnz;
 	uint32_t nrow = src_csr->nrow;
 	uint32_t nrow = src_csr->nrow;
 	size_t elemsize = src_csr->elemsize;
 	size_t elemsize = src_csr->elemsize;
+	int ret = 0;
 
 
-	memcpy((void *)dst_csr->nzval, (void *)src_csr->nzval, nnz*elemsize);
+	if (starpu_interface_copy(src_csr->nzval, 0, src_node, dst_csr->nzval, 0, dst_node, nnz*elemsize, async_data))
+		ret = -EAGAIN;
 
 
-	memcpy((void *)dst_csr->colind, (void *)src_csr->colind, nnz*sizeof(uint32_t));
+	if (starpu_interface_copy((uintptr_t)src_csr->colind, 0, src_node, (uintptr_t)dst_csr->colind, 0, dst_node, nnz*sizeof(uint32_t), async_data))
+		ret = -EAGAIN;
 
 
-	memcpy((void *)dst_csr->rowptr, (void *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t));
+	if (starpu_interface_copy((uintptr_t)src_csr->rowptr, 0, src_node, (uintptr_t)dst_csr->rowptr, 0, dst_node, (nrow+1)*sizeof(uint32_t), async_data))
+		ret = -EAGAIN;
 
 
 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
 
 
-	return 0;
+	return ret;
 }
 }

+ 2 - 36
src/datawizard/interfaces/data_interface.c

@@ -290,41 +290,6 @@ void starpu_data_register(starpu_data_handle_t *handleptr, unsigned home_node,
 	STARPU_ASSERT(handleptr);
 	STARPU_ASSERT(handleptr);
 	*handleptr = handle;
 	*handleptr = handle;
 
 
-	int asynchronous_copy_disabled = starpu_asynchronous_copy_disabled();
-	if (STARPU_UNLIKELY(asynchronous_copy_disabled))
-	{
-#ifdef STARPU_USE_CUDA
-		ops->copy_methods->ram_to_cuda_async = NULL;
-		ops->copy_methods->cuda_to_ram_async = NULL;
-		ops->copy_methods->cuda_to_cuda_async = NULL;
-#endif
-#ifdef STARPU_USE_OPENCL
-		ops->copy_methods->ram_to_opencl_async = NULL;
-		ops->copy_methods->opencl_to_ram_async = NULL;
-		ops->copy_methods->opencl_to_opencl_async = NULL;
-#endif
-	}
-
-#ifdef STARPU_USE_CUDA
-	int asynchronous_cuda_copy_disabled = starpu_asynchronous_cuda_copy_disabled();
-	if (STARPU_UNLIKELY(asynchronous_cuda_copy_disabled))
-	{
-		ops->copy_methods->ram_to_cuda_async = NULL;
-		ops->copy_methods->cuda_to_ram_async = NULL;
-		ops->copy_methods->cuda_to_cuda_async = NULL;
-	}
-#endif
-
-#ifdef STARPU_USE_OPENCL
-	int asynchronous_opencl_copy_disabled = starpu_asynchronous_opencl_copy_disabled();
-	if (STARPU_UNLIKELY(asynchronous_opencl_copy_disabled))
-	{
-		ops->copy_methods->ram_to_opencl_async = NULL;
-		ops->copy_methods->opencl_to_ram_async = NULL;
-		ops->copy_methods->opencl_to_opencl_async = NULL;
-	}
-#endif
-
 	/* fill the interface fields with the appropriate method */
 	/* fill the interface fields with the appropriate method */
 	STARPU_ASSERT(ops->register_data_handle);
 	STARPU_ASSERT(ops->register_data_handle);
 	ops->register_data_handle(handle, home_node, data_interface);
 	ops->register_data_handle(handle, home_node, data_interface);
@@ -618,7 +583,8 @@ void starpu_data_unregister_no_coherency(starpu_data_handle_t handle)
 	_starpu_data_unregister(handle, 0);
 	_starpu_data_unregister(handle, 0);
 }
 }
 
 
-void starpu_data_unregister_submit(starpu_data_handle_t handle) {
+void starpu_data_unregister_submit(starpu_data_handle_t handle)
+{
 	_starpu_spin_lock(&handle->header_lock);
 	_starpu_spin_lock(&handle->header_lock);
 	STARPU_ASSERT_MSG(!handle->lazy_unregister, "data must not be unregistered twice");
 	STARPU_ASSERT_MSG(!handle->lazy_unregister, "data must not be unregistered twice");
 	handle->lazy_unregister = 1;
 	handle->lazy_unregister = 1;

+ 3 - 3
src/datawizard/interfaces/matrix_interface.c

@@ -48,7 +48,7 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
 static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
 static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
 #endif
 #endif
 
 
-static struct starpu_data_copy_methods matrix_copy_data_methods_s =
+static const struct starpu_data_copy_methods matrix_copy_data_methods_s =
 {
 {
 	.ram_to_ram = copy_ram_to_ram,
 	.ram_to_ram = copy_ram_to_ram,
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
@@ -516,8 +516,8 @@ static int copy_opencl_common(void *src_interface, unsigned src_node, void *dst_
 
 
 	STARPU_ASSERT_MSG((src_matrix->ld == src_matrix->nx) && (dst_matrix->ld == dst_matrix->nx), "XXX non contiguous buffers are not properly supported in OpenCL yet. (TODO)");
 	STARPU_ASSERT_MSG((src_matrix->ld == src_matrix->nx) && (dst_matrix->ld == dst_matrix->nx), "XXX non contiguous buffers are not properly supported in OpenCL yet. (TODO)");
 
 
-	ret = starpu_opencl_copy_async_sync(src_matrix->dev_handle, src_node, src_matrix->offset,
-					    dst_matrix->dev_handle, dst_node, dst_matrix->offset,
+	ret = starpu_opencl_copy_async_sync(src_matrix->dev_handle, src_matrix->offset, src_node,
+					    dst_matrix->dev_handle, dst_matrix->offset, dst_node,
 					    src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
 					    src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
 					    event);
 					    event);
 
 

+ 1 - 1
src/datawizard/interfaces/multiformat_interface.c

@@ -42,7 +42,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cl_event *event);
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cl_event *event);
 #endif
 #endif
 
 
-static struct starpu_data_copy_methods multiformat_copy_data_methods_s =
+static const struct starpu_data_copy_methods multiformat_copy_data_methods_s =
 {
 {
 	.ram_to_ram = copy_ram_to_ram,
 	.ram_to_ram = copy_ram_to_ram,
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA

+ 0 - 0
src/datawizard/interfaces/variable_interface.c


Některé soubory nejsou zobrazeny, neboť je v těchto rozdílových datech změněno mnoho souborů