Marc Sergent лет назад: 13
Родитель
Сommit
f4fb7d745d
100 измененных файлов с 965 добавлено и 1272 удалено
  1. 12 9
      ChangeLog
  2. 6 6
      Makefile.am
  3. 21 1
      configure.ac
  4. 7 4
      doc/Makefile.am
  5. 53 60
      doc/chapters/advanced-api.texi
  6. 19 9
      doc/chapters/basic-api.texi
  7. 14 2
      doc/chapters/configuration.texi
  8. 44 1
      doc/chapters/mpi-support.texi
  9. 17 7
      doc/chapters/perf-feedback.texi
  10. 6 1
      doc/chapters/perf-optimization.texi
  11. 4 4
      doc/chapters/sched_ctx_hypervisor.texi
  12. 1 1
      examples/Makefile.am
  13. 2 2
      examples/basic_examples/block_cpu.c
  14. 58 0
      examples/cholesky/cholesky.h
  15. 10 1
      examples/cholesky/cholesky_grain_tag.c
  16. 6 2
      examples/cholesky/cholesky_implicit.c
  17. 10 1
      examples/cholesky/cholesky_tag.c
  18. 10 1
      examples/cholesky/cholesky_tile_tag.c
  19. 2 2
      examples/filters/custom_mf/custom_conversion_codelets.c
  20. 4 3
      examples/filters/custom_mf/custom_interface.c
  21. 2 2
      examples/filters/fblock_cpu.c
  22. 8 8
      examples/filters/fblock_opencl.c
  23. 3 3
      examples/filters/fmatrix.c
  24. 2 2
      examples/filters/fvector.c
  25. 3 3
      examples/filters/shadow.c
  26. 19 128
      examples/interface/complex_interface.c
  27. 3 3
      examples/ppm_downscaler/ppm_downscaler.c
  28. 2 2
      examples/profiling/profiling.c
  29. 2 2
      examples/sched_ctx/sched_ctx.c
  30. 1 1
      examples/sched_ctx_utils/sched_ctx_utils.c
  31. 4 4
      examples/scheduler/dummy_sched.c
  32. 2 2
      examples/stencil/Makefile.am
  33. 1 1
      examples/stencil/life.c
  34. 5 5
      examples/stencil/stencil-blocks.c
  35. 11 11
      examples/stencil/stencil-tasks.c
  36. 4 4
      examples/stencil/stencil.c
  37. 6 6
      examples/stencil/stencil.h
  38. 1 1
      examples/tag_example/tag_example2.c
  39. 1 2
      include/starpu_cuda.h
  40. 1 1
      include/starpu_data.h
  41. 6 1
      include/starpu_data_interfaces.h
  42. 2 3
      include/starpu_opencl.h
  43. 7 1
      include/starpu_perfmodel.h
  44. 21 25
      include/starpu_sched_ctx.h
  45. 3 0
      include/starpu_task.h
  46. 2 2
      include/starpu_task_util.h
  47. 4 4
      include/starpu_util.h
  48. 1 1
      mpi/examples/Makefile.am
  49. 1 2
      mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
  50. 2 2
      mpi/examples/matrix_decomposition/mpi_decomposition_params.c
  51. 3 3
      mpi/examples/mpi_lu/plu_example.c
  52. 2 2
      mpi/examples/mpi_lu/pxlu.c
  53. 1 1
      mpi/src/Makefile.am
  54. 14 15
      mpi/src/starpu_mpi.c
  55. 2 2
      mpi/src/starpu_mpi_stats.c
  56. 1 1
      mpi/tests/Makefile.am
  57. 3 3
      mpi/tests/mpi_detached_tag.c
  58. 3 3
      mpi/tests/mpi_irecv.c
  59. 3 3
      mpi/tests/mpi_irecv_detached.c
  60. 3 3
      mpi/tests/mpi_isend.c
  61. 3 3
      mpi/tests/mpi_isend_detached.c
  62. 3 3
      mpi/tests/mpi_probe.c
  63. 3 3
      mpi/tests/mpi_test.c
  64. 3 3
      mpi/tests/pingpong.c
  65. 8 8
      mpi/tests/ring.c
  66. 8 8
      mpi/tests/ring_async.c
  67. 8 8
      mpi/tests/ring_async_implicit.c
  68. 2 2
      sched_ctx_hypervisor/examples/Makefile.am
  69. 3 3
      sched_ctx_hypervisor/examples/app_driven_test/app_driven_test.c
  70. 4 4
      sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.c
  71. 1 0
      sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.h
  72. 7 1
      sched_ctx_hypervisor/include/sched_ctx_hypervisor.h
  73. 2 4
      sched_ctx_hypervisor/src/Makefile.am
  74. 9 9
      sched_ctx_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c
  75. 1 1
      sched_ctx_hypervisor/src/hypervisor_policies/ispeed_policy.c
  76. 13 6
      sched_ctx_hypervisor/src/hypervisor_policies/lp_tools.c
  77. 26 19
      sched_ctx_hypervisor/src/hypervisor_policies/policy_tools.c
  78. 25 22
      sched_ctx_hypervisor/src/sched_ctx_hypervisor.c
  79. 1 1
      socl/examples/Makefile.am
  80. 7 4
      socl/src/cl_enqueuendrangekernel.c
  81. 1 1
      src/Makefile.am
  82. 10 2
      src/core/jobs.c
  83. 14 4
      src/core/perfmodel/perfmodel_bus.c
  84. 31 14
      src/core/perfmodel/perfmodel_history.c
  85. 4 4
      src/core/perfmodel/perfmodel_print.c
  86. 35 30
      src/core/sched_ctx.c
  87. 2 2
      src/core/sched_ctx.h
  88. 15 16
      src/core/sched_policy.c
  89. 12 2
      src/core/task.c
  90. 24 24
      src/core/workers.c
  91. 1 0
      src/datawizard/coherency.c
  92. 147 35
      src/datawizard/copy_driver.c
  93. 14 127
      src/datawizard/interfaces/bcsr_interface.c
  94. 9 7
      src/datawizard/interfaces/block_interface.c
  95. 19 247
      src/datawizard/interfaces/coo_interface.c
  96. 13 220
      src/datawizard/interfaces/csr_interface.c
  97. 2 36
      src/datawizard/interfaces/data_interface.c
  98. 3 3
      src/datawizard/interfaces/matrix_interface.c
  99. 1 1
      src/datawizard/interfaces/multiformat_interface.c
  100. 0 0
      src/datawizard/interfaces/variable_interface.c

+ 12 - 9
ChangeLog

@@ -93,6 +93,18 @@ New features:
   * Introduce new variables STARPU_LIMIT_CUDA_devid_MEM and
     STARPU_LIMIT_OPENCL_devid_MEM to limit memory per specific device
 
+Small features:
+  * Add starpu_worker_get_by_type and starpu_worker_get_by_devid
+  * Add starpu_fxt_stop_profiling/starpu_fxt_start_profiling which permits to
+    pause trace recording.
+  * Add trace_buffer_size configuration field to permit to specify the tracing
+    buffer size.
+  * Add starpu_codelet_profile and starpu_codelet_histo_profile, tools which draw
+    the profile of a codelet.
+  * File STARPU-REVISION --- containing the SVN revision number from which
+    StarPU was compiled --- is installed in the share/doc/starpu directory
+  * starpu_perfmodel_plot can now directly draw GFlops curves.
+
 Changes:
   * Fix the block filter functions.
   * Fix StarPU-MPI on Darwin.
@@ -127,15 +139,6 @@ Changes:
   * StarPU can now use poti to generate paje traces.
   * Rename scheduling policy "parallel greedy" to "parallel eager"
 
-Small features:
-  * Add starpu_worker_get_by_type and starpu_worker_get_by_devid
-  * Add starpu_fxt_stop_profiling/starpu_fxt_start_profiling which permits to
-  pause trace recording.
-  * Add trace_buffer_size configuration field to permit to specify the tracing
-  buffer size.
-  * Add starpu_codelet_profile and starpu_codelet_histo_profile, tools which draw
-  the profile of a codelet.
-
 Small changes:
   * STARPU_NCPU should now be used instead of STARPU_NCPUS. STARPU_NCPUS is
 	still available for compatibility reasons.

+ 6 - 6
Makefile.am

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2009-2012  Université de Bordeaux 1
-# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+# Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -91,7 +91,7 @@ all-local:
 	cd starpu-top ; $(QMAKE) ; $(MAKE)
 clean-local:
 	cd starpu-top ; $(QMAKE) ; $(MAKE) clean ; $(RM) Makefile
-	$(RM) starpu_top.1 starpu-top/starpu_top
+	$(RM) starpu-top/starpu_top.1 starpu-top/starpu_top
 # TODO: resources
 install-exec-local:
 	$(MKDIR_P) $(DESTDIR)$(bindir)
@@ -102,10 +102,10 @@ uninstall-local:
 	$(RM) starpu-top/Makefile
 
 if STARPU_HAVE_HELP2MAN
-starpu_top.1: starpu-top/starpu_top$(EXEEXT)
+starpu-top/starpu_top.1: starpu-top/starpu_top$(EXEEXT)
 	help2man --no-discard-stderr -N --output=$@ starpu-top/starpu_top$(EXEEXT)
 dist_man1_MANS =\
-	starpu_top.1
+	starpu-top/starpu_top.1
 endif
 endif
 
@@ -114,8 +114,8 @@ txtdir = ${prefix}
 else
 txtdir = ${docdir}
 endif
-txt_DATA = AUTHORS COPYING.LGPL README
-EXTRA_DIST = AUTHORS COPYING.LGPL README STARPU-VERSION build-aux/svn2cl.xsl
+txt_DATA = AUTHORS COPYING.LGPL README STARPU-REVISION
+EXTRA_DIST = AUTHORS COPYING.LGPL README STARPU-VERSION STARPU-REVISION build-aux/svn2cl.xsl
 
 include starpu-top/extradist
 

+ 21 - 1
configure.ac

@@ -81,6 +81,21 @@ AC_C_RESTRICT
 # Check if bash is available
 AC_CHECK_PROGS([BASH], [bash])
 
+# Check whether subversion is installed
+AC_PATH_PROG(svnversioncommand, svnversion)
+
+# use svnversion to record the current repository revision only if
+# subversion is installed and we are in a working copy
+if test "$svnversioncommand" = "" || test `LC_ALL=C $svnversioncommand -n $srcdir` = "exported" ; then
+   if test -f $srcdir/STARPU-REVISION ; then
+      cp $srcdir/STARPU-REVISION .
+   else
+      echo "unknown" > ./STARPU-REVISION
+   fi
+else
+   LC_ALL=C svnversion $srcdir > ./STARPU-REVISION
+fi
+
 AC_PATH_PROGS([STARPU_MS_LIB], [lib])
 AC_ARG_VAR([STARPU_MS_LIB], [Path to Microsoft's Visual Studio `lib' tool])
 AM_CONDITIONAL([STARPU_HAVE_MS_LIB], [test "x$STARPU_MS_LIB" != "x"])
@@ -1327,12 +1342,17 @@ AC_DEFUN([IS_SUPPORTED_CFLAG],
 IS_SUPPORTED_CFLAG(-W)
 IS_SUPPORTED_CFLAG(-Wall)
 IS_SUPPORTED_CFLAG(-Wextra)
-AC_SUBST(GLOBAL_AM_CFLAGS)
+IS_SUPPORTED_CFLAG(-Werror=implicit)
 
 if test "x$STARPU_DEVEL" != x; then
 	AC_DEFINE(STARPU_DEVEL, [1], [enable developer warnings])
+	IS_SUPPORTED_CFLAG(-Wunused)
+	IS_SUPPORTED_CFLAG(-Wundef)
+	IS_SUPPORTED_CFLAG(-Wshadow)
 fi
 
+AC_SUBST(GLOBAL_AM_CFLAGS)
+
 # Same value as Automake's, for use in other places.
 pkglibdir="\${libdir}/$PACKAGE"
 AC_SUBST([pkglibdir])

+ 7 - 4
doc/Makefile.am

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2009, 2011  Université de Bordeaux 1
-# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+# Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
 #
 # Permission is granted to copy, distribute and/or modify this document
 # under the terms of the GNU Free Documentation License, Version 1.3
@@ -12,7 +12,7 @@
 
 info_TEXINFOS = starpu.texi
 
-starpu_TEXINFOS = chapters/advanced-api.texi \
+chapters =	chapters/advanced-api.texi \
 	chapters/benchmarks.texi \
 	chapters/configuration.texi \
 	chapters/perf-feedback.texi \
@@ -35,9 +35,12 @@ starpu_TEXINFOS = chapters/advanced-api.texi \
 	chapters/using.texi \
 	chapters/vector_scal_opencl.texi \
 	chapters/socl.texi \
-	chapters/version.texi \
 	chapters/sched_ctx_hypervisor.texi
 
+starpu_TEXINFOS = 		\
+	chapters/version.texi 	\
+	$(chapters)
+
 MAINTAINERCLEANFILES = starpu.pdf starpu.html
 
 EXTRA_DIST = starpu.css
@@ -50,7 +53,7 @@ AM_MAKEINFOHTMLFLAGS = --css-include=$(top_srcdir)/doc/starpu.css --no-headers -
 uninstall-local:
 	$(RM) $(DESTDIR)$(infodir)/dir
 
-chapters/version.texi:
+chapters/version.texi: $(chapters)
 	@-for f in $(starpu_TEXINFOS) ; do \
                 if test -f $(top_srcdir)/doc/$$f ; then stat --format=%Y $(top_srcdir)/doc/$$f 2>/dev/null ; fi \
         done | sort -r | head -1 > timestamp

+ 53 - 60
doc/chapters/advanced-api.texi

@@ -38,7 +38,7 @@ The arguments following the codelets can be of the following types:
 @item
 the specific values @code{STARPU_VALUE}, @code{STARPU_CALLBACK},
 @code{STARPU_CALLBACK_ARG}, @code{STARPU_CALLBACK_WITH_ARG},
-@code{STARPU_PRIORITY}, @code{STARPU_TAG}, followed by the appropriated objects
+@code{STARPU_PRIORITY}, @code{STARPU_TAG}, @code{STARPU_FLOPS}, followed by the appropriated objects
 as defined below.
 @end itemize
 
@@ -85,6 +85,12 @@ this macro is used when calling @code{starpu_insert_task}, and must be
 followed by a tag.
 @end defmac
 
+@defmac STARPU_FLOPS
+this macro is used when calling @code{starpu_insert_task}, and must be followed
+by an amount of floating point operations, as a double. The user may have to
+explicitly cast into double, otherwise parameter passing will not work.
+@end defmac
+
 @deftypefun void starpu_codelet_pack_args ({char **}@var{arg_buffer}, {size_t *}@var{arg_buffer_size}, ...)
 Pack arguments of type @code{STARPU_VALUE} into a buffer which can be
 given to a codelet and later unpacked with the function
@@ -165,24 +171,6 @@ to the world size. Communications statistics must be enabled
 @node Communication
 @subsection Communication
 
-The standard point to point communications of MPI have been
-implemented. The semantic is similar to the MPI one, but adapted to
-the DSM provided by StarPU. A MPI request will only be submitted when
-the data is available in the main memory of the node submitting the
-request.
-
-There is two types of asynchronous communications: the classic
-asynchronous communications and the detached communications. The
-classic asynchronous communications (@code{starpu_mpi_isend} and
-@code{starpu_mpi_irecv}) need to be followed by a call to
-@code{starpu_mpi_wait} or to @code{starpu_mpi_test} to wait for or to
-test the completion of the communication. Waiting for or testing the
-completion of detached communications is not possible, this is done
-internally by StarPU-MPI, on completion, the resources are
-automatically released. This mechanism is similar to the pthread
-detach state attribute which determines whether a thread will be
-created in a joinable or a detached state.
-
 @deftypefun int starpu_mpi_send (starpu_data_handle_t @var{data_handle}, int @var{dest}, int @var{mpi_tag}, MPI_Comm @var{comm})
 Performs a standard-mode, blocking send of @var{data_handle} to the
 node @var{dest} using the message tag @code{mpi_tag} within the
@@ -354,51 +342,56 @@ Unpack the data handle from the contiguous buffer at the address @code{ptr} of s
 @end deftp
 
 @deftp {Data Type} {struct starpu_data_copy_methods}
-Defines the per-interface methods.
+Defines the per-interface methods. If the @code{any_to_any} method is provided,
+it will be used by default if no more specific method is provided. It can still
+be useful to provide more specific method in case of e.g. available particular
+CUDA or OpenCL support.
+
 @table @asis
-@item @code{int @{ram,cuda,opencl@}_to_@{ram,cuda,opencl@}(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)}
+@item @code{int (*@{ram,cuda,opencl@}_to_@{ram,cuda,opencl@})(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)}
 These 12 functions define how to copy data from the @var{src_interface}
 interface on the @var{src_node} node to the @var{dst_interface} interface
 on the @var{dst_node} node. They return 0 on success.
 
-@item @code{int (*ram_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
-Define how to copy data from the @var{src_interface} interface on the
-@var{src_node} node (in RAM) to the @var{dst_interface} interface on the
-@var{dst_node} node (on a CUDA device), using the given @var{stream}. Return 0
-on success.
-
-@item @code{int (*cuda_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
-Define how to copy data from the @var{src_interface} interface on the
-@var{src_node} node (on a CUDA device) to the @var{dst_interface} interface on the
-@var{dst_node} node (in RAM), using the given @var{stream}. Return 0
-on success.
-
-@item @code{int (*cuda_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
+@item @code{int (*@{ram,cuda@}_to_@{ram,cuda@}_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
+These 3 functions (@code{ram_to_ram} is not among these) define how to copy
+data from the @var{src_interface} interface on the @var{src_node} node to the
+@var{dst_interface} interface on the @var{dst_node} node, using the given
+@var{stream}. Must return 0 if the transfer was actually completed completely
+synchronously, or -EAGAIN if at least some transfers are still ongoing and
+should be awaited for by the core.
+
+@item @code{int (*@{ram,opencl@}_to_@{ram,opencl@}_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
+These 3 functions (@code{ram_to_ram} is not among them) define how to copy
+data from the @var{src_interface} interface on the @var{src_node} node to the
+@var{dst_interface} interface on the @var{dst_node} node, by recording in
+@var{event}, a pointer to a cl_event, the event of the last submitted transfer.
+Must return 0 if the transfer was actually completed completely synchronously,
+or -EAGAIN if at least some transfers are still ongoing and should be awaited
+for by the core.
+
+@item @code{int (*any_to_any)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)}
 Define how to copy data from the @var{src_interface} interface on the
-@var{src_node} node (on a CUDA device) to the @var{dst_interface} interface on
-the @var{dst_node} node (on another CUDA device), using the given @var{stream}.
-Return 0 on success.
+@var{src_node} node to the @var{dst_interface} interface on the @var{dst_node}
+node. This is meant to be implemented through the @var{starpu_interface_copy}
+helper, to which @var{async_data} should be passed as such, and will be used to
+manage asynchronicity. This must return -EAGAIN if any of the
+@var{starpu_interface_copy} calls has returned -EAGAIN (i.e. at least some
+transfer is still ongoing), and return 0 otherwise.
 
-@item @code{int (*ram_to_opencl_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
-Define how to copy data from the @var{src_interface} interface on the
-@var{src_node} node (in RAM) to the @var{dst_interface} interface on the
-@var{dst_node} node (on an OpenCL device), using @var{event}, a pointer to a
-cl_event. Return 0 on success.
-
-@item @code{int (*opencl_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
-Define how to copy data from the @var{src_interface} interface on the
-@var{src_node} node (on an OpenCL device) to the @var{dst_interface} interface
-on the @var{dst_node} node (in RAM), using the given @var{event}, a pointer to
-a cl_event. Return 0 on success.
-
-@item @code{int (*opencl_to_opencl_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
-Define how to copy data from the @var{src_interface} interface on the
-@var{src_node} node (on an OpenCL device) to the @var{dst_interface} interface
-on the @var{dst_node} node (on another OpenCL device), using the given
-@var{event}, a pointer to a cl_event. Return 0 on success.
 @end table
 @end deftp
 
+@deftypefun int starpu_interface_copy (uintptr_t @var{src}, size_t @var{src_offset}, unsigned @var{src_node}, uintptr_t @var{dst}, size_t @var{dst_offset}, unsigned @var{dst_node}, size_t @var{size}, {void *}@var{async_data})
+Copy @var{size} bytes from byte offset @var{src_offset} of @var{src} on
+@var{src_node} to byte offset @var{dst_offset} of @var{dst} on @var{dst_node}.
+This is to be used in the @var{any_to_any} copy method, which is provided with
+the @var{async_data} to be pased to @var{starpu_interface_copy}. this returns
+-EAGAIN if the transfer is still ongoing, or 0 if the transfer is already
+completed.
+@end deftypefun
+
+
 @deftypefun uint32_t starpu_crc32_be_n ({void *}@var{input}, size_t @var{n}, uint32_t @var{inputcrc})
 Compute the CRC of a byte buffer seeded by the inputcrc "current
 state". The return value should be considered as the new "current
@@ -457,7 +450,7 @@ void starpu_complex_data_register(starpu_data_handle_t *handle,
                 .nx = nx
         @};
 
-        if (interface_complex_ops.interfaceid == -1)
+        if (interface_complex_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
         @{
                 interface_complex_ops.interfaceid = starpu_data_interface_get_next_id();
         @}
@@ -483,7 +476,7 @@ static struct starpu_data_interface_ops interface_complex_ops =
         .copy_methods = &complex_copy_methods,
         .get_size = complex_get_size,
         .footprint = complex_footprint,
-        .interfaceid = -1,
+        .interfaceid = STARPU_UNKNOWN_INTERFACE_ID,
         .interface_size = sizeof(struct starpu_complex_interface),
 @};
 @end smallexample
@@ -837,7 +830,7 @@ The number of workerids
 @item @code{pthread_key_t cursor_key} (optional)
 The cursor needed to iterate the collection (depending on the data structure)
 @item @code{int type}
-The type of structure (currently STARPU_WORKER_LIST is the only one available)
+The type of structure (currently STARPU_SCHED_CTX_WORKER_LIST is the only one available)
 @item @code{unsigned (*has_next)(struct starpu_sched_ctx_worker_collection *workers)}
 Checks if there is a next worker
 @item @code{int (*get_next)(struct starpu_sched_ctx_worker_collection *workers)}
@@ -870,15 +863,15 @@ Delete the worker collection of the specified scheduling context
 Return the worker collection managed by the indicated context
 @end deftypefun
 
-@deftypefun pthread_mutex_t* starpu_get_changing_ctx_mutex (unsigned @var{sched_ctx_id})
+@deftypefun pthread_mutex_t* starpu_sched_ctx_get_changing_ctx_mutex (unsigned @var{sched_ctx_id})
 TODO
 @end deftypefun
 
-@deftypefun void starpu_task_set_context (unsigned *@var{sched_ctx_id})
+@deftypefun void starpu_sched_ctx_set_context (unsigned *@var{sched_ctx_id})
 Set the scheduling context the subsequent tasks will be submitted to
 @end deftypefun
 
-@deftypefun unsigned starpu_task_get_context (void)
+@deftypefun unsigned starpu_sched_ctx_get_context (void)
 Return the scheduling context the tasks are currently submitted to
 @end deftypefun
 

+ 19 - 9
doc/chapters/basic-api.texi

@@ -1849,6 +1849,11 @@ A pointer to the next task. This should only be used by StarPU.
 This is only used for tasks that use multiformat handle. This should only be
 used by StarPU.
 
+@item @code{double flops}
+This can be set to the number of floating points operations that the task
+will have to achieve. This is useful for easily getting GFlops curves from
+@code{starpu_perfmodel_plot}, and for the hypervisor load balancing.
+
 @item @code{void *starpu_private}
 This is private to StarPU, do not modify. If the task is allocated by hand
 (without starpu_task_create), this field should be set to NULL.
@@ -1857,6 +1862,7 @@ This is private to StarPU, do not modify. If the task is allocated by hand
 This field is set when initializing a task. It prevents a task from being
 submitted if it has not been properly initialized.
 @end table
+
 @end deftp
 
 @deftypefun void starpu_task_init ({struct starpu_task} *@var{task})
@@ -1939,6 +1945,18 @@ This function blocks until all the tasks that were submitted are terminated. It
 does not destroy these tasks.
 @end deftypefun
 
+@deftypefun int starpu_task_nready (void)
+@end deftypefun
+
+@deftypefun int starpu_task_nsubmitted (void)
+Return the number of submitted tasks which have not completed yet.
+@end deftypefun
+
+@deftypefun int starpu_task_nready (void)
+Return the number of submitted tasks which are ready for execution are already
+executing. It thus does not include tasks waiting for dependencies.
+@end deftypefun
+
 @deftypefun {struct starpu_task *} starpu_task_get_current (void)
 This function returns the task currently executed by the worker, or
 NULL if it is called either from a thread that is not a task or simply
@@ -2489,10 +2507,6 @@ This function returns a pointer to device properties for worker @var{workerid}
 (assumed to be a CUDA worker).
 @end deftypefun
 
-@deftypefun size_t starpu_cuda_get_global_mem_size (unsigned @var{devid})
-Return the size of the global memory of CUDA device @var{devid}.
-@end deftypefun
-
 @deftypefun void starpu_cuda_report_error ({const char *}@var{func}, {const char *}@var{file}, int @var{line}, cudaError_t @var{status})
 Report a CUDA error.
 @end deftypefun
@@ -2560,10 +2574,6 @@ OpenCL as shown in @ref{Full source code for the 'Scaling a Vector' example}.
 @node Writing OpenCL kernels
 @subsection Writing OpenCL kernels
 
-@deftypefun size_t starpu_opencl_get_global_mem_size (int @var{devid})
-Return the size of global device memory in bytes.
-@end deftypefun
-
 @deftypefun void starpu_opencl_get_context (int @var{devid}, {cl_context *}@var{context})
 Places the OpenCL context of the device designated by @var{devid} into @var{context}.
 @end deftypefun
@@ -2780,7 +2790,7 @@ otherwise. The integer pointed to by @var{ret} is set to -EAGAIN if the asynchro
 was successful, or to 0 if event was NULL.
 @end deftypefun
 
-@deftypefun cl_int starpu_opencl_copy_async_sync (cl_mem @var{src}, unsigned @var{src_node}, size_t @var{src_offset}, cl_mem @var{dst}, unsigned @var{dst_node}, size_t @var{dst_offset}, size_t @var{size}, {cl_event *}@var{event})
+@deftypefun cl_int starpu_opencl_copy_async_sync (uintptr_t @var{src}, size_t @var{src_offset}, unsigned @var{src_node}, uintptr_t @var{dst}, size_t @var{dst_offset}, unsigned @var{dst_node}, size_t @var{size}, {cl_event *}@var{event})
 Copy @var{size} bytes from byte offset @var{src_offset} of
 @var{src} on @var{src_node} to byte offset @var{dst_offset} of @var{dst} on
 @var{dst_node}. if @var{event} is NULL, the copy is synchronous, i.e the queue is

+ 14 - 2
doc/chapters/configuration.texi

@@ -417,8 +417,20 @@ the @code{starpu_conf} structure passed to @code{starpu_init} is set.
 
 @defvr {Environment variable} @code{STARPU_SINGLE_COMBINED_WORKER}
 If set, StarPU will create several workers which won't be able to work
-concurrently. It will create combined workers which size goes from 1 to the
-total number of CPU workers in the system.
+concurrently. It will by default create combined workers which size goes from 1
+to the total number of CPU workers in the system. @code{STARPU_MIN_WORKERSIZE}
+and @code{STARPU_MAX_WORKERSIZE} can be used to change this default.
+@end defvr
+
+@defvr {Environment variable} @code{STARPU_MIN_WORKERSIZE}
+When @code{STARPU_SINGLE_COMBINED_WORKER} is set, @code{STARPU_MIN_WORKERSIZE}
+permits to specify the minimum size of the combined workers (instead of the default 1)
+@end defvr
+
+@defvr {Environment variable} @code{STARPU_MAX_WORKERSIZE}
+When @code{STARPU_SINGLE_COMBINED_WORKER} is set, @code{STARPU_MAX_WORKERSIZE}
+permits to specify the minimum size of the combined workers (instead of the
+number of CPU workers in the system)
 @end defvr
 
 @defvr {Environment variable} STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER

+ 44 - 1
doc/chapters/mpi-support.texi

@@ -21,6 +21,7 @@ according to the task graph and an application-provided distribution.
 
 @menu
 * Simple Example::
+* Point to point communication::
 * Exchanging User Defined Data Interface::
 * MPI Insert Task Utility::
 * MPI Collective Operations::
@@ -120,7 +121,49 @@ int main(int argc, char **argv)
 @end smallexample
 @end cartouche
 
-@page
+@node Point to point communication
+@section Point to point communication
+
+The standard point to point communications of MPI have been
+implemented. The semantic is similar to the MPI one, but adapted to
+the DSM provided by StarPU. A MPI request will only be submitted when
+the data is available in the main memory of the node submitting the
+request.
+
+There is two types of asynchronous communications: the classic
+asynchronous communications and the detached communications. The
+classic asynchronous communications (@code{starpu_mpi_isend} and
+@code{starpu_mpi_irecv}) need to be followed by a call to
+@code{starpu_mpi_wait} or to @code{starpu_mpi_test} to wait for or to
+test the completion of the communication. Waiting for or testing the
+completion of detached communications is not possible, this is done
+internally by StarPU-MPI, on completion, the resources are
+automatically released. This mechanism is similar to the pthread
+detach state attribute which determines whether a thread will be
+created in a joinable or a detached state.
+
+For any communication, the call of the function will result in the
+creation of a StarPU-MPI request, the function
+@code{starpu_data_acquire_cb} is then called to asynchronously request
+StarPU to fetch the data in main memory; when the data is available in
+main memory, a StarPU-MPI function is called to put the new request in
+the list of the ready requests.
+
+The StarPU-MPI progression thread regularly polls this list of ready
+requests. For each new ready request, the appropriate function is
+called to post the corresponding MPI call. For example, calling
+@code{starpu_mpi_isend} will result in posting @code{MPI_Isend}. If
+the request is marked as detached, the request will be put in the list
+of detached requests.
+
+The StarPU-MPI progression thread also polls the list of detached
+requests. For each detached request, it regularly tests the completion
+of the MPI request by calling @code{MPI_Test}. On completion, the data
+handle is released, and if a callback was defined, it is called.
+
+@ref{Communication} gives the list of all the point to point
+communications defined in StarPU-MPI.
+
 @node Exchanging User Defined Data Interface
 @section Exchanging User Defined Data Interface
 

+ 17 - 7
doc/chapters/perf-feedback.texi

@@ -411,7 +411,7 @@ display the regression formula, and in the case of non-linear regression, the
 same performance log as for history-based performance models:
 
 @example
-$ starpu_perfmodel_display -s non_linear_memset_regression_based.type
+$ starpu_perfmodel_display -s non_linear_memset_regression_based
 performance model for cpu_impl_0
 	Regression : #sample = 1400
 	Linear: y = alpha size ^ beta
@@ -429,15 +429,25 @@ a3d3725e	4096           	4.763200e+00   	7.650928e-01   	100
 ...
 @end example
 
-The @code{starpu_perfmodel_plot} tool can be used to draw performance models.
-It writes a @code{.gp} file in the current directory, to be run in the
-@code{gnuplot} tool, which shows the corresponding curve.
-
 The same can also be achieved by using StarPU's library API, see
 @ref{Performance Model API} and notably the @code{starpu_perfmodel_load_symbol}
 function. The source code of the @code{starpu_perfmodel_display} tool can be a
 useful example.
 
+The @code{starpu_perfmodel_plot} tool can be used to draw performance models.
+It writes a @code{.gp} file in the current directory, to be run in the
+@code{gnuplot} tool, which shows the corresponding curve.
+
+When the @code{flops} field of tasks is set, @code{starpu_perfmodel_plot} can
+directly draw a GFlops curve, by simply adding the @code{-f} option:
+
+@example
+$ starpu_perfmodel_display -f -s chol_model_11
+@end example
+
+This will however disable displaying the regression model, for which we can not
+compute GFlops.
+
 When the FxT trace file @code{filename} has been generated, it is possible to
 get a profiling of each codelet by calling:
 @example
@@ -453,10 +463,10 @@ This is also available in the @code{starpu_perfmodel_plot} tool, by passing it
 the fxt trace:
 
 @example
-$ starpu_perfmodel_display -s non_linear_memset_regression_based.type -i /tmp/prof_file_foo_0
+$ starpu_perfmodel_plot -s non_linear_memset_regression_based -i /tmp/prof_file_foo_0
 @end example
 
-It willd produce a @code{.gp} file which contains both the performance model
+It will produce a @code{.gp} file which contains both the performance model
 curves, and the profiling measurements.
 
 If you have the R statistical tool installed, you can additionally use

+ 6 - 1
doc/chapters/perf-optimization.texi

@@ -228,7 +228,7 @@ int workerids[3] = @{1, 3, 10@};
 int id_ctx = starpu_sched_ctx_create("heft", workerids, 3, "my_ctx");
 
 /* @b{let StarPU know that the folowing tasks will be submitted to this context} */
-starpu_task_set_context(id);
+starpu_sched_ctx_set_task_context(id);
 
 /* @b{submit the task to StarPU} */
 starpu_task_submit(task);
@@ -548,6 +548,11 @@ The number of devices can be chosen as usual with @code{STARPU_NCPU},
 cpus can be arbitrarily chosen. The number of CUDA and OpenCL devices have to be
 lower than the real number on the current machine.
 
+The amount of simulated GPU memory is for now unbound by default, but
+it can be chosen by hand through the @code{STARPU_LIMIT_CUDA_MEM},
+@code{STARPU_LIMIT_CUDA_devid_MEM}, @code{STARPU_LIMIT_OPENCL_MEM}, and
+@code{STARPU_LIMIT_OPENCL_devid_MEM} environment variables.
+
 The Simgrid default stack size is small; to increase it use the
 parameter @code{--cfg=contexts/stack_size}, for example:
 

+ 4 - 4
doc/chapters/sched_ctx_hypervisor.texi

@@ -27,7 +27,7 @@ Basic strategies of resizing scheduling contexts already exist but a platform fo
 @section Managing the hypervisor
 There is a single hypervisor that is in charge of resizing contexts and the resizing strategy is chosen at the initialization of the hypervisor. A single resize can be done at a time.
 
-@deftypefun {struct starpu_performance_counters *} sched_ctx_hypervisor_init ({struct sched_ctx_hypervisor_policy *} @var{policy})
+@deftypefun {struct starpu_sched_ctx_performance_counters *} sched_ctx_hypervisor_init ({struct sched_ctx_hypervisor_policy *} @var{policy})
 Initializes the hypervisor to use the strategy provided as parameter and creates the performance counters (see @pxref{Performance Counters}).
 These performance counters represent actually some callbacks that will be used by the contexts to notify the information needed by the hypervisor.
 @end deftypefun
@@ -200,7 +200,7 @@ or
 @smallexample
 starpu_insert_task(&codelet,
                     ...,
-                    STARPU_FLOPS, 100,
+                    STARPU_FLOPS, (double) 100,
                     0);
 @end smallexample
 @end cartouche
@@ -210,8 +210,8 @@ starpu_insert_task(&codelet,
 
 The Scheduling Context Hypervisor Plugin provides a series of performance counters to StarPU. By incrementing them, StarPU can help the hypervisor in the resizing decision making process.
 
-@deftp {Data Type} {struct starpu_performance_counters}
-@anchor{struct starpu_performance_counters}
+@deftp {Data Type} {struct starpu_sched_ctx_performance_counters}
+@anchor{struct starpu_sched_ctx_performance_counters}
 
 @table @asis
 @item @code{void (*notify_idle_cycle)(unsigned sched_ctx_id, int worker, double idle_time)}

+ 1 - 1
examples/Makefile.am

@@ -16,7 +16,7 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
-AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) -Werror=implicit
+AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
 AM_CXXFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(MAGMA_LIBS) $(HWLOC_LIBS) @LIBS@
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include

+ 2 - 2
examples/basic_examples/block_cpu.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -26,7 +26,7 @@ void cpu_codelet(void *descr[], void *_args)
         unsigned ldy = STARPU_BLOCK_GET_LDY(descr[0]);
         unsigned ldz = STARPU_BLOCK_GET_LDZ(descr[0]);
         float *multiplier = (float *)_args;
-        unsigned i, j, k;
+        int i, j, k;
 
         for(k=0; k<nz ; k++)
 	{

+ 58 - 0
examples/cholesky/cholesky.h

@@ -55,6 +55,64 @@
 #define BLAS3_FLOP(n1,n2,n3)    \
         (2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
 
+/* This is from magma
+
+  -- Innovative Computing Laboratory
+  -- Electrical Engineering and Computer Science Department
+  -- University of Tennessee
+  -- (C) Copyright 2009
+
+  Redistribution  and  use  in  source and binary forms, with or without
+  modification,  are  permitted  provided  that the following conditions
+  are met:
+
+  * Redistributions  of  source  code  must  retain  the above copyright
+    notice,  this  list  of  conditions  and  the  following  disclaimer.
+  * Redistributions  in  binary  form must reproduce the above copyright
+    notice,  this list of conditions and the following disclaimer in the
+    documentation  and/or other materials provided with the distribution.
+  * Neither  the  name of the University of Tennessee, Knoxville nor the
+    names of its contributors may be used to endorse or promote products
+    derived from this software without specific prior written permission.
+
+  THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
+  LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  */
+
+#define FMULS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n) + 0.5) * (double)(__n) + (1. / 3.)))
+#define FADDS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n)      ) * (double)(__n) - (1. / 6.)))
+
+#define FLOPS_SPOTRF(__n) (     FMULS_POTRF((__n)) +       FADDS_POTRF((__n)) )
+
+#define FMULS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)+1.))
+#define FADDS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)-1.))
+
+#define FMULS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FMULS_TRMM_2((__m), (__n)) :*/ FMULS_TRMM_2((__n), (__m)) )
+#define FADDS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FADDS_TRMM_2((__m), (__n)) :*/ FADDS_TRMM_2((__n), (__m)) )
+
+#define FMULS_TRSM FMULS_TRMM
+#define FADDS_TRSM FMULS_TRMM
+
+#define FLOPS_STRSM(__m, __n) (     FMULS_TRSM((__m), (__n)) +       FADDS_TRSM((__m), (__n)) )
+
+
+#define FMULS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
+#define FADDS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
+
+#define FLOPS_SGEMM(__m, __n, __k) (     FMULS_GEMM((__m), (__n), (__k)) +       FADDS_GEMM((__m), (__n), (__k)) )
+
+/* End of magma code */
+
 static unsigned size = 4*1024;
 static unsigned nblocks = 16;
 static unsigned nbigblocks = 8;

+ 10 - 1
examples/cholesky/cholesky_grain_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2013  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
@@ -68,6 +68,9 @@ static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned
 		starpu_tag_declare_deps(TAG11_AUX(k, reclevel), 1, TAG22_AUX(k-1, k, k, reclevel));
 	}
 
+	int n = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_SPOTRF(n);
+
 	return task;
 }
 
@@ -110,6 +113,9 @@ static int create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j, un
 		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 1, TAG11_AUX(k, reclevel));
 	}
 
+	int n = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_STRSM(n, n);
+
 	ret = starpu_task_submit(task);
 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	return ret;
@@ -157,6 +163,9 @@ static int create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, un
 		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 2, TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
 	}
 
+	int n = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_SGEMM(n, n, n);
+
 	ret = starpu_task_submit(task);
 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	return ret;

+ 6 - 2
examples/cholesky/cholesky_implicit.c

@@ -85,6 +85,8 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 	double end;
 
 	unsigned i,j,k;
+	unsigned long n = starpu_matrix_get_nx(dataA);
+	unsigned long nn = n/nblocks;
 
 	int prio_level = noprio?STARPU_DEFAULT_PRIO:STARPU_MAX_PRIO;
 
@@ -101,6 +103,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 					 STARPU_PRIORITY, prio_level,
 					 STARPU_RW, sdatakk,
 					 STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
+					 STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
 					 0);
 		if (ret == -ENODEV) return 77;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
@@ -113,6 +116,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 						 STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
 						 STARPU_R, sdatakk,
 						 STARPU_RW, sdatakj,
+						 STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
 						 0);
 			if (ret == -ENODEV) return 77;
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
@@ -129,6 +133,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 								 STARPU_R, sdataki,
 								 STARPU_R, sdatakj,
 								 STARPU_RW, sdataij,
+								 STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
 								 0);
 					if (ret == -ENODEV) return 77;
 					STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
@@ -144,9 +149,8 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 	end = starpu_timing_now();
 
 	double timing = end - start;
-	unsigned long n = starpu_matrix_get_nx(dataA);
 
-	double flop = (1.0f*n*n*n)/3.0f;
+	double flop = FLOPS_SPOTRF(n);
 
 	if(with_ctxs || with_noctxs || chole1 || chole2)
 		update_sched_ctx_timing_results((flop/timing/1000.0f), (timing/1000000.0f));

+ 10 - 1
examples/cholesky/cholesky_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2013  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
@@ -69,6 +69,9 @@ static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned
 		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
 	}
 
+	int n = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_SPOTRF(n);
+
 	return task;
 }
 
@@ -109,6 +112,9 @@ static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j)
 		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
 	}
 
+	int n = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_STRSM(n, n);
+
 	int ret = starpu_task_submit(task);
         if (STARPU_UNLIKELY(ret == -ENODEV))
 	{
@@ -158,6 +164,9 @@ static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, u
 		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
 	}
 
+	int n = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_SGEMM(n, n, n);
+
 	int ret = starpu_task_submit(task);
         if (STARPU_UNLIKELY(ret == -ENODEV))
 	{

+ 10 - 1
examples/cholesky/cholesky_tile_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -71,6 +71,9 @@ static struct starpu_task * create_task_11(unsigned k, unsigned nblocks)
 		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
 	}
 
+	int n = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_SPOTRF(n);
+
 	return task;
 }
 
@@ -113,6 +116,9 @@ static int create_task_21(unsigned k, unsigned j)
 		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
 	}
 
+	int n = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_STRSM(n, n);
+
 	ret = starpu_task_submit(task);
 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	return ret;
@@ -160,6 +166,9 @@ static int create_task_22(unsigned k, unsigned i, unsigned j)
 		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
 	}
 
+	int n = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_SGEMM(n, n, n);
+
 	ret = starpu_task_submit(task);
 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	return ret;

+ 2 - 2
examples/filters/custom_mf/custom_conversion_codelets.c

@@ -21,7 +21,7 @@
 #ifdef STARPU_USE_CUDA
 void cuda_to_cpu(void *buffers[], void *arg)
 {
-	unsigned int n = CUSTOM_GET_NX(buffers[0]);
+	int n = CUSTOM_GET_NX(buffers[0]);
 	float *x = (float*) CUSTOM_GET_X_PTR(buffers[0]);
 	float *y = (float*) CUSTOM_GET_Y_PTR(buffers[0]);
 	struct point *aop;
@@ -60,7 +60,7 @@ struct starpu_codelet cuda_to_cpu_cl =
 #ifdef STARPU_USE_OPENCL
 void opencl_to_cpu_cpu_func(void *buffers[], void *arg)
 {
-	unsigned int n = CUSTOM_GET_NX(buffers[0]);
+	int n = CUSTOM_GET_NX(buffers[0]);
 	float *x = (float *) CUSTOM_GET_OPENCL_X_PTR(buffers[0]);
 	struct point *aop;
 	aop = (struct point *) CUSTOM_GET_CPU_PTR(buffers[0]);

+ 4 - 3
examples/filters/custom_mf/custom_interface.c

@@ -46,7 +46,7 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
 				    cl_event *event);
 #endif /* !STARPU_USE_OPENCL */
 
-static struct starpu_data_copy_methods custom_copy_data_methods_s =
+static const struct starpu_data_copy_methods custom_copy_data_methods_s =
 {
 	.ram_to_ram = NULL,
 #ifdef STARPU_USE_CUDA
@@ -98,7 +98,7 @@ static struct starpu_data_interface_ops interface_custom_ops =
 	.get_size              = custom_interface_get_size,
 	.footprint             = footprint_custom_interface_crc32,
 	.compare               = NULL,
-	.interfaceid           = -1,
+	.interfaceid           = STARPU_UNKNOWN_INTERFACE_ID,
 	.interface_size        = sizeof(struct custom_data_interface),
 	.display               = display_custom_interface,
 	.is_multiformat        = 1,
@@ -276,7 +276,8 @@ void custom_data_register(starpu_data_handle_t *handle,
 		.ops = format_ops
 	};
 
-	if (interface_custom_ops.interfaceid == -1) {
+	if (interface_custom_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
+	{
 		interface_custom_ops.interfaceid = starpu_data_interface_get_next_id();
 	}
 	starpu_data_register(handle, home_node, &custom, &interface_custom_ops);

+ 2 - 2
examples/filters/fblock_cpu.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -18,7 +18,7 @@
 
 void cpu_func(void *buffers[], void *cl_arg)
 {
-        unsigned i, j, k;
+        int i, j, k;
         int *factor = (int *) cl_arg;
 	int *block = (int *)STARPU_BLOCK_GET_PTR(buffers[0]);
 	int nx = (int)STARPU_BLOCK_GET_NX(buffers[0]);

+ 8 - 8
examples/filters/fblock_opencl.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -17,13 +17,13 @@
 
 #include <starpu.h>
 
-#define CHECK_CL_SET_KERNEL_ARG(kernel, n, size, ptr)       \
-do                                                          \
-{							    \
-	int err;                                            \
-	err = clSetKernelArg(kernel, n, size, ptr);         \
-	if (err != CL_SUCCESS)                              \
-       		STARPU_OPENCL_REPORT_ERROR(err);            \
+#define CHECK_CL_SET_KERNEL_ARG(kernel, n, size, ptr)       	\
+do                                                          	\
+{								\
+	int check_err;                           	        \
+	check_err = clSetKernelArg(kernel, n, size, ptr);       \
+	if (check_err != CL_SUCCESS)                            \
+       		STARPU_OPENCL_REPORT_ERROR(check_err);          \
 } while (0)
 
 extern struct starpu_opencl_program opencl_program;

+ 3 - 3
examples/filters/fmatrix.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -43,9 +43,9 @@ void cpu_func(void *buffers[], void *cl_arg)
 
 int main(int argc, char **argv)
 {
-	unsigned i, j, n=1;
+	unsigned j, n=1;
         int matrix[NX*NY];
-	int ret;
+	int ret, i;
 
         FPRINTF(stderr,"IN  Matrix: \n");
         for(j=0 ; j<NY ; j++)

+ 2 - 2
examples/filters/fvector.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -37,7 +37,7 @@ void cpu_func(void *buffers[], void *cl_arg)
 
 int main(int argc, char **argv)
 {
-	unsigned i;
+	int i;
         int vector[NX];
         starpu_data_handle_t handle;
         int factor=1;

+ 3 - 3
examples/filters/shadow.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2012  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -91,11 +91,11 @@ void cuda_func(void *buffers[], void *cl_arg)
 
 int main(int argc, char **argv)
 {
-	unsigned i, j;
+	unsigned j;
         int vector[NX + 2*SHADOW];
         int vector2[NX + PARTS*2*SHADOW];
 	starpu_data_handle_t handle, handle2;
-	int ret;
+	int ret, i;
 
         struct starpu_codelet cl =
 	{

+ 19 - 128
examples/interface/complex_interface.c

@@ -146,139 +146,30 @@ static int complex_unpack_data(starpu_data_handle_t handle, unsigned node, void
 	return 0;
 }
 
-#ifdef STARPU_USE_CUDA
-static int copy_cuda_async_sync(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, enum cudaMemcpyKind kind, cudaStream_t stream)
+static int copy_any_to_any(void *src_interface, unsigned src_node,
+			   void *dst_interface, unsigned dst_node,
+			   void *async_data)
 {
 	struct starpu_complex_interface *src_complex = src_interface;
 	struct starpu_complex_interface *dst_complex = dst_interface;
-
-	cudaStream_t sstream = stream;
-	int ret;
-
-	ret = starpu_cuda_copy_async_sync((void *)src_complex->real, src_node, (void *)dst_complex->real, dst_node,
-					  src_complex->nx*sizeof(src_complex->real[0]), sstream, kind);
-	if (ret == 0) sstream = NULL;
-
-	ret = starpu_cuda_copy_async_sync((char *)src_complex->imaginary, src_node, (char *)dst_complex->imaginary, dst_node,
-					  src_complex->nx*sizeof(src_complex->imaginary[0]), sstream, kind);
+	int ret = 0;
+
+	if (starpu_interface_copy((uintptr_t) src_complex->real, 0, src_node,
+				    (uintptr_t) dst_complex->real, 0, dst_node,
+				     src_complex->nx*sizeof(src_complex->real[0]),
+				     async_data))
+		ret = -EAGAIN;
+	if (starpu_interface_copy((uintptr_t) src_complex->imaginary, 0, src_node,
+				    (uintptr_t) dst_complex->imaginary, 0, dst_node,
+				     src_complex->nx*sizeof(src_complex->imaginary[0]),
+				     async_data))
+		ret = -EAGAIN;
 	return ret;
 }
 
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-     return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, NULL);
-}
-
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, stream);
-}
-
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, NULL);
-}
-
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, stream);
-}
-#endif
-
-#ifdef STARPU_USE_OPENCL
-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
-{
-	struct starpu_complex_interface *src_complex = src_interface;
-	struct starpu_complex_interface *dst_complex = dst_interface;
-	cl_int err;
-	int ret;
-
-	err = starpu_opencl_copy_ram_to_opencl(src_complex->real,
-					       src_node,
-					       (cl_mem) dst_complex->real,
-					       dst_node,
-					       src_complex->nx * sizeof(src_complex->real[0]),
-					       0,
-					       event,
-					       &ret);
-	if (STARPU_UNLIKELY(err != CL_SUCCESS))
-		STARPU_OPENCL_REPORT_ERROR(err);
-	if (ret == 0)
-		event = NULL;
-
-	err = starpu_opencl_copy_ram_to_opencl(src_complex->imaginary,
-					       src_node,
-					       (cl_mem) dst_complex->imaginary,
-					       dst_node,
-					       src_complex->nx * sizeof(src_complex->imaginary[0]),
-					       0,
-					       event,
-					       &ret);
-	if (STARPU_UNLIKELY(err != CL_SUCCESS))
-		STARPU_OPENCL_REPORT_ERROR(err);
-
-	return ret;
-}
-
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-        return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
-}
-
-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
-{
-	struct starpu_complex_interface *src_complex = src_interface;
-	struct starpu_complex_interface *dst_complex = dst_interface;
-	cl_int err;
-	int ret;
-
-	err = starpu_opencl_copy_opencl_to_ram((cl_mem) src_complex->real,
-					       src_node,
-					       dst_complex->real,
-					       dst_node,
-					       src_complex->nx * sizeof(src_complex->real[0]),
-					       0,
-					       event,
-					       &ret);
-	if (STARPU_UNLIKELY(err != CL_SUCCESS))
-		STARPU_OPENCL_REPORT_ERROR(err);
-	if (ret == 0)
-		event = NULL;
-
-	err = starpu_opencl_copy_opencl_to_ram((cl_mem) src_complex->imaginary,
-					       src_node,
-					       dst_complex->imaginary,
-					       dst_node,
-					       src_complex->nx * sizeof(src_complex->imaginary[0]),
-					       0,
-					       event,
-					       &ret);
-	if (STARPU_UNLIKELY(err != CL_SUCCESS))
-		STARPU_OPENCL_REPORT_ERROR(err);
-
-	return ret;
-}
-
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-        return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
-}
-#endif
-
-static struct starpu_data_copy_methods complex_copy_methods =
+static const struct starpu_data_copy_methods complex_copy_methods =
 {
-#ifdef STARPU_USE_CUDA
-	.ram_to_cuda = copy_ram_to_cuda,
-	.cuda_to_ram = copy_cuda_to_ram,
-	.ram_to_cuda_async = copy_ram_to_cuda_async,
-	.cuda_to_ram_async = copy_cuda_to_ram_async,
-#endif
-#ifdef STARPU_USE_OPENCL
-	.ram_to_opencl = copy_ram_to_opencl,
-	.opencl_to_ram = copy_opencl_to_ram,
-	.ram_to_opencl_async = copy_ram_to_opencl_async,
-	.opencl_to_ram_async = copy_opencl_to_ram_async,
-#endif
+	.any_to_any = copy_any_to_any
 };
 
 static struct starpu_data_interface_ops interface_complex_ops =
@@ -289,7 +180,7 @@ static struct starpu_data_interface_ops interface_complex_ops =
 	.copy_methods = &complex_copy_methods,
 	.get_size = complex_get_size,
 	.footprint = complex_footprint,
-	.interfaceid = -1,
+	.interfaceid = STARPU_UNKNOWN_INTERFACE_ID,
 	.interface_size = sizeof(struct starpu_complex_interface),
 	.handle_to_pointer = complex_handle_to_pointer,
 	.pack_data = complex_pack_data,
@@ -305,7 +196,7 @@ void starpu_complex_data_register(starpu_data_handle_t *handleptr, unsigned home
 		.nx = nx
 	};
 
-	if (interface_complex_ops.interfaceid == -1)
+	if (interface_complex_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
 	{
 		interface_complex_ops.interfaceid = starpu_data_interface_get_next_id();
 	}

+ 3 - 3
examples/ppm_downscaler/ppm_downscaler.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -75,7 +75,7 @@ struct ppm_image *file_to_ppm(char *filename)
 	ret = fread(ppm->data, sizeof(struct ppm_color), ppm->ncols*ppm->nlines, file);
 	STARPU_ASSERT(ret == ppm->ncols*ppm->nlines);
 
-	unsigned i;
+	int i;
 	for (i = 0; i < ppm->ncols*ppm->nlines; i++)
 	{
 /*		fprintf(stderr, "READ (index %d) -> r %d g %d b %d\n", i, ppm->data[i].r, ppm->data[i].g, ppm->data[i].b); */
@@ -121,7 +121,7 @@ void dummy_downscale(struct ppm_image *input_ppm, struct ppm_image *output_ppm)
 	struct ppm_color *in = input_ppm->data;
 	struct ppm_color *out = output_ppm->data;
 
-	unsigned line, col;
+	int line, col;
 	for (line = 0; line < output_ppm->nlines; line++)
 	{
 		for (col = 0; col < output_ppm->ncols; col++)

+ 2 - 2
examples/profiling/profiling.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2012  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -107,7 +107,7 @@ int main(int argc, char **argv)
 	FPRINTF(stderr, "Avg. length : %2.2lf us\n", (length_sum)/niter);
 
 	/* Display the occupancy of all workers during the test */
-	int worker;
+	unsigned worker;
 	for (worker = 0; worker < starpu_worker_get_count(); worker++)
 	{
 		struct starpu_worker_profiling_info worker_info;

+ 2 - 2
examples/sched_ctx/sched_ctx.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2012  Université de Bordeaux 1
- * Copyright (C) 2010-2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -79,7 +79,7 @@ int main(int argc, char **argv)
 	/*indicate what to do with the resources when context 2 finishes (it depends on your application)*/
 	starpu_sched_ctx_set_inheritor(sched_ctx2, sched_ctx1);
 
-	unsigned i;
+	int i;
 	for (i = 0; i < ntasks/2; i++)
 	{
 		struct starpu_task *task = starpu_task_create();

+ 1 - 1
examples/sched_ctx_utils/sched_ctx_utils.c

@@ -93,7 +93,7 @@ void* start_bench(void *val)
 	pthread_setspecific(key, &p->id);
 
 	if(p->ctx != 0)
-		starpu_task_set_context(&p->ctx);
+		starpu_sched_ctx_set_context(&p->ctx);
 
 	for(i = 0; i < NSAMPLES; i++)
 		p->bench(p->size, p->nblocks);

+ 4 - 4
examples/scheduler/dummy_sched.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2012  Université de Bordeaux 1
- * Copyright (C) 2010-2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -28,7 +28,7 @@ typedef struct dummy_sched_data {
 
 static void init_dummy_sched(unsigned sched_ctx_id)
 {
-	starpu_sched_ctx_create_worker_collection(sched_ctx_id, STARPU_WORKER_LIST);
+	starpu_sched_ctx_create_worker_collection(sched_ctx_id, STARPU_SCHED_CTX_WORKER_LIST);
 
 	struct dummy_sched_data *data = (struct dummy_sched_data*)malloc(sizeof(struct dummy_sched_data));
 	
@@ -70,7 +70,7 @@ static int push_task_dummy(struct starpu_task *task)
 	   of them would pop for tasks */
 	unsigned worker = 0;
 	struct starpu_sched_ctx_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
-	struct starpu_iterator it;
+	struct starpu_sched_ctx_iterator it;
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 
@@ -159,7 +159,7 @@ int main(int argc, char **argv)
 	ntasks /= 100;
 #endif
 
-	unsigned i;
+	int i;
 	for (i = 0; i < ntasks; i++)
 	{
 		struct starpu_task *task = starpu_task_create();

+ 2 - 2
examples/stencil/Makefile.am

@@ -13,10 +13,10 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
-AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
+AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(HWLOC_LIBS) @LIBS@
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
-AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) 
+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS)
 
 if USE_MPI
 LIBS += $(top_builddir)/mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la

+ 1 - 1
examples/stencil/life.c

@@ -20,7 +20,7 @@
 
 void life_update(int bz, const TYPE *old, TYPE *newp, int nx, int ny, int nz, int ldy, int ldz, int iter)
 {
-	unsigned x, y, z, num, alive;
+	int x, y, z, num, alive;
 
 	for (z = iter; z < nz - iter; z++)
 	{

+ 5 - 5
examples/stencil/stencil-blocks.c

@@ -121,7 +121,7 @@ struct block_description *get_block_description(int z)
 	return &blocks[z];
 }
 
-unsigned get_block_mpi_node(int z)
+int get_block_mpi_node(int z)
 {
 	z = (z + nbz)%nbz;
 	return blocks[z].mpi_node;
@@ -277,7 +277,7 @@ void allocate_memory_on_node(int rank)
 	{
 		struct block_description *block = get_block_description(bz);
 
-		unsigned node = block->mpi_node;
+		int node = block->mpi_node;
 
 		unsigned size_bz = block_sizes_z[bz];
 	
@@ -301,7 +301,7 @@ void allocate_memory_on_node(int rank)
 		}
 
 		/* Boundary blocks : Top */
-		unsigned top_node = block->boundary_blocks[T]->mpi_node;
+		int top_node = block->boundary_blocks[T]->mpi_node;
 		if ((node == rank) || (top_node == rank))
 		{
 			allocate_block_on_node(&block->boundaries_handle[T][0], &block->boundaries[T][0],
@@ -311,7 +311,7 @@ void allocate_memory_on_node(int rank)
 		} 
 
 		/* Boundary blocks : Bottom */
-		unsigned bottom_node = block->boundary_blocks[B]->mpi_node;
+		int bottom_node = block->boundary_blocks[B]->mpi_node;
 		if ((node == rank) || (bottom_node == rank))
 		{
 			allocate_block_on_node(&block->boundaries_handle[B][0], &block->boundaries[B][0],
@@ -330,7 +330,7 @@ void check(int rank)
 	{
 		struct block_description *block = get_block_description(bz);
 
-		unsigned node = block->mpi_node;
+		int node = block->mpi_node;
 
 		/* Main blocks */
 		if (node == rank)

+ 11 - 11
examples/stencil/stencil-tasks.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -40,7 +40,7 @@
  */
 
 /* R(z) = R(z+d) = local, just call the save kernel */
-static void create_task_save_local(unsigned iter, unsigned z, int dir, unsigned local_rank)
+static void create_task_save_local(unsigned iter, unsigned z, int dir, int local_rank)
 {
 	struct starpu_task *save_task = starpu_task_create();
 	struct block_description *descr = get_block_description(z);
@@ -81,7 +81,7 @@ static void send_done(void *arg)
 
 #ifdef STARPU_USE_MPI
 /* Post MPI send */
-static void create_task_save_mpi_send(unsigned iter, unsigned z, int dir, unsigned local_rank)
+static void create_task_save_mpi_send(unsigned iter, unsigned z, int dir, int local_rank)
 {
 	struct block_description *descr = get_block_description(z);
 	STARPU_ASSERT(descr->mpi_node == local_rank);
@@ -108,7 +108,7 @@ static void recv_done(void *arg)
 }
 
 /* Post MPI recv */
-static void create_task_save_mpi_recv(unsigned iter, unsigned z, int dir, unsigned local_rank)
+static void create_task_save_mpi_recv(unsigned iter, unsigned z, int dir, int local_rank)
 {
 	struct block_description *descr = get_block_description(z);
 	STARPU_ASSERT(descr->mpi_node != local_rank);
@@ -129,10 +129,10 @@ static void create_task_save_mpi_recv(unsigned iter, unsigned z, int dir, unsign
 /*
  * Schedule saving boundaries of blocks to communication buffers
  */
-void create_task_save(unsigned iter, unsigned z, int dir, unsigned local_rank)
+void create_task_save(unsigned iter, unsigned z, int dir, int local_rank)
 {
-	unsigned node_z = get_block_mpi_node(z);
-	unsigned node_z_and_d = get_block_mpi_node(z+dir);
+	int node_z = get_block_mpi_node(z);
+	int node_z_and_d = get_block_mpi_node(z+dir);
 
 #ifdef STARPU_USE_MPI
 	if (node_z == local_rank)
@@ -168,7 +168,7 @@ void create_task_save(unsigned iter, unsigned z, int dir, unsigned local_rank)
  * Schedule update computation in computation buffer
  */
 
-void create_task_update(unsigned iter, unsigned z, unsigned local_rank)
+void create_task_update(unsigned iter, unsigned z, int local_rank)
 {
 	STARPU_ASSERT(iter != 0);
 
@@ -253,8 +253,8 @@ void create_start_task(int z, int dir)
  */
 void create_tasks(int rank)
 {
-	unsigned iter;
-	unsigned bz;
+	int iter;
+	int bz;
 	int niter = get_niter();
 	int nbz = get_nbz();
 
@@ -288,7 +288,7 @@ void create_tasks(int rank)
  */
 void wait_end_tasks(int rank)
 {
-	unsigned bz;
+	int bz;
 	int nbz = get_nbz();
 
 	for (bz = 0; bz < nbz; bz++)

+ 4 - 4
examples/stencil/stencil.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010-2012  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -221,7 +221,7 @@ int main(int argc, char **argv)
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 #ifdef STARPU_USE_MPI
-	starpu_mpi_initialize();
+	starpu_mpi_init(NULL, NULL, 0);
 #endif
 
 #ifdef STARPU_USE_OPENCL
@@ -312,8 +312,8 @@ int main(int argc, char **argv)
 #if 1
 		unsigned nzblocks_per_process = (nbz + world_size - 1) / world_size;
 
-		unsigned bz, iter;
-		unsigned last;
+		int iter;
+		unsigned last, bz;
 		for (iter = 0; iter < who_runs_what_len; iter++)
 		{
 			last = 1;

+ 6 - 6
examples/stencil/stencil.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010-2011  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -56,8 +56,8 @@ typedef enum
 struct block_description
 {
 	/* Which MPI node should process that block ? */
-	unsigned mpi_node;
-	
+	int mpi_node;
+
 	unsigned preferred_worker;
 
 	unsigned bz;
@@ -101,7 +101,7 @@ void check(int rank);
 
 void display_memory_consumption(int rank);
 
-unsigned get_block_mpi_node(int z);
+int get_block_mpi_node(int z);
 unsigned get_block_size(int z);
 unsigned get_bind_tasks(void);
 
@@ -111,8 +111,8 @@ unsigned get_ticks(void);
 
 unsigned global_workerid(unsigned local_workerid);
 
-void create_task_update(unsigned iter, unsigned z, unsigned local_rank);
-void create_task_save(unsigned iter, unsigned z, int dir, unsigned local_rank);
+void create_task_update(unsigned iter, unsigned z, int local_rank);
+void create_task_save(unsigned iter, unsigned z, int dir, int local_rank);
 
 extern int starpu_mpi_initialize(void);
 extern int starpu_mpi_shutdown(void);

+ 1 - 1
examples/tag_example/tag_example2.c

@@ -76,7 +76,7 @@ static void tag_cleanup_grid(unsigned ni, unsigned iter)
 
 static int create_task_grid(unsigned iter)
 {
-	int i;
+	unsigned i;
 	int ret;
 
 /*	FPRINTF(stderr, "start iter %d ni %d...\n", iter, ni); */

+ 1 - 2
include/starpu_cuda.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2012  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -39,7 +39,6 @@ void starpu_cuda_report_error(const char *func, const char *file, int line, cuda
 #define STARPU_CUDA_REPORT_ERROR(status) \
 	starpu_cuda_report_error(__starpu_func__, __FILE__, __LINE__, status)
 
-size_t starpu_cuda_get_global_mem_size(unsigned devid);
 cudaStream_t starpu_cuda_get_local_stream(void);
 
 const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid);

+ 1 - 1
include/starpu_data.h

@@ -85,7 +85,7 @@ int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, unsigned node, e
 void starpu_data_release(starpu_data_handle_t handle);
 void starpu_data_release_on_node(starpu_data_handle_t handle, unsigned node);
 
-void starpu_malloc_set_align(size_t);
+void starpu_malloc_set_align(size_t align);
 int starpu_malloc(void **A, size_t dim);
 int starpu_free(void *A);
 void starpu_memory_display_stats();

+ 6 - 1
include/starpu_data_interfaces.h

@@ -73,10 +73,15 @@ struct starpu_data_copy_methods
 	int (*opencl_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event);
 	int (*opencl_to_opencl_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event);
 #endif
+
+	int (*any_to_any)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 };
 
+int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, void *async_data);
+
 enum starpu_data_interface_id
 {
+	STARPU_UNKNOWN_INTERFACE_ID = -1,
 	STARPU_MATRIX_INTERFACE_ID=0,
 	STARPU_BLOCK_INTERFACE_ID=1,
 	STARPU_VECTOR_INTERFACE_ID=2,
@@ -99,7 +104,7 @@ struct starpu_data_interface_ops
 	/* Free data of the interface on a given node. */
 	void (*free_data_on_node)(void *data_interface, unsigned node);
 	/* ram/cuda/opencl synchronous and asynchronous transfer methods */
-	struct starpu_data_copy_methods *copy_methods;
+	const struct starpu_data_copy_methods *copy_methods;
 	/* Return the current pointer (if any) for the handle on the given node. */
 	void * (*handle_to_pointer)(starpu_data_handle_t handle, unsigned node);
 	/* Return an estimation of the size of data, for performance models */

+ 2 - 3
include/starpu_opencl.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2013  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -53,7 +53,6 @@ struct starpu_opencl_program
 	cl_program programs[STARPU_MAXOPENCLDEVS];
 };
 
-size_t starpu_opencl_get_global_mem_size(int devid);
 void starpu_opencl_get_context(int devid, cl_context *context);
 void starpu_opencl_get_device(int devid, cl_device_id *device);
 void starpu_opencl_get_queue(int devid, cl_command_queue *queue);
@@ -108,7 +107,7 @@ cl_int starpu_opencl_copy_opencl_to_ram(cl_mem buffer, unsigned src_node, void *
 
 cl_int starpu_opencl_copy_opencl_to_opencl(cl_mem src, unsigned src_node, size_t src_offset, cl_mem dst, unsigned dst_node, size_t dst_offset, size_t size, cl_event *event, int *ret);
 
-cl_int starpu_opencl_copy_async_sync(uintptr_t src, unsigned src_node, size_t src_offset, uintptr_t dst, unsigned dst_node, size_t dst_offset, size_t size, cl_event *event);
+cl_int starpu_opencl_copy_async_sync(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, cl_event *event);
 
 #ifdef __cplusplus
 }

+ 7 - 1
include/starpu_perfmodel.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
@@ -102,6 +102,8 @@ struct starpu_perfmodel_history_entry
 #else
 	size_t size; /* in bytes */
 #endif
+
+	double flops; /* Provided by the application */
 };
 
 struct starpu_perfmodel_history_list
@@ -212,6 +214,10 @@ void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct star
 void starpu_bus_print_bandwidth(FILE *f);
 void starpu_bus_print_affinity(FILE *f);
 
+/* use bw & latency to compute the velocity of resources*/
+double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev);
+double starpu_get_latency_RAM_CUDA(unsigned cudadev);
+
 #ifdef __cplusplus
 }
 #endif

+ 21 - 25
include/starpu_sched_ctx.h

@@ -24,12 +24,8 @@ extern "C"
 {
 #endif
 
-#ifdef STARPU_DEVEL
-#  warning rename all objects to start with starpu_sched_ctx
-#endif
-
-//struct starpu_iterator;
-struct starpu_iterator
+//struct starpu_sched_ctx_iterator;
+struct starpu_sched_ctx_iterator
 {
 	int cursor;
 };
@@ -42,12 +38,12 @@ struct starpu_sched_ctx_worker_collection
 	void *workerids;
 	/* the number of workers in the collection */
 	unsigned nworkers;
-	/* the type of structure (STARPU_WORKER_LIST,...) */
+	/* the type of structure (STARPU_SCHED_CTX_WORKER_LIST,...) */
 	int type;
 	/* checks if there is another element in collection */
-	unsigned (*has_next)(struct starpu_sched_ctx_worker_collection *workers, struct starpu_iterator *it);
+	unsigned (*has_next)(struct starpu_sched_ctx_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
 	/* return the next element in the collection */
-	int (*get_next)(struct starpu_sched_ctx_worker_collection *workers, struct starpu_iterator *it);
+	int (*get_next)(struct starpu_sched_ctx_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
 	/* add a new element in the collection */
 	int (*add)(struct starpu_sched_ctx_worker_collection *workers, int worker);
 	/* remove an element from the collection */
@@ -57,26 +53,26 @@ struct starpu_sched_ctx_worker_collection
 	/* free the structure */
 	void (*deinit)(struct starpu_sched_ctx_worker_collection *workers);
 	/* initialize the cursor if there is one */
-	void (*init_iterator)(struct starpu_sched_ctx_worker_collection *workers, struct starpu_iterator *it);
+	void (*init_iterator)(struct starpu_sched_ctx_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
 };
 
 /* types of structures the worker collection can implement */
-#define STARPU_WORKER_LIST 0
+#define STARPU_SCHED_CTX_WORKER_LIST 0
 
-struct starpu_performance_counters
+struct starpu_sched_ctx_performance_counters
 {
 	void (*notify_idle_cycle)(unsigned sched_ctx_id, int worker, double idle_time);
 	void (*notify_idle_end)(unsigned sched_ctx_id, int worker);
 	void (*notify_pushed_task)(unsigned sched_ctx_id, int worker);
-	void (*notify_poped_task)(unsigned sched_ctx_id, int worker, double flops);
+	void (*notify_poped_task)(unsigned sched_ctx_id, int worker, double flops, size_t data_size);
 	void (*notify_post_exec_hook)(unsigned sched_ctx_id, int taskid);
 	void (*notify_submitted_job)(struct starpu_task *task, uint32_t footprint);
 };
 
 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
-void starpu_set_perf_counters(unsigned sched_ctx_id, struct starpu_performance_counters *perf_counters);
-void starpu_call_poped_task_cb(int workerid, unsigned sched_ctx_id, double flops);
-void starpu_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
+void starpu_sched_ctx_set_perf_counters(unsigned sched_ctx_id, struct starpu_sched_ctx_performance_counters *perf_counters);
+void starpu_sched_ctx_call_poped_task_cb(int workerid, unsigned sched_ctx_id, double flops, size_t data_size);
+void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
 
 unsigned starpu_sched_ctx_create(const char *policy_name, int *workerids_ctx, int nworkers_ctx, const char *sched_ctx_name);
@@ -102,16 +98,16 @@ void starpu_sched_ctx_delete_worker_collection(unsigned sched_ctx_id);
 struct starpu_sched_ctx_worker_collection* starpu_sched_ctx_get_worker_collection(unsigned sched_ctx_id);
 
 #if !defined(_MSC_VER) && !defined(STARPU_SIMGRID)
-pthread_mutex_t* starpu_get_changing_ctx_mutex(unsigned sched_ctx_id);
+pthread_mutex_t* starpu_sched_ctx_get_changing_ctx_mutex(unsigned sched_ctx_id);
 #endif
 
-void starpu_task_set_context(unsigned *sched_ctx_id);
+void starpu_sched_ctx_set_context(unsigned *sched_ctx_id);
 
-unsigned starpu_task_get_context(void);
+unsigned starpu_sched_ctx_get_context(void);
 
-void starpu_notify_hypervisor_exists(void);
+void starpu_sched_ctx_notify_hypervisor_exists(void);
 
-unsigned starpu_check_if_hypervisor_exists(void);
+unsigned starpu_sched_ctx_check_if_hypervisor_exists(void);
 
 unsigned starpu_sched_ctx_get_nworkers(unsigned sched_ctx_id);
 
@@ -121,13 +117,13 @@ unsigned starpu_sched_ctx_contains_worker(int workerid, unsigned sched_ctx_id);
 
 unsigned starpu_sched_ctx_overlapping_ctxs_on_worker(int workerid);
 
-unsigned starpu_is_ctxs_turn(int workerid, unsigned sched_ctx_id);
+unsigned starpu_sched_ctx_is_ctxs_turn(int workerid, unsigned sched_ctx_id);
 
-void starpu_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id);
+void starpu_sched_ctx_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id);
 
-double starpu_get_max_time_worker_on_ctx(void);
+double starpu_sched_ctx_get_max_time_worker_on_ctx(void);
 
-void starpu_stop_task_submission(void);
+void starpu_sched_ctx_stop_task_submission(void);
 
 void starpu_sched_ctx_set_inheritor(unsigned sched_ctx_id, unsigned inheritor);
 

+ 3 - 0
include/starpu_task.h

@@ -321,6 +321,9 @@ int starpu_task_wait_for_all_in_ctx(unsigned sched_ctx_id);
 /* This function waits until there is no more ready task. */
 int starpu_task_wait_for_no_ready(void);
 
+int starpu_task_nready(void);
+int starpu_task_nsubmitted(void);
+
 void starpu_codelet_init(struct starpu_codelet *cl);
 
 void starpu_display_codelet_stats(struct starpu_codelet *cl);

+ 2 - 2
include/starpu_task_util.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -45,7 +45,7 @@ void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t
 #define STARPU_DATA_ARRAY       (1<<11) /* Array of data handles */
 #define STARPU_TAG       (1<<12) /* Tag */
 #define STARPU_HYPERVISOR_TAG	(1<<13)	/* Used to tag a task after whose execution we'll execute  a code */
-#define STARPU_HYPERVISOR_FLOPS	(1<<14)	/* Used to specify the number of flops needed to be executed by a task */
+#define STARPU_FLOPS	(1<<14)	/* Used to specify the number of flops needed to be executed by a task */
 
 /* Wrapper to create a task. */
 int starpu_insert_task(struct starpu_codelet *cl, ...);

+ 4 - 4
include/starpu_util.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2012  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -207,17 +207,17 @@ static __inline int starpu_get_env_number(const char *str)
 	if (strval)
 	{
 		/* the env variable was actually set */
-		unsigned val;
+		long int val;
 		char *check;
 
-		val = (int)strtol(strval, &check, 10);
+		val = strtol(strval, &check, 10);
 		if (*check) {
 			fprintf(stderr,"The %s environment variable must contain an integer\n", str);
 			STARPU_ABORT();
 		}
 
 		/* fprintf(stderr, "ENV %s WAS %d\n", str, val); */
-		return val;
+		return (int)val;
 	}
 	else
 	{

+ 1 - 1
mpi/examples/Makefile.am

@@ -75,7 +75,7 @@ NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -
 	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
 endif
 
-AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Werror=implicit
+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include
 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS)

+ 1 - 2
mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c

@@ -72,10 +72,9 @@ void dw_cholesky(float ***matA, unsigned size, unsigned ld, unsigned nblocks, in
 	struct timeval start;
 	struct timeval end;
 	starpu_data_handle_t **data_handles;
-	int x, y;
+	unsigned x,y,i,j,k;
 
 	/* create all the DAG nodes */
-	unsigned i,j,k;
 
 	data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
 	for(x=0 ; x<nblocks ; x++) data_handles[x] = malloc(nblocks*sizeof(starpu_data_handle_t));

+ 2 - 2
mpi/examples/matrix_decomposition/mpi_decomposition_params.c

@@ -25,8 +25,8 @@ unsigned nblocks = 16;
 unsigned nbigblocks = 2;
 unsigned noprio = 0;
 unsigned display = 0;
-unsigned dblockx = -1;
-unsigned dblocky = -1;
+int dblockx = -1;
+int dblocky = -1;
 
 void parse_args(int argc, char **argv, int nodes)
 {

+ 3 - 3
mpi/examples/mpi_lu/plu_example.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -32,8 +32,8 @@
 static unsigned long size = 4096;
 static unsigned nblocks = 16;
 static unsigned check = 0;
-static unsigned p = 1;
-static unsigned q = 1;
+static int p = 1;
+static int q = 1;
 static unsigned display = 0;
 
 #ifdef STARPU_HAVE_LIBNUMA

+ 2 - 2
mpi/examples/mpi_lu/pxlu.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -101,7 +101,7 @@ static void send_data_to_mask(starpu_data_handle_t handle, int *rank_mask, int m
 	int mpi_tag_array[world_size];
 	starpu_data_handle_t handle_array[world_size];
 
-	unsigned r;
+	int r;
 	for (r = 0; r < world_size; r++)
 	{
 		if (rank_mask[r]) {

+ 1 - 1
mpi/src/Makefile.am

@@ -21,7 +21,7 @@ BUILT_SOURCES =
 
 CLEANFILES = *.gcno *.gcda *.linkinfo
 
-AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Werror=implicit
+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS)
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/src/ -I$(top_builddir)/src -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/mpi/src
 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS)

+ 14 - 15
mpi/src/starpu_mpi.c

@@ -443,12 +443,12 @@ static void _starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
 	_STARPU_MPI_DEBUG("Test request %p - mpitag %d - TYPE %s %d\n", &req->request, req->mpi_tag, _starpu_mpi_request_type(req->request_type), req->srcdst);
 
 	TRACE_MPI_UTESTING_BEGIN(req->srcdst, req->mpi_tag);
-	
+
 	req->ret = MPI_Test(&req->request, testing_req->flag, testing_req->status);
 	STARPU_ASSERT(req->ret == MPI_SUCCESS);
 
 	TRACE_MPI_UTESTING_END(req->srcdst, req->mpi_tag);
-	
+
 	if (*testing_req->flag)
 	{
 		testing_req->ret = req->ret;
@@ -841,6 +841,17 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 		_starpu_mpi_print_thread_level_support(provided, " has been initialized with");
 	}
 
+	{
+	     int rank, worldsize;
+	     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	     MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
+	     TRACE_MPI_START(rank, worldsize);
+#ifdef STARPU_USE_FXT
+	     starpu_set_profiling_id(rank);
+#endif //STARPU_USE_FXT
+	}
+
+
 	/* notify the main thread that the progression thread is ready */
 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	running = 1;
@@ -862,7 +873,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 			_STARPU_MPI_DEBUG("NO MORE REQUESTS TO HANDLE\n");
 
 			TRACE_MPI_SLEEP_BEGIN();
-			
+
 			if (barrier_running)
 				/* Tell mpi_barrier */
 				_STARPU_PTHREAD_COND_SIGNAL(&cond_finished);
@@ -967,13 +978,6 @@ int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi)
 	argc_argv->argc = argc;
 	argc_argv->argv = argv;
 
-	int rank, worldsize;
-
-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-	MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
-
-	TRACE_MPI_START(rank,worldsize);
-
 	_STARPU_PTHREAD_CREATE("MPI progress", &progress_thread, NULL, _starpu_mpi_progress_thread_func, argc_argv);
 
 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
@@ -981,10 +985,6 @@ int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi)
 		_STARPU_PTHREAD_COND_WAIT(&cond_progression, &mutex);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 
-#ifdef STARPU_USE_FXT
-	starpu_set_profiling_id(rank);
-#endif //STARPU_USE_FXT
-
 #ifdef USE_STARPU_ACTIVITY
 	hookid = starpu_progression_hook_register(progression_hook_func, NULL);
 	STARPU_ASSERT(hookid >= 0);
@@ -1053,4 +1053,3 @@ int starpu_mpi_shutdown(void)
 
 	return 0;
 }
-

+ 2 - 2
mpi/src/starpu_mpi_stats.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -70,7 +70,7 @@ void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts)
 
 void _starpu_mpi_comm_amounts_display(int node)
 {
-	unsigned dst;
+	int dst;
 	size_t sum = 0;
 
 	if (stats_enabled == 0) return;

+ 1 - 1
mpi/tests/Makefile.am

@@ -64,7 +64,7 @@ NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -
 	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
 endif
 
-AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Werror=implicit
+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/src -I$(top_builddir)/src -I$(top_srcdir)/examples/
 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS)

+ 3 - 3
mpi/tests/mpi_detached_tag.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -54,8 +54,8 @@ int main(int argc, char **argv)
 
 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
 
-	unsigned nloops = NITER;
-	unsigned loop;
+	int nloops = NITER;
+	int loop;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 	for (loop = 0; loop < nloops; loop++)

+ 3 - 3
mpi/tests/mpi_irecv.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -54,8 +54,8 @@ int main(int argc, char **argv)
 
 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
 
-	unsigned nloops = NITER;
-	unsigned loop;
+	int nloops = NITER;
+	int loop;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 	for (loop = 0; loop < nloops; loop++)

+ 3 - 3
mpi/tests/mpi_irecv_detached.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2012  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -69,8 +69,8 @@ int main(int argc, char **argv)
 
 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
 
-	unsigned nloops = NITER;
-	unsigned loop;
+	int nloops = NITER;
+	int loop;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 	for (loop = 0; loop < nloops; loop++)

+ 3 - 3
mpi/tests/mpi_isend.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -54,8 +54,8 @@ int main(int argc, char **argv)
 
 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
 
-	unsigned nloops = NITER;
-	unsigned loop;
+	int nloops = NITER;
+	int loop;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 	for (loop = 0; loop < nloops; loop++)

+ 3 - 3
mpi/tests/mpi_isend_detached.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2012  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -68,8 +68,8 @@ int main(int argc, char **argv)
 
 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
 
-	unsigned nloops = NITER;
-	unsigned loop;
+	int nloops = NITER;
+	int loop;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 	for (loop = 0; loop < nloops; loop++)

+ 3 - 3
mpi/tests/mpi_probe.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2012  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -69,8 +69,8 @@ int main(int argc, char **argv)
 
 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
 
-	unsigned nloops = NITER;
-	unsigned loop;
+	int nloops = NITER;
+	int loop;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 	for (loop = 0; loop < nloops; loop++)

+ 3 - 3
mpi/tests/mpi_test.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -55,8 +55,8 @@ int main(int argc, char **argv)
 
 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
 
-	unsigned nloops = NITER;
-	unsigned loop;
+	int nloops = NITER;
+	int loop;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 	for (loop = 0; loop < nloops; loop++)

+ 3 - 3
mpi/tests/pingpong.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -55,8 +55,8 @@ int main(int argc, char **argv)
 
 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
 
-	unsigned nloops = NITER;
-	unsigned loop;
+	int nloops = NITER;
+	int loop;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 	for (loop = 0; loop < nloops; loop++)

+ 8 - 8
mpi/tests/ring.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -24,7 +24,7 @@
 #  define NITER	2048
 #endif
 
-unsigned token = 42;
+int token = 42;
 starpu_data_handle_t token_handle;
 
 #ifdef STARPU_USE_CUDA
@@ -33,7 +33,7 @@ extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
 
 void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 {
-	unsigned *tokenptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
+	int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
 	(*tokenptr)++;
 }
 
@@ -82,13 +82,13 @@ int main(int argc, char **argv)
 	ret = starpu_mpi_init(NULL, NULL, 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
-	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
+	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(token));
 
-	unsigned nloops = NITER;
-	unsigned loop;
+	int nloops = NITER;
+	int loop;
 
-	unsigned last_loop = nloops - 1;
-	unsigned last_rank = size - 1;
+	int last_loop = nloops - 1;
+	int last_rank = size - 1;
 
 	for (loop = 0; loop < nloops; loop++)
 	{

+ 8 - 8
mpi/tests/ring_async.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -24,7 +24,7 @@
 #  define NITER	2048
 #endif
 
-unsigned token = 42;
+int token = 42;
 starpu_data_handle_t token_handle;
 
 #ifdef STARPU_USE_CUDA
@@ -33,7 +33,7 @@ extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
 
 void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 {
-	unsigned *tokenptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
+	int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
 	(*tokenptr)++;
 }
 
@@ -82,13 +82,13 @@ int main(int argc, char **argv)
 	ret = starpu_mpi_init(NULL, NULL, 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
-	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
+	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(token));
 
-	unsigned nloops = NITER;
-	unsigned loop;
+	int nloops = NITER;
+	int loop;
 
-	unsigned last_loop = nloops - 1;
-	unsigned last_rank = size - 1;
+	int last_loop = nloops - 1;
+	int last_rank = size - 1;
 
 	for (loop = 0; loop < nloops; loop++)
 	{

+ 8 - 8
mpi/tests/ring_async_implicit.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -24,7 +24,7 @@
 #  define NITER	2048
 #endif
 
-unsigned token = 42;
+int token = 42;
 starpu_data_handle_t token_handle;
 
 #ifdef STARPU_USE_CUDA
@@ -33,7 +33,7 @@ extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
 
 void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 {
-	unsigned *tokenptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
+	int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
 	(*tokenptr)++;
 }
 
@@ -80,13 +80,13 @@ int main(int argc, char **argv)
 	}
 
 
-	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
+	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(token));
 
-	unsigned nloops = NITER;
-	unsigned loop;
+	int nloops = NITER;
+	int loop;
 
-	unsigned last_loop = nloops - 1;
-	unsigned last_rank = size - 1;
+	int last_loop = nloops - 1;
+	int last_rank = size - 1;
 
 	for (loop = 0; loop < nloops; loop++)
 	{

+ 2 - 2
sched_ctx_hypervisor/examples/Makefile.am

@@ -13,9 +13,9 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
-AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS)
+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS)
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(top_builddir)/sched_ctx_hypervisor/src/libsched_ctx_hypervisor.la
-AM_CPPFLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/examples -I$(top_builddir)/include -I$(top_srcdir)/sched_ctx_hypervisor/include
+AM_CPPFLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/examples -I$(top_builddir)/include -I$(top_srcdir)/sched_ctx_hypervisor/include -I$(top_srcdir)/sched_ctx_hypervisor/examples
 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_GLPK_LDFLAGS)
 
 if !NO_BLAS_LIB

+ 3 - 3
sched_ctx_hypervisor/examples/app_driven_test/app_driven_test.c

@@ -48,7 +48,7 @@ int tag = 1;
 void* start_thread(void *arg)
 {
 	unsigned sched_ctx = *((unsigned*)arg);
-	starpu_task_set_context(&sched_ctx);
+	starpu_sched_ctx_set_context(&sched_ctx);
 
 	struct starpu_task *task[10];
 	struct params params[10];
@@ -115,8 +115,8 @@ int main()
 	policy.name = "app_driven";
 	void *perf_counters = sched_ctx_hypervisor_init(&policy);
 
-	starpu_set_perf_counters(sched_ctx1, (struct starpu_performance_counters*)perf_counters);
-	starpu_set_perf_counters(sched_ctx2, (struct starpu_performance_counters*)perf_counters);
+	starpu_sched_ctx_set_perf_counters(sched_ctx1, (struct starpu_sched_ctx_performance_counters*)perf_counters);
+	starpu_sched_ctx_set_perf_counters(sched_ctx2, (struct starpu_sched_ctx_performance_counters*)perf_counters);
 	sched_ctx_hypervisor_register_ctx(sched_ctx1, 0.0);
 	sched_ctx_hypervisor_register_ctx(sched_ctx2, 0.0);
 

+ 4 - 4
sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.c

@@ -97,7 +97,7 @@ void* start_bench(void *val)
 	pthread_setspecific(key, &p->id);
 
 	if(p->ctx != 0)
-		starpu_task_set_context(&p->ctx);
+		starpu_sched_ctx_set_context(&p->ctx);
 
 	for(i = 0; i < NSAMPLES; i++)
 		p->bench(p->mat[i], p->size, p->nblocks);
@@ -241,7 +241,7 @@ void construct_contexts(void (*bench)(float*, unsigned, unsigned))
 	struct sched_ctx_hypervisor_policy policy;
 	policy.custom = 0;
 	policy.name = "idle";
-	struct starpu_performance_counters *perf_counters = sched_ctx_hypervisor_init(&policy);
+	struct starpu_sched_ctx_performance_counters *perf_counters = sched_ctx_hypervisor_init(&policy);
 	int nworkers1 = cpu1 + gpu + gpu1;
 	int nworkers2 = cpu2 + gpu + gpu2;
 	unsigned n_all_gpus = gpu + gpu1 + gpu2;
@@ -267,7 +267,7 @@ void construct_contexts(void (*bench)(float*, unsigned, unsigned))
 		p1.workers[i] = i;
 
 	p1.ctx = starpu_sched_ctx_create("heft", p1.workers, nworkers1, "sched_ctx1");
-	starpu_set_perf_counters(p1.ctx, perf_counters);
+	starpu_sched_ctx_set_perf_counters(p1.ctx, perf_counters);
 	p2.the_other_ctx = (int)p1.ctx;
 	p1.nworkers = nworkers1;
 	sched_ctx_hypervisor_register_ctx(p1.ctx, 0.0);
@@ -303,7 +303,7 @@ void construct_contexts(void (*bench)(float*, unsigned, unsigned))
 	/* 	p2.workers[k++] = i; */
 
 	p2.ctx = starpu_sched_ctx_create("heft", p2.workers, 0, "sched_ctx2");
-	starpu_set_perf_counters(p2.ctx, perf_counters);
+	starpu_sched_ctx_set_perf_counters(p2.ctx, perf_counters);
 	p1.the_other_ctx = (int)p2.ctx;
 	p2.nworkers = 0;
 	sched_ctx_hypervisor_register_ctx(p2.ctx, 0.0);

+ 1 - 0
sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.h

@@ -30,3 +30,4 @@ void end_contexts(void);
 void start_2benchs(void (*bench)(float *mat, unsigned size, unsigned nblocks));
 void start_1stbench(void (*bench)(float *mat, unsigned size, unsigned nblocks));
 void start_2ndbench(void (*bench)(float *mat, unsigned size, unsigned nblocks));
+void set_hypervisor_conf(int event, int task_tag);

+ 7 - 1
sched_ctx_hypervisor/include/sched_ctx_hypervisor.h

@@ -127,6 +127,12 @@ struct sched_ctx_hypervisor_wrapper
 	/* number of flops executed since last resizing */
 	double elapsed_flops[STARPU_NMAXWORKERS];
 
+	/* data quantity executed on each worker in this ctx */
+	size_t elapsed_data[STARPU_NMAXWORKERS];
+
+	/* nr of tasks executed on each worker in this ctx */
+	int elapsed_tasks[STARPU_NMAXWORKERS];
+
 	/* the average speed of workers when they belonged to this context */
 	double ref_velocity[STARPU_NMAXWORKERS];
 
@@ -168,7 +174,7 @@ struct sched_ctx_hypervisor_policy
 	void (*end_ctx)(unsigned sched_ctx);
 };
 
-struct starpu_performance_counters *sched_ctx_hypervisor_init(struct sched_ctx_hypervisor_policy *policy);
+struct starpu_sched_ctx_performance_counters *sched_ctx_hypervisor_init(struct sched_ctx_hypervisor_policy *policy);
 
 void sched_ctx_hypervisor_shutdown(void);
 

+ 2 - 4
sched_ctx_hypervisor/src/Makefile.am

@@ -12,11 +12,9 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
-AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS)
 LIBS = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la
-
-AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include/starpu/$(STARPU_EFFECTIVE_VERSION)/ -I$(top_builddir)/src/ -I$(top_srcdir)/src/ -I$(top_srcdir)/sched_ctx_hypervisor/include/
-
+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/src/ -I$(top_builddir)/src/ -I$(top_builddir)/include -I$(top_srcdir)/sched_ctx_hypervisor/include/ -I$(top_srcdir)/sched_ctx_hypervisor/src
 AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
 
 lib_LTLIBRARIES = libsched_ctx_hypervisor.la

+ 9 - 9
sched_ctx_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c

@@ -32,10 +32,12 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 	int *sched_ctxs = in_sched_ctxs == NULL ? sched_ctx_hypervisor_get_sched_ctxs() : in_sched_ctxs;
 	
 	int w,s;
-	struct sched_ctx_hypervisor_wrapper* sc_w = NULL;
 
+	struct sched_ctx_hypervisor_wrapper* sc_w = NULL;
+	double total_flops = 0.0;
 	for(s = 0; s < ns; s++)
 	{
+		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
 		for(w = 0; w < nw; w++)
 		{
 			w_in_s[s][w] = 0.0;
@@ -44,7 +46,6 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 			draft_flops_on_w[s][w] = 0.0;
 			int worker = workers == NULL ? w : workers[w];
 
-			sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
 			velocity[s][w] = _get_velocity_per_worker(sc_w, worker);
 			if(velocity[s][w] == -1.0)
 			{
@@ -53,21 +54,20 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 				if(velocity[s][w] == -1.0)
 					velocity[s][w] = sc_w->ref_velocity[worker];
 				if(velocity[s][w] == -1.0)
-					velocity[s][w] = arch == STARPU_CPU_WORKER ? 5.0 : 150.0;
+					velocity[s][w] = arch == STARPU_CPU_WORKER ? 5.0 : 100.0;
 			}
 			
-			printf("v[w%d][s%d] = %lf\n",w, s, velocity[s][w]);
+//			printf("v[w%d][s%d] = %lf\n",w, s, velocity[s][w]);
 		}
 		struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sched_ctxs[s]);
 		flops[s] = config->ispeed_ctx_sample/1000000000; /* in gflops */
 	}
-
-
+	
 	/* take the exec time of the slowest ctx 
 	   as starting point and then try to minimize it
 	   as increasing it a little for the faster ctxs */
 	double tmax = _get_slowest_ctx_exec_time();
-	double smallest_tmax = _get_fastest_ctx_exec_time(); //tmax - 0.5*tmax;
+ 	double smallest_tmax = _get_fastest_ctx_exec_time(); //tmax - 0.5*tmax; 
 //	printf("tmax %lf smallest %lf\n", tmax, smallest_tmax);
 
 	double res = 1.0;
@@ -413,8 +413,8 @@ static void ispeed_lp_end_ctx(unsigned sched_ctx)
 {
 	struct sched_ctx_hypervisor_wrapper* sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
 	int worker;
-	for(worker = 0; worker < 12; worker++)
-		printf("%d/%d: speed %lf\n", worker, sched_ctx, sc_w->ref_velocity[worker]);
+/* 	for(worker = 0; worker < 12; worker++) */
+/* 		printf("%d/%d: speed %lf\n", worker, sched_ctx, sc_w->ref_velocity[worker]); */
 
 	return;
 }

+ 1 - 1
sched_ctx_hypervisor/src/hypervisor_policies/ispeed_policy.c

@@ -77,7 +77,7 @@ static int* _get_slowest_workers(unsigned sched_ctx, int *nworkers, enum starpu_
 	int worker;
 	int considered = 0;
 
-	struct starpu_iterator it;
+	struct starpu_sched_ctx_iterator it;
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 

+ 13 - 6
sched_ctx_hypervisor/src/hypervisor_policies/lp_tools.c

@@ -283,6 +283,12 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 		{
 			tmp_nw_move[w] = 0;
 			tmp_nw_add[w] = 0;
+			int i;
+			for(i = 0; i < STARPU_NMAXWORKERS; i++)
+			{
+				tmp_workers_move[w][i] = -1;
+				tmp_workers_add[w][i] = -1;
+			}
 		}
 
 		/* find workers that ctx s has to give away */
@@ -363,6 +369,7 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 				int nw_add = 0;
 
 				int w;
+				int j = 0, k = 0;
 				for(w = 0; w < nw; w++)
 				{
 					enum starpu_archtype arch = STARPU_ANY_WORKER;
@@ -375,7 +382,7 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 					if( nw_needed > 0 && tmp_nw_move[w] > 0)
 					{
 						nw_move += nw_needed >= tmp_nw_move[w] ? tmp_nw_move[w] : nw_needed;
-						int i = 0, j = 0;
+						int i = 0;
 						for(i = 0; i < STARPU_NMAXWORKERS; i++)
 						{
 							if(tmp_workers_move[w][i] != -1)
@@ -395,14 +402,14 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 					if(diff > 0.3 && tmp_nw_add[w] != 0)
 					{
 						nw_add = tmp_nw_add[w];
-						int i = 0, j = 0;
+						int i = 0;
 						for(i = 0; i < STARPU_NMAXWORKERS; i++)
 						{
 							if(tmp_workers_add[w][i] != -1)
 							{
-								workers_add[j++] = tmp_workers_add[w][i];
+								workers_add[k++] = tmp_workers_add[w][i];
 								tmp_workers_add[w][i] = -1;
-								if(j == nw_add)
+								if(k == nw_add)
 									break;
 							}
 						}
@@ -413,7 +420,7 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 				
 				if(nw_move > 0)
 				{
-					sched_ctx_hypervisor_move_workers(sched_ctxs[s], sched_ctxs[s2], workers_move, nw_move, 1);
+					sched_ctx_hypervisor_move_workers(sched_ctxs[s], sched_ctxs[s2], workers_move, nw_move, 0);
 					nw_move = 0;
 				}
 
@@ -452,7 +459,7 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 		}
 
 		if(nw_move > 0)
-			sched_ctx_hypervisor_remove_workers_from_sched_ctx(workers_move, nw_move, sched_ctxs[s], 1);
+			sched_ctx_hypervisor_remove_workers_from_sched_ctx(workers_move, nw_move, sched_ctxs[s], 0);
 	}
 }
 

+ 26 - 19
sched_ctx_hypervisor/src/hypervisor_policies/policy_tools.c

@@ -28,7 +28,7 @@ static int _compute_priority(unsigned sched_ctx)
 	struct starpu_sched_ctx_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx);
 	int worker;
 
-	struct starpu_iterator it;
+	struct starpu_sched_ctx_iterator it;
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 
@@ -113,7 +113,7 @@ int* _get_first_workers(unsigned sched_ctx, int *nworkers, enum starpu_archtype
 	int worker;
 	int considered = 0;
 
-	struct starpu_iterator it;
+	struct starpu_sched_ctx_iterator it;
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 
@@ -169,7 +169,6 @@ int* _get_first_workers(unsigned sched_ctx, int *nworkers, enum starpu_archtype
 		}
 	}
 
-
 	return curr_workers;
 }
 
@@ -181,7 +180,7 @@ unsigned _get_potential_nworkers(struct sched_ctx_hypervisor_policy_config *conf
 	unsigned potential_workers = 0;
 	int worker;
 
-	struct starpu_iterator it;
+	struct starpu_sched_ctx_iterator it;
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 	while(workers->has_next(workers, &it))
@@ -304,7 +303,7 @@ static double _get_best_elapsed_flops(struct sched_ctx_hypervisor_wrapper* sc_w,
 	struct starpu_sched_ctx_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sc_w->sched_ctx);
         int worker;
 
-	struct starpu_iterator it;
+	struct starpu_sched_ctx_iterator it;
 	if(workers->init_iterator)
                 workers->init_iterator(workers, &it);
 
@@ -330,7 +329,7 @@ static double _get_ispeed_sample_for_type_of_worker(struct sched_ctx_hypervisor_
 
 	double avg = 0.0;
 	int n = 0;
-	struct starpu_iterator it;
+	struct starpu_sched_ctx_iterator it;
 	if(workers->init_iterator)
                 workers->init_iterator(workers, &it);
 
@@ -356,7 +355,7 @@ static double _get_ispeed_sample_for_sched_ctx(unsigned sched_ctx)
         
 	int worker;
 	double ispeed_sample = 0.0;
-	struct starpu_iterator it;
+	struct starpu_sched_ctx_iterator it;
 
 	if(workers->init_iterator)
                 workers->init_iterator(workers, &it);
@@ -401,7 +400,7 @@ double _get_slowest_ctx_exec_time(void)
 	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
 	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
 
-	double curr_time = starpu_timing_now();
+/* 	double curr_time = starpu_timing_now(); */
 	double slowest_time = 0.0;
 
 	int s;
@@ -410,18 +409,13 @@ double _get_slowest_ctx_exec_time(void)
 	{
 		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
 
-/*                 double elapsed_time = curr_time - sc_w->start_time; */
-/* 		if(elapsed_time > slowest_time) */
-/* 			slowest_time = elapsed_time; */
-
-//		double elapsed_flops = sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);
+//		double elapsed_time  = (curr_time - sc_w->start_time)/1000000;
 		struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx);
-                double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/_get_ctx_velocity(sc_w);
+		double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/_get_ctx_velocity(sc_w);
 		if(elapsed_time > slowest_time)
 			slowest_time = elapsed_time;
 
         }
-//	return slowest_time / 1000000.0;
 	return slowest_time;
 }
 
@@ -431,7 +425,7 @@ double _get_fastest_ctx_exec_time(void)
 	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
 
 	double curr_time = starpu_timing_now();
-	double fastest_time = curr_time;
+ 	double fastest_time = curr_time;
 
 	int s;
 	struct sched_ctx_hypervisor_wrapper* sc_w;		
@@ -440,13 +434,13 @@ double _get_fastest_ctx_exec_time(void)
 		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
 
 		struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx);
-                double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/_get_ctx_velocity(sc_w);
-
+		double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/_get_ctx_velocity(sc_w);
+		
 		if(elapsed_time < fastest_time)
 			fastest_time = elapsed_time;
 
         }
-//	return fastest_time / 1000000.0;
+
 	return fastest_time;
 }
 
@@ -457,6 +451,8 @@ double _get_velocity_per_worker(struct sched_ctx_hypervisor_wrapper *sc_w, unsig
 		return -1.0;
 
         double elapsed_flops = sc_w->elapsed_flops[worker] / 1000000000.0; /*in gflops */
+	size_t elapsed_data_used = sc_w->elapsed_data[worker];
+	int elapsed_tasks = sc_w->elapsed_tasks[worker];
 	struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx);
 	double sample = config->ispeed_w_sample[worker] / 1000000000.0; /*in gflops */
 
@@ -479,6 +475,17 @@ double _get_velocity_per_worker(struct sched_ctx_hypervisor_wrapper *sc_w, unsig
         {
                 double curr_time = starpu_timing_now();
                 double elapsed_time = (curr_time - sc_w->start_time) / 1000000.0; /* in seconds */
+ 		enum starpu_archtype arch = starpu_worker_get_type(worker);
+		if(arch == STARPU_CUDA_WORKER)
+		{
+			double transfer_velocity = starpu_get_bandwidth_RAM_CUDA(worker);
+			elapsed_time +=  (elapsed_data_used / transfer_velocity) / 1000000 ;
+			double latency = starpu_get_latency_RAM_CUDA(worker);
+//			printf("%d/%d: latency %lf elapsed_time before %lf ntasks %d\n", worker, sc_w->sched_ctx, latency, elapsed_time, elapsed_tasks);
+			elapsed_time += (elapsed_tasks * latency)/1000000;
+//			printf("elapsed time after %lf \n", elapsed_time);
+		}
+			
                 double vel  = (elapsed_flops/elapsed_time);/* in Gflops/s */
 		sc_w->ref_velocity[worker] = sc_w->ref_velocity[worker] > 0.0 ? (sc_w->ref_velocity[worker] + vel) / 2 : vel; 
                 return vel;

+ 25 - 22
sched_ctx_hypervisor/src/sched_ctx_hypervisor.c

@@ -19,11 +19,11 @@
 #include <starpu_config.h>
 
 unsigned imposed_resize = 0;
-struct starpu_performance_counters* perf_counters = NULL;
+struct starpu_sched_ctx_performance_counters* perf_counters = NULL;
 
 static void notify_idle_cycle(unsigned sched_ctx, int worker, double idle_time);
 static void notify_pushed_task(unsigned sched_ctx, int worker);
-static void notify_poped_task(unsigned sched_ctx, int worker, double flops);
+static void notify_poped_task(unsigned sched_ctx, int worker, double flops, size_t data_size);
 static void notify_post_exec_hook(unsigned sched_ctx, int taskid);
 static void notify_idle_end(unsigned sched_ctx, int  worker);
 static void notify_submitted_job(struct starpu_task *task, unsigned footprint);
@@ -125,7 +125,7 @@ static struct sched_ctx_hypervisor_policy *_select_hypervisor_policy(struct sche
 
 
 /* initializez the performance counters that starpu will use to retrive hints for resizing */
-struct starpu_performance_counters* sched_ctx_hypervisor_init(struct sched_ctx_hypervisor_policy *hypervisor_policy)
+struct starpu_sched_ctx_performance_counters* sched_ctx_hypervisor_init(struct sched_ctx_hypervisor_policy *hypervisor_policy)
 {
 	hypervisor.min_tasks = 0;
 	hypervisor.nsched_ctxs = 0;
@@ -158,6 +158,8 @@ struct starpu_performance_counters* sched_ctx_hypervisor_init(struct sched_ctx_h
 			hypervisor.sched_ctx_w[i].pushed_tasks[j] = 0;
 			hypervisor.sched_ctx_w[i].poped_tasks[j] = 0;
 			hypervisor.sched_ctx_w[i].elapsed_flops[j] = 0.0;
+			hypervisor.sched_ctx_w[i].elapsed_data[j] = 0;
+			hypervisor.sched_ctx_w[i].elapsed_tasks[j] = 0;
 			hypervisor.sched_ctx_w[i].total_elapsed_flops[j] = 0.0;
 			hypervisor.sched_ctx_w[i].worker_to_be_removed[j] = 0;
 			hypervisor.sched_ctx_w[i].ref_velocity[j] = -1.0;
@@ -167,7 +169,7 @@ struct starpu_performance_counters* sched_ctx_hypervisor_init(struct sched_ctx_h
 	struct sched_ctx_hypervisor_policy *selected_hypervisor_policy = _select_hypervisor_policy(hypervisor_policy);
 	_load_hypervisor_policy(selected_hypervisor_policy);
 
-	perf_counters = (struct starpu_performance_counters*)malloc(sizeof(struct starpu_performance_counters));
+	perf_counters = (struct starpu_sched_ctx_performance_counters*)malloc(sizeof(struct starpu_sched_ctx_performance_counters));
 	perf_counters->notify_idle_cycle = notify_idle_cycle;
 	perf_counters->notify_pushed_task = notify_pushed_task;
 	perf_counters->notify_poped_task = notify_poped_task;
@@ -175,7 +177,7 @@ struct starpu_performance_counters* sched_ctx_hypervisor_init(struct sched_ctx_h
 	perf_counters->notify_idle_end = notify_idle_end;
 	perf_counters->notify_submitted_job = notify_submitted_job;
 
-	starpu_notify_hypervisor_exists();
+	starpu_sched_ctx_notify_hypervisor_exists();
 
 	return perf_counters;
 }
@@ -346,7 +348,7 @@ int sched_ctx_hypervisor_get_nworkers_ctx(unsigned sched_ctx, enum starpu_archty
 	struct starpu_sched_ctx_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx);
 	int worker;
 
-	struct starpu_iterator it;
+	struct starpu_sched_ctx_iterator it;
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 
@@ -364,7 +366,14 @@ static void _set_elapsed_flops_per_sched_ctx(unsigned sched_ctx, double val)
 {
 	int i;
 	for(i = 0; i < STARPU_NMAXWORKERS; i++)
+	{
 		hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[i] = val;
+		if(val == 0)
+		{
+			hypervisor.sched_ctx_w[sched_ctx].elapsed_data[i] = 0;
+			hypervisor.sched_ctx_w[sched_ctx].elapsed_tasks[i] = 0;
+		}
+	}
 }
 
 double sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(struct sched_ctx_hypervisor_wrapper* sc_w)
@@ -396,7 +405,7 @@ void _reset_resize_sample_info(unsigned sender_sched_ctx, unsigned receiver_sche
 	sender_sc_w->start_time = start_time;
 	sender_sc_w->remaining_flops = sender_sc_w->remaining_flops - sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sender_sc_w);
 	_set_elapsed_flops_per_sched_ctx(sender_sched_ctx, 0.0);
-	
+
 	receiver_sc_w->start_time = start_time;
 	receiver_sc_w->remaining_flops = receiver_sc_w->remaining_flops - sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(receiver_sc_w);
 	_set_elapsed_flops_per_sched_ctx(receiver_sched_ctx, 0.0);
@@ -410,19 +419,11 @@ void sched_ctx_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned recei
 	{
 		_print_current_time();
 		int j;
-		printf("resize ctx %d with", sender_sched_ctx);
+		printf("resize ctx %d with %d workers", sender_sched_ctx, nworkers_to_move);
 		for(j = 0; j < nworkers_to_move; j++)
 			printf(" %d", workers_to_move[j]);
 		printf("\n");
 
-/* 		int *cpus = (int*) malloc(nworkers_to_move * sizeof(int)); */
-/* 		int ncpus; */
-
-/* 		_get_cpus(workers_to_move, nworkers_to_move, cpus, &ncpus); */
-
-/* //		if(ncpus != 0) */
-/* //			starpu_sched_ctx_remove_workers(cpus, ncpus, sender_sched_ctx); */
-
 		starpu_sched_ctx_add_workers(workers_to_move, nworkers_to_move, receiver_sched_ctx);
 
 		if(now)
@@ -622,11 +623,11 @@ static unsigned _ack_resize_completed(unsigned sched_ctx, int worker)
 			   whatever the application says */
 			if(!((hypervisor.resize[sender_sched_ctx] == 0 || hypervisor.resize[receiver_sched_ctx] == 0) && imposed_resize))
 			{
-				int j;
-				printf("remove after ack from ctx %d:", sender_sched_ctx);
-				for(j = 0; j < nmoved_workers; j++)
-					printf(" %d", moved_workers[j]);
-				printf("\n");
+/* 				int j; */
+/* 				printf("remove after ack from ctx %d:", sender_sched_ctx); */
+/* 				for(j = 0; j < nmoved_workers; j++) */
+/* 					printf(" %d", moved_workers[j]); */
+/* 				printf("\n"); */
 
 				starpu_sched_ctx_remove_workers(moved_workers, nmoved_workers, sender_sched_ctx);
 
@@ -715,10 +716,12 @@ static void notify_pushed_task(unsigned sched_ctx, int worker)
 }
 
 /* notifies the hypervisor that a task was poped from the queue of the worker */
-static void notify_poped_task(unsigned sched_ctx, int worker, double elapsed_flops)
+static void notify_poped_task(unsigned sched_ctx, int worker, double elapsed_flops, size_t data_size)
 {
 	hypervisor.sched_ctx_w[sched_ctx].poped_tasks[worker]++;
 	hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[worker] += elapsed_flops;
+	hypervisor.sched_ctx_w[sched_ctx].elapsed_data[worker] += data_size ;
+	hypervisor.sched_ctx_w[sched_ctx].elapsed_tasks[worker]++ ;
 	hypervisor.sched_ctx_w[sched_ctx].total_elapsed_flops[worker] += elapsed_flops;
 	hypervisor.sched_ctx_w[sched_ctx].remaining_flops -= elapsed_flops; //sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(&hypervisor.sched_ctx_w[sched_ctx]);
 

+ 1 - 1
socl/examples/Makefile.am

@@ -14,7 +14,7 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
-AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
+AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS)
 LIBS = $(top_builddir)/socl/src/libsocl-@STARPU_EFFECTIVE_VERSION@.la $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la
 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS)
 

+ 7 - 4
socl/src/cl_enqueuendrangekernel.c

@@ -164,9 +164,6 @@ soclEnqueueNDRangeKernel(cl_command_queue cq,
       cl_event beforeEvent, afterEvent, totalEvent;
 
       totalEvent = event_create();
-      totalEvent->prof_start = _socl_nanotime();
-      totalEvent->prof_submit = totalEvent->prof_start;
-      totalEvent->prof_queued = totalEvent->prof_start;
       gc_entity_store(&totalEvent->cq, cq);
 
       command_marker cmd = command_marker_create();
@@ -197,7 +194,7 @@ soclEnqueueNDRangeKernel(cl_command_queue cq,
          /* Store perf */
          cl_ulong start,end;
          soclGetEventProfilingInfo(beforeEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &start, NULL);
-         soclGetEventProfilingInfo(afterEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &end, NULL);
+         soclGetEventProfilingInfo(afterEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
          soclReleaseEvent(afterEvent);
 
          kernel->split_perfs[iter] = end-start;
@@ -205,6 +202,12 @@ soclEnqueueNDRangeKernel(cl_command_queue cq,
          pthread_mutex_unlock(&kernel->split_lock);
 
          event_complete(totalEvent);
+
+         totalEvent->prof_start = start;
+         totalEvent->prof_submit = start;
+         totalEvent->prof_queued = start;
+         totalEvent->prof_end = end;
+
          RETURN_EVENT(totalEvent,event);
       } else {
          soclReleaseEvent(totalEvent);

+ 1 - 1
src/Makefile.am

@@ -49,7 +49,7 @@ endif STARPU_HAVE_WINDOWS
 
 lib_LTLIBRARIES = libstarpu-@STARPU_EFFECTIVE_VERSION@.la
 
-libstarpu_@STARPU_EFFECTIVE_VERSION@_la_CPPFLAGS = -I$(top_srcdir)/include/ -DBUILDING_STARPU -Werror=implicit
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_CPPFLAGS = -I$(top_srcdir)/include/ -DBUILDING_STARPU
 
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_CFLAGS = $(GLOBAL_AM_CFLAGS) $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS)
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_LIBADD = -lm $(HWLOC_LIBS) $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LIBS) $(STARPU_GLPK_LDFLAGS)

+ 10 - 2
src/core/jobs.c

@@ -144,6 +144,15 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
 
+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
+	int workerid = starpu_worker_get_id();
+	int i;
+	size_t data_size = 0;
+	for(i = 0; i < STARPU_NMAXBUFS; i++)
+		if(task->handles[i] != NULL)
+			data_size += _starpu_data_get_size(task->handles[i]);
+#endif //STARPU_USE_SCHED_CTX_HYPERVISOR
+
 	/* We release handle reference count */
 	if (task->cl)
 	{
@@ -210,8 +219,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 	{
 		_starpu_sched_post_exec_hook(task);
 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
-		int workerid = starpu_worker_get_id();
-		starpu_call_poped_task_cb(workerid, task->sched_ctx, task->flops);
+		starpu_sched_ctx_call_poped_task_cb(workerid, task->sched_ctx, task->flops, data_size);
 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
 	}
 

+ 14 - 4
src/core/perfmodel/perfmodel_bus.c

@@ -1344,6 +1344,16 @@ static void write_bus_bandwidth_file_content(void)
 }
 #endif /* STARPU_SIMGRID */
 
+double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev)
+{
+	return bandwidth_matrix[0][cudadev+1];
+}
+
+double starpu_get_latency_RAM_CUDA(unsigned cudadev)
+{
+	return latency_matrix[0][cudadev+1];
+}
+
 void starpu_bus_print_bandwidth(FILE *f)
 {
 	unsigned src, dst, maxnode;
@@ -1397,14 +1407,14 @@ void starpu_bus_print_bandwidth(FILE *f)
 	{
 		struct dev_timing *timing;
 		struct _starpu_machine_config *config = _starpu_get_machine_config();
-		int ncpus = _starpu_topology_get_nhwcpu(config);
-		int cpu;
+		unsigned config_ncpus = _starpu_topology_get_nhwcpu(config);
+		unsigned cpu;
 
 #ifdef STARPU_USE_CUDA
 		if (src <= ncuda)
 		{
 			fprintf(f, "CUDA %d\t", src-1);
-			for (cpu = 0; cpu < ncpus; cpu++)
+			for (cpu = 0; cpu < config_ncpus; cpu++)
 			{
 				timing = &cudadev_timing_per_cpu[src*STARPU_MAXCPUS+cpu];
 				if (timing->timing_htod)
@@ -1420,7 +1430,7 @@ void starpu_bus_print_bandwidth(FILE *f)
 #ifdef STARPU_USE_OPENCL
 		{
 			fprintf(f, "OpenCL%d\t", src-ncuda-1);
-			for (cpu = 0; cpu < ncpus; cpu++)
+			for (cpu = 0; cpu < config_ncpus; cpu++)
 			{
 				timing = &opencldev_timing_per_cpu[(src-ncuda)*STARPU_MAXCPUS+cpu];
 				if (timing->timing_htod)

+ 31 - 14
src/core/perfmodel/perfmodel_history.c

@@ -180,7 +180,7 @@ static void scan_reg_model(FILE *f, struct starpu_perfmodel_regression_model *re
 
 static void dump_history_entry(FILE *f, struct starpu_perfmodel_history_entry *entry)
 {
-	fprintf(f, "%08x\t%-15lu\t%-15le\t%-15le\t%-15le\t%-15le\t%u\n", entry->footprint, (unsigned long) entry->size, entry->mean, entry->deviation, entry->sum, entry->sum2, entry->nsample);
+	fprintf(f, "%08x\t%-15lu\t%-15le\t%-15le\t%-15le\t%-15le\t%-15le\t%u\n", entry->footprint, (unsigned long) entry->size, entry->flops, entry->mean, entry->deviation, entry->sum, entry->sum2, entry->nsample);
 }
 
 static void scan_history_entry(FILE *f, struct starpu_perfmodel_history_entry *entry)
@@ -192,28 +192,36 @@ static void scan_history_entry(FILE *f, struct starpu_perfmodel_history_entry *e
 	/* In case entry is NULL, we just drop these values */
 	unsigned nsample;
 	uint32_t footprint;
-#ifdef STARPU_HAVE_WINDOWS
-	unsigned size; /* in bytes */
-#else
-	size_t size; /* in bytes */
-#endif
+	unsigned long size; /* in bytes */
+	double flops;
 	double mean;
 	double deviation;
 	double sum;
 	double sum2;
 
+	char line[256];
+	char *ret;
+
+	ret = fgets(line, sizeof(line), f);
+	STARPU_ASSERT(ret);
+	STARPU_ASSERT(strchr(line, '\n'));
+
 	/* Read the values from the file */
-	res = fscanf(f, "%x\t%"
-#ifndef STARPU_HAVE_WINDOWS
-	"z"
-#endif
-	"u\t%le\t%le\t%le\t%le\t%u\n", &footprint, &size, &mean, &deviation, &sum, &sum2, &nsample);
-	STARPU_ASSERT_MSG(res == 7, "Incorrect performance model file");
+	res = sscanf(line, "%x\t%lu\t%le\t%le\t%le\t%le\t%le\t%u", &footprint, &size, &flops, &mean, &deviation, &sum, &sum2, &nsample);
+
+	if (res != 8)
+	{
+		flops = 0.;
+		/* Read the values from the file */
+		res = sscanf(line, "%x\t%lu\t%le\t%le\t%le\t%le\t%u", &footprint, &size, &mean, &deviation, &sum, &sum2, &nsample);
+		STARPU_ASSERT_MSG(res == 7, "Incorrect performance model file");
+	}
 
 	if (entry)
 	{
 		entry->footprint = footprint;
 		entry->size = size;
+		entry->flops = flops;
 		entry->mean = mean;
 		entry->deviation = deviation;
 		entry->sum = sum;
@@ -393,7 +401,7 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel *model, un
 	/* Dump the history into the model file in case it is necessary */
 	if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
 	{
-		fprintf(f, "# hash\t\tsize\t\tmean\t\tdev\t\tsum\t\tsum2\t\tn\n");
+		fprintf(f, "# hash\t\tsize\t\tflops\t\tmean\t\tdev\t\tsum\t\tsum2\t\tn\n");
 		ptr = per_arch_model->list;
 		while (ptr)
 		{
@@ -956,7 +964,7 @@ int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *mo
 			char *symbol2 = strdup(symbol);
 			symbol2[dot-symbol] = '\0';
 			int ret;
-			fprintf(stderr,"note: loading history from %s instead of %s\n", symbol2, symbol);
+			_STARPU_DISP("note: loading history from %s instead of %s\n", symbol2, symbol);
 			ret = starpu_perfmodel_load_symbol(symbol2,model);
 			free(symbol2);
 			return ret;
@@ -1152,6 +1160,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 				entry->sum2 = measured*measured;
 
 				entry->size = _starpu_job_get_data_size(model, arch, nimpl, j);
+				entry->flops = j->task->flops;
 
 				entry->footprint = key;
 				entry->nsample = 1;
@@ -1168,6 +1177,14 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 				unsigned n = entry->nsample;
 				entry->mean = entry->sum / n;
 				entry->deviation = sqrt((entry->sum2 - (entry->sum*entry->sum)/n)/n);
+				if (j->task->flops != 0.)
+				{
+					if (entry->flops == 0.)
+						entry->flops = j->task->flops;
+					else if (entry->flops != j->task->flops)
+						/* Incoherent flops! forget about trying to record flops */
+						entry->flops = NAN;
+				}
 			}
 
 			STARPU_ASSERT(entry);

+ 4 - 4
src/core/perfmodel/perfmodel_print.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2011, 2013  Université de Bordeaux 1
  * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
@@ -38,8 +38,8 @@ void _starpu_perfmodel_print_history_based(struct starpu_perfmodel_per_arch *per
 			if (!parameter)
 			{
 				/* There isn't a parameter that is explicitely requested, so we display all parameters */
-				printf("%08x\t%-15lu\t%-15le\t%-15le\t%u\n", entry->footprint,
-					(unsigned long) entry->size, entry->mean, entry->deviation, entry->nsample);
+				printf("%08x\t%-15lu\t%-15le\t%-15le\t%-15le\t%u\n", entry->footprint,
+					(unsigned long) entry->size, entry->flops, entry->mean, entry->deviation, entry->nsample);
 			}
 			else
 			{
@@ -230,7 +230,7 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 		nmatched = sscanf(arch, "cuda_%d", &gpuid);
 		if (nmatched == 1)
 		{
-			unsigned archid = STARPU_CUDA_DEFAULT+ gpuid;
+			int archid = STARPU_CUDA_DEFAULT+ gpuid;
 			unsigned implid;
 			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
 				starpu_perfmodel_print(model, (enum starpu_perf_archtype) archid, implid, parameter, footprint, output);

+ 35 - 30
src/core/sched_ctx.c

@@ -40,9 +40,9 @@ static void _starpu_worker_gets_into_ctx(unsigned sched_ctx_id, struct _starpu_w
 	unsigned worker_sched_ctx_id = _starpu_worker_get_sched_ctx_id(worker, sched_ctx_id);
 	/* the worker was planning to go away in another ctx but finally he changed his mind & 
 	   he's staying */
-	if(worker_sched_ctx_id  == STARPU_NMAX_SCHED_CTXS)
+	if (worker_sched_ctx_id  == STARPU_NMAX_SCHED_CTXS)
 	{
-		unsigned worker_sched_ctx_id = _starpu_worker_get_first_free_sched_ctx(worker);
+		worker_sched_ctx_id = _starpu_worker_get_first_free_sched_ctx(worker);
 		struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 		/* add context to worker */
 		worker->sched_ctx[worker_sched_ctx_id] = sched_ctx;
@@ -124,7 +124,7 @@ static void _starpu_update_workers_without_ctx(int *workerids, int nworkers, int
 	return;
 }
 
-void starpu_stop_task_submission()
+void starpu_sched_ctx_stop_task_submission()
 {
 	_starpu_exclude_task_from_dag(&stop_submission_task);
 	_starpu_task_submit_internally(&stop_submission_task);
@@ -442,7 +442,7 @@ unsigned starpu_sched_ctx_create(const char *policy_name, int *workerids,
 }
 
 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
-void starpu_set_perf_counters(unsigned sched_ctx_id, struct starpu_performance_counters *perf_counters)
+void starpu_sched_ctx_set_perf_counters(unsigned sched_ctx_id, struct starpu_sched_ctx_performance_counters *perf_counters)
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	sched_ctx->perf_counters = perf_counters;
@@ -716,6 +716,8 @@ void _starpu_decrement_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	int finished = _starpu_barrier_counter_decrement_until_empty_counter(&sched_ctx->tasks_barrier);
+/*when finished decrementing the tasks if the user signaled he will not submit tasks anymore
+  we can move all its workers to the inheritor context */
 	if(finished && sched_ctx->inheritor != STARPU_NMAX_SCHED_CTXS)
 	{
 		_STARPU_PTHREAD_MUTEX_LOCK(&finished_submit_mutex);
@@ -723,17 +725,20 @@ void _starpu_decrement_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
 		{
 			_STARPU_PTHREAD_MUTEX_UNLOCK(&finished_submit_mutex);
 
-			_STARPU_PTHREAD_MUTEX_LOCK(&changing_ctx_mutex[sched_ctx->id]);
-			int *workerids = NULL;
-			unsigned nworkers = _get_workers_list(sched_ctx, &workerids);
-
-			if(nworkers > 0)
+			/* take care the context is not deleted or changed at the same time */
+			_STARPU_PTHREAD_MUTEX_LOCK(&changing_ctx_mutex[sched_ctx_id]);
+			if(sched_ctx->id != STARPU_NMAX_SCHED_CTXS)
 			{
-				starpu_sched_ctx_add_workers(workerids, nworkers, sched_ctx->inheritor);
-				free(workerids);
+				int *workerids = NULL;
+				unsigned nworkers = _get_workers_list(sched_ctx, &workerids);
+				
+				if(nworkers > 0)
+				{
+					starpu_sched_ctx_add_workers(workerids, nworkers, sched_ctx->inheritor);
+					free(workerids);
+				}
 			}
-
-			_STARPU_PTHREAD_MUTEX_UNLOCK(&changing_ctx_mutex[sched_ctx->id]);
+			_STARPU_PTHREAD_MUTEX_UNLOCK(&changing_ctx_mutex[sched_ctx_id]);
 
 			return;
 		}
@@ -748,12 +753,12 @@ void _starpu_increment_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
 	_starpu_barrier_counter_increment(&sched_ctx->tasks_barrier);
 }
 
-void starpu_task_set_context(unsigned *sched_ctx)
+void starpu_sched_ctx_set_context(unsigned *sched_ctx)
 {
 	pthread_setspecific(sched_ctx_key, (void*)sched_ctx);
 }
 
-unsigned starpu_task_get_context()
+unsigned starpu_sched_ctx_get_context()
 {
 	unsigned *sched_ctx = (unsigned*)pthread_getspecific(sched_ctx_key);
 	if(sched_ctx == NULL)
@@ -762,12 +767,12 @@ unsigned starpu_task_get_context()
 	return *sched_ctx;
 }
 
-void starpu_notify_hypervisor_exists()
+void starpu_sched_ctx_notify_hypervisor_exists()
 {
 	with_hypervisor = 1;
 }
 
-unsigned starpu_check_if_hypervisor_exists()
+unsigned starpu_sched_ctx_check_if_hypervisor_exists()
 {
 	return with_hypervisor;
 }
@@ -797,7 +802,7 @@ struct starpu_sched_ctx_worker_collection* starpu_sched_ctx_create_worker_collec
 
 	switch(worker_collection_type)
 	{
-	case STARPU_WORKER_LIST:
+	case STARPU_SCHED_CTX_WORKER_LIST:
 		sched_ctx->workers->has_next = worker_list.has_next;
 		sched_ctx->workers->get_next = worker_list.get_next;
 		sched_ctx->workers->add = worker_list.add;
@@ -805,7 +810,7 @@ struct starpu_sched_ctx_worker_collection* starpu_sched_ctx_create_worker_collec
 		sched_ctx->workers->init = worker_list.init;
 		sched_ctx->workers->deinit = worker_list.deinit;
 		sched_ctx->workers->init_iterator = worker_list.init_iterator;
-		sched_ctx->workers->type = STARPU_WORKER_LIST;
+		sched_ctx->workers->type = STARPU_SCHED_CTX_WORKER_LIST;
 		break;
 	}
 
@@ -818,7 +823,7 @@ static unsigned _get_workers_list(struct _starpu_sched_ctx *sched_ctx, int **wor
 	*workerids = (int*)malloc(workers->nworkers*sizeof(int));
 	int worker;
 	unsigned nworkers = 0;
-	struct starpu_iterator it;
+	struct starpu_sched_ctx_iterator it;
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 
@@ -851,7 +856,7 @@ int starpu_get_workers_of_sched_ctx(unsigned sched_ctx_id, int *pus, enum starpu
 	int worker;
 
 	int npus = 0;
-	struct starpu_iterator it;
+	struct starpu_sched_ctx_iterator it;
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 
@@ -866,7 +871,7 @@ int starpu_get_workers_of_sched_ctx(unsigned sched_ctx_id, int *pus, enum starpu
 	return npus;
 }
 
-_starpu_pthread_mutex_t* starpu_get_changing_ctx_mutex(unsigned sched_ctx_id)
+_starpu_pthread_mutex_t* starpu_sched_ctx_get_changing_ctx_mutex(unsigned sched_ctx_id)
 {
 	return &changing_ctx_mutex[sched_ctx_id];
 }
@@ -891,7 +896,7 @@ unsigned starpu_sched_ctx_get_nshared_workers(unsigned sched_ctx_id, unsigned sc
         int worker, worker2;
         int shared_workers = 0;
 
-	struct starpu_iterator it1, it2;
+	struct starpu_sched_ctx_iterator it1, it2;
         if(workers->init_iterator)
                 workers->init_iterator(workers, &it1);
 
@@ -926,7 +931,7 @@ unsigned starpu_sched_ctx_contains_worker(int workerid, unsigned sched_ctx_id)
         struct starpu_sched_ctx_worker_collection *workers = sched_ctx->workers;
         int worker;
 
-	struct starpu_iterator it;
+	struct starpu_sched_ctx_iterator it;
         if(workers->init_iterator)
                 workers->init_iterator(workers, &it);
 
@@ -963,7 +968,7 @@ unsigned starpu_sched_ctx_overlapping_ctxs_on_worker(int workerid)
 	return worker->nsched_ctxs > 1;
 }
 
-unsigned starpu_is_ctxs_turn(int workerid, unsigned sched_ctx_id)
+unsigned starpu_sched_ctx_is_ctxs_turn(int workerid, unsigned sched_ctx_id)
 {
 	if(max_time_worker_on_ctx == -1.0) return 1;
 
@@ -971,7 +976,7 @@ unsigned starpu_is_ctxs_turn(int workerid, unsigned sched_ctx_id)
 	return worker->active_ctx == sched_ctx_id;
 }
 
-void starpu_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id)
+void starpu_sched_ctx_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id)
 {
 	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
 
@@ -996,7 +1001,7 @@ void starpu_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id)
 	}
 }
 
-double starpu_get_max_time_worker_on_ctx(void)
+double starpu_sched_ctx_get_max_time_worker_on_ctx(void)
 {
 	return max_time_worker_on_ctx;
 }
@@ -1020,15 +1025,15 @@ void starpu_sched_ctx_finished_submit(unsigned sched_ctx_id)
 
 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
 
-void starpu_call_poped_task_cb(int workerid, unsigned sched_ctx_id, double flops)
+void starpu_sched_ctx_call_poped_task_cb(int workerid, unsigned sched_ctx_id, double flops, size_t data_size)
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	if(sched_ctx != NULL && sched_ctx_id != 0 && sched_ctx_id != STARPU_NMAX_SCHED_CTXS
 	   && sched_ctx->perf_counters != NULL)
-		sched_ctx->perf_counters->notify_poped_task(sched_ctx_id, workerid, flops);
+		sched_ctx->perf_counters->notify_poped_task(sched_ctx_id, workerid, flops, data_size);
 }
 
-void starpu_call_pushed_task_cb(int workerid, unsigned sched_ctx_id)
+void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id)
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 

+ 2 - 2
src/core/sched_ctx.h

@@ -91,7 +91,7 @@ struct _starpu_sched_ctx
 
 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
 	/* a structure containing a series of performance counters determining the resize procedure */
-	struct starpu_performance_counters *perf_counters;
+	struct starpu_sched_ctx_performance_counters *perf_counters;
 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
 };
 
@@ -139,7 +139,7 @@ void _starpu_worker_gets_out_of_ctx(unsigned sched_ctx_id, struct _starpu_worker
 unsigned _starpu_worker_belongs_to_a_sched_ctx(int workerid, unsigned sched_ctx_id);
 
 #if defined(_MSC_VER) || defined(STARPU_SIMGRID)
-_starpu_pthread_mutex_t* starpu_get_changing_ctx_mutex(unsigned sched_ctx_id);
+_starpu_pthread_mutex_t* starpu_sched_ctx_get_changing_ctx_mutex(unsigned sched_ctx_id);
 #endif
 
 #endif // __SCHED_CONTEXT_H__

+ 15 - 16
src/core/sched_policy.c

@@ -225,7 +225,7 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 	}
 
 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
-	starpu_call_pushed_task_cb(workerid, task->sched_ctx);
+	starpu_sched_ctx_call_pushed_task_cb(workerid, task->sched_ctx);
 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
 
 	if (is_basic_worker)
@@ -233,7 +233,6 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 		unsigned node = starpu_worker_get_memory_node(workerid);
 		if (_starpu_task_uses_multiformat_handles(task))
 		{
-			unsigned i;
 			for (i = 0; i < task->cl->nbuffers; i++)
 			{
 				struct starpu_task *conversion_task;
@@ -269,24 +268,24 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 
 		int ret = 0;
 
-		struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
-		j->task_size = worker_size;
-		j->combined_workerid = workerid;
-		j->active_task_alias_count = 0;
+		struct _starpu_job *job = _starpu_get_job_associated_to_task(task);
+		job->task_size = worker_size;
+		job->combined_workerid = workerid;
+		job->active_task_alias_count = 0;
 
-		_STARPU_PTHREAD_BARRIER_INIT(&j->before_work_barrier, NULL, worker_size);
-		_STARPU_PTHREAD_BARRIER_INIT(&j->after_work_barrier, NULL, worker_size);
+		_STARPU_PTHREAD_BARRIER_INIT(&job->before_work_barrier, NULL, worker_size);
+		_STARPU_PTHREAD_BARRIER_INIT(&job->after_work_barrier, NULL, worker_size);
 
 		/* Note: we have to call that early, or else the task may have
 		 * disappeared already */
 		_starpu_push_task_end(task);
 
-		int i;
-		for (i = 0; i < worker_size; i++)
+		int j;
+		for (j = 0; j < worker_size; j++)
 		{
 			struct starpu_task *alias = _starpu_create_task_alias(task);
 
-			worker = _starpu_get_worker_struct(combined_workerid[i]);
+			worker = _starpu_get_worker_struct(combined_workerid[j]);
 			ret |= _starpu_push_local_task(worker, alias, 0);
 		}
 
@@ -299,14 +298,14 @@ static int _starpu_nworkers_able_to_execute_task(struct starpu_task *task, struc
 	int worker = -1, nworkers = 0;
 	struct starpu_sched_ctx_worker_collection *workers = sched_ctx->workers;
 
-	struct starpu_iterator it;
+	struct starpu_sched_ctx_iterator it;
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 
 	while(workers->has_next(workers, &it))
 	{
 		worker = workers->get_next(workers, &it);
-		if (starpu_worker_can_execute_task(worker, task, 0) && starpu_is_ctxs_turn(worker, sched_ctx->id))
+		if (starpu_worker_can_execute_task(worker, task, 0) && starpu_sched_ctx_is_ctxs_turn(worker, sched_ctx->id))
 			nworkers++;
 	}
 
@@ -563,7 +562,7 @@ pick:
 	{
 		struct _starpu_sched_ctx *sched_ctx;
 
-		unsigned lucky_ctx = STARPU_NMAX_SCHED_CTXS;
+		//unsigned lucky_ctx = STARPU_NMAX_SCHED_CTXS;
 
 		int been_here[STARPU_NMAX_SCHED_CTXS];
 		int i;
@@ -582,7 +581,7 @@ pick:
 				if (sched_ctx->sched_policy && sched_ctx->sched_policy->pop_task)
 				{
 					task = sched_ctx->sched_policy->pop_task(sched_ctx->id);
-					lucky_ctx = sched_ctx->id;
+					//lucky_ctx = sched_ctx->id;
 				}
 			}
 
@@ -605,7 +604,7 @@ pick:
 
 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
 	struct _starpu_sched_ctx *sched_ctx = NULL;
-	struct starpu_performance_counters *perf_counters = NULL;
+	struct starpu_sched_ctx_performance_counters *perf_counters = NULL;
 	int j;
 	for(j = 0; j < STARPU_NMAX_SCHED_CTXS; j++)
 	{

+ 12 - 2
src/core/task.c

@@ -365,7 +365,7 @@ int starpu_task_submit(struct starpu_task *task)
 
 	if (task->sched_ctx == 0 && nsched_ctxs != 1 && !j->exclude_from_dag)
 	{
-		set_sched_ctx = starpu_task_get_context();
+		set_sched_ctx = starpu_sched_ctx_get_context();
 		if (set_sched_ctx != STARPU_NMAX_SCHED_CTXS)
 			task->sched_ctx = set_sched_ctx;
 	}
@@ -633,7 +633,7 @@ void starpu_display_codelet_stats(struct starpu_codelet *cl)
 int starpu_task_wait_for_all(void)
 {
 	unsigned nsched_ctxs = _starpu_get_nsched_ctxs();
-	unsigned sched_ctx_id = nsched_ctxs == 1 ? 0 : starpu_task_get_context();
+	unsigned sched_ctx_id = nsched_ctxs == 1 ? 0 : starpu_sched_ctx_get_context();
 
 	/* if there is no indication about which context to wait,
 	   we wait for all tasks submitted to starpu */
@@ -745,6 +745,11 @@ static void _starpu_increment_nsubmitted_tasks(void)
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
 }
 
+int starpu_task_nsubmitted(void)
+{
+	return nsubmitted;
+}
+
 void _starpu_increment_nready_tasks(void)
 {
 	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
@@ -765,6 +770,11 @@ void _starpu_decrement_nready_tasks(void)
 
 }
 
+int starpu_task_nready(void)
+{
+	return nready;
+}
+
 void _starpu_initialize_current_task_key(void)
 {
 	_STARPU_PTHREAD_KEY_CREATE(&current_task_key, NULL);

+ 24 - 24
src/core/workers.c

@@ -84,15 +84,15 @@ static uint32_t _starpu_worker_exists_and_can_execute(struct starpu_task *task,
 			switch (arch)
 			{
 			case STARPU_CPU_WORKER:
-				if (task->cl->cpu_funcs[i] != NULL)
+				if (task->cl->cpu_funcs[impl] != NULL)
 					test_implementation = 1;
 				break;
 			case STARPU_CUDA_WORKER:
-				if (task->cl->cuda_funcs[i] != NULL)
+				if (task->cl->cuda_funcs[impl] != NULL)
 					test_implementation = 1;
 				break;
 			case STARPU_OPENCL_WORKER:
-				if (task->cl->opencl_funcs[i] != NULL)
+				if (task->cl->opencl_funcs[impl] != NULL)
 					test_implementation = 1;
 				break;
 			default:
@@ -340,14 +340,14 @@ void _starpu_worker_init(struct _starpu_worker *worker, unsigned fut_key)
 
 }
 
-static void _starpu_launch_drivers(struct _starpu_machine_config *config)
+static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 {
-	config->running = 1;
-	config->submitting = 1;
+	pconfig->running = 1;
+	pconfig->submitting = 1;
 
 	_STARPU_PTHREAD_KEY_CREATE(&worker_key, NULL);
 
-	unsigned nworkers = config->topology.nworkers;
+	unsigned nworkers = pconfig->topology.nworkers;
 
 	/* Launch workers asynchronously */
 	unsigned cpu = 0, cuda = 0;
@@ -368,9 +368,9 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
 	for (worker = 0; worker < nworkers; worker++)
 	{
-		struct _starpu_worker *workerarg = &config->workers[worker];
+		struct _starpu_worker *workerarg = &pconfig->workers[worker];
 
-		workerarg->config = config;
+		workerarg->config = pconfig;
 
 		_starpu_barrier_counter_init(&workerarg->tasks_barrier, 0);
 
@@ -388,7 +388,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 		workerarg->run_by_starpu = 1;
 		workerarg->worker_is_running = 0;
 		workerarg->worker_is_initialized = 0;
-		
+
 		int ctx;
 		for(ctx = 0; ctx < STARPU_NMAX_SCHED_CTXS; ctx++)
 			workerarg->removed_from_ctx[ctx] = 0;
@@ -419,7 +419,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 			case STARPU_CPU_WORKER:
 				workerarg->set = NULL;
 				driver.id.cpu_id = cpu;
-				if (_starpu_may_launch_driver(config->conf, &driver))
+				if (_starpu_may_launch_driver(pconfig->conf, &driver))
 				{
 					_STARPU_PTHREAD_CREATE_ON(
 						workerarg->name,
@@ -446,7 +446,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 			case STARPU_CUDA_WORKER:
 				workerarg->set = NULL;
 				driver.id.cuda_id = cuda;
-				if (_starpu_may_launch_driver(config->conf, &driver))
+				if (_starpu_may_launch_driver(pconfig->conf, &driver))
 				{
 					_STARPU_PTHREAD_CREATE_ON(
 						workerarg->name,
@@ -473,7 +473,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 			case STARPU_OPENCL_WORKER:
 #ifndef STARPU_SIMGRID
 				starpu_opencl_get_device(workerarg->devid, &driver.id.opencl_id);
-				if (!_starpu_may_launch_driver(config->conf, &driver))
+				if (!_starpu_may_launch_driver(pconfig->conf, &driver))
 				{
 					workerarg->run_by_starpu = 0;
 					break;
@@ -504,7 +504,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 	cuda = 0;
 	for (worker = 0; worker < nworkers; worker++)
 	{
-		struct _starpu_worker *workerarg = &config->workers[worker];
+		struct _starpu_worker *workerarg = &pconfig->workers[worker];
 		struct starpu_driver driver;
 		driver.type = workerarg->arch;
 
@@ -512,7 +512,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 		{
 			case STARPU_CPU_WORKER:
 				driver.id.cpu_id = cpu;
-				if (!_starpu_may_launch_driver(config->conf, &driver))
+				if (!_starpu_may_launch_driver(pconfig->conf, &driver))
 				{
 					cpu++;
 					break;
@@ -526,7 +526,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 				break;
 			case STARPU_CUDA_WORKER:
 				driver.id.cuda_id = cuda;
-				if (!_starpu_may_launch_driver(config->conf, &driver))
+				if (!_starpu_may_launch_driver(pconfig->conf, &driver))
 				{
 					cuda++;
 					break;
@@ -542,7 +542,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 			case STARPU_OPENCL_WORKER:
 #ifndef STARPU_SIMGRID
 				starpu_opencl_get_device(workerarg->devid, &driver.id.opencl_id);
-				if (!_starpu_may_launch_driver(config->conf, &driver))
+				if (!_starpu_may_launch_driver(pconfig->conf, &driver))
 					break;
 #endif
 				_STARPU_DEBUG("waiting for worker %u initialization\n", worker);
@@ -817,19 +817,19 @@ void starpu_profiling_init()
  * Handle runtime termination
  */
 
-static void _starpu_terminate_workers(struct _starpu_machine_config *config)
+static void _starpu_terminate_workers(struct _starpu_machine_config *pconfig)
 {
 	int status STARPU_ATTRIBUTE_UNUSED;
 	unsigned workerid;
 
-	for (workerid = 0; workerid < config->topology.nworkers; workerid++)
+	for (workerid = 0; workerid < pconfig->topology.nworkers; workerid++)
 	{
 		starpu_wake_all_blocked_workers();
 
 		_STARPU_DEBUG("wait for worker %u\n", workerid);
 
-		struct _starpu_worker_set *set = config->workers[workerid].set;
-		struct _starpu_worker *worker = &config->workers[workerid];
+		struct _starpu_worker_set *set = pconfig->workers[workerid].set;
+		struct _starpu_worker *worker = &pconfig->workers[workerid];
 
 		/* in case StarPU termination code is called from a callback,
  		 * we have to check if pthread_self() is the worker itself */
@@ -914,10 +914,10 @@ unsigned _starpu_worker_can_block(unsigned memnode STARPU_ATTRIBUTE_UNUSED)
 #endif
 }
 
-static void _starpu_kill_all_workers(struct _starpu_machine_config *config)
+static void _starpu_kill_all_workers(struct _starpu_machine_config *pconfig)
 {
 	/* set the flag which will tell workers to stop */
-	config->running = 0;
+	pconfig->running = 0;
 	/* running is just protected by a memory barrier */
 	STARPU_WMB();
 	starpu_wake_all_blocked_workers();
@@ -1299,7 +1299,7 @@ int starpu_worker_get_nids_ctx_free_by_type(enum starpu_archtype type, int *work
 				if(config.sched_ctxs[s].id != STARPU_NMAX_SCHED_CTXS)
 				{
 					struct starpu_sched_ctx_worker_collection *workers = config.sched_ctxs[s].workers;
-					struct starpu_iterator it;
+					struct starpu_sched_ctx_iterator it;
 					if(workers->init_iterator)
 						workers->init_iterator(workers, &it);
 

+ 1 - 0
src/datawizard/coherency.c

@@ -458,6 +458,7 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 			}
 		}
 		else
+			/* The last request will perform the callback after termination */
 			_starpu_data_request_append_callback(r, callback_func, callback_arg);
 
 

+ 147 - 35
src/datawizard/copy_driver.c

@@ -134,8 +134,10 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 	{
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM):
 		/* STARPU_CPU_RAM -> STARPU_CPU_RAM */
-		STARPU_ASSERT(copy_methods->ram_to_ram);
-		copy_methods->ram_to_ram(src_interface, src_node, dst_interface, dst_node);
+		if (copy_methods->ram_to_ram)
+			copy_methods->ram_to_ram(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
 		break;
 #ifdef STARPU_USE_CUDA
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
@@ -143,11 +145,15 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 #if !defined(HAVE_CUDA_MEMCPY_PEER)
 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == src_node);
 #endif
-		STARPU_ASSERT(copy_methods->cuda_to_ram);
-		if (!req || !copy_methods->cuda_to_ram_async)
+		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
+				!(copy_methods->cuda_to_ram_async || copy_methods->any_to_any))
 		{
 			/* this is not associated to a request so it's synchronous */
-			copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node);
+			STARPU_ASSERT(copy_methods->cuda_to_ram || copy_methods->any_to_any);
+			if (copy_methods->cuda_to_ram)
+				copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node);
+			else
+				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
 		}
 		else
 		{
@@ -156,7 +162,13 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 
 			stream = starpu_cuda_get_local_out_transfer_stream();
-			ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
+			if (copy_methods->cuda_to_ram_async)
+				ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
+			else
+			{
+				STARPU_ASSERT(copy_methods->any_to_any);
+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+			}
 
 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
@@ -168,11 +180,15 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 #if !defined(HAVE_CUDA_MEMCPY_PEER)
 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node);
 #endif
-		STARPU_ASSERT(copy_methods->ram_to_cuda);
-		if (!req || !copy_methods->ram_to_cuda_async)
+		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
+				!(copy_methods->ram_to_cuda_async || copy_methods->any_to_any))
 		{
 			/* this is not associated to a request so it's synchronous */
-			copy_methods->ram_to_cuda(src_interface, src_node, dst_interface, dst_node);
+			STARPU_ASSERT(copy_methods->ram_to_cuda || copy_methods->any_to_any);
+			if (copy_methods->ram_to_cuda)
+				copy_methods->ram_to_cuda(src_interface, src_node, dst_interface, dst_node);
+			else
+				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
 		}
 		else
 		{
@@ -182,7 +198,13 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 				STARPU_CUDA_REPORT_ERROR(cures);
 
 			stream = starpu_cuda_get_local_in_transfer_stream();
-			ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
+			if (copy_methods->ram_to_cuda_async)
+				ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
+			else
+			{
+				STARPU_ASSERT(copy_methods->any_to_any);
+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+			}
 
 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
 			if (STARPU_UNLIKELY(cures != cudaSuccess))
@@ -191,12 +213,15 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 		break;
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
 		/* CUDA - CUDA transfer */
-		STARPU_ASSERT(copy_methods->cuda_to_cuda || copy_methods->cuda_to_cuda_async);
-		if (!req || !copy_methods->cuda_to_cuda_async)
+		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
+				!(copy_methods->cuda_to_cuda_async || copy_methods->any_to_any))
 		{
-			STARPU_ASSERT(copy_methods->cuda_to_cuda);
+			STARPU_ASSERT(copy_methods->cuda_to_cuda || copy_methods->any_to_any);
 			/* this is not associated to a request so it's synchronous */
-			copy_methods->cuda_to_cuda(src_interface, src_node, dst_interface, dst_node);
+			if (copy_methods->cuda_to_cuda)
+				copy_methods->cuda_to_cuda(src_interface, src_node, dst_interface, dst_node);
+			else
+				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
 		}
 		else
 		{
@@ -205,7 +230,13 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 
 			stream = starpu_cuda_get_local_peer_transfer_stream();
-			ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
+			if (copy_methods->cuda_to_cuda_async)
+				ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
+			else
+			{
+				STARPU_ASSERT(copy_methods->any_to_any);
+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+			}
 
 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
@@ -215,54 +246,77 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 #ifdef STARPU_USE_OPENCL
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM):
 		/* OpenCL -> RAM */
-		if (_starpu_memory_node_get_local_key() == src_node)
+		STARPU_ASSERT(_starpu_memory_node_get_local_key() == src_node);
+		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() ||
+				!(copy_methods->opencl_to_ram_async || copy_methods->any_to_any))
 		{
-			STARPU_ASSERT(copy_methods->opencl_to_ram);
-			if (!req || !copy_methods->opencl_to_ram_async)
-			{
-				/* this is not associated to a request so it's synchronous */
+			STARPU_ASSERT(copy_methods->opencl_to_ram || copy_methods->any_to_any);
+			/* this is not associated to a request so it's synchronous */
+			if (copy_methods->opencl_to_ram)
 				copy_methods->opencl_to_ram(src_interface, src_node, dst_interface, dst_node);
-			}
 			else
-			{
-				req->async_channel.type = STARPU_OPENCL_RAM;
-				ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
-			}
+				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
 		}
 		else
 		{
-			/* we should not have a blocking call ! */
-			STARPU_ABORT();
+			req->async_channel.type = STARPU_OPENCL_RAM;
+			if (copy_methods->opencl_to_ram_async)
+				ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
+			else
+			{
+				STARPU_ASSERT(copy_methods->any_to_any);
+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+			}
 		}
 		break;
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM):
 		/* STARPU_CPU_RAM -> STARPU_OPENCL_RAM */
 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node);
-		STARPU_ASSERT(copy_methods->ram_to_opencl);
-		if (!req || !copy_methods->ram_to_opencl_async)
+		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() ||
+				!(copy_methods->ram_to_opencl_async || copy_methods->any_to_any))
 		{
+			STARPU_ASSERT(copy_methods->ram_to_opencl || copy_methods->any_to_any);
 			/* this is not associated to a request so it's synchronous */
-			copy_methods->ram_to_opencl(src_interface, src_node, dst_interface, dst_node);
+			if (copy_methods->ram_to_opencl)
+				copy_methods->ram_to_opencl(src_interface, src_node, dst_interface, dst_node);
+			else
+				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
 		}
 		else
 		{
 			req->async_channel.type = STARPU_OPENCL_RAM;
-			ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
+			if (copy_methods->ram_to_opencl_async)
+				ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
+			else
+			{
+				STARPU_ASSERT(copy_methods->any_to_any);
+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+			}
 		}
 		break;
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_OPENCL_RAM):
 		/* STARPU_OPENCL_RAM -> STARPU_OPENCL_RAM */
 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node || _starpu_memory_node_get_local_key() == src_node);
-		STARPU_ASSERT(copy_methods->opencl_to_opencl);
-		if (!req || !copy_methods->opencl_to_opencl_async)
+		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() ||
+				!(copy_methods->opencl_to_opencl_async || copy_methods->any_to_any))
 		{
+			STARPU_ASSERT(copy_methods->opencl_to_opencl || copy_methods->any_to_any);
 			/* this is not associated to a request so it's synchronous */
-			copy_methods->opencl_to_opencl(src_interface, src_node, dst_interface, dst_node);
+			if (copy_methods->opencl_to_opencl)
+				copy_methods->opencl_to_opencl(src_interface, src_node, dst_interface, dst_node);
+			else
+				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
 		}
 		else
 		{
 			req->async_channel.type = STARPU_OPENCL_RAM;
-			ret = copy_methods->opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
+			if (copy_methods->opencl_to_opencl_async)
+				ret = copy_methods->opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
+			else
+			{
+				STARPU_ASSERT(copy_methods->any_to_any);
+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+			}
 		}
 		break;
 #endif
@@ -331,6 +385,64 @@ int __attribute__((warn_unused_result)) _starpu_driver_copy_data_1_to_1(starpu_d
 	return 0;
 }
 
+/* This can be used by interfaces to easily transfer a piece of data without
+ * caring about the particular CUDA/OpenCL methods.  */
+
+int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, void *async_data)
+{
+	struct _starpu_async_channel *async_channel = async_data;
+	enum starpu_node_kind src_kind = starpu_node_get_kind(src_node);
+	enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node);
+
+	switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind))
+	{
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM):
+		memcpy((void *) dst + dst_offset, (void *) src + src_offset, size);
+		return 0;
+
+#ifdef STARPU_USE_CUDA
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
+		return starpu_cuda_copy_async_sync(
+				(void*) src + src_offset, src_node,
+				(void*) dst + dst_offset, dst_node,
+				size,
+				async_channel?starpu_cuda_get_local_out_transfer_stream():NULL,
+				cudaMemcpyDeviceToHost);
+
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
+		return starpu_cuda_copy_async_sync(
+				(void*) src + src_offset, src_node,
+				(void*) dst + dst_offset, dst_node,
+				size,
+				async_channel?starpu_cuda_get_local_in_transfer_stream():NULL,
+				cudaMemcpyHostToDevice);
+
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
+		return starpu_cuda_copy_async_sync(
+				(void*) src + src_offset, src_node,
+				(void*) dst + dst_offset, dst_node,
+				size,
+				async_channel?starpu_cuda_get_local_peer_transfer_stream():NULL,
+				cudaMemcpyDeviceToDevice);
+
+#endif
+#ifdef STARPU_USE_OPENCL
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM):
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM):
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_OPENCL_RAM):
+		return starpu_opencl_copy_async_sync(
+				src, src_offset, src_node,
+				dst, dst_offset, dst_node,
+				size,
+				&async_channel->event.opencl_event);
+#endif
+	default:
+		STARPU_ABORT();
+		return -1;
+	}
+	return 0;
+}
+
 void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_channel)
 {
 #ifdef STARPU_SIMGRID

+ 14 - 127
src/datawizard/interfaces/bcsr_interface.c

@@ -31,31 +31,11 @@
  * BCSR : blocked CSR, we use blocks of size (r x c)
  */
 
-static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-#ifdef STARPU_USE_CUDA
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-#endif
-#ifdef STARPU_USE_OPENCL
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-#endif
-
-static struct starpu_data_copy_methods bcsr_copy_data_methods_s =
+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
+
+static const struct starpu_data_copy_methods bcsr_copy_data_methods_s =
 {
-	.ram_to_ram = copy_ram_to_ram,
-#ifdef STARPU_USE_CUDA
-	.ram_to_cuda = copy_ram_to_cuda,
-	.cuda_to_ram = copy_cuda_to_ram,
-	.cuda_to_cuda = copy_cuda_to_cuda,
-#endif
-#ifdef STARPU_USE_OPENCL
-	.ram_to_opencl = copy_ram_to_opencl,
-	.opencl_to_ram = copy_opencl_to_ram,
-	.opencl_to_opencl = copy_opencl_to_opencl,
-#endif
+	.any_to_any = copy_any_to_any,
 };
 
 static void register_bcsr_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
@@ -315,105 +295,7 @@ static void free_bcsr_buffer_on_node(void *data_interface, unsigned node)
 	starpu_free_buffer_on_node(node, (uintptr_t) bcsr_interface->rowptr, (nrow+1)*sizeof(uint32_t));
 }
 
-#ifdef STARPU_USE_CUDA
-static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
-{
-	struct starpu_bcsr_interface *src_bcsr = src_interface;
-	struct starpu_bcsr_interface *dst_bcsr = dst_interface;
-
-	uint32_t nnz = src_bcsr->nnz;
-	uint32_t nrow = src_bcsr->nrow;
-	size_t elemsize = src_bcsr->elemsize;
-
-	uint32_t r = src_bcsr->r;
-	uint32_t c = src_bcsr->c;
-
-	cudaError_t cures;
-
-	cures = cudaMemcpy((char *)dst_bcsr->nzval, (char *)src_bcsr->nzval, nnz*r*c*elemsize, kind);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-
-	cures = cudaMemcpy((char *)dst_bcsr->colind, (char *)src_bcsr->colind, nnz*sizeof(uint32_t), kind);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-
-	cures = cudaMemcpy((char *)dst_bcsr->rowptr, (char *)src_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), kind);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-
-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
-
-	return 0;
-}
-
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
-}
-
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
-}
-
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
-}
-#endif // STARPU_USE_CUDA
-
-#ifdef STARPU_USE_OPENCL
-static int copy_opencl_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-	struct starpu_bcsr_interface *src_bcsr = src_interface;
-	struct starpu_bcsr_interface *dst_bcsr = dst_interface;
-
-	uint32_t nnz = src_bcsr->nnz;
-	uint32_t nrow = src_bcsr->nrow;
-	size_t elemsize = src_bcsr->elemsize;
-
-	uint32_t r = src_bcsr->r;
-	uint32_t c = src_bcsr->c;
-
-        int err;
-
-	err = starpu_opencl_copy_async_sync(src_bcsr->nzval, src_node, 0, dst_bcsr->nzval, dst_node, 0, nnz*r*c*elemsize, NULL);
-	if (STARPU_UNLIKELY(err))
-		STARPU_OPENCL_REPORT_ERROR(err);
-
-	err = starpu_opencl_copy_async_sync((uintptr_t)src_bcsr->colind, src_node, 0, (uintptr_t)dst_bcsr->colind, dst_node, 0, nnz*sizeof(uint32_t), NULL);
-	if (STARPU_UNLIKELY(err))
-		STARPU_OPENCL_REPORT_ERROR(err);
-
-	err = starpu_opencl_copy_async_sync((uintptr_t)src_bcsr->rowptr, src_node, 0, (uintptr_t)dst_bcsr->rowptr, dst_node, 0, (nrow+1)*sizeof(uint32_t), NULL);
-	if (STARPU_UNLIKELY(err))
-		STARPU_OPENCL_REPORT_ERROR(err);
-
-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
-
-	return 0;
-}
-
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
-}
-
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
-}
-
-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
-}
-
-#endif // STARPU_USE_OPENCL
-
-/* as not all platform easily have a BLAS lib installed ... */
-static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)
 {
 	struct starpu_bcsr_interface *src_bcsr = (struct starpu_bcsr_interface *) src_interface;
 	struct starpu_bcsr_interface *dst_bcsr = (struct starpu_bcsr_interface *) dst_interface;
@@ -425,13 +307,18 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBU
 	uint32_t r = src_bcsr->r;
 	uint32_t c = src_bcsr->c;
 
-	memcpy((void *)dst_bcsr->nzval, (void *)src_bcsr->nzval, nnz*elemsize*r*c);
+	int ret = 0;
+
+	if (starpu_interface_copy(src_bcsr->nzval, 0, src_node, dst_bcsr->nzval, 0, dst_node, nnz*elemsize*r*c, async_data))
+		ret = -EAGAIN;
 
-	memcpy((void *)dst_bcsr->colind, (void *)src_bcsr->colind, nnz*sizeof(uint32_t));
+	if (starpu_interface_copy((uintptr_t)src_bcsr->colind, 0, src_node, (uintptr_t)dst_bcsr->colind, 0, dst_node, nnz*sizeof(uint32_t), async_data))
+		ret = -EAGAIN;
 
-	memcpy((void *)dst_bcsr->rowptr, (void *)src_bcsr->rowptr, (nrow+1)*sizeof(uint32_t));
+	if (starpu_interface_copy((uintptr_t)src_bcsr->rowptr, 0, src_node, (uintptr_t)dst_bcsr->rowptr, 0, dst_node, (nrow+1)*sizeof(uint32_t), async_data))
+		ret = -EAGAIN;
 
 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize*r*c + (nnz+nrow+1)*sizeof(uint32_t));
 
-	return 0;
+	return ret;
 }

+ 9 - 7
src/datawizard/interfaces/block_interface.c

@@ -44,7 +44,7 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
 static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
 #endif
 
-static struct starpu_data_copy_methods block_copy_data_methods_s =
+static const struct starpu_data_copy_methods block_copy_data_methods_s =
 {
 	.ram_to_ram = copy_ram_to_ram,
 #ifdef STARPU_USE_CUDA
@@ -350,7 +350,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 	}
 	else
 	{
-		/* Default case: we transfer all lines one by one: ny*nz transfers */
+		/* Default case: we transfer all blocks one by one: nz transfers */
 		unsigned layer;
 		for (layer = 0; layer < src_block->nz; layer++)
 		{
@@ -420,7 +420,7 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 	}
 	else
 	{
-		/* Default case: we transfer all lines one by one: ny*nz transfers */
+		/* Default case: we transfer all blocks one by one: nz 2D transfers */
 		unsigned layer;
 		for (layer = 0; layer < src_block->nz; layer++)
 		{
@@ -514,8 +514,8 @@ static int copy_opencl_common(void *src_interface, unsigned src_node, void *dst_
 		/* Is that a single contiguous buffer ? */
 		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
 		{
-			ret = starpu_opencl_copy_async_sync(src_block->dev_handle, src_node, src_block->offset,
-								dst_block->dev_handle, dst_node, dst_block->offset,
+			ret = starpu_opencl_copy_async_sync(src_block->dev_handle, src_block->offset, src_node,
+								dst_block->dev_handle, dst_block->offset, dst_node,
 							       src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
 							       event);
                 }
@@ -535,10 +535,12 @@ static int copy_opencl_common(void *src_interface, unsigned src_node, void *dst_
                         unsigned j;
                         for(j=0 ; j<src_block->ny ; j++)
 			{
-				ret = starpu_opencl_copy_async_sync(src_block->dev_handle, src_node,
+				ret = starpu_opencl_copy_async_sync(src_block->dev_handle,
 								    src_block->offset + layer*src_block->ldz*src_block->elemsize + j*src_block->ldy*src_block->elemsize,
-								    dst_block->dev_handle, dst_node,
+								    src_node,
+								    dst_block->dev_handle,
 								    dst_block->offset + layer*dst_block->ldz*dst_block->elemsize + j*dst_block->ldy*dst_block->elemsize,
+								    dst_node,
 								       src_block->nx*src_block->elemsize,
 								       event);
                         }

+ 19 - 247
src/datawizard/interfaces/coo_interface.c

@@ -19,190 +19,36 @@
 #include <datawizard/memalloc.h>
 
 static int
-copy_ram_to_ram(void *src_interface, STARPU_ATTRIBUTE_UNUSED unsigned src_node,
-		void *dst_interface, STARPU_ATTRIBUTE_UNUSED unsigned dst_node)
+copy_any_to_any(void *src_interface, unsigned src_node,
+		void *dst_interface, unsigned dst_node, void *async_data)
 {
 	size_t size = 0;
 	struct starpu_coo_interface *src_coo, *dst_coo;
-
-	src_coo = (struct starpu_coo_interface *) src_interface;
-	dst_coo = (struct starpu_coo_interface *) dst_interface;
-
-	size = src_coo->n_values * sizeof(src_coo->columns[0]);
-	memcpy((void *) dst_coo->columns, (void *) src_coo->columns, size);
-
-	/* sizeof(src_coo->columns[0]) == sizeof(src_coo->rows[0]) */
-	memcpy((void *) dst_coo->rows, (void *) src_coo->rows, size);
-
-	size = src_coo->n_values * src_coo->elemsize;
-	memcpy((void *) dst_coo->values, (void *) src_coo->values, size);
-
-	_STARPU_TRACE_DATA_COPY(src_node, dst_node,
-		src_coo->n_values *
-		(2 * sizeof(src_coo->rows[0]) + src_coo->elemsize));
-
-	return 0;
-}
-
-#ifdef STARPU_USE_CUDA
-static int
-copy_cuda_async_sync(void *src_interface, unsigned src_node,
-		     void *dst_interface, unsigned dst_node,
-		     cudaStream_t stream, enum cudaMemcpyKind kind)
-{
-	int ret;
-	size_t size = 0;
-	struct starpu_coo_interface *src_coo, *dst_coo;
-
-	src_coo = (struct starpu_coo_interface *) src_interface;
-	dst_coo = (struct starpu_coo_interface *) dst_interface;
-
-	size = src_coo->n_values * sizeof(src_coo->columns[0]);
-	ret = starpu_cuda_copy_async_sync(
-		(void *) src_coo->columns,
-		src_node,
-		(void *) dst_coo->columns,
-		dst_node,
-		size,
-		stream,
-		kind);
-	if (ret == 0)
-		stream = NULL;
-
-	/* sizeof(src_coo->columns[0]) == sizeof(src_coo->rows[0]) */
-	ret = starpu_cuda_copy_async_sync(
-		(void *) src_coo->rows,
-		src_node,
-		(void *) dst_coo->rows,
-		dst_node,
-		size,
-		stream,
-		kind);
-	if (ret == 0)
-		stream = NULL;
-
-	size = src_coo->n_values * src_coo->elemsize;
-	ret = starpu_cuda_copy_async_sync(
-		(void *) src_coo->values,
-		src_node,
-		(void *) dst_coo->values,
-		dst_node,
-		size,
-		stream,
-		kind);
-
-	_STARPU_TRACE_DATA_COPY(src_node, dst_node,
-		src_coo->n_values *
-		(2 * sizeof(src_coo->rows[0]) + src_coo->elemsize));
-	return ret;
-}
-
-static int
-copy_ram_to_cuda(void *src_interface, unsigned src_node,
-		 void *dst_interface, unsigned dst_node)
-{
-	return copy_cuda_async_sync(src_interface, src_node,
-				    dst_interface, dst_node,
-				    NULL, cudaMemcpyHostToDevice);
-}
-
-static int
-copy_cuda_to_ram(void *src_interface, unsigned src_node,
-		 void *dst_interface, unsigned dst_node)
-{
-	return copy_cuda_async_sync(src_interface, src_node,
-				    dst_interface, dst_node,
-				    NULL, cudaMemcpyDeviceToHost);
-}
-
-static int
-copy_ram_to_cuda_async(void *src_interface, unsigned src_node,
-		       void *dst_interface, unsigned dst_node,
-		       cudaStream_t stream)
-{
-	return copy_cuda_async_sync(src_interface, src_node,
-				    dst_interface, dst_node,
-				    stream, cudaMemcpyHostToDevice);
-}
-
-static int
-copy_cuda_to_ram_async(void *src_interface, unsigned src_node,
-		       void *dst_interface, unsigned dst_node,
-		       cudaStream_t stream)
-{
-	return copy_cuda_async_sync(src_interface, src_node,
-				    dst_interface, dst_node,
-				    stream, cudaMemcpyDeviceToHost);
-}
-
-static int
-copy_cuda_to_cuda(void *src_interface, unsigned src_node,
-		  void *dst_interface, unsigned dst_node)
-{
-	return copy_cuda_async_sync(src_interface, src_node,
-				    dst_interface, dst_node,
-				    NULL, cudaMemcpyDeviceToDevice);
-}
-
-#ifdef NO_STRIDE
-static int
-copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,
-			void *dst_interface, unsigned dst_node,
-			cudaStream_t stream)
-{
-	return copy_cuda_async_sync(src_interface, src_node,
-				    dst_interface, dst_node,
-				    stream, cudaMemcpyDeviceToDevice);
-}
-#endif /* !NO_STRIDE */
-#endif /* !STARPU_USE_CUDA */
-
-#ifdef STARPU_USE_OPENCL
-static int
-copy_opencl_common(void *src_interface, unsigned src_node,
-			 void *dst_interface, unsigned dst_node,
-			 cl_event *event)
-{
 	int ret = 0;
-	size_t size = 0;
-	struct starpu_coo_interface *src_coo, *dst_coo;
 
 	src_coo = (struct starpu_coo_interface *) src_interface;
 	dst_coo = (struct starpu_coo_interface *) dst_interface;
 
-
 	size = src_coo->n_values * sizeof(src_coo->columns[0]);
-	ret = starpu_opencl_copy_async_sync(
-		(uintptr_t) src_coo->columns,
-		src_node,
-		0,
-		(uintptr_t) dst_coo->columns,
-		dst_node,
-		0,
-		size,
-		NULL);
+	if (starpu_interface_copy(
+		(uintptr_t) src_coo->columns, 0, src_node,
+		(uintptr_t) dst_coo->columns, 0, dst_node,
+		size, async_data))
+		ret = -EAGAIN;
 
 	/* sizeof(src_coo->columns[0]) == sizeof(src_coo->rows[0]) */
-	ret = starpu_opencl_copy_async_sync(
-		(uintptr_t) src_coo->rows,
-		src_node,
-		0,
-		(uintptr_t) dst_coo->rows,
-		dst_node,
-		0,
-		size,
-		NULL);
+	if (starpu_interface_copy(
+		(uintptr_t) src_coo->rows, 0, src_node,
+		(uintptr_t) dst_coo->rows, 0, dst_node,
+		size, async_data))
+		ret = -EAGAIN;
 
 	size = src_coo->n_values * src_coo->elemsize;
-	ret = starpu_opencl_copy_async_sync(
-		src_coo->values,
-		src_node,
-		0,
-		(uintptr_t) dst_coo->values,
-		dst_node,
-		0,
-		size,
-		event);
+	if (starpu_interface_copy(
+		src_coo->values, 0, src_node,
+		dst_coo->values, 0, dst_node,
+		size, async_data))
+		ret = -EAGAIN;
 
 	_STARPU_TRACE_DATA_COPY(src_node, dst_node,
 		src_coo->n_values *
@@ -211,83 +57,9 @@ copy_opencl_common(void *src_interface, unsigned src_node,
 	return ret;
 }
 
-static int
-copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
-			 void *dst_interface, unsigned dst_node,
-			 cl_event *event)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
-}
-
-static int
-copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
-			 void *dst_interface, unsigned dst_node,
-			 cl_event *event)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
-}
-
-static int
-copy_opencl_to_opencl_async(void *src_interface, unsigned src_node,
-			 void *dst_interface, unsigned dst_node,
-			 cl_event *event)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
-}
-
-static int
-copy_ram_to_opencl(void *src_interface, unsigned src_node,
-		   void *dst_interface, unsigned dst_node)
-{
-	return copy_ram_to_opencl_async(src_interface, src_node,
-					dst_interface, dst_node,
-					NULL);
-}
-static int
-copy_opencl_to_ram(void *src_interface, unsigned src_node,
-		   void *dst_interface, unsigned dst_node)
-{
-	return copy_opencl_to_ram_async(src_interface, src_node,
-					dst_interface, dst_node,
-					NULL);
-}
-static int
-copy_opencl_to_opencl(void *src_interface, unsigned src_node,
-		   void *dst_interface, unsigned dst_node)
-{
-	return copy_opencl_to_opencl_async(src_interface, src_node,
-					dst_interface, dst_node,
-					NULL);
-}
-#endif /* !STARPU_USE_OPENCL */
-
-static struct starpu_data_copy_methods coo_copy_data_methods =
+static const struct starpu_data_copy_methods coo_copy_data_methods =
 {
-	.ram_to_ram          = copy_ram_to_ram,
-#ifdef STARPU_USE_CUDA
-	.ram_to_cuda         = copy_ram_to_cuda,
-	.cuda_to_ram         = copy_cuda_to_ram,
-	.ram_to_cuda_async   = copy_ram_to_cuda_async,
-	.cuda_to_ram_async   = copy_cuda_to_ram_async,
-	.cuda_to_cuda        = copy_cuda_to_cuda,
-#ifdef NO_STRIDE
-	.cuda_to_cuda_async  = copy_cuda_to_cuda_async,
-#endif
-#else
-#ifdef STARPU_SIMGRID
-#ifdef NO_STRIDE
-	/* Enable GPU-GPU transfers in simgrid */
-	.cuda_to_cuda_async = 1,
-#endif
-#endif
-#endif /* !STARPU_USE_CUDA */
-#ifdef STARPU_USE_OPENCL
-	.ram_to_opencl       = copy_ram_to_opencl,
-	.opencl_to_ram       = copy_opencl_to_ram,
-	.opencl_to_opencl    = copy_opencl_to_opencl,
-	.ram_to_opencl_async = copy_ram_to_opencl_async,
-	.opencl_to_opencl_async = copy_opencl_to_opencl_async,
-#endif /* !STARPU_USE_OPENCL */
+	.any_to_any          = copy_any_to_any,
 };
 
 static void

+ 13 - 220
src/datawizard/interfaces/csr_interface.c

@@ -28,42 +28,11 @@
 #include <starpu_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 
-static int copy_ram_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
-#ifdef STARPU_USE_CUDA
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
-static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
-#endif
-#ifdef STARPU_USE_OPENCL
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
-#endif
-
-static struct starpu_data_copy_methods csr_copy_data_methods_s =
+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
+
+static const struct starpu_data_copy_methods csr_copy_data_methods_s =
 {
-	.ram_to_ram = copy_ram_to_ram,
-#ifdef STARPU_USE_CUDA
-	.ram_to_cuda = copy_ram_to_cuda,
-	.cuda_to_ram = copy_cuda_to_ram,
-	.cuda_to_cuda = copy_cuda_to_cuda,
-	.ram_to_cuda_async = copy_ram_to_cuda_async,
-	.cuda_to_ram_async = copy_cuda_to_ram_async,
-	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
-#else
-#ifdef STARPU_SIMGRID
-	/* Enable GPU-GPU transfers in simgrid */
-	.cuda_to_cuda_async = 1,
-#endif
-#endif
-#ifdef STARPU_USE_OPENCL
-	.ram_to_opencl = copy_ram_to_opencl,
-	.opencl_to_ram = copy_opencl_to_ram,
-	.opencl_to_opencl = copy_opencl_to_opencl,
-#endif
+	.any_to_any = copy_any_to_any,
 };
 
 static void register_csr_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
@@ -293,188 +262,8 @@ static void free_csr_buffer_on_node(void *data_interface, unsigned node)
 	starpu_free_buffer_on_node(node, (uintptr_t) csr_interface->rowptr, (nrow+1)*sizeof(uint32_t));
 }
 
-#ifdef STARPU_USE_CUDA
-static int copy_cuda_async_sync(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind, cudaStream_t stream)
-{
-	struct starpu_csr_interface *src_csr = src_interface;
-	struct starpu_csr_interface *dst_csr = dst_interface;
-
-	uint32_t nnz = src_csr->nnz;
-	uint32_t nrow = src_csr->nrow;
-	size_t elemsize = src_csr->elemsize;
-
-	cudaStream_t sstream = stream;
-	int ret;
-
-	ret = starpu_cuda_copy_async_sync((void *)src_csr->nzval, src_node, (void *)dst_csr->nzval, dst_node, nnz*elemsize, sstream, kind);
-	if (ret == 0) sstream = NULL;
-
-	ret = starpu_cuda_copy_async_sync((void *)src_csr->colind, src_node, (void *)dst_csr->colind, dst_node, nnz*sizeof(uint32_t), sstream, kind);
-	if (ret == 0) sstream = NULL;
-
-	ret = starpu_cuda_copy_async_sync((void *)src_csr->rowptr, src_node, (void *)dst_csr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), sstream, kind);
-
-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
-	return ret;
-}
-
-static int copy_cuda_peer_async(void *src_interface STARPU_ATTRIBUTE_UNUSED, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
-				void *dst_interface STARPU_ATTRIBUTE_UNUSED, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream STARPU_ATTRIBUTE_UNUSED)
-{
-#ifdef HAVE_CUDA_MEMCPY_PEER
-	struct starpu_csr_interface *src_csr = src_interface;
-	struct starpu_csr_interface *dst_csr = dst_interface;
-
-	uint32_t nnz = src_csr->nnz;
-	uint32_t nrow = src_csr->nrow;
-	size_t elemsize = src_csr->elemsize;
-
-	cudaError_t cures;
-
-	int src_dev = _starpu_memory_node_get_devid(src_node);
-	int dst_dev = _starpu_memory_node_get_devid(dst_node);
-
-	int synchronous_fallback = 0;
-
-	_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
-	cures = cudaMemcpyPeerAsync((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize, stream);
-	if (cures)
-	{
-		synchronous_fallback = 1;
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
-		cures = cudaMemcpyPeer((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize);
-		if (STARPU_UNLIKELY(cures))
-			STARPU_CUDA_REPORT_ERROR(cures);
-	}
-
-	if (!synchronous_fallback)
-	{
-		cures = cudaMemcpyPeerAsync((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t), stream);
-	}
-
-	if (synchronous_fallback || cures != cudaSuccess)
-	{
-		synchronous_fallback = 1;
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
-		cures = cudaMemcpyPeer((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t));
-		if (STARPU_UNLIKELY(cures))
-			STARPU_CUDA_REPORT_ERROR(cures);
-	}
-
-	if (!synchronous_fallback)
-	{
-		cures = cudaMemcpyPeerAsync((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t), stream);
-	}
-
-	if (synchronous_fallback || cures != cudaSuccess)
-	{
-		synchronous_fallback = 1;
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
-		cures = cudaMemcpyPeer((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t));
-		if (STARPU_UNLIKELY(cures))
-			STARPU_CUDA_REPORT_ERROR(cures);
-	}
-
-	if (synchronous_fallback)
-	{
-		_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
-		return 0;
-	}
-	else
-	{
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
-		return -EAGAIN;
-	}
-#else
-	/* Illegal without Peer tranfers */
-	STARPU_ABORT();
-	return 0;
-#endif
-}
-
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, NULL);
-}
-
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, NULL);
-}
-
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, NULL);
-}
-
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, stream);
-}
-
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, stream);
-}
-
-static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
-{
-	if (src_node == dst_node)
-		return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, stream);
-	else
-		return copy_cuda_peer_async(src_interface, src_node, dst_interface, dst_node, stream);
-}
-
-#endif // STARPU_USE_CUDA
-
-#ifdef STARPU_USE_OPENCL
-static int copy_opencl_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-	struct starpu_csr_interface *src_csr = src_interface;
-	struct starpu_csr_interface *dst_csr = dst_interface;
-
-	uint32_t nnz = src_csr->nnz;
-	uint32_t nrow = src_csr->nrow;
-	size_t elemsize = src_csr->elemsize;
-
-        int err;
-
-	err = starpu_opencl_copy_async_sync(src_csr->nzval, src_node, 0, dst_csr->nzval, dst_node, 0, nnz*elemsize, NULL);
-	if (STARPU_UNLIKELY(err))
-                STARPU_OPENCL_REPORT_ERROR(err);
-
-	err = starpu_opencl_copy_async_sync((uintptr_t)src_csr->colind, src_node, 0, (uintptr_t)dst_csr->colind, dst_node, 0, nnz*sizeof(uint32_t), NULL);
-        if (STARPU_UNLIKELY(err))
-                STARPU_OPENCL_REPORT_ERROR(err);
-
-	err = starpu_opencl_copy_async_sync((uintptr_t)src_csr->rowptr, src_node, 0, (uintptr_t)dst_csr->rowptr, dst_node, 0, (nrow+1)*sizeof(uint32_t), NULL);
-	if (STARPU_UNLIKELY(err))
-                STARPU_OPENCL_REPORT_ERROR(err);
-
-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
-
-	return 0;
-}
-
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
-}
-
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
-}
-
-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
-}
-
-#endif // STARPU_USE_OPENCL
-
 /* as not all platform easily have a BLAS lib installed ... */
-static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)
 {
 	struct starpu_csr_interface *src_csr = (struct starpu_csr_interface *) src_interface;
 	struct starpu_csr_interface *dst_csr = (struct starpu_csr_interface *) dst_interface;
@@ -482,14 +271,18 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBU
 	uint32_t nnz = src_csr->nnz;
 	uint32_t nrow = src_csr->nrow;
 	size_t elemsize = src_csr->elemsize;
+	int ret = 0;
 
-	memcpy((void *)dst_csr->nzval, (void *)src_csr->nzval, nnz*elemsize);
+	if (starpu_interface_copy(src_csr->nzval, 0, src_node, dst_csr->nzval, 0, dst_node, nnz*elemsize, async_data))
+		ret = -EAGAIN;
 
-	memcpy((void *)dst_csr->colind, (void *)src_csr->colind, nnz*sizeof(uint32_t));
+	if (starpu_interface_copy((uintptr_t)src_csr->colind, 0, src_node, (uintptr_t)dst_csr->colind, 0, dst_node, nnz*sizeof(uint32_t), async_data))
+		ret = -EAGAIN;
 
-	memcpy((void *)dst_csr->rowptr, (void *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t));
+	if (starpu_interface_copy((uintptr_t)src_csr->rowptr, 0, src_node, (uintptr_t)dst_csr->rowptr, 0, dst_node, (nrow+1)*sizeof(uint32_t), async_data))
+		ret = -EAGAIN;
 
 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
 
-	return 0;
+	return ret;
 }

+ 2 - 36
src/datawizard/interfaces/data_interface.c

@@ -290,41 +290,6 @@ void starpu_data_register(starpu_data_handle_t *handleptr, unsigned home_node,
 	STARPU_ASSERT(handleptr);
 	*handleptr = handle;
 
-	int asynchronous_copy_disabled = starpu_asynchronous_copy_disabled();
-	if (STARPU_UNLIKELY(asynchronous_copy_disabled))
-	{
-#ifdef STARPU_USE_CUDA
-		ops->copy_methods->ram_to_cuda_async = NULL;
-		ops->copy_methods->cuda_to_ram_async = NULL;
-		ops->copy_methods->cuda_to_cuda_async = NULL;
-#endif
-#ifdef STARPU_USE_OPENCL
-		ops->copy_methods->ram_to_opencl_async = NULL;
-		ops->copy_methods->opencl_to_ram_async = NULL;
-		ops->copy_methods->opencl_to_opencl_async = NULL;
-#endif
-	}
-
-#ifdef STARPU_USE_CUDA
-	int asynchronous_cuda_copy_disabled = starpu_asynchronous_cuda_copy_disabled();
-	if (STARPU_UNLIKELY(asynchronous_cuda_copy_disabled))
-	{
-		ops->copy_methods->ram_to_cuda_async = NULL;
-		ops->copy_methods->cuda_to_ram_async = NULL;
-		ops->copy_methods->cuda_to_cuda_async = NULL;
-	}
-#endif
-
-#ifdef STARPU_USE_OPENCL
-	int asynchronous_opencl_copy_disabled = starpu_asynchronous_opencl_copy_disabled();
-	if (STARPU_UNLIKELY(asynchronous_opencl_copy_disabled))
-	{
-		ops->copy_methods->ram_to_opencl_async = NULL;
-		ops->copy_methods->opencl_to_ram_async = NULL;
-		ops->copy_methods->opencl_to_opencl_async = NULL;
-	}
-#endif
-
 	/* fill the interface fields with the appropriate method */
 	STARPU_ASSERT(ops->register_data_handle);
 	ops->register_data_handle(handle, home_node, data_interface);
@@ -618,7 +583,8 @@ void starpu_data_unregister_no_coherency(starpu_data_handle_t handle)
 	_starpu_data_unregister(handle, 0);
 }
 
-void starpu_data_unregister_submit(starpu_data_handle_t handle) {
+void starpu_data_unregister_submit(starpu_data_handle_t handle)
+{
 	_starpu_spin_lock(&handle->header_lock);
 	STARPU_ASSERT_MSG(!handle->lazy_unregister, "data must not be unregistered twice");
 	handle->lazy_unregister = 1;

+ 3 - 3
src/datawizard/interfaces/matrix_interface.c

@@ -48,7 +48,7 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
 static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
 #endif
 
-static struct starpu_data_copy_methods matrix_copy_data_methods_s =
+static const struct starpu_data_copy_methods matrix_copy_data_methods_s =
 {
 	.ram_to_ram = copy_ram_to_ram,
 #ifdef STARPU_USE_CUDA
@@ -516,8 +516,8 @@ static int copy_opencl_common(void *src_interface, unsigned src_node, void *dst_
 
 	STARPU_ASSERT_MSG((src_matrix->ld == src_matrix->nx) && (dst_matrix->ld == dst_matrix->nx), "XXX non contiguous buffers are not properly supported in OpenCL yet. (TODO)");
 
-	ret = starpu_opencl_copy_async_sync(src_matrix->dev_handle, src_node, src_matrix->offset,
-					    dst_matrix->dev_handle, dst_node, dst_matrix->offset,
+	ret = starpu_opencl_copy_async_sync(src_matrix->dev_handle, src_matrix->offset, src_node,
+					    dst_matrix->dev_handle, dst_matrix->offset, dst_node,
 					    src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
 					    event);
 

+ 1 - 1
src/datawizard/interfaces/multiformat_interface.c

@@ -42,7 +42,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cl_event *event);
 #endif
 
-static struct starpu_data_copy_methods multiformat_copy_data_methods_s =
+static const struct starpu_data_copy_methods multiformat_copy_data_methods_s =
 {
 	.ram_to_ram = copy_ram_to_ram,
 #ifdef STARPU_USE_CUDA

+ 0 - 0
src/datawizard/interfaces/variable_interface.c


Некоторые файлы не были показаны из-за большого количества измененных файлов